Update snappy to 1.1.3 release

COUCHDB-2873
diff --git a/c_src/snappy/NEWS b/c_src/snappy/NEWS
index d514787..4eb7a1d 100644
--- a/c_src/snappy/NEWS
+++ b/c_src/snappy/NEWS
@@ -1,3 +1,97 @@
+Snappy v1.1.3, July 6th 2015:
+
+This is the first release to be done from GitHub, which means that
+some minor things like the ChangeLog format has changed (git log
+format instead of svn log).
+
+  * Add support for Uncompress() from a Source to a Sink.
+
+  * Various minor changes to improve MSVC support; in particular,
+    the unit tests now compile and run under MSVC.
+
+
+Snappy v1.1.2, February 28th 2014:
+
+This is a maintenance release with no changes to the actual library
+source code.
+
+  * Stop distributing benchmark data files that have unclear
+    or unsuitable licensing.
+
+  * Add support for padding chunks in the framing format.
+
+
+Snappy v1.1.1, October 15th 2013:
+
+  * Add support for uncompressing to iovecs (scatter I/O).
+    The bulk of this patch was contributed by Mohit Aron.
+
+  * Speed up decompression by ~2%; much more so (~13-20%) on
+    a few benchmarks on given compilers and CPUs.
+
+  * Fix a few issues with MSVC compilation.
+
+  * Support truncated test data in the benchmark.
+
+
+Snappy v1.1.0, January 18th 2013:
+
+  * Snappy now uses 64 kB block size instead of 32 kB. On average,
+    this means it compresses about 3% denser (more so for some
+    inputs), at the same or better speeds.
+
+  * libsnappy no longer depends on iostream.
+
+  * Some small performance improvements in compression on x86
+    (0.5–1%).
+
+  * Various portability fixes for ARM-based platforms, for MSVC,
+    and for GNU/Hurd.
+
+
+Snappy v1.0.5, February 24th 2012:
+
+  * More speed improvements. Exactly how big will depend on
+    the architecture:
+
+    - 3–10% faster decompression for the base case (x86-64).
+
+    - ARMv7 and higher can now use unaligned accesses,
+      and will see about 30% faster decompression and
+      20–40% faster compression.
+
+    - 32-bit platforms (ARM and 32-bit x86) will see 2–5%
+      faster compression.
+
+    These are all cumulative (e.g., ARM gets all three speedups).
+
+  * Fixed an issue where the unit test would crash on system
+    with less than 256 MB address space available,
+    e.g. some embedded platforms.
+
+  * Added a framing format description, for use over e.g. HTTP,
+    or for a command-line compressor. We do not have any
+    implementations of this at the current point, but there seems
+    to be enough of a general interest in the topic.
+    Also make the format description slightly clearer.
+
+  * Remove some compile-time warnings in -Wall
+    (mostly signed/unsigned comparisons), for easier embedding
+    into projects that use -Wall -Werror.
+
+
+Snappy v1.0.4, September 15th 2011:
+
+  * Speeded up the decompressor somewhat; typically about 2–8%
+    for Core i7, in 64-bit mode (comparable for Opteron).
+    Somewhat more for some tests, almost no gain for others.
+  
+  * Make Snappy compile on certain platforms it didn't before
+    (Solaris with SunPro C++, HP-UX, AIX).
+
+  * Correct some minor errors in the format description.
+
+
 Snappy v1.0.3, June 2nd 2011:
 
   * Speeded up the decompressor somewhat; about 3-6% for Core 2,
diff --git a/c_src/snappy/README b/c_src/snappy/README
index df8f0e1..3bc8888 100644
--- a/c_src/snappy/README
+++ b/c_src/snappy/README
@@ -76,11 +76,11 @@
 
 There are many ways to call Snappy, but the simplest possible is
 
-  snappy::Compress(input, &output);
+  snappy::Compress(input.data(), input.size(), &output);
 
 and similarly
 
-  snappy::Uncompress(input, &output);
+  snappy::Uncompress(input.data(), input.size(), &output);
 
 where "input" and "output" are both instances of std::string.
 
diff --git a/c_src/snappy/snappy-internal.h b/c_src/snappy/snappy-internal.h
index a32eda5..0653dc6 100644
--- a/c_src/snappy/snappy-internal.h
+++ b/c_src/snappy/snappy-internal.h
@@ -28,8 +28,8 @@
 //
 // Internals shared between the Snappy implementation and its unittest.
 
-#ifndef UTIL_SNAPPY_SNAPPY_INTERNAL_H_
-#define UTIL_SNAPPY_SNAPPY_INTERNAL_H_
+#ifndef THIRD_PARTY_SNAPPY_SNAPPY_INTERNAL_H_
+#define THIRD_PARTY_SNAPPY_SNAPPY_INTERNAL_H_
 
 #include "snappy-stubs-internal.h"
 
@@ -85,7 +85,7 @@
 static inline int FindMatchLength(const char* s1,
                                   const char* s2,
                                   const char* s2_limit) {
-  DCHECK_GE(s2_limit, s2);
+  assert(s2_limit >= s2);
   int matched = 0;
 
   // Find out how long the match is. We loop over the data 64 bits at a
@@ -93,7 +93,7 @@
   // the first non-matching bit and use that to calculate the total
   // length of the match.
   while (PREDICT_TRUE(s2 <= s2_limit - 8)) {
-    if (PREDICT_FALSE(UNALIGNED_LOAD64(s2) == UNALIGNED_LOAD64(s1 + matched))) {
+    if (UNALIGNED_LOAD64(s2) == UNALIGNED_LOAD64(s1 + matched)) {
       s2 += 8;
       matched += 8;
     } else {
@@ -108,7 +108,7 @@
     }
   }
   while (PREDICT_TRUE(s2 < s2_limit)) {
-    if (PREDICT_TRUE(s1[matched] == *s2)) {
+    if (s1[matched] == *s2) {
       ++s2;
       ++matched;
     } else {
@@ -122,7 +122,7 @@
                                   const char* s2,
                                   const char* s2_limit) {
   // Implementation based on the x86-64 version, above.
-  DCHECK_GE(s2_limit, s2);
+  assert(s2_limit >= s2);
   int matched = 0;
 
   while (s2 <= s2_limit - 4 &&
@@ -147,4 +147,4 @@
 }  // end namespace internal
 }  // end namespace snappy
 
-#endif  // UTIL_SNAPPY_SNAPPY_INTERNAL_H_
+#endif  // THIRD_PARTY_SNAPPY_SNAPPY_INTERNAL_H_
diff --git a/c_src/snappy/snappy-sinksource.cc b/c_src/snappy/snappy-sinksource.cc
index 5844552..369a132 100644
--- a/c_src/snappy/snappy-sinksource.cc
+++ b/c_src/snappy/snappy-sinksource.cc
@@ -40,6 +40,21 @@
   return scratch;
 }
 
+char* Sink::GetAppendBufferVariable(
+      size_t min_size, size_t desired_size_hint, char* scratch,
+      size_t scratch_size, size_t* allocated_size) {
+  *allocated_size = scratch_size;
+  return scratch;
+}
+
+void Sink::AppendAndTakeOwnership(
+    char* bytes, size_t n,
+    void (*deleter)(void*, const char*, size_t),
+    void *deleter_arg) {
+  Append(bytes, n);
+  (*deleter)(deleter_arg, bytes, n);
+}
+
 ByteArraySource::~ByteArraySource() { }
 
 size_t ByteArraySource::Available() const { return left_; }
@@ -68,4 +83,22 @@
   return dest_;
 }
 
+void UncheckedByteArraySink::AppendAndTakeOwnership(
+    char* data, size_t n,
+    void (*deleter)(void*, const char*, size_t),
+    void *deleter_arg) {
+  if (data != dest_) {
+    memcpy(dest_, data, n);
+    (*deleter)(deleter_arg, data, n);
+  }
+  dest_ += n;
 }
+
+char* UncheckedByteArraySink::GetAppendBufferVariable(
+      size_t min_size, size_t desired_size_hint, char* scratch,
+      size_t scratch_size, size_t* allocated_size) {
+  *allocated_size = desired_size_hint;
+  return dest_;
+}
+
+}  // namespace snappy
diff --git a/c_src/snappy/snappy-sinksource.h b/c_src/snappy/snappy-sinksource.h
index faabfa1..8afcdaa 100644
--- a/c_src/snappy/snappy-sinksource.h
+++ b/c_src/snappy/snappy-sinksource.h
@@ -26,12 +26,11 @@
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-#ifndef UTIL_SNAPPY_SNAPPY_SINKSOURCE_H_
-#define UTIL_SNAPPY_SNAPPY_SINKSOURCE_H_
+#ifndef THIRD_PARTY_SNAPPY_SNAPPY_SINKSOURCE_H_
+#define THIRD_PARTY_SNAPPY_SNAPPY_SINKSOURCE_H_
 
 #include <stddef.h>
 
-
 namespace snappy {
 
 // A Sink is an interface that consumes a sequence of bytes.
@@ -60,6 +59,47 @@
   // The default implementation always returns the scratch buffer.
   virtual char* GetAppendBuffer(size_t length, char* scratch);
 
+  // For higher performance, Sink implementations can provide custom
+  // AppendAndTakeOwnership() and GetAppendBufferVariable() methods.
+  // These methods can reduce the number of copies done during
+  // compression/decompression.
+
+  // Append "bytes[0,n-1] to the sink. Takes ownership of "bytes"
+  // and calls the deleter function as (*deleter)(deleter_arg, bytes, n)
+  // to free the buffer. deleter function must be non NULL.
+  //
+  // The default implementation just calls Append and frees "bytes".
+  // Other implementations may avoid a copy while appending the buffer.
+  virtual void AppendAndTakeOwnership(
+      char* bytes, size_t n, void (*deleter)(void*, const char*, size_t),
+      void *deleter_arg);
+
+  // Returns a writable buffer for appending and writes the buffer's capacity to
+  // *allocated_size. Guarantees *allocated_size >= min_size.
+  // May return a pointer to the caller-owned scratch buffer which must have
+  // scratch_size >= min_size.
+  //
+  // The returned buffer is only valid until the next operation
+  // on this ByteSink.
+  //
+  // After writing at most *allocated_size bytes, call Append() with the
+  // pointer returned from this function and the number of bytes written.
+  // Many Append() implementations will avoid copying bytes if this function
+  // returned an internal buffer.
+  //
+  // If the sink implementation allocates or reallocates an internal buffer,
+  // it should use the desired_size_hint if appropriate. If a caller cannot
+  // provide a reasonable guess at the desired capacity, it should set
+  // desired_size_hint = 0.
+  //
+  // If a non-scratch buffer is returned, the caller may only pass
+  // a prefix to it to Append(). That is, it is not correct to pass an
+  // interior pointer to Append().
+  //
+  // The default implementation always returns the scratch buffer.
+  virtual char* GetAppendBufferVariable(
+      size_t min_size, size_t desired_size_hint, char* scratch,
+      size_t scratch_size, size_t* allocated_size);
 
  private:
   // No copying
@@ -122,6 +162,12 @@
   virtual ~UncheckedByteArraySink();
   virtual void Append(const char* data, size_t n);
   virtual char* GetAppendBuffer(size_t len, char* scratch);
+  virtual char* GetAppendBufferVariable(
+      size_t min_size, size_t desired_size_hint, char* scratch,
+      size_t scratch_size, size_t* allocated_size);
+  virtual void AppendAndTakeOwnership(
+      char* bytes, size_t n, void (*deleter)(void*, const char*, size_t),
+      void *deleter_arg);
 
   // Return the current output pointer so that a caller can see how
   // many bytes were produced.
@@ -131,7 +177,6 @@
   char* dest_;
 };
 
+}  // namespace snappy
 
-}
-
-#endif  // UTIL_SNAPPY_SNAPPY_SINKSOURCE_H_
+#endif  // THIRD_PARTY_SNAPPY_SNAPPY_SINKSOURCE_H_
diff --git a/c_src/snappy/snappy-stubs-internal.h b/c_src/snappy/snappy-stubs-internal.h
index 6033cdf..ddca1a8 100644
--- a/c_src/snappy/snappy-stubs-internal.h
+++ b/c_src/snappy/snappy-stubs-internal.h
@@ -28,14 +28,13 @@
 //
 // Various stubs for the open-source version of Snappy.
 
-#ifndef UTIL_SNAPPY_OPENSOURCE_SNAPPY_STUBS_INTERNAL_H_
-#define UTIL_SNAPPY_OPENSOURCE_SNAPPY_STUBS_INTERNAL_H_
+#ifndef THIRD_PARTY_SNAPPY_OPENSOURCE_SNAPPY_STUBS_INTERNAL_H_
+#define THIRD_PARTY_SNAPPY_OPENSOURCE_SNAPPY_STUBS_INTERNAL_H_
 
 #ifdef HAVE_CONFIG_H
 #include "config.h"
 #endif
 
-#include <iostream>
 #include <string>
 
 #include <assert.h>
@@ -95,87 +94,6 @@
 static const uint32 kuint32max = static_cast<uint32>(0xFFFFFFFF);
 static const int64 kint64max = static_cast<int64>(0x7FFFFFFFFFFFFFFFLL);
 
-// Logging.
-
-#define LOG(level) LogMessage()
-#define VLOG(level) true ? (void)0 : \
-    snappy::LogMessageVoidify() & snappy::LogMessage()
-
-class LogMessage {
- public:
-  LogMessage() { }
-  ~LogMessage() {
-    cerr << endl;
-  }
-
-  LogMessage& operator<<(const std::string& msg) {
-    cerr << msg;
-    return *this;
-  }
-  LogMessage& operator<<(int x) {
-    cerr << x;
-    return *this;
-  }
-};
-
-// Asserts, both versions activated in debug mode only,
-// and ones that are always active.
-
-#define CRASH_UNLESS(condition) \
-    PREDICT_TRUE(condition) ? (void)0 : \
-    snappy::LogMessageVoidify() & snappy::LogMessageCrash()
-
-class LogMessageCrash : public LogMessage {
- public:
-  LogMessageCrash() { }
-  ~LogMessageCrash() {
-    cerr << endl;
-    abort();
-  }
-};
-
-// This class is used to explicitly ignore values in the conditional
-// logging macros.  This avoids compiler warnings like "value computed
-// is not used" and "statement has no effect".
-
-class LogMessageVoidify {
- public:
-  LogMessageVoidify() { }
-  // This has to be an operator with a precedence lower than << but
-  // higher than ?:
-  void operator&(const LogMessage&) { }
-};
-
-#define CHECK(cond) CRASH_UNLESS(cond)
-#define CHECK_LE(a, b) CRASH_UNLESS((a) <= (b))
-#define CHECK_GE(a, b) CRASH_UNLESS((a) >= (b))
-#define CHECK_EQ(a, b) CRASH_UNLESS((a) == (b))
-#define CHECK_NE(a, b) CRASH_UNLESS((a) != (b))
-#define CHECK_LT(a, b) CRASH_UNLESS((a) < (b))
-#define CHECK_GT(a, b) CRASH_UNLESS((a) > (b))
-
-#ifdef NDEBUG
-
-#define DCHECK(cond) CRASH_UNLESS(true)
-#define DCHECK_LE(a, b) CRASH_UNLESS(true)
-#define DCHECK_GE(a, b) CRASH_UNLESS(true)
-#define DCHECK_EQ(a, b) CRASH_UNLESS(true)
-#define DCHECK_NE(a, b) CRASH_UNLESS(true)
-#define DCHECK_LT(a, b) CRASH_UNLESS(true)
-#define DCHECK_GT(a, b) CRASH_UNLESS(true)
-
-#else
-
-#define DCHECK(cond) CHECK(cond)
-#define DCHECK_LE(a, b) CHECK_LE(a, b)
-#define DCHECK_GE(a, b) CHECK_GE(a, b)
-#define DCHECK_EQ(a, b) CHECK_EQ(a, b)
-#define DCHECK_NE(a, b) CHECK_NE(a, b)
-#define DCHECK_LT(a, b) CHECK_LT(a, b)
-#define DCHECK_GT(a, b) CHECK_GT(a, b)
-
-#endif
-
 // Potentially unaligned loads and stores.
 
 // x86 and PowerPC can simply do these loads and stores native.
@@ -200,6 +118,8 @@
 // This is a mess, but there's not much we can do about it.
 
 #elif defined(__arm__) && \
+      !defined(__ARM_ARCH_4__) && \
+      !defined(__ARM_ARCH_4T__) && \
       !defined(__ARM_ARCH_5__) && \
       !defined(__ARM_ARCH_5T__) && \
       !defined(__ARM_ARCH_5TE__) && \
@@ -568,4 +488,4 @@
 
 }  // namespace snappy
 
-#endif  // UTIL_SNAPPY_OPENSOURCE_SNAPPY_STUBS_INTERNAL_H_
+#endif  // THIRD_PARTY_SNAPPY_OPENSOURCE_SNAPPY_STUBS_INTERNAL_H_
diff --git a/c_src/snappy/snappy-stubs-public.h b/c_src/snappy/snappy-stubs-public.h
index fb7c8c8..aec79dc 100644
--- a/c_src/snappy/snappy-stubs-public.h
+++ b/c_src/snappy/snappy-stubs-public.h
@@ -40,8 +40,8 @@
 #include <stddef.h>
 
 #define SNAPPY_MAJOR 1
-#define SNAPPY_MINOR 0
-#define SNAPPY_PATCHLEVEL 5
+#define SNAPPY_MINOR 1
+#define SNAPPY_PATCHLEVEL 3
 #define SNAPPY_VERSION \
     ((SNAPPY_MAJOR << 16) | (SNAPPY_MINOR << 8) | SNAPPY_PATCHLEVEL)
 
@@ -49,7 +49,6 @@
 
 namespace snappy {
 
-#if 1
 typedef int8_t int8;
 typedef uint8_t uint8;
 typedef int16_t int16;
@@ -58,16 +57,6 @@
 typedef uint32_t uint32;
 typedef int64_t int64;
 typedef uint64_t uint64;
-#else
-typedef signed char int8;
-typedef unsigned char uint8;
-typedef short int16;
-typedef unsigned short uint16;
-typedef int int32;
-typedef unsigned int uint32;
-typedef long long int64;
-typedef unsigned long long uint64;
-#endif
 
 typedef std::string string;
 
@@ -75,6 +64,11 @@
   TypeName(const TypeName&);               \
   void operator=(const TypeName&)
 
+struct iovec {
+    void* iov_base;
+    size_t iov_len;
+};
+
 }  // namespace snappy
 
 #endif  // UTIL_SNAPPY_OPENSOURCE_SNAPPY_STUBS_PUBLIC_H_
diff --git a/c_src/snappy/snappy.cc b/c_src/snappy/snappy.cc
index 4d4eb42..b6ca7ec 100644
--- a/c_src/snappy/snappy.cc
+++ b/c_src/snappy/snappy.cc
@@ -82,6 +82,7 @@
   COPY_2_BYTE_OFFSET = 2,
   COPY_4_BYTE_OFFSET = 3
 };
+static const int kMaximumTagLength = 5;  // COPY_4_BYTE_OFFSET plus the actual offset.
 
 // Copy "len" bytes from "src" to "op", one byte at a time.  Used for
 // handling COPY operations where the input and output regions may
@@ -94,8 +95,8 @@
 //    ababababababababababab
 // Note that this does not match the semantics of either memcpy()
 // or memmove().
-static inline void IncrementalCopy(const char* src, char* op, int len) {
-  DCHECK_GT(len, 0);
+static inline void IncrementalCopy(const char* src, char* op, ssize_t len) {
+  assert(len > 0);
   do {
     *op++ = *src++;
   } while (--len > 0);
@@ -136,10 +137,8 @@
 
 const int kMaxIncrementCopyOverflow = 10;
 
-}  // namespace
-
-static inline void IncrementalCopyFastPath(const char* src, char* op, int len) {
-  while (op - src < 8) {
+inline void IncrementalCopyFastPath(const char* src, char* op, ssize_t len) {
+  while (PREDICT_FALSE(op - src < 8)) {
     UnalignedCopy64(src, op);
     len -= op - src;
     op += op - src;
@@ -152,6 +151,8 @@
   }
 }
 
+}  // namespace
+
 static inline char* EmitLiteral(char* op,
                                 const char* literal,
                                 int len,
@@ -195,17 +196,17 @@
 }
 
 static inline char* EmitCopyLessThan64(char* op, size_t offset, int len) {
-  DCHECK_LE(len, 64);
-  DCHECK_GE(len, 4);
-  DCHECK_LT(offset, 65536);
+  assert(len <= 64);
+  assert(len >= 4);
+  assert(offset < 65536);
 
   if ((len < 12) && (offset < 2048)) {
     size_t len_minus_4 = len - 4;
     assert(len_minus_4 < 8);            // Must fit in 3 bits
-    *op++ = COPY_1_BYTE_OFFSET | ((len_minus_4) << 2) | ((offset >> 8) << 5);
+    *op++ = COPY_1_BYTE_OFFSET + ((len_minus_4) << 2) + ((offset >> 8) << 5);
     *op++ = offset & 0xff;
   } else {
-    *op++ = COPY_2_BYTE_OFFSET | ((len-1) << 2);
+    *op++ = COPY_2_BYTE_OFFSET + ((len-1) << 2);
     LittleEndian::Store16(op, offset);
     op += 2;
   }
@@ -214,7 +215,7 @@
 
 static inline char* EmitCopy(char* op, size_t offset, int len) {
   // Emit 64 byte copies but make sure to keep at least four bytes reserved
-  while (len >= 68) {
+  while (PREDICT_FALSE(len >= 68)) {
     op = EmitCopyLessThan64(op, offset, 64);
     len -= 64;
   }
@@ -253,8 +254,6 @@
   while (htsize < kMaxHashTableSize && htsize < input_size) {
     htsize <<= 1;
   }
-  CHECK_EQ(0, htsize & (htsize - 1)) << ": must be power of two";
-  CHECK_LE(htsize, kMaxHashTableSize) << ": hash table too large";
 
   uint16* table;
   if (htsize <= ARRAYSIZE(small_table_)) {
@@ -294,8 +293,8 @@
 }
 
 static inline uint32 GetUint32AtOffset(uint64 v, int offset) {
-  DCHECK_GE(offset, 0);
-  DCHECK_LE(offset, 4);
+  assert(offset >= 0);
+  assert(offset <= 4);
   return v >> (LittleEndian::IsLittleEndian() ? 8 * offset : 32 - 8 * offset);
 }
 
@@ -308,8 +307,8 @@
 }
 
 static inline uint32 GetUint32AtOffset(const char* v, int offset) {
-  DCHECK_GE(offset, 0);
-  DCHECK_LE(offset, 4);
+  assert(offset >= 0);
+  assert(offset <= 4);
   return UNALIGNED_LOAD32(v + offset);
 }
 
@@ -334,10 +333,10 @@
                        const int table_size) {
   // "ip" is the input pointer, and "op" is the output pointer.
   const char* ip = input;
-  CHECK_LE(input_size, kBlockSize);
-  CHECK_EQ(table_size & (table_size - 1), 0) << ": table must be power of two";
+  assert(input_size <= kBlockSize);
+  assert((table_size & (table_size - 1)) == 0); // table must be power of two
   const int shift = 32 - Bits::Log2Floor(table_size);
-  DCHECK_EQ(static_cast<int>(kuint32max >> shift), table_size - 1);
+  assert(static_cast<int>(kuint32max >> shift) == table_size - 1);
   const char* ip_end = input + input_size;
   const char* base_ip = ip;
   // Bytes in [next_emit, ip) will be emitted as literal bytes.  Or
@@ -349,7 +348,7 @@
     const char* ip_limit = input + input_size - kInputMarginBytes;
 
     for (uint32 next_hash = Hash(++ip, shift); ; ) {
-      DCHECK_LT(next_emit, ip);
+      assert(next_emit < ip);
       // The body of this loop calls EmitLiteral once and then EmitCopy one or
       // more times.  (The exception is that when we're close to exhausting
       // the input we goto emit_remainder.)
@@ -382,7 +381,7 @@
       do {
         ip = next_ip;
         uint32 hash = next_hash;
-        DCHECK_EQ(hash, Hash(ip, shift));
+        assert(hash == Hash(ip, shift));
         uint32 bytes_between_hash_lookups = skip++ >> 5;
         next_ip = ip + bytes_between_hash_lookups;
         if (PREDICT_FALSE(next_ip > ip_limit)) {
@@ -390,8 +389,8 @@
         }
         next_hash = Hash(next_ip, shift);
         candidate = base_ip + table[hash];
-        DCHECK_GE(candidate, base_ip);
-        DCHECK_LT(candidate, ip);
+        assert(candidate >= base_ip);
+        assert(candidate < ip);
 
         table[hash] = ip - base_ip;
       } while (PREDICT_TRUE(UNALIGNED_LOAD32(ip) !=
@@ -400,7 +399,7 @@
       // Step 2: A 4-byte match has been found.  We'll later see if more
       // than 4 bytes match.  But, prior to the match, input
       // bytes [next_emit, ip) are unmatched.  Emit them as "literal bytes."
-      DCHECK_LE(next_emit + 16, ip_end);
+      assert(next_emit + 16 <= ip_end);
       op = EmitLiteral(op, next_emit, ip - next_emit, true);
 
       // Step 3: Call EmitCopy, and then see if another EmitCopy could
@@ -421,7 +420,7 @@
         int matched = 4 + FindMatchLength(candidate + 4, ip + 4, ip_end);
         ip += matched;
         size_t offset = base - candidate;
-        DCHECK_EQ(0, memcmp(base, candidate, matched));
+        assert(0 == memcmp(base, candidate, matched));
         op = EmitCopy(op, offset, matched);
         // We could immediately start working at ip now, but to improve
         // compression we first update table[Hash(ip - 1, ...)].
@@ -471,21 +470,26 @@
 //   bool Append(const char* ip, size_t length);
 //   bool AppendFromSelf(uint32 offset, size_t length);
 //
-//   // The difference between TryFastAppend and Append is that TryFastAppend
-//   // is allowed to read up to <available> bytes from the input buffer,
-//   // whereas Append is allowed to read <length>.
+//   // The rules for how TryFastAppend differs from Append are somewhat
+//   // convoluted:
 //   //
-//   // Also, TryFastAppend is allowed to return false, declining the append,
-//   // without it being a fatal error -- just "return false" would be
-//   // a perfectly legal implementation of TryFastAppend. The intention
-//   // is for TryFastAppend to allow a fast path in the common case of
-//   // a small append.
+//   //  - TryFastAppend is allowed to decline (return false) at any
+//   //    time, for any reason -- just "return false" would be
+//   //    a perfectly legal implementation of TryFastAppend.
+//   //    The intention is for TryFastAppend to allow a fast path
+//   //    in the common case of a small append.
+//   //  - TryFastAppend is allowed to read up to <available> bytes
+//   //    from the input buffer, whereas Append is allowed to read
+//   //    <length>. However, if it returns true, it must leave
+//   //    at least five (kMaximumTagLength) bytes in the input buffer
+//   //    afterwards, so that there is always enough space to read the
+//   //    next tag without checking for a refill.
+//   //  - TryFastAppend must always return decline (return false)
+//   //    if <length> is 61 or more, as in this case the literal length is not
+//   //    decoded fully. In practice, this should not be a big problem,
+//   //    as it is unlikely that one would implement a fast path accepting
+//   //    this much data.
 //   //
-//   // NOTE(user): TryFastAppend must always return decline (return false)
-//   // if <length> is 61 or more, as in this case the literal length is not
-//   // decoded fully. In practice, this should not be a big problem,
-//   // as it is unlikely that one would implement a fast path accepting
-//   // this much data.
 //   bool TryFastAppend(const char* ip, size_t available, size_t length);
 // };
 
@@ -554,9 +558,9 @@
                         unsigned int len,
                         unsigned int copy_offset) {
   // Check that all of the fields fit within the allocated space
-  DCHECK_EQ(extra,       extra & 0x7);          // At most 3 bits
-  DCHECK_EQ(copy_offset, copy_offset & 0x7);    // At most 3 bits
-  DCHECK_EQ(len,         len & 0x7f);           // At most 7 bits
+  assert(extra       == (extra & 0x7));          // At most 3 bits
+  assert(copy_offset == (copy_offset & 0x7));    // At most 3 bits
+  assert(len         == (len & 0x7f));           // At most 7 bits
   return len | (copy_offset << 8) | (extra << 11);
 }
 
@@ -614,9 +618,15 @@
   }
 
   // Check that each entry was initialized exactly once.
-  CHECK_EQ(assigned, 256);
+  if (assigned != 256) {
+    fprintf(stderr, "ComputeTable: assigned only %d of 256\n", assigned);
+    abort();
+  }
   for (int i = 0; i < 256; i++) {
-    CHECK_NE(dst[i], 0xffff);
+    if (dst[i] == 0xffff) {
+      fprintf(stderr, "ComputeTable: did not assign byte %d\n", i);
+      abort();
+    }
   }
 
   if (FLAGS_snappy_dump_decompression_table) {
@@ -631,7 +641,11 @@
 
   // Check that computed table matched recorded table
   for (int i = 0; i < 256; i++) {
-    CHECK_EQ(dst[i], char_table[i]);
+    if (dst[i] != char_table[i]) {
+      fprintf(stderr, "ComputeTable: byte %d: computed (%x), expect (%x)\n",
+              i, static_cast<int>(dst[i]), static_cast<int>(char_table[i]));
+      abort();
+    }
   }
 }
 #endif /* !NDEBUG */
@@ -644,7 +658,7 @@
   const char*   ip_limit_;       // Points just past buffered bytes
   uint32        peeked_;         // Bytes peeked from reader (need to skip)
   bool          eof_;            // Hit end of input without an error?
-  char          scratch_[5];     // Temporary buffer for PeekFast() boundaries
+  char          scratch_[kMaximumTagLength];  // See RefillTag().
 
   // Ensure that all of the tag metadata for the next tag is available
   // in [ip_..ip_limit_-1].  Also ensures that [ip,ip+4] is readable even
@@ -676,7 +690,7 @@
   // On succcess, stores the length in *result and returns true.
   // On failure, returns false.
   bool ReadUncompressedLength(uint32* result) {
-    DCHECK(ip_ == NULL);       // Must not have read anything yet
+    assert(ip_ == NULL);       // Must not have read anything yet
     // Length is encoded in 1..5 bytes
     *result = 0;
     uint32 shift = 0;
@@ -707,7 +721,7 @@
     // scope to optimize the <ip_limit_ - ip> expression based on the local
     // context, which overall increases speed.
     #define MAYBE_REFILL() \
-        if (ip_limit_ - ip < 5) { \
+        if (ip_limit_ - ip < kMaximumTagLength) { \
           ip_ = ip; \
           if (!RefillTag()) return; \
           ip = ip_; \
@@ -720,9 +734,11 @@
       if ((c & 0x3) == LITERAL) {
         size_t literal_length = (c >> 2) + 1u;
         if (writer->TryFastAppend(ip, ip_limit_ - ip, literal_length)) {
-          DCHECK_LT(literal_length, 61);
+          assert(literal_length < 61);
           ip += literal_length;
-          MAYBE_REFILL();
+          // NOTE(user): There is no MAYBE_REFILL() here, as TryFastAppend()
+          // will not return true unless there's already at least five spare
+          // bytes in addition to the literal.
           continue;
         }
         if (PREDICT_FALSE(literal_length >= 61)) {
@@ -787,11 +803,11 @@
   }
 
   // Read the tag character
-  DCHECK_LT(ip, ip_limit_);
+  assert(ip < ip_limit_);
   const unsigned char c = *(reinterpret_cast<const unsigned char*>(ip));
   const uint32 entry = char_table[c];
   const uint32 needed = (entry >> 11) + 1;  // +1 byte for 'c'
-  DCHECK_LE(needed, sizeof(scratch_));
+  assert(needed <= sizeof(scratch_));
 
   // Read more bytes from reader if needed
   uint32 nbuf = ip_limit_ - ip;
@@ -812,10 +828,10 @@
       nbuf += to_add;
       reader_->Skip(to_add);
     }
-    DCHECK_EQ(nbuf, needed);
+    assert(nbuf == needed);
     ip_ = scratch_;
     ip_limit_ = scratch_ + needed;
-  } else if (nbuf < 5) {
+  } else if (nbuf < kMaximumTagLength) {
     // Have enough bytes, but move into scratch_ so that we do not
     // read past end of input
     memmove(scratch_, ip, nbuf);
@@ -831,31 +847,23 @@
 }
 
 template <typename Writer>
-static bool InternalUncompress(Source* r,
-                               Writer* writer,
-                               uint32 max_len) {
+static bool InternalUncompress(Source* r, Writer* writer) {
   // Read the uncompressed length from the front of the compressed input
   SnappyDecompressor decompressor(r);
   uint32 uncompressed_len = 0;
   if (!decompressor.ReadUncompressedLength(&uncompressed_len)) return false;
-  return InternalUncompressAllTags(
-      &decompressor, writer, uncompressed_len, max_len);
+  return InternalUncompressAllTags(&decompressor, writer, uncompressed_len);
 }
 
 template <typename Writer>
 static bool InternalUncompressAllTags(SnappyDecompressor* decompressor,
                                       Writer* writer,
-                                      uint32 uncompressed_len,
-                                      uint32 max_len) {
-  // Protect against possible DoS attack
-  if (static_cast<uint64>(uncompressed_len) > max_len) {
-    return false;
-  }
-
+                                      uint32 uncompressed_len) {
   writer->SetExpectedLength(uncompressed_len);
 
   // Process the entire input
   decompressor->DecompressAllTags(writer);
+  writer->Flush();
   return (decompressor->eof() && writer->CheckLength());
 }
 
@@ -880,7 +888,7 @@
     // Get next block to compress (without copying if possible)
     size_t fragment_size;
     const char* fragment = reader->Peek(&fragment_size);
-    DCHECK_NE(fragment_size, 0) << ": premature end of input";
+    assert(fragment_size != 0);  // premature end of input
     const size_t num_to_read = min(N, kBlockSize);
     size_t bytes_read = fragment_size;
 
@@ -907,11 +915,11 @@
         bytes_read += n;
         reader->Skip(n);
       }
-      DCHECK_EQ(bytes_read, num_to_read);
+      assert(bytes_read == num_to_read);
       fragment = scratch;
       fragment_size = num_to_read;
     }
-    DCHECK_EQ(fragment_size, num_to_read);
+    assert(fragment_size == num_to_read);
 
     // Get encoding table for compression
     int table_size;
@@ -946,6 +954,184 @@
 }
 
 // -----------------------------------------------------------------------
+// IOVec interfaces
+// -----------------------------------------------------------------------
+
+// A type that writes to an iovec.
+// Note that this is not a "ByteSink", but a type that matches the
+// Writer template argument to SnappyDecompressor::DecompressAllTags().
+class SnappyIOVecWriter {
+ private:
+  const struct iovec* output_iov_;
+  const size_t output_iov_count_;
+
+  // We are currently writing into output_iov_[curr_iov_index_].
+  int curr_iov_index_;
+
+  // Bytes written to output_iov_[curr_iov_index_] so far.
+  size_t curr_iov_written_;
+
+  // Total bytes decompressed into output_iov_ so far.
+  size_t total_written_;
+
+  // Maximum number of bytes that will be decompressed into output_iov_.
+  size_t output_limit_;
+
+  inline char* GetIOVecPointer(int index, size_t offset) {
+    return reinterpret_cast<char*>(output_iov_[index].iov_base) +
+        offset;
+  }
+
+ public:
+  // Does not take ownership of iov. iov must be valid during the
+  // entire lifetime of the SnappyIOVecWriter.
+  inline SnappyIOVecWriter(const struct iovec* iov, size_t iov_count)
+      : output_iov_(iov),
+        output_iov_count_(iov_count),
+        curr_iov_index_(0),
+        curr_iov_written_(0),
+        total_written_(0),
+        output_limit_(-1) {
+  }
+
+  inline void SetExpectedLength(size_t len) {
+    output_limit_ = len;
+  }
+
+  inline bool CheckLength() const {
+    return total_written_ == output_limit_;
+  }
+
+  inline bool Append(const char* ip, size_t len) {
+    if (total_written_ + len > output_limit_) {
+      return false;
+    }
+
+    while (len > 0) {
+      assert(curr_iov_written_ <= output_iov_[curr_iov_index_].iov_len);
+      if (curr_iov_written_ >= output_iov_[curr_iov_index_].iov_len) {
+        // This iovec is full. Go to the next one.
+        if (curr_iov_index_ + 1 >= output_iov_count_) {
+          return false;
+        }
+        curr_iov_written_ = 0;
+        ++curr_iov_index_;
+      }
+
+      const size_t to_write = std::min(
+          len, output_iov_[curr_iov_index_].iov_len - curr_iov_written_);
+      memcpy(GetIOVecPointer(curr_iov_index_, curr_iov_written_),
+             ip,
+             to_write);
+      curr_iov_written_ += to_write;
+      total_written_ += to_write;
+      ip += to_write;
+      len -= to_write;
+    }
+
+    return true;
+  }
+
+  inline bool TryFastAppend(const char* ip, size_t available, size_t len) {
+    const size_t space_left = output_limit_ - total_written_;
+    if (len <= 16 && available >= 16 + kMaximumTagLength && space_left >= 16 &&
+        output_iov_[curr_iov_index_].iov_len - curr_iov_written_ >= 16) {
+      // Fast path, used for the majority (about 95%) of invocations.
+      char* ptr = GetIOVecPointer(curr_iov_index_, curr_iov_written_);
+      UnalignedCopy64(ip, ptr);
+      UnalignedCopy64(ip + 8, ptr + 8);
+      curr_iov_written_ += len;
+      total_written_ += len;
+      return true;
+    }
+
+    return false;
+  }
+
+  inline bool AppendFromSelf(size_t offset, size_t len) {
+    if (offset > total_written_ || offset == 0) {
+      return false;
+    }
+    const size_t space_left = output_limit_ - total_written_;
+    if (len > space_left) {
+      return false;
+    }
+
+    // Locate the iovec from which we need to start the copy.
+    int from_iov_index = curr_iov_index_;
+    size_t from_iov_offset = curr_iov_written_;
+    while (offset > 0) {
+      if (from_iov_offset >= offset) {
+        from_iov_offset -= offset;
+        break;
+      }
+
+      offset -= from_iov_offset;
+      --from_iov_index;
+      assert(from_iov_index >= 0);
+      from_iov_offset = output_iov_[from_iov_index].iov_len;
+    }
+
+    // Copy <len> bytes starting from the iovec pointed to by from_iov_index to
+    // the current iovec.
+    while (len > 0) {
+      assert(from_iov_index <= curr_iov_index_);
+      if (from_iov_index != curr_iov_index_) {
+        const size_t to_copy = std::min(
+            output_iov_[from_iov_index].iov_len - from_iov_offset,
+            len);
+        Append(GetIOVecPointer(from_iov_index, from_iov_offset), to_copy);
+        len -= to_copy;
+        if (len > 0) {
+          ++from_iov_index;
+          from_iov_offset = 0;
+        }
+      } else {
+        assert(curr_iov_written_ <= output_iov_[curr_iov_index_].iov_len);
+        size_t to_copy = std::min(output_iov_[curr_iov_index_].iov_len -
+                                      curr_iov_written_,
+                                  len);
+        if (to_copy == 0) {
+          // This iovec is full. Go to the next one.
+          if (curr_iov_index_ + 1 >= output_iov_count_) {
+            return false;
+          }
+          ++curr_iov_index_;
+          curr_iov_written_ = 0;
+          continue;
+        }
+        if (to_copy > len) {
+          to_copy = len;
+        }
+        IncrementalCopy(GetIOVecPointer(from_iov_index, from_iov_offset),
+                        GetIOVecPointer(curr_iov_index_, curr_iov_written_),
+                        to_copy);
+        curr_iov_written_ += to_copy;
+        from_iov_offset += to_copy;
+        total_written_ += to_copy;
+        len -= to_copy;
+      }
+    }
+
+    return true;
+  }
+
+  inline void Flush() {}
+};
+
+bool RawUncompressToIOVec(const char* compressed, size_t compressed_length,
+                          const struct iovec* iov, size_t iov_cnt) {
+  ByteArraySource reader(compressed, compressed_length);
+  return RawUncompressToIOVec(&reader, iov, iov_cnt);
+}
+
+bool RawUncompressToIOVec(Source* compressed, const struct iovec* iov,
+                          size_t iov_cnt) {
+  SnappyIOVecWriter output(iov, iov_cnt);
+  return InternalUncompress(compressed, &output);
+}
+
+// -----------------------------------------------------------------------
 // Flat array interfaces
 // -----------------------------------------------------------------------
 
@@ -961,7 +1147,8 @@
  public:
   inline explicit SnappyArrayWriter(char* dst)
       : base_(dst),
-        op_(dst) {
+        op_(dst),
+        op_limit_(dst) {
   }
 
   inline void SetExpectedLength(size_t len) {
@@ -986,7 +1173,7 @@
   inline bool TryFastAppend(const char* ip, size_t available, size_t len) {
     char* op = op_;
     const size_t space_left = op_limit_ - op;
-    if (len <= 16 && available >= 16 && space_left >= 16) {
+    if (len <= 16 && available >= 16 + kMaximumTagLength && space_left >= 16) {
       // Fast path, used for the majority (about 95%) of invocations.
       UnalignedCopy64(ip, op);
       UnalignedCopy64(ip + 8, op + 8);
@@ -1001,7 +1188,16 @@
     char* op = op_;
     const size_t space_left = op_limit_ - op;
 
-    if (op - base_ <= offset - 1u) {  // -1u catches offset==0
+    // Check if we try to append from before the start of the buffer.
+    // Normally this would just be a check for "produced < offset",
+    // but "produced <= offset - 1u" is equivalent for every case
+    // except the one where offset==0, where the right side will wrap around
+    // to a very big number. This is convenient, as offset==0 is another
+    // invalid case that we also want to catch, so that we do not go
+    // into an infinite loop.
+    assert(op >= base_);
+    size_t produced = op - base_;
+    if (produced <= offset - 1u) {
       return false;
     }
     if (len <= 16 && offset >= 8 && space_left >= 16) {
@@ -1022,6 +1218,10 @@
     op_ = op + len;
     return true;
   }
+  inline size_t Produced() const {
+    return op_ - base_;
+  }
+  inline void Flush() {}
 };
 
 bool RawUncompress(const char* compressed, size_t n, char* uncompressed) {
@@ -1031,7 +1231,7 @@
 
 bool RawUncompress(Source* compressed, char* uncompressed) {
   SnappyArrayWriter output(uncompressed);
-  return InternalUncompress(compressed, &output, kuint32max);
+  return InternalUncompress(compressed, &output);
 }
 
 bool Uncompress(const char* compressed, size_t n, string* uncompressed) {
@@ -1039,16 +1239,15 @@
   if (!GetUncompressedLength(compressed, n, &ulength)) {
     return false;
   }
-  // Protect against possible DoS attack
-  if ((static_cast<uint64>(ulength) + uncompressed->size()) >
-      uncompressed->max_size()) {
+  // On 32-bit builds: max_size() < kuint32max.  Check for that instead
+  // of crashing (e.g., consider externally specified compressed data).
+  if (ulength > uncompressed->max_size()) {
     return false;
   }
   STLStringResizeUninitialized(uncompressed, ulength);
   return RawUncompress(compressed, n, string_as_array(uncompressed));
 }
 
-
 // A Writer that drops everything on the floor and just does validation
 class SnappyDecompressionValidator {
  private:
@@ -1056,7 +1255,7 @@
   size_t produced_;
 
  public:
-  inline SnappyDecompressionValidator() : produced_(0) { }
+  inline SnappyDecompressionValidator() : expected_(0), produced_(0) { }
   inline void SetExpectedLength(size_t len) {
     expected_ = len;
   }
@@ -1071,16 +1270,24 @@
     return false;
   }
   inline bool AppendFromSelf(size_t offset, size_t len) {
-    if (produced_ <= offset - 1u) return false;  // -1u catches offset==0
+    // See SnappyArrayWriter::AppendFromSelf for an explanation of
+    // the "offset - 1u" trick.
+    if (produced_ <= offset - 1u) return false;
     produced_ += len;
     return produced_ <= expected_;
   }
+  inline void Flush() {}
 };
 
 bool IsValidCompressedBuffer(const char* compressed, size_t n) {
   ByteArraySource reader(compressed, n);
   SnappyDecompressionValidator writer;
-  return InternalUncompress(&reader, &writer, kuint32max);
+  return InternalUncompress(&reader, &writer);
+}
+
+bool IsValidCompressed(Source* compressed) {
+  SnappyDecompressionValidator writer;
+  return InternalUncompress(compressed, &writer);
 }
 
 void RawCompress(const char* input,
@@ -1106,6 +1313,241 @@
   return compressed_length;
 }
 
+// -----------------------------------------------------------------------
+// Sink interface
+// -----------------------------------------------------------------------
+
+// A type that decompresses into a Sink. The template parameter
+// Allocator must export one method "char* Allocate(int size);", which
+// allocates a buffer of "size" and appends that to the destination.
+template <typename Allocator>
+class SnappyScatteredWriter {
+  Allocator allocator_;
+
+  // We need random access into the data generated so far.  Therefore
+  // we keep track of all of the generated data as an array of blocks.
+  // All of the blocks except the last have length kBlockSize.
+  vector<char*> blocks_;
+  size_t expected_;
+
+  // Total size of all fully generated blocks so far
+  size_t full_size_;
+
+  // Pointer into current output block
+  char* op_base_;       // Base of output block
+  char* op_ptr_;        // Pointer to next unfilled byte in block
+  char* op_limit_;      // Pointer just past block
+
+  inline size_t Size() const {
+    return full_size_ + (op_ptr_ - op_base_);
+  }
+
+  bool SlowAppend(const char* ip, size_t len);
+  bool SlowAppendFromSelf(size_t offset, size_t len);
+
+ public:
+  inline explicit SnappyScatteredWriter(const Allocator& allocator)
+      : allocator_(allocator),
+        full_size_(0),
+        op_base_(NULL),
+        op_ptr_(NULL),
+        op_limit_(NULL) {
+  }
+
+  inline void SetExpectedLength(size_t len) {
+    assert(blocks_.empty());
+    expected_ = len;
+  }
+
+  inline bool CheckLength() const {
+    return Size() == expected_;
+  }
+
+  // Return the number of bytes actually uncompressed so far
+  inline size_t Produced() const {
+    return Size();
+  }
+
+  inline bool Append(const char* ip, size_t len) {
+    size_t avail = op_limit_ - op_ptr_;
+    if (len <= avail) {
+      // Fast path
+      memcpy(op_ptr_, ip, len);
+      op_ptr_ += len;
+      return true;
+    } else {
+      return SlowAppend(ip, len);
+    }
+  }
+
+  inline bool TryFastAppend(const char* ip, size_t available, size_t length) {
+    char* op = op_ptr_;
+    const int space_left = op_limit_ - op;
+    if (length <= 16 && available >= 16 + kMaximumTagLength &&
+        space_left >= 16) {
+      // Fast path, used for the majority (about 95%) of invocations.
+      UNALIGNED_STORE64(op, UNALIGNED_LOAD64(ip));
+      UNALIGNED_STORE64(op + 8, UNALIGNED_LOAD64(ip + 8));
+      op_ptr_ = op + length;
+      return true;
+    } else {
+      return false;
+    }
+  }
+
+  inline bool AppendFromSelf(size_t offset, size_t len) {
+    // See SnappyArrayWriter::AppendFromSelf for an explanation of
+    // the "offset - 1u" trick.
+    if (offset - 1u < op_ptr_ - op_base_) {
+      const size_t space_left = op_limit_ - op_ptr_;
+      if (space_left >= len + kMaxIncrementCopyOverflow) {
+        // Fast path: src and dst in current block.
+        IncrementalCopyFastPath(op_ptr_ - offset, op_ptr_, len);
+        op_ptr_ += len;
+        return true;
+      }
+    }
+    return SlowAppendFromSelf(offset, len);
+  }
+
+  // Called at the end of the decompress. We ask the allocator
+  // write all blocks to the sink.
+  inline void Flush() { allocator_.Flush(Produced()); }
+};
+
+template<typename Allocator>
+bool SnappyScatteredWriter<Allocator>::SlowAppend(const char* ip, size_t len) {
+  size_t avail = op_limit_ - op_ptr_;
+  while (len > avail) {
+    // Completely fill this block
+    memcpy(op_ptr_, ip, avail);
+    op_ptr_ += avail;
+    assert(op_limit_ - op_ptr_ == 0);
+    full_size_ += (op_ptr_ - op_base_);
+    len -= avail;
+    ip += avail;
+
+    // Bounds check
+    if (full_size_ + len > expected_) {
+      return false;
+    }
+
+    // Make new block
+    size_t bsize = min<size_t>(kBlockSize, expected_ - full_size_);
+    op_base_ = allocator_.Allocate(bsize);
+    op_ptr_ = op_base_;
+    op_limit_ = op_base_ + bsize;
+    blocks_.push_back(op_base_);
+    avail = bsize;
+  }
+
+  memcpy(op_ptr_, ip, len);
+  op_ptr_ += len;
+  return true;
+}
+
+template<typename Allocator>
+bool SnappyScatteredWriter<Allocator>::SlowAppendFromSelf(size_t offset,
+                                                         size_t len) {
+  // Overflow check
+  // See SnappyArrayWriter::AppendFromSelf for an explanation of
+  // the "offset - 1u" trick.
+  const size_t cur = Size();
+  if (offset - 1u >= cur) return false;
+  if (expected_ - cur < len) return false;
+
+  // Currently we shouldn't ever hit this path because Compress() chops the
+  // input into blocks and does not create cross-block copies. However, it is
+  // nice if we do not rely on that, since we can get better compression if we
+  // allow cross-block copies and thus might want to change the compressor in
+  // the future.
+  size_t src = cur - offset;
+  while (len-- > 0) {
+    char c = blocks_[src >> kBlockLog][src & (kBlockSize-1)];
+    Append(&c, 1);
+    src++;
+  }
+  return true;
+}
+
+class SnappySinkAllocator {
+ public:
+  explicit SnappySinkAllocator(Sink* dest): dest_(dest) {}
+  ~SnappySinkAllocator() {}
+
+  char* Allocate(int size) {
+    Datablock block(new char[size], size);
+    blocks_.push_back(block);
+    return block.data;
+  }
+
+  // We flush only at the end, because the writer wants
+  // random access to the blocks and once we hand the
+  // block over to the sink, we can't access it anymore.
+  // Also we don't write more than has been actually written
+  // to the blocks.
+  void Flush(size_t size) {
+    size_t size_written = 0;
+    size_t block_size;
+    for (int i = 0; i < blocks_.size(); ++i) {
+      block_size = min<size_t>(blocks_[i].size, size - size_written);
+      dest_->AppendAndTakeOwnership(blocks_[i].data, block_size,
+                                    &SnappySinkAllocator::Deleter, NULL);
+      size_written += block_size;
+    }
+    blocks_.clear();
+  }
+
+ private:
+  struct Datablock {
+    char* data;
+    size_t size;
+    Datablock(char* p, size_t s) : data(p), size(s) {}
+  };
+
+  static void Deleter(void* arg, const char* bytes, size_t size) {
+    delete[] bytes;
+  }
+
+  Sink* dest_;
+  vector<Datablock> blocks_;
+
+  // Note: copying this object is allowed
+};
+
+size_t UncompressAsMuchAsPossible(Source* compressed, Sink* uncompressed) {
+  SnappySinkAllocator allocator(uncompressed);
+  SnappyScatteredWriter<SnappySinkAllocator> writer(allocator);
+  InternalUncompress(compressed, &writer);
+  return writer.Produced();
+}
+
+bool Uncompress(Source* compressed, Sink* uncompressed) {
+  // Read the uncompressed length from the front of the compressed input
+  SnappyDecompressor decompressor(compressed);
+  uint32 uncompressed_len = 0;
+  if (!decompressor.ReadUncompressedLength(&uncompressed_len)) {
+    return false;
+  }
+
+  char c;
+  size_t allocated_size;
+  char* buf = uncompressed->GetAppendBufferVariable(
+      1, uncompressed_len, &c, 1, &allocated_size);
+
+  // If we can get a flat buffer, then use it, otherwise do block by block
+  // uncompression
+  if (allocated_size >= uncompressed_len) {
+    SnappyArrayWriter writer(buf);
+    bool result = InternalUncompressAllTags(
+        &decompressor, &writer, uncompressed_len);
+    uncompressed->Append(buf, writer.Produced());
+    return result;
+  } else {
+    SnappySinkAllocator allocator(uncompressed);
+    SnappyScatteredWriter<SnappySinkAllocator> writer(allocator);
+    return InternalUncompressAllTags(&decompressor, &writer, uncompressed_len);
+  }
+}
 
 } // end namespace snappy
-
diff --git a/c_src/snappy/snappy.h b/c_src/snappy/snappy.h
index 8c2075f..4568db8 100644
--- a/c_src/snappy/snappy.h
+++ b/c_src/snappy/snappy.h
@@ -36,8 +36,8 @@
 // using BMDiff and then compressing the output of BMDiff with
 // Snappy.
 
-#ifndef UTIL_SNAPPY_SNAPPY_H__
-#define UTIL_SNAPPY_SNAPPY_H__
+#ifndef THIRD_PARTY_SNAPPY_SNAPPY_H__
+#define THIRD_PARTY_SNAPPY_SNAPPY_H__
 
 #include <stddef.h>
 #include <string>
@@ -56,6 +56,13 @@
   // number of bytes written.
   size_t Compress(Source* source, Sink* sink);
 
+  // Find the uncompressed length of the given stream, as given by the header.
+  // Note that the true length could deviate from this; the stream could e.g.
+  // be truncated.
+  //
+  // Also note that this leaves "*source" in a state that is unsuitable for
+  // further operations, such as RawUncompress(). You will need to rewind
+  // or recreate the source yourself before attempting any further calls.
   bool GetUncompressedLength(Source* source, uint32* result);
 
   // ------------------------------------------------------------------------
@@ -77,6 +84,18 @@
   bool Uncompress(const char* compressed, size_t compressed_length,
                   string* uncompressed);
 
+  // Decompresses "compressed" to "*uncompressed".
+  //
+  // returns false if the message is corrupted and could not be decompressed
+  bool Uncompress(Source* compressed, Sink* uncompressed);
+
+  // This routine uncompresses as much of the "compressed" as possible
+  // into sink.  It returns the number of valid bytes added to sink
+  // (extra invalid bytes may have been added due to errors; the caller
+  // should ignore those). The emitted data typically has length
+  // GetUncompressedLength(), but may be shorter if an error is
+  // encountered.
+  size_t UncompressAsMuchAsPossible(Source* compressed, Sink* uncompressed);
 
   // ------------------------------------------------------------------------
   // Lower-level character array based routines.  May be useful for
@@ -117,6 +136,28 @@
   // returns false if the message is corrupted and could not be decrypted
   bool RawUncompress(Source* compressed, char* uncompressed);
 
+  // Given data in "compressed[0..compressed_length-1]" generated by
+  // calling the Snappy::Compress routine, this routine
+  // stores the uncompressed data to the iovec "iov". The number of physical
+  // buffers in "iov" is given by iov_cnt and their cumulative size
+  // must be at least GetUncompressedLength(compressed). The individual buffers
+  // in "iov" must not overlap with each other.
+  //
+  // returns false if the message is corrupted and could not be decrypted
+  bool RawUncompressToIOVec(const char* compressed, size_t compressed_length,
+                            const struct iovec* iov, size_t iov_cnt);
+
+  // Given data from the byte source 'compressed' generated by calling
+  // the Snappy::Compress routine, this routine stores the uncompressed
+  // data to the iovec "iov". The number of physical
+  // buffers in "iov" is given by iov_cnt and their cumulative size
+  // must be at least GetUncompressedLength(compressed). The individual buffers
+  // in "iov" must not overlap with each other.
+  //
+  // returns false if the message is corrupted and could not be decrypted
+  bool RawUncompressToIOVec(Source* compressed, const struct iovec* iov,
+                            size_t iov_cnt);
+
   // Returns the maximal size of the compressed representation of
   // input data that is "source_bytes" bytes in length;
   size_t MaxCompressedLength(size_t source_bytes);
@@ -135,21 +176,28 @@
   bool IsValidCompressedBuffer(const char* compressed,
                                size_t compressed_length);
 
-  // *** DO NOT CHANGE THE VALUE OF kBlockSize ***
+  // Returns true iff the contents of "compressed" can be uncompressed
+  // successfully.  Does not return the uncompressed data.  Takes
+  // time proportional to *compressed length, but is usually at least
+  // a factor of four faster than actual decompression.
+  // On success, consumes all of *compressed.  On failure, consumes an
+  // unspecified prefix of *compressed.
+  bool IsValidCompressed(Source* compressed);
+
+  // The size of a compression block. Note that many parts of the compression
+  // code assumes that kBlockSize <= 65536; in particular, the hash table
+  // can only store 16-bit offsets, and EmitCopy() also assumes the offset
+  // is 65535 bytes or less. Note also that if you change this, it will
+  // affect the framing format (see framing_format.txt).
   //
-  // New Compression code chops up the input into blocks of at most
-  // the following size.  This ensures that back-references in the
-  // output never cross kBlockSize block boundaries.  This can be
-  // helpful in implementing blocked decompression.  However the
-  // decompression code should not rely on this guarantee since older
-  // compression code may not obey it.
-  static const int kBlockLog = 15;
+  // Note that there might be older data around that is compressed with larger
+  // block sizes, so the decompression code should not rely on the
+  // non-existence of long backreferences.
+  static const int kBlockLog = 16;
   static const size_t kBlockSize = 1 << kBlockLog;
 
   static const int kMaxHashTableBits = 14;
   static const size_t kMaxHashTableSize = 1 << kMaxHashTableBits;
-
 }  // end namespace snappy
 
-
-#endif  // UTIL_SNAPPY_SNAPPY_H__
+#endif  // THIRD_PARTY_SNAPPY_SNAPPY_H__