Merge branch 'main' into update-snappy
diff --git a/.gitignore b/.gitignore
index 61b4fa5..d722831 100644
--- a/.gitignore
+++ b/.gitignore
@@ -7,3 +7,41 @@
*.la
*~
*.swp
+_build
+c_src/snappy/*.d
+c_src/*.d
+compile_commands.json
+
+### Elixir ###
+/_build
+/cover
+/deps
+/doc
+/.fetch
+erl_crash.dump
+*.ez
+*.beam
+/config/*.secret.exs
+.elixir_ls/
+
+### Elixir Patch ###
+
+### Erlang ###
+.eunit
+*.o
+*.plt
+.concrete/DEV_MODE
+
+# rebar 2.x
+.rebar
+rel/example_project
+ebin/*.beam
+deps
+
+# rebar 3
+.rebar3
+_build/
+_checkouts/
+
+### Erlang Patch ###
+rebar3.crashdump
diff --git a/Makefile b/Makefile
index 058a7ec..9be4a26 100644
--- a/Makefile
+++ b/Makefile
@@ -1,41 +1,14 @@
-REBAR?=rebar
+all: snappy eunit
+snappy:
+ ./rebar3 compile
-.PHONY: all
-# target: all - Makes everything
-all: build
-
-
-.PHONY: build
-# target: build - Builds the project
-build:
- $(REBAR) compile
-
-
-.PHONY: check
-# target: check - Checks if project builds and passes all the tests
-check: build eunit
-
-
-.PHONY: clean
-# target: clean - Removes build artifacts
-clean:
- $(REBAR) clean
-
-
-.PHONY: distclean
-# target: distclean - Removes all unversioned files
-distclean: clean
- git clean -fxd
-
-
-.PHONY: eunit
-# target: eunit - Runs eunit test suite
eunit:
- $(REBAR) eunit
+ ./rebar3 eunit
+check: eunit
-.PHONY: help
-# target: help - Prints this help
-help:
- @egrep "^# target:" Makefile | sed -e 's/^# target: //g' | sort
+clean:
+ ./rebar3 clean
+ rm -fr priv ebin
+
diff --git a/README.md b/README.md
index 0e54b59..b851eec 100644
--- a/README.md
+++ b/README.md
@@ -8,11 +8,42 @@
Its source is included in this project.
+# changelog
+
+- tag `1.1.2` as tests pass against OTP-24
+- tag `otp-24` compatibility
+- switch build chain to [rebar3] and [enc] requiring OTP20 or newer
+- tag `snappy-1.1.8`, also first version compatible with OTP23 (thanks @skaes)
+- tag `snappy-1.1.3`
+- add a changelog
+
+[rebar3]: https://rebar3.org/
+[enc]: https://github.com/davisp/erlang-native-compiler
# site
-https://github.com/fdmanana/snappy-erlang-nif
+https://github.com/skunkwerks/snappy-erlang-nif
+# credits
+
+Software is built by a few people and maintained by many. Thank-you for
+all your patches!
+
+
+- Arne Ehrlich <Arne.Ehrlich@groknet.de>
+- Bryan Chan <bryanpkc@gmail.com>
+- Dave Cottlehuber <dch@skunkwerks.at>
+- Filipe David Manana <fdmanana@apache.org>
+- Joshua Scott <joshua.scott@gmail.com>
+- Kelly McLaughlin <kelly@basho.com>
+- Louis-Philippe Gauthier <lpgauth@gmail.com>
+- Mikhail Uvarov <arcusfelis@gmail.com>
+- Peter Membrey <peter@membrey.hk>
+- Piotr Nosek <piotr.nosek@erlang-solutions.com>
+- benoitc <bchesneau@gmail.com>
+- @dch314
+- glejeune <gregoire.lejeune@free.fr>
+- Stefan Kaes <stkaes@gmail.com>
# performance tests
diff --git a/c_src/snappy/snappy-internal.h b/c_src/snappy/snappy-internal.h
index 0653dc6..1e1c307 100644
--- a/c_src/snappy/snappy-internal.h
+++ b/c_src/snappy/snappy-internal.h
@@ -36,21 +36,30 @@
namespace snappy {
namespace internal {
+// Working memory performs a single allocation to hold all scratch space
+// required for compression.
class WorkingMemory {
public:
- WorkingMemory() : large_table_(NULL) { }
- ~WorkingMemory() { delete[] large_table_; }
+ explicit WorkingMemory(size_t input_size);
+ ~WorkingMemory();
// Allocates and clears a hash table using memory in "*this",
// stores the number of buckets in "*table_size" and returns a pointer to
// the base of the hash table.
- uint16* GetHashTable(size_t input_size, int* table_size);
+ uint16* GetHashTable(size_t fragment_size, int* table_size) const;
+ char* GetScratchInput() const { return input_; }
+ char* GetScratchOutput() const { return output_; }
private:
- uint16 small_table_[1<<10]; // 2KB
- uint16* large_table_; // Allocated only when needed
+ char* mem_; // the allocated memory, never nullptr
+ size_t size_; // the size of the allocated memory, never 0
+ uint16* table_; // the pointer to the hashtable
+ char* input_; // the pointer to the input scratch buffer
+ char* output_; // the pointer to the output scratch buffer
- DISALLOW_COPY_AND_ASSIGN(WorkingMemory);
+ // No copying
+ WorkingMemory(const WorkingMemory&);
+ void operator=(const WorkingMemory&);
};
// Flat array compression that does not emit the "uncompressed length"
@@ -70,57 +79,72 @@
uint16* table,
const int table_size);
-// Return the largest n such that
+// Find the largest n such that
//
// s1[0,n-1] == s2[0,n-1]
// and n <= (s2_limit - s2).
//
+// Return make_pair(n, n < 8).
// Does not read *s2_limit or beyond.
// Does not read *(s1 + (s2_limit - s2)) or beyond.
// Requires that s2_limit >= s2.
//
-// Separate implementation for x86_64, for speed. Uses the fact that
-// x86_64 is little endian.
-#if defined(ARCH_K8)
-static inline int FindMatchLength(const char* s1,
- const char* s2,
- const char* s2_limit) {
+// Separate implementation for 64-bit, little-endian cpus.
+#if !defined(SNAPPY_IS_BIG_ENDIAN) && \
+ (defined(ARCH_K8) || defined(ARCH_PPC) || defined(ARCH_ARM))
+static inline std::pair<size_t, bool> FindMatchLength(const char* s1,
+ const char* s2,
+ const char* s2_limit) {
assert(s2_limit >= s2);
- int matched = 0;
+ size_t matched = 0;
+
+ // This block isn't necessary for correctness; we could just start looping
+ // immediately. As an optimization though, it is useful. It creates some not
+ // uncommon code paths that determine, without extra effort, whether the match
+ // length is less than 8. In short, we are hoping to avoid a conditional
+ // branch, and perhaps get better code layout from the C++ compiler.
+ if (SNAPPY_PREDICT_TRUE(s2 <= s2_limit - 8)) {
+ uint64 a1 = UNALIGNED_LOAD64(s1);
+ uint64 a2 = UNALIGNED_LOAD64(s2);
+ if (a1 != a2) {
+ return std::pair<size_t, bool>(Bits::FindLSBSetNonZero64(a1 ^ a2) >> 3,
+ true);
+ } else {
+ matched = 8;
+ s2 += 8;
+ }
+ }
// Find out how long the match is. We loop over the data 64 bits at a
// time until we find a 64-bit block that doesn't match; then we find
// the first non-matching bit and use that to calculate the total
// length of the match.
- while (PREDICT_TRUE(s2 <= s2_limit - 8)) {
+ while (SNAPPY_PREDICT_TRUE(s2 <= s2_limit - 8)) {
if (UNALIGNED_LOAD64(s2) == UNALIGNED_LOAD64(s1 + matched)) {
s2 += 8;
matched += 8;
} else {
- // On current (mid-2008) Opteron models there is a 3% more
- // efficient code sequence to find the first non-matching byte.
- // However, what follows is ~10% better on Intel Core 2 and newer,
- // and we expect AMD's bsf instruction to improve.
uint64 x = UNALIGNED_LOAD64(s2) ^ UNALIGNED_LOAD64(s1 + matched);
int matching_bits = Bits::FindLSBSetNonZero64(x);
matched += matching_bits >> 3;
- return matched;
+ assert(matched >= 8);
+ return std::pair<size_t, bool>(matched, false);
}
}
- while (PREDICT_TRUE(s2 < s2_limit)) {
+ while (SNAPPY_PREDICT_TRUE(s2 < s2_limit)) {
if (s1[matched] == *s2) {
++s2;
++matched;
} else {
- return matched;
+ return std::pair<size_t, bool>(matched, matched < 8);
}
}
- return matched;
+ return std::pair<size_t, bool>(matched, matched < 8);
}
#else
-static inline int FindMatchLength(const char* s1,
- const char* s2,
- const char* s2_limit) {
+static inline std::pair<size_t, bool> FindMatchLength(const char* s1,
+ const char* s2,
+ const char* s2_limit) {
// Implementation based on the x86-64 version, above.
assert(s2_limit >= s2);
int matched = 0;
@@ -140,10 +164,67 @@
++matched;
}
}
- return matched;
+ return std::pair<size_t, bool>(matched, matched < 8);
}
#endif
+// Lookup tables for decompression code. Give --snappy_dump_decompression_table
+// to the unit test to recompute char_table.
+
+enum {
+ LITERAL = 0,
+ COPY_1_BYTE_OFFSET = 1, // 3 bit length + 3 bits of offset in opcode
+ COPY_2_BYTE_OFFSET = 2,
+ COPY_4_BYTE_OFFSET = 3
+};
+static const int kMaximumTagLength = 5; // COPY_4_BYTE_OFFSET plus the actual offset.
+
+// Data stored per entry in lookup table:
+// Range Bits-used Description
+// ------------------------------------
+// 1..64 0..7 Literal/copy length encoded in opcode byte
+// 0..7 8..10 Copy offset encoded in opcode byte / 256
+// 0..4 11..13 Extra bytes after opcode
+//
+// We use eight bits for the length even though 7 would have sufficed
+// because of efficiency reasons:
+// (1) Extracting a byte is faster than a bit-field
+// (2) It properly aligns copy offset so we do not need a <<8
+static const uint16 char_table[256] = {
+ 0x0001, 0x0804, 0x1001, 0x2001, 0x0002, 0x0805, 0x1002, 0x2002,
+ 0x0003, 0x0806, 0x1003, 0x2003, 0x0004, 0x0807, 0x1004, 0x2004,
+ 0x0005, 0x0808, 0x1005, 0x2005, 0x0006, 0x0809, 0x1006, 0x2006,
+ 0x0007, 0x080a, 0x1007, 0x2007, 0x0008, 0x080b, 0x1008, 0x2008,
+ 0x0009, 0x0904, 0x1009, 0x2009, 0x000a, 0x0905, 0x100a, 0x200a,
+ 0x000b, 0x0906, 0x100b, 0x200b, 0x000c, 0x0907, 0x100c, 0x200c,
+ 0x000d, 0x0908, 0x100d, 0x200d, 0x000e, 0x0909, 0x100e, 0x200e,
+ 0x000f, 0x090a, 0x100f, 0x200f, 0x0010, 0x090b, 0x1010, 0x2010,
+ 0x0011, 0x0a04, 0x1011, 0x2011, 0x0012, 0x0a05, 0x1012, 0x2012,
+ 0x0013, 0x0a06, 0x1013, 0x2013, 0x0014, 0x0a07, 0x1014, 0x2014,
+ 0x0015, 0x0a08, 0x1015, 0x2015, 0x0016, 0x0a09, 0x1016, 0x2016,
+ 0x0017, 0x0a0a, 0x1017, 0x2017, 0x0018, 0x0a0b, 0x1018, 0x2018,
+ 0x0019, 0x0b04, 0x1019, 0x2019, 0x001a, 0x0b05, 0x101a, 0x201a,
+ 0x001b, 0x0b06, 0x101b, 0x201b, 0x001c, 0x0b07, 0x101c, 0x201c,
+ 0x001d, 0x0b08, 0x101d, 0x201d, 0x001e, 0x0b09, 0x101e, 0x201e,
+ 0x001f, 0x0b0a, 0x101f, 0x201f, 0x0020, 0x0b0b, 0x1020, 0x2020,
+ 0x0021, 0x0c04, 0x1021, 0x2021, 0x0022, 0x0c05, 0x1022, 0x2022,
+ 0x0023, 0x0c06, 0x1023, 0x2023, 0x0024, 0x0c07, 0x1024, 0x2024,
+ 0x0025, 0x0c08, 0x1025, 0x2025, 0x0026, 0x0c09, 0x1026, 0x2026,
+ 0x0027, 0x0c0a, 0x1027, 0x2027, 0x0028, 0x0c0b, 0x1028, 0x2028,
+ 0x0029, 0x0d04, 0x1029, 0x2029, 0x002a, 0x0d05, 0x102a, 0x202a,
+ 0x002b, 0x0d06, 0x102b, 0x202b, 0x002c, 0x0d07, 0x102c, 0x202c,
+ 0x002d, 0x0d08, 0x102d, 0x202d, 0x002e, 0x0d09, 0x102e, 0x202e,
+ 0x002f, 0x0d0a, 0x102f, 0x202f, 0x0030, 0x0d0b, 0x1030, 0x2030,
+ 0x0031, 0x0e04, 0x1031, 0x2031, 0x0032, 0x0e05, 0x1032, 0x2032,
+ 0x0033, 0x0e06, 0x1033, 0x2033, 0x0034, 0x0e07, 0x1034, 0x2034,
+ 0x0035, 0x0e08, 0x1035, 0x2035, 0x0036, 0x0e09, 0x1036, 0x2036,
+ 0x0037, 0x0e0a, 0x1037, 0x2037, 0x0038, 0x0e0b, 0x1038, 0x2038,
+ 0x0039, 0x0f04, 0x1039, 0x2039, 0x003a, 0x0f05, 0x103a, 0x203a,
+ 0x003b, 0x0f06, 0x103b, 0x203b, 0x003c, 0x0f07, 0x103c, 0x203c,
+ 0x0801, 0x0f08, 0x103d, 0x203d, 0x1001, 0x0f09, 0x103e, 0x203e,
+ 0x1801, 0x0f0a, 0x103f, 0x203f, 0x2001, 0x0f0b, 0x1040, 0x2040
+};
+
} // end namespace internal
} // end namespace snappy
diff --git a/c_src/snappy/snappy-stubs-internal.cc b/c_src/snappy/snappy-stubs-internal.cc
index 6ed3343..66ed2e9 100644
--- a/c_src/snappy/snappy-stubs-internal.cc
+++ b/c_src/snappy/snappy-stubs-internal.cc
@@ -33,7 +33,7 @@
namespace snappy {
-void Varint::Append32(string* s, uint32 value) {
+void Varint::Append32(std::string* s, uint32 value) {
char buf[Varint::kMax32];
const char* p = Varint::Encode32(buf, value);
s->append(buf, p - buf);
diff --git a/c_src/snappy/snappy-stubs-internal.h b/c_src/snappy/snappy-stubs-internal.h
index a07b6df..4854689 100644
--- a/c_src/snappy/snappy-stubs-internal.h
+++ b/c_src/snappy/snappy-stubs-internal.h
@@ -45,11 +45,26 @@
#include <sys/mman.h>
#endif
-#ifdef _MSC_VER
-#include <BaseTsd.h>
-typedef SSIZE_T ssize_t;
+#ifdef HAVE_UNISTD_H
+#include <unistd.h>
#endif
+#if defined(_MSC_VER)
+#include <intrin.h>
+#endif // defined(_MSC_VER)
+
+#ifndef __has_feature
+#define __has_feature(x) 0
+#endif
+
+#if __has_feature(memory_sanitizer)
+#include <sanitizer/msan_interface.h>
+#define SNAPPY_ANNOTATE_MEMORY_IS_INITIALIZED(address, size) \
+ __msan_unpoison((address), (size))
+#else
+#define SNAPPY_ANNOTATE_MEMORY_IS_INITIALIZED(address, size) /* empty */
+#endif // __has_feature(memory_sanitizer)
+
#include "snappy-stubs-public.h"
#if defined(__x86_64__)
@@ -57,6 +72,14 @@
// Enable 64-bit optimized versions of some routines.
#define ARCH_K8 1
+#elif defined(__ppc64__)
+
+#define ARCH_PPC 1
+
+#elif defined(__aarch64__)
+
+#define ARCH_ARM 1
+
#endif
// Needed by OS X, among others.
@@ -64,10 +87,6 @@
#define MAP_ANONYMOUS MAP_ANON
#endif
-// Pull in std::min, std::ostream, and the likes. This is safe because this
-// header file is never used from any public header files.
-using namespace std;
-
// The size of an array, if known at compile-time.
// Will give unexpected results if used on a pointer.
// We undefine it first, since some compilers already have a definition.
@@ -78,11 +97,11 @@
// Static prediction hints.
#ifdef HAVE_BUILTIN_EXPECT
-#define PREDICT_FALSE(x) (__builtin_expect(x, 0))
-#define PREDICT_TRUE(x) (__builtin_expect(!!(x), 1))
+#define SNAPPY_PREDICT_FALSE(x) (__builtin_expect(x, 0))
+#define SNAPPY_PREDICT_TRUE(x) (__builtin_expect(!!(x), 1))
#else
-#define PREDICT_FALSE(x) x
-#define PREDICT_TRUE(x) x
+#define SNAPPY_PREDICT_FALSE(x) x
+#define SNAPPY_PREDICT_TRUE(x) x
#endif
// This is only used for recomputing the tag byte table used during
@@ -101,9 +120,10 @@
// Potentially unaligned loads and stores.
-// x86 and PowerPC can simply do these loads and stores native.
+// x86, PowerPC, and ARM64 can simply do these loads and stores native.
-#if defined(__i386__) || defined(__x86_64__) || defined(__powerpc__)
+#if defined(__i386__) || defined(__x86_64__) || defined(__powerpc__) || \
+ defined(__aarch64__)
#define UNALIGNED_LOAD16(_p) (*reinterpret_cast<const uint16 *>(_p))
#define UNALIGNED_LOAD32(_p) (*reinterpret_cast<const uint32 *>(_p))
@@ -121,6 +141,15 @@
// sub-architectures.
//
// This is a mess, but there's not much we can do about it.
+//
+// To further complicate matters, only LDR instructions (single reads) are
+// allowed to be unaligned, not LDRD (two reads) or LDM (many reads). Unless we
+// explicitly tell the compiler that these accesses can be unaligned, it can and
+// will combine accesses. On armcc, the way to signal this is done by accessing
+// through the type (uint32 __packed *), but GCC has no such attribute
+// (it ignores __attribute__((packed)) on individual variables). However,
+// we can tell it that a _struct_ is unaligned, which has the same effect,
+// so we do that.
#elif defined(__arm__) && \
!defined(__ARM_ARCH_4__) && \
@@ -136,13 +165,41 @@
!defined(__ARM_ARCH_6ZK__) && \
!defined(__ARM_ARCH_6T2__)
-#define UNALIGNED_LOAD16(_p) (*reinterpret_cast<const uint16 *>(_p))
-#define UNALIGNED_LOAD32(_p) (*reinterpret_cast<const uint32 *>(_p))
+#if __GNUC__
+#define ATTRIBUTE_PACKED __attribute__((__packed__))
+#else
+#define ATTRIBUTE_PACKED
+#endif
-#define UNALIGNED_STORE16(_p, _val) (*reinterpret_cast<uint16 *>(_p) = (_val))
-#define UNALIGNED_STORE32(_p, _val) (*reinterpret_cast<uint32 *>(_p) = (_val))
+namespace base {
+namespace internal {
-// TODO(user): NEON supports unaligned 64-bit loads and stores.
+struct Unaligned16Struct {
+ uint16 value;
+ uint8 dummy; // To make the size non-power-of-two.
+} ATTRIBUTE_PACKED;
+
+struct Unaligned32Struct {
+ uint32 value;
+ uint8 dummy; // To make the size non-power-of-two.
+} ATTRIBUTE_PACKED;
+
+} // namespace internal
+} // namespace base
+
+#define UNALIGNED_LOAD16(_p) \
+ ((reinterpret_cast<const ::snappy::base::internal::Unaligned16Struct *>(_p))->value)
+#define UNALIGNED_LOAD32(_p) \
+ ((reinterpret_cast<const ::snappy::base::internal::Unaligned32Struct *>(_p))->value)
+
+#define UNALIGNED_STORE16(_p, _val) \
+ ((reinterpret_cast< ::snappy::base::internal::Unaligned16Struct *>(_p))->value = \
+ (_val))
+#define UNALIGNED_STORE32(_p, _val) \
+ ((reinterpret_cast< ::snappy::base::internal::Unaligned32Struct *>(_p))->value = \
+ (_val))
+
+// TODO: NEON supports unaligned 64-bit loads and stores.
// See if that would be more efficient on platforms supporting it,
// at least for copies.
@@ -193,22 +250,8 @@
#endif
-// This can be more efficient than UNALIGNED_LOAD64 + UNALIGNED_STORE64
-// on some platforms, in particular ARM.
-inline void UnalignedCopy64(const void *src, void *dst) {
- if (sizeof(void *) == 8) {
- UNALIGNED_STORE64(dst, UNALIGNED_LOAD64(src));
- } else {
- const char *src_char = reinterpret_cast<const char *>(src);
- char *dst_char = reinterpret_cast<char *>(dst);
-
- UNALIGNED_STORE32(dst_char, UNALIGNED_LOAD32(src_char));
- UNALIGNED_STORE32(dst_char + 4, UNALIGNED_LOAD32(src_char + 4));
- }
-}
-
// The following guarantees declaration of the byte swap functions.
-#ifdef WORDS_BIGENDIAN
+#if defined(SNAPPY_IS_BIG_ENDIAN)
#ifdef HAVE_SYS_BYTEORDER_H
#include <sys/byteorder.h>
@@ -265,7 +308,7 @@
#endif
-#endif // WORDS_BIGENDIAN
+#endif // defined(SNAPPY_IS_BIG_ENDIAN)
// Convert to little-endian storage, opposite of network format.
// Convert x from host to little endian: x = LittleEndian.FromHost(x);
@@ -279,7 +322,7 @@
class LittleEndian {
public:
// Conversion functions.
-#ifdef WORDS_BIGENDIAN
+#if defined(SNAPPY_IS_BIG_ENDIAN)
static uint16 FromHost16(uint16 x) { return bswap_16(x); }
static uint16 ToHost16(uint16 x) { return bswap_16(x); }
@@ -289,7 +332,7 @@
static bool IsLittleEndian() { return false; }
-#else // !defined(WORDS_BIGENDIAN)
+#else // !defined(SNAPPY_IS_BIG_ENDIAN)
static uint16 FromHost16(uint16 x) { return x; }
static uint16 ToHost16(uint16 x) { return x; }
@@ -299,7 +342,7 @@
static bool IsLittleEndian() { return true; }
-#endif // !defined(WORDS_BIGENDIAN)
+#endif // !defined(SNAPPY_IS_BIG_ENDIAN)
// Functions to do unaligned loads and stores in little-endian order.
static uint16 Load16(const void *p) {
@@ -322,6 +365,9 @@
// Some bit-manipulation functions.
class Bits {
public:
+ // Return floor(log2(n)) for positive integer n.
+ static int Log2FloorNonZero(uint32 n);
+
// Return floor(log2(n)) for positive integer n. Returns -1 iff n == 0.
static int Log2Floor(uint32 n);
@@ -329,31 +375,85 @@
// undefined value if n == 0. FindLSBSetNonZero() is similar to ffs() except
// that it's 0-indexed.
static int FindLSBSetNonZero(uint32 n);
+
+#if defined(ARCH_K8) || defined(ARCH_PPC) || defined(ARCH_ARM)
static int FindLSBSetNonZero64(uint64 n);
+#endif // defined(ARCH_K8) || defined(ARCH_PPC) || defined(ARCH_ARM)
private:
- DISALLOW_COPY_AND_ASSIGN(Bits);
+ // No copying
+ Bits(const Bits&);
+ void operator=(const Bits&);
};
#ifdef HAVE_BUILTIN_CTZ
+inline int Bits::Log2FloorNonZero(uint32 n) {
+ assert(n != 0);
+ // (31 ^ x) is equivalent to (31 - x) for x in [0, 31]. An easy proof
+ // represents subtraction in base 2 and observes that there's no carry.
+ //
+ // GCC and Clang represent __builtin_clz on x86 as 31 ^ _bit_scan_reverse(x).
+ // Using "31 ^" here instead of "31 -" allows the optimizer to strip the
+ // function body down to _bit_scan_reverse(x).
+ return 31 ^ __builtin_clz(n);
+}
+
inline int Bits::Log2Floor(uint32 n) {
- return n == 0 ? -1 : 31 ^ __builtin_clz(n);
+ return (n == 0) ? -1 : Bits::Log2FloorNonZero(n);
}
inline int Bits::FindLSBSetNonZero(uint32 n) {
+ assert(n != 0);
return __builtin_ctz(n);
}
+#if defined(ARCH_K8) || defined(ARCH_PPC) || defined(ARCH_ARM)
inline int Bits::FindLSBSetNonZero64(uint64 n) {
+ assert(n != 0);
return __builtin_ctzll(n);
}
+#endif // defined(ARCH_K8) || defined(ARCH_PPC) || defined(ARCH_ARM)
+
+#elif defined(_MSC_VER)
+
+inline int Bits::Log2FloorNonZero(uint32 n) {
+ assert(n != 0);
+ unsigned long where;
+ _BitScanReverse(&where, n);
+ return static_cast<int>(where);
+}
+
+inline int Bits::Log2Floor(uint32 n) {
+ unsigned long where;
+ if (_BitScanReverse(&where, n))
+ return static_cast<int>(where);
+ return -1;
+}
+
+inline int Bits::FindLSBSetNonZero(uint32 n) {
+ assert(n != 0);
+ unsigned long where;
+ if (_BitScanForward(&where, n))
+ return static_cast<int>(where);
+ return 32;
+}
+
+#if defined(ARCH_K8) || defined(ARCH_PPC) || defined(ARCH_ARM)
+inline int Bits::FindLSBSetNonZero64(uint64 n) {
+ assert(n != 0);
+ unsigned long where;
+ if (_BitScanForward64(&where, n))
+ return static_cast<int>(where);
+ return 64;
+}
+#endif // defined(ARCH_K8) || defined(ARCH_PPC) || defined(ARCH_ARM)
#else // Portable versions.
-inline int Bits::Log2Floor(uint32 n) {
- if (n == 0)
- return -1;
+inline int Bits::Log2FloorNonZero(uint32 n) {
+ assert(n != 0);
+
int log = 0;
uint32 value = n;
for (int i = 4; i >= 0; --i) {
@@ -368,7 +468,13 @@
return log;
}
+inline int Bits::Log2Floor(uint32 n) {
+ return (n == 0) ? -1 : Bits::Log2FloorNonZero(n);
+}
+
inline int Bits::FindLSBSetNonZero(uint32 n) {
+ assert(n != 0);
+
int rc = 31;
for (int i = 4, shift = 1 << 4; i >= 0; --i) {
const uint32 x = n << shift;
@@ -381,8 +487,11 @@
return rc;
}
+#if defined(ARCH_K8) || defined(ARCH_PPC) || defined(ARCH_ARM)
// FindLSBSetNonZero64() is defined in terms of FindLSBSetNonZero().
inline int Bits::FindLSBSetNonZero64(uint64 n) {
+ assert(n != 0);
+
const uint32 bottombits = static_cast<uint32>(n);
if (bottombits == 0) {
// Bottom bits are zero, so scan in top bits
@@ -391,6 +500,7 @@
return FindLSBSetNonZero(bottombits);
}
}
+#endif // defined(ARCH_K8) || defined(ARCH_PPC) || defined(ARCH_ARM)
#endif // End portable versions.
@@ -414,7 +524,7 @@
static char* Encode32(char* ptr, uint32 v);
// EFFECTS Appends the varint representation of "value" to "*s".
- static void Append32(string* s, uint32 value);
+ static void Append32(std::string* s, uint32 value);
};
inline const char* Varint::Parse32WithLimit(const char* p,
@@ -471,7 +581,7 @@
// replace this function with one that resizes the string without
// filling the new space with zeros (if applicable) --
// it will be non-portable but faster.
-inline void STLStringResizeUninitialized(string* s, size_t new_size) {
+inline void STLStringResizeUninitialized(std::string* s, size_t new_size) {
s->resize(new_size);
}
@@ -487,7 +597,7 @@
// (http://www.open-std.org/JTC1/SC22/WG21/docs/lwg-defects.html#530)
// proposes this as the method. It will officially be part of the standard
// for C++0x. This should already work on all current implementations.
-inline char* string_as_array(string* str) {
+inline char* string_as_array(std::string* str) {
return str->empty() ? NULL : &*str->begin();
}
diff --git a/c_src/snappy/snappy-stubs-public.h b/c_src/snappy/snappy-stubs-public.h
index aec79dc..c156ba4 100644
--- a/c_src/snappy/snappy-stubs-public.h
+++ b/c_src/snappy/snappy-stubs-public.h
@@ -33,11 +33,20 @@
// which is a public header. Instead, snappy-stubs-public.h is generated by
// from snappy-stubs-public.h.in at configure time.
-#ifndef UTIL_SNAPPY_OPENSOURCE_SNAPPY_STUBS_PUBLIC_H_
-#define UTIL_SNAPPY_OPENSOURCE_SNAPPY_STUBS_PUBLIC_H_
+#ifndef THIRD_PARTY_SNAPPY_OPENSOURCE_SNAPPY_STUBS_PUBLIC_H_
+#define THIRD_PARTY_SNAPPY_OPENSOURCE_SNAPPY_STUBS_PUBLIC_H_
+#if 1
#include <stdint.h>
+#endif
+
+#if 1
#include <stddef.h>
+#endif
+
+#if 0
+#include <sys/uio.h>
+#endif
#define SNAPPY_MAJOR 1
#define SNAPPY_MINOR 1
@@ -49,6 +58,7 @@
namespace snappy {
+#if 1
typedef int8_t int8;
typedef uint8_t uint8;
typedef int16_t int16;
@@ -57,6 +67,16 @@
typedef uint32_t uint32;
typedef int64_t int64;
typedef uint64_t uint64;
+#else
+typedef signed char int8;
+typedef unsigned char uint8;
+typedef short int16;
+typedef unsigned short uint16;
+typedef int int32;
+typedef unsigned int uint32;
+typedef long long int64;
+typedef unsigned long long uint64;
+#endif
typedef std::string string;
@@ -64,11 +84,15 @@
TypeName(const TypeName&); \
void operator=(const TypeName&)
+#if !0
+// Windows does not have an iovec type, yet the concept is universally useful.
+// It is simple to define it ourselves, so we put it inside our own namespace.
struct iovec {
- void* iov_base;
- size_t iov_len;
+ void* iov_base;
+ size_t iov_len;
};
+#endif
} // namespace snappy
-#endif // UTIL_SNAPPY_OPENSOURCE_SNAPPY_STUBS_PUBLIC_H_
+#endif // THIRD_PARTY_SNAPPY_OPENSOURCE_SNAPPY_STUBS_PUBLIC_H_
diff --git a/c_src/snappy/snappy.cc b/c_src/snappy/snappy.cc
index 09eab49..ce1eef4 100644
--- a/c_src/snappy/snappy.cc
+++ b/c_src/snappy/snappy.cc
@@ -30,15 +30,58 @@
#include "snappy-internal.h"
#include "snappy-sinksource.h"
+#if !defined(SNAPPY_HAVE_SSSE3)
+// __SSSE3__ is defined by GCC and Clang. Visual Studio doesn't target SIMD
+// support between SSE2 and AVX (so SSSE3 instructions require AVX support), and
+// defines __AVX__ when AVX support is available.
+#if defined(__SSSE3__) || defined(__AVX__)
+#define SNAPPY_HAVE_SSSE3 1
+#else
+#define SNAPPY_HAVE_SSSE3 0
+#endif
+#endif // !defined(SNAPPY_HAVE_SSSE3)
+
+#if !defined(SNAPPY_HAVE_BMI2)
+// __BMI2__ is defined by GCC and Clang. Visual Studio doesn't target BMI2
+// specifically, but it does define __AVX2__ when AVX2 support is available.
+// Fortunately, AVX2 was introduced in Haswell, just like BMI2.
+//
+// BMI2 is not defined as a subset of AVX2 (unlike SSSE3 and AVX above). So,
+// GCC and Clang can build code with AVX2 enabled but BMI2 disabled, in which
+// case issuing BMI2 instructions results in a compiler error.
+#if defined(__BMI2__) || (defined(_MSC_VER) && defined(__AVX2__))
+#define SNAPPY_HAVE_BMI2 1
+#else
+#define SNAPPY_HAVE_BMI2 0
+#endif
+#endif // !defined(SNAPPY_HAVE_BMI2)
+
+#if SNAPPY_HAVE_SSSE3
+// Please do not replace with <x86intrin.h>. or with headers that assume more
+// advanced SSE versions without checking with all the OWNERS.
+#include <tmmintrin.h>
+#endif
+
+#if SNAPPY_HAVE_BMI2
+// Please do not replace with <x86intrin.h>. or with headers that assume more
+// advanced SSE versions without checking with all the OWNERS.
+#include <immintrin.h>
+#endif
+
#include <stdio.h>
#include <algorithm>
#include <string>
#include <vector>
-
namespace snappy {
+using internal::COPY_1_BYTE_OFFSET;
+using internal::COPY_2_BYTE_OFFSET;
+using internal::LITERAL;
+using internal::char_table;
+using internal::kMaximumTagLength;
+
// Any hash function will produce a valid compressed bitstream, but a good
// hash function reduces the number of collisions and thus yields better
// compression for compressible input, and more speed for incompressible
@@ -76,162 +119,314 @@
return 32 + source_len + source_len/6;
}
-enum {
- LITERAL = 0,
- COPY_1_BYTE_OFFSET = 1, // 3 bit length + 3 bits of offset in opcode
- COPY_2_BYTE_OFFSET = 2,
- COPY_4_BYTE_OFFSET = 3
-};
-static const int kMaximumTagLength = 5; // COPY_4_BYTE_OFFSET plus the actual offset.
-
-// Copy "len" bytes from "src" to "op", one byte at a time. Used for
-// handling COPY operations where the input and output regions may
-// overlap. For example, suppose:
-// src == "ab"
-// op == src + 2
-// len == 20
-// After IncrementalCopy(src, op, len), the result will have
-// eleven copies of "ab"
-// ababababababababababab
-// Note that this does not match the semantics of either memcpy()
-// or memmove().
-static inline void IncrementalCopy(const char* src, char* op, ssize_t len) {
- assert(len > 0);
- do {
- *op++ = *src++;
- } while (--len > 0);
-}
-
-// Equivalent to IncrementalCopy except that it can write up to ten extra
-// bytes after the end of the copy, and that it is faster.
-//
-// The main part of this loop is a simple copy of eight bytes at a time until
-// we've copied (at least) the requested amount of bytes. However, if op and
-// src are less than eight bytes apart (indicating a repeating pattern of
-// length < 8), we first need to expand the pattern in order to get the correct
-// results. For instance, if the buffer looks like this, with the eight-byte
-// <src> and <op> patterns marked as intervals:
-//
-// abxxxxxxxxxxxx
-// [------] src
-// [------] op
-//
-// a single eight-byte copy from <src> to <op> will repeat the pattern once,
-// after which we can move <op> two bytes without moving <src>:
-//
-// ababxxxxxxxxxx
-// [------] src
-// [------] op
-//
-// and repeat the exercise until the two no longer overlap.
-//
-// This allows us to do very well in the special case of one single byte
-// repeated many times, without taking a big hit for more general cases.
-//
-// The worst case of extra writing past the end of the match occurs when
-// op - src == 1 and len == 1; the last copy will read from byte positions
-// [0..7] and write to [4..11], whereas it was only supposed to write to
-// position 1. Thus, ten excess bytes.
-
namespace {
-const int kMaxIncrementCopyOverflow = 10;
+void UnalignedCopy64(const void* src, void* dst) {
+ char tmp[8];
+ memcpy(tmp, src, 8);
+ memcpy(dst, tmp, 8);
+}
-inline void IncrementalCopyFastPath(const char* src, char* op, ssize_t len) {
- while (PREDICT_FALSE(op - src < 8)) {
- UnalignedCopy64(src, op);
- len -= op - src;
- op += op - src;
+void UnalignedCopy128(const void* src, void* dst) {
+ // memcpy gets vectorized when the appropriate compiler options are used.
+ // For example, x86 compilers targeting SSE2+ will optimize to an SSE2 load
+ // and store.
+ char tmp[16];
+ memcpy(tmp, src, 16);
+ memcpy(dst, tmp, 16);
+}
+
+// Copy [src, src+(op_limit-op)) to [op, (op_limit-op)) a byte at a time. Used
+// for handling COPY operations where the input and output regions may overlap.
+// For example, suppose:
+// src == "ab"
+// op == src + 2
+// op_limit == op + 20
+// After IncrementalCopySlow(src, op, op_limit), the result will have eleven
+// copies of "ab"
+// ababababababababababab
+// Note that this does not match the semantics of either memcpy() or memmove().
+inline char* IncrementalCopySlow(const char* src, char* op,
+ char* const op_limit) {
+ // TODO: Remove pragma when LLVM is aware this
+ // function is only called in cold regions and when cold regions don't get
+ // vectorized or unrolled.
+#ifdef __clang__
+#pragma clang loop unroll(disable)
+#endif
+ while (op < op_limit) {
+ *op++ = *src++;
}
- while (len > 0) {
+ return op_limit;
+}
+
+#if SNAPPY_HAVE_SSSE3
+
+// This is a table of shuffle control masks that can be used as the source
+// operand for PSHUFB to permute the contents of the destination XMM register
+// into a repeating byte pattern.
+alignas(16) const char pshufb_fill_patterns[7][16] = {
+ {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+ {0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1},
+ {0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0},
+ {0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3},
+ {0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0},
+ {0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3},
+ {0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1},
+};
+
+#endif // SNAPPY_HAVE_SSSE3
+
+// Copy [src, src+(op_limit-op)) to [op, (op_limit-op)) but faster than
+// IncrementalCopySlow. buf_limit is the address past the end of the writable
+// region of the buffer.
+inline char* IncrementalCopy(const char* src, char* op, char* const op_limit,
+ char* const buf_limit) {
+ // Terminology:
+ //
+ // slop = buf_limit - op
+ // pat = op - src
+ // len = limit - op
+ assert(src < op);
+ assert(op <= op_limit);
+ assert(op_limit <= buf_limit);
+ // NOTE: The compressor always emits 4 <= len <= 64. It is ok to assume that
+ // to optimize this function but we have to also handle other cases in case
+ // the input does not satisfy these conditions.
+
+ size_t pattern_size = op - src;
+ // The cases are split into different branches to allow the branch predictor,
+ // FDO, and static prediction hints to work better. For each input we list the
+ // ratio of invocations that match each condition.
+ //
+ // input slop < 16 pat < 8 len > 16
+ // ------------------------------------------
+ // html|html4|cp 0% 1.01% 27.73%
+ // urls 0% 0.88% 14.79%
+ // jpg 0% 64.29% 7.14%
+ // pdf 0% 2.56% 58.06%
+ // txt[1-4] 0% 0.23% 0.97%
+ // pb 0% 0.96% 13.88%
+ // bin 0.01% 22.27% 41.17%
+ //
+ // It is very rare that we don't have enough slop for doing block copies. It
+ // is also rare that we need to expand a pattern. Small patterns are common
+ // for incompressible formats and for those we are plenty fast already.
+ // Lengths are normally not greater than 16 but they vary depending on the
+ // input. In general if we always predict len <= 16 it would be an ok
+ // prediction.
+ //
+ // In order to be fast we want a pattern >= 8 bytes and an unrolled loop
+ // copying 2x 8 bytes at a time.
+
+ // Handle the uncommon case where pattern is less than 8 bytes.
+ if (SNAPPY_PREDICT_FALSE(pattern_size < 8)) {
+#if SNAPPY_HAVE_SSSE3
+ // Load the first eight bytes into an 128-bit XMM register, then use PSHUFB
+ // to permute the register's contents in-place into a repeating sequence of
+ // the first "pattern_size" bytes.
+ // For example, suppose:
+ // src == "abc"
+ // op == op + 3
+ // After _mm_shuffle_epi8(), "pattern" will have five copies of "abc"
+ // followed by one byte of slop: abcabcabcabcabca.
+ //
+ // The non-SSE fallback implementation suffers from store-forwarding stalls
+ // because its loads and stores partly overlap. By expanding the pattern
+ // in-place, we avoid the penalty.
+ if (SNAPPY_PREDICT_TRUE(op <= buf_limit - 16)) {
+ const __m128i shuffle_mask = _mm_load_si128(
+ reinterpret_cast<const __m128i*>(pshufb_fill_patterns)
+ + pattern_size - 1);
+ const __m128i pattern = _mm_shuffle_epi8(
+ _mm_loadl_epi64(reinterpret_cast<const __m128i*>(src)), shuffle_mask);
+ // Uninitialized bytes are masked out by the shuffle mask.
+ // TODO: remove annotation and macro defs once MSan is fixed.
+ SNAPPY_ANNOTATE_MEMORY_IS_INITIALIZED(&pattern, sizeof(pattern));
+ pattern_size *= 16 / pattern_size;
+ char* op_end = std::min(op_limit, buf_limit - 15);
+ while (op < op_end) {
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(op), pattern);
+ op += pattern_size;
+ }
+ if (SNAPPY_PREDICT_TRUE(op >= op_limit)) return op_limit;
+ }
+ return IncrementalCopySlow(src, op, op_limit);
+#else // !SNAPPY_HAVE_SSSE3
+ // If plenty of buffer space remains, expand the pattern to at least 8
+ // bytes. The way the following loop is written, we need 8 bytes of buffer
+ // space if pattern_size >= 4, 11 bytes if pattern_size is 1 or 3, and 10
+ // bytes if pattern_size is 2. Precisely encoding that is probably not
+ // worthwhile; instead, invoke the slow path if we cannot write 11 bytes
+ // (because 11 are required in the worst case).
+ if (SNAPPY_PREDICT_TRUE(op <= buf_limit - 11)) {
+ while (pattern_size < 8) {
+ UnalignedCopy64(src, op);
+ op += pattern_size;
+ pattern_size *= 2;
+ }
+ if (SNAPPY_PREDICT_TRUE(op >= op_limit)) return op_limit;
+ } else {
+ return IncrementalCopySlow(src, op, op_limit);
+ }
+#endif // SNAPPY_HAVE_SSSE3
+ }
+ assert(pattern_size >= 8);
+
+ // Copy 2x 8 bytes at a time. Because op - src can be < 16, a single
+ // UnalignedCopy128 might overwrite data in op. UnalignedCopy64 is safe
+ // because expanding the pattern to at least 8 bytes guarantees that
+ // op - src >= 8.
+ //
+ // Typically, the op_limit is the gating factor so try to simplify the loop
+ // based on that.
+ if (SNAPPY_PREDICT_TRUE(op_limit <= buf_limit - 16)) {
+ // There is at least one, and at most four 16-byte blocks. Writing four
+ // conditionals instead of a loop allows FDO to layout the code with respect
+ // to the actual probabilities of each length.
+ // TODO: Replace with loop with trip count hint.
+ UnalignedCopy64(src, op);
+ UnalignedCopy64(src + 8, op + 8);
+
+ if (op + 16 < op_limit) {
+ UnalignedCopy64(src + 16, op + 16);
+ UnalignedCopy64(src + 24, op + 24);
+ }
+ if (op + 32 < op_limit) {
+ UnalignedCopy64(src + 32, op + 32);
+ UnalignedCopy64(src + 40, op + 40);
+ }
+ if (op + 48 < op_limit) {
+ UnalignedCopy64(src + 48, op + 48);
+ UnalignedCopy64(src + 56, op + 56);
+ }
+ return op_limit;
+ }
+
+ // Fall back to doing as much as we can with the available slop in the
+ // buffer. This code path is relatively cold however so we save code size by
+ // avoiding unrolling and vectorizing.
+ //
+ // TODO: Remove pragma when when cold regions don't get vectorized
+ // or unrolled.
+#ifdef __clang__
+#pragma clang loop unroll(disable)
+#endif
+ for (char *op_end = buf_limit - 16; op < op_end; op += 16, src += 16) {
+ UnalignedCopy64(src, op);
+ UnalignedCopy64(src + 8, op + 8);
+ }
+ if (op >= op_limit)
+ return op_limit;
+
+ // We only take this branch if we didn't have enough slop and we can do a
+ // single 8 byte copy.
+ if (SNAPPY_PREDICT_FALSE(op <= buf_limit - 8)) {
UnalignedCopy64(src, op);
src += 8;
op += 8;
- len -= 8;
}
+ return IncrementalCopySlow(src, op, op_limit);
}
} // namespace
+template <bool allow_fast_path>
static inline char* EmitLiteral(char* op,
const char* literal,
- int len,
- bool allow_fast_path) {
- int n = len - 1; // Zero-length literals are disallowed
- if (n < 60) {
+ int len) {
+ // The vast majority of copies are below 16 bytes, for which a
+ // call to memcpy is overkill. This fast path can sometimes
+ // copy up to 15 bytes too much, but that is okay in the
+ // main loop, since we have a bit to go on for both sides:
+ //
+ // - The input will always have kInputMarginBytes = 15 extra
+ // available bytes, as long as we're in the main loop, and
+ // if not, allow_fast_path = false.
+ // - The output will always have 32 spare bytes (see
+ // MaxCompressedLength).
+ assert(len > 0); // Zero-length literals are disallowed
+ int n = len - 1;
+ if (allow_fast_path && len <= 16) {
// Fits in tag byte
*op++ = LITERAL | (n << 2);
- // The vast majority of copies are below 16 bytes, for which a
- // call to memcpy is overkill. This fast path can sometimes
- // copy up to 15 bytes too much, but that is okay in the
- // main loop, since we have a bit to go on for both sides:
- //
- // - The input will always have kInputMarginBytes = 15 extra
- // available bytes, as long as we're in the main loop, and
- // if not, allow_fast_path = false.
- // - The output will always have 32 spare bytes (see
- // MaxCompressedLength).
- if (allow_fast_path && len <= 16) {
- UnalignedCopy64(literal, op);
- UnalignedCopy64(literal + 8, op + 8);
- return op + len;
- }
+ UnalignedCopy128(literal, op);
+ return op + len;
+ }
+
+ if (n < 60) {
+ // Fits in tag byte
+ *op++ = LITERAL | (n << 2);
} else {
- // Encode in upcoming bytes
- char* base = op;
- int count = 0;
- op++;
- while (n > 0) {
- *op++ = n & 0xff;
- n >>= 8;
- count++;
- }
+ int count = (Bits::Log2Floor(n) >> 3) + 1;
assert(count >= 1);
assert(count <= 4);
- *base = LITERAL | ((59+count) << 2);
+ *op++ = LITERAL | ((59 + count) << 2);
+ // Encode in upcoming bytes.
+ // Write 4 bytes, though we may care about only 1 of them. The output buffer
+ // is guaranteed to have at least 3 more spaces left as 'len >= 61' holds
+ // here and there is a memcpy of size 'len' below.
+ LittleEndian::Store32(op, n);
+ op += count;
}
memcpy(op, literal, len);
return op + len;
}
-static inline char* EmitCopyLessThan64(char* op, size_t offset, int len) {
+template <bool len_less_than_12>
+static inline char* EmitCopyAtMost64(char* op, size_t offset, size_t len) {
assert(len <= 64);
assert(len >= 4);
assert(offset < 65536);
+ assert(len_less_than_12 == (len < 12));
- if ((len < 12) && (offset < 2048)) {
- size_t len_minus_4 = len - 4;
- assert(len_minus_4 < 8); // Must fit in 3 bits
- *op++ = COPY_1_BYTE_OFFSET + ((len_minus_4) << 2) + ((offset >> 8) << 5);
+ if (len_less_than_12 && SNAPPY_PREDICT_TRUE(offset < 2048)) {
+ // offset fits in 11 bits. The 3 highest go in the top of the first byte,
+ // and the rest go in the second byte.
+ *op++ = COPY_1_BYTE_OFFSET + ((len - 4) << 2) + ((offset >> 3) & 0xe0);
*op++ = offset & 0xff;
} else {
- *op++ = COPY_2_BYTE_OFFSET + ((len-1) << 2);
- LittleEndian::Store16(op, offset);
- op += 2;
+ // Write 4 bytes, though we only care about 3 of them. The output buffer
+ // is required to have some slack, so the extra byte won't overrun it.
+ uint32 u = COPY_2_BYTE_OFFSET + ((len - 1) << 2) + (offset << 8);
+ LittleEndian::Store32(op, u);
+ op += 3;
}
return op;
}
-static inline char* EmitCopy(char* op, size_t offset, int len) {
- // Emit 64 byte copies but make sure to keep at least four bytes reserved
- while (PREDICT_FALSE(len >= 68)) {
- op = EmitCopyLessThan64(op, offset, 64);
- len -= 64;
- }
+template <bool len_less_than_12>
+static inline char* EmitCopy(char* op, size_t offset, size_t len) {
+ assert(len_less_than_12 == (len < 12));
+ if (len_less_than_12) {
+ return EmitCopyAtMost64</*len_less_than_12=*/true>(op, offset, len);
+ } else {
+ // A special case for len <= 64 might help, but so far measurements suggest
+ // it's in the noise.
- // Emit an extra 60 byte copy if have too much data to fit in one copy
- if (len > 64) {
- op = EmitCopyLessThan64(op, offset, 60);
- len -= 60;
- }
+ // Emit 64 byte copies but make sure to keep at least four bytes reserved.
+ while (SNAPPY_PREDICT_FALSE(len >= 68)) {
+ op = EmitCopyAtMost64</*len_less_than_12=*/false>(op, offset, 64);
+ len -= 64;
+ }
- // Emit remainder
- op = EmitCopyLessThan64(op, offset, len);
- return op;
+ // One or two copies will now finish the job.
+ if (len > 64) {
+ op = EmitCopyAtMost64</*len_less_than_12=*/false>(op, offset, 60);
+ len -= 60;
+ }
+
+ // Emit remainder.
+ if (len < 12) {
+ op = EmitCopyAtMost64</*len_less_than_12=*/true>(op, offset, len);
+ } else {
+ op = EmitCopyAtMost64</*len_less_than_12=*/false>(op, offset, len);
+ }
+ return op;
+ }
}
-
bool GetUncompressedLength(const char* start, size_t n, size_t* result) {
uint32 v = 0;
const char* limit = start + n;
@@ -243,31 +438,45 @@
}
}
+namespace {
+uint32 CalculateTableSize(uint32 input_size) {
+ static_assert(
+ kMaxHashTableSize >= kMinHashTableSize,
+ "kMaxHashTableSize should be greater or equal to kMinHashTableSize.");
+ if (input_size > kMaxHashTableSize) {
+ return kMaxHashTableSize;
+ }
+ if (input_size < kMinHashTableSize) {
+ return kMinHashTableSize;
+ }
+ // This is equivalent to Log2Ceiling(input_size), assuming input_size > 1.
+ // 2 << Log2Floor(x - 1) is equivalent to 1 << (1 + Log2Floor(x - 1)).
+ return 2u << Bits::Log2Floor(input_size - 1);
+}
+} // namespace
+
namespace internal {
-uint16* WorkingMemory::GetHashTable(size_t input_size, int* table_size) {
- // Use smaller hash table when input.size() is smaller, since we
- // fill the table, incurring O(hash table size) overhead for
- // compression, and if the input is short, we won't need that
- // many hash table entries anyway.
- assert(kMaxHashTableSize >= 256);
- size_t htsize = 256;
- while (htsize < kMaxHashTableSize && htsize < input_size) {
- htsize <<= 1;
- }
+WorkingMemory::WorkingMemory(size_t input_size) {
+ const size_t max_fragment_size = std::min(input_size, kBlockSize);
+ const size_t table_size = CalculateTableSize(max_fragment_size);
+ size_ = table_size * sizeof(*table_) + max_fragment_size +
+ MaxCompressedLength(max_fragment_size);
+ mem_ = std::allocator<char>().allocate(size_);
+ table_ = reinterpret_cast<uint16*>(mem_);
+ input_ = mem_ + table_size * sizeof(*table_);
+ output_ = input_ + max_fragment_size;
+}
- uint16* table;
- if (htsize <= ARRAYSIZE(small_table_)) {
- table = small_table_;
- } else {
- if (large_table_ == NULL) {
- large_table_ = new uint16[kMaxHashTableSize];
- }
- table = large_table_;
- }
+WorkingMemory::~WorkingMemory() {
+ std::allocator<char>().deallocate(mem_, size_);
+}
+uint16* WorkingMemory::GetHashTable(size_t fragment_size,
+ int* table_size) const {
+ const size_t htsize = CalculateTableSize(fragment_size);
+ memset(table_, 0, htsize * sizeof(*table_));
*table_size = htsize;
- memset(table, 0, htsize * sizeof(*table));
- return table;
+ return table_;
}
} // end namespace internal
@@ -334,7 +543,7 @@
// "ip" is the input pointer, and "op" is the output pointer.
const char* ip = input;
assert(input_size <= kBlockSize);
- assert((table_size & (table_size - 1)) == 0); // table must be power of two
+ assert((table_size & (table_size - 1)) == 0); // table must be power of two
const int shift = 32 - Bits::Log2Floor(table_size);
assert(static_cast<int>(kuint32max >> shift) == table_size - 1);
const char* ip_end = input + input_size;
@@ -344,7 +553,7 @@
const char* next_emit = ip;
const size_t kInputMarginBytes = 15;
- if (PREDICT_TRUE(input_size >= kInputMarginBytes)) {
+ if (SNAPPY_PREDICT_TRUE(input_size >= kInputMarginBytes)) {
const char* ip_limit = input + input_size - kInputMarginBytes;
for (uint32 next_hash = Hash(++ip, shift); ; ) {
@@ -364,9 +573,9 @@
//
// Heuristic match skipping: If 32 bytes are scanned with no matches
// found, start looking only at every other byte. If 32 more bytes are
- // scanned, look at every third byte, etc.. When a match is found,
- // immediately go back to looking at every byte. This is a small loss
- // (~5% performance, ~0.1% density) for compressible data due to more
+ // scanned (or skipped), look at every third byte, etc.. When a match is
+ // found, immediately go back to looking at every byte. This is a small
+ // loss (~5% performance, ~0.1% density) for compressible data due to more
// bookkeeping, but for non-compressible data (such as JPEG) it's a huge
// win since the compressor quickly "realizes" the data is incompressible
// and doesn't bother looking for matches everywhere.
@@ -382,9 +591,10 @@
ip = next_ip;
uint32 hash = next_hash;
assert(hash == Hash(ip, shift));
- uint32 bytes_between_hash_lookups = skip++ >> 5;
+ uint32 bytes_between_hash_lookups = skip >> 5;
+ skip += bytes_between_hash_lookups;
next_ip = ip + bytes_between_hash_lookups;
- if (PREDICT_FALSE(next_ip > ip_limit)) {
+ if (SNAPPY_PREDICT_FALSE(next_ip > ip_limit)) {
goto emit_remainder;
}
next_hash = Hash(next_ip, shift);
@@ -393,14 +603,14 @@
assert(candidate < ip);
table[hash] = ip - base_ip;
- } while (PREDICT_TRUE(UNALIGNED_LOAD32(ip) !=
- UNALIGNED_LOAD32(candidate)));
+ } while (SNAPPY_PREDICT_TRUE(UNALIGNED_LOAD32(ip) !=
+ UNALIGNED_LOAD32(candidate)));
// Step 2: A 4-byte match has been found. We'll later see if more
// than 4 bytes match. But, prior to the match, input
// bytes [next_emit, ip) are unmatched. Emit them as "literal bytes."
assert(next_emit + 16 <= ip_end);
- op = EmitLiteral(op, next_emit, ip - next_emit, true);
+ op = EmitLiteral</*allow_fast_path=*/true>(op, next_emit, ip - next_emit);
// Step 3: Call EmitCopy, and then see if another EmitCopy could
// be our next move. Repeat until we find no match for the
@@ -417,19 +627,25 @@
// We have a 4-byte match at ip, and no need to emit any
// "literal bytes" prior to ip.
const char* base = ip;
- int matched = 4 + FindMatchLength(candidate + 4, ip + 4, ip_end);
+ std::pair<size_t, bool> p =
+ FindMatchLength(candidate + 4, ip + 4, ip_end);
+ size_t matched = 4 + p.first;
ip += matched;
size_t offset = base - candidate;
assert(0 == memcmp(base, candidate, matched));
- op = EmitCopy(op, offset, matched);
- // We could immediately start working at ip now, but to improve
- // compression we first update table[Hash(ip - 1, ...)].
- const char* insert_tail = ip - 1;
+ if (p.second) {
+ op = EmitCopy</*len_less_than_12=*/true>(op, offset, matched);
+ } else {
+ op = EmitCopy</*len_less_than_12=*/false>(op, offset, matched);
+ }
next_emit = ip;
- if (PREDICT_FALSE(ip >= ip_limit)) {
+ if (SNAPPY_PREDICT_FALSE(ip >= ip_limit)) {
goto emit_remainder;
}
- input_bytes = GetEightBytesAt(insert_tail);
+ // We are now looking for a 4-byte match again. We read
+ // table[Hash(ip, shift)] for that. To improve compression,
+ // we also update table[Hash(ip - 1, shift)] and table[Hash(ip, shift)].
+ input_bytes = GetEightBytesAt(ip - 1);
uint32 prev_hash = HashBytes(GetUint32AtOffset(input_bytes, 0), shift);
table[prev_hash] = ip - base_ip - 1;
uint32 cur_hash = HashBytes(GetUint32AtOffset(input_bytes, 1), shift);
@@ -446,13 +662,18 @@
emit_remainder:
// Emit the remaining bytes as a literal
if (next_emit < ip_end) {
- op = EmitLiteral(op, next_emit, ip_end - next_emit, false);
+ op = EmitLiteral</*allow_fast_path=*/false>(op, next_emit,
+ ip_end - next_emit);
}
return op;
}
} // end namespace internal
+// Called back at avery compression call to trace parameters and sizes.
+static inline void Report(const char *algorithm, size_t compressed_size,
+ size_t uncompressed_size) {}
+
// Signature of output types needed by decompression code.
// The decompression code is templatized on a type that obeys this
// signature so that we do not pay virtual function call overhead in
@@ -493,162 +714,28 @@
// bool TryFastAppend(const char* ip, size_t available, size_t length);
// };
-// -----------------------------------------------------------------------
-// Lookup table for decompression code. Generated by ComputeTable() below.
-// -----------------------------------------------------------------------
-
-// Mapping from i in range [0,4] to a mask to extract the bottom 8*i bits
-static const uint32 wordmask[] = {
- 0u, 0xffu, 0xffffu, 0xffffffu, 0xffffffffu
-};
-
-// Data stored per entry in lookup table:
-// Range Bits-used Description
-// ------------------------------------
-// 1..64 0..7 Literal/copy length encoded in opcode byte
-// 0..7 8..10 Copy offset encoded in opcode byte / 256
-// 0..4 11..13 Extra bytes after opcode
-//
-// We use eight bits for the length even though 7 would have sufficed
-// because of efficiency reasons:
-// (1) Extracting a byte is faster than a bit-field
-// (2) It properly aligns copy offset so we do not need a <<8
-static const uint16 char_table[256] = {
- 0x0001, 0x0804, 0x1001, 0x2001, 0x0002, 0x0805, 0x1002, 0x2002,
- 0x0003, 0x0806, 0x1003, 0x2003, 0x0004, 0x0807, 0x1004, 0x2004,
- 0x0005, 0x0808, 0x1005, 0x2005, 0x0006, 0x0809, 0x1006, 0x2006,
- 0x0007, 0x080a, 0x1007, 0x2007, 0x0008, 0x080b, 0x1008, 0x2008,
- 0x0009, 0x0904, 0x1009, 0x2009, 0x000a, 0x0905, 0x100a, 0x200a,
- 0x000b, 0x0906, 0x100b, 0x200b, 0x000c, 0x0907, 0x100c, 0x200c,
- 0x000d, 0x0908, 0x100d, 0x200d, 0x000e, 0x0909, 0x100e, 0x200e,
- 0x000f, 0x090a, 0x100f, 0x200f, 0x0010, 0x090b, 0x1010, 0x2010,
- 0x0011, 0x0a04, 0x1011, 0x2011, 0x0012, 0x0a05, 0x1012, 0x2012,
- 0x0013, 0x0a06, 0x1013, 0x2013, 0x0014, 0x0a07, 0x1014, 0x2014,
- 0x0015, 0x0a08, 0x1015, 0x2015, 0x0016, 0x0a09, 0x1016, 0x2016,
- 0x0017, 0x0a0a, 0x1017, 0x2017, 0x0018, 0x0a0b, 0x1018, 0x2018,
- 0x0019, 0x0b04, 0x1019, 0x2019, 0x001a, 0x0b05, 0x101a, 0x201a,
- 0x001b, 0x0b06, 0x101b, 0x201b, 0x001c, 0x0b07, 0x101c, 0x201c,
- 0x001d, 0x0b08, 0x101d, 0x201d, 0x001e, 0x0b09, 0x101e, 0x201e,
- 0x001f, 0x0b0a, 0x101f, 0x201f, 0x0020, 0x0b0b, 0x1020, 0x2020,
- 0x0021, 0x0c04, 0x1021, 0x2021, 0x0022, 0x0c05, 0x1022, 0x2022,
- 0x0023, 0x0c06, 0x1023, 0x2023, 0x0024, 0x0c07, 0x1024, 0x2024,
- 0x0025, 0x0c08, 0x1025, 0x2025, 0x0026, 0x0c09, 0x1026, 0x2026,
- 0x0027, 0x0c0a, 0x1027, 0x2027, 0x0028, 0x0c0b, 0x1028, 0x2028,
- 0x0029, 0x0d04, 0x1029, 0x2029, 0x002a, 0x0d05, 0x102a, 0x202a,
- 0x002b, 0x0d06, 0x102b, 0x202b, 0x002c, 0x0d07, 0x102c, 0x202c,
- 0x002d, 0x0d08, 0x102d, 0x202d, 0x002e, 0x0d09, 0x102e, 0x202e,
- 0x002f, 0x0d0a, 0x102f, 0x202f, 0x0030, 0x0d0b, 0x1030, 0x2030,
- 0x0031, 0x0e04, 0x1031, 0x2031, 0x0032, 0x0e05, 0x1032, 0x2032,
- 0x0033, 0x0e06, 0x1033, 0x2033, 0x0034, 0x0e07, 0x1034, 0x2034,
- 0x0035, 0x0e08, 0x1035, 0x2035, 0x0036, 0x0e09, 0x1036, 0x2036,
- 0x0037, 0x0e0a, 0x1037, 0x2037, 0x0038, 0x0e0b, 0x1038, 0x2038,
- 0x0039, 0x0f04, 0x1039, 0x2039, 0x003a, 0x0f05, 0x103a, 0x203a,
- 0x003b, 0x0f06, 0x103b, 0x203b, 0x003c, 0x0f07, 0x103c, 0x203c,
- 0x0801, 0x0f08, 0x103d, 0x203d, 0x1001, 0x0f09, 0x103e, 0x203e,
- 0x1801, 0x0f0a, 0x103f, 0x203f, 0x2001, 0x0f0b, 0x1040, 0x2040
-};
-
-// In debug mode, allow optional computation of the table at startup.
-// Also, check that the decompression table is correct.
-#ifndef NDEBUG
-DEFINE_bool(snappy_dump_decompression_table, false,
- "If true, we print the decompression table at startup.");
-
-static uint16 MakeEntry(unsigned int extra,
- unsigned int len,
- unsigned int copy_offset) {
- // Check that all of the fields fit within the allocated space
- assert(extra == (extra & 0x7)); // At most 3 bits
- assert(copy_offset == (copy_offset & 0x7)); // At most 3 bits
- assert(len == (len & 0x7f)); // At most 7 bits
- return len | (copy_offset << 8) | (extra << 11);
+static inline uint32 ExtractLowBytes(uint32 v, int n) {
+ assert(n >= 0);
+ assert(n <= 4);
+#if SNAPPY_HAVE_BMI2
+ return _bzhi_u32(v, 8 * n);
+#else
+ // This needs to be wider than uint32 otherwise `mask << 32` will be
+ // undefined.
+ uint64 mask = 0xffffffff;
+ return v & ~(mask << (8 * n));
+#endif
}
-static void ComputeTable() {
- uint16 dst[256];
-
- // Place invalid entries in all places to detect missing initialization
- int assigned = 0;
- for (int i = 0; i < 256; i++) {
- dst[i] = 0xffff;
- }
-
- // Small LITERAL entries. We store (len-1) in the top 6 bits.
- for (unsigned int len = 1; len <= 60; len++) {
- dst[LITERAL | ((len-1) << 2)] = MakeEntry(0, len, 0);
- assigned++;
- }
-
- // Large LITERAL entries. We use 60..63 in the high 6 bits to
- // encode the number of bytes of length info that follow the opcode.
- for (unsigned int extra_bytes = 1; extra_bytes <= 4; extra_bytes++) {
- // We set the length field in the lookup table to 1 because extra
- // bytes encode len-1.
- dst[LITERAL | ((extra_bytes+59) << 2)] = MakeEntry(extra_bytes, 1, 0);
- assigned++;
- }
-
- // COPY_1_BYTE_OFFSET.
- //
- // The tag byte in the compressed data stores len-4 in 3 bits, and
- // offset/256 in 5 bits. offset%256 is stored in the next byte.
- //
- // This format is used for length in range [4..11] and offset in
- // range [0..2047]
- for (unsigned int len = 4; len < 12; len++) {
- for (unsigned int offset = 0; offset < 2048; offset += 256) {
- dst[COPY_1_BYTE_OFFSET | ((len-4)<<2) | ((offset>>8)<<5)] =
- MakeEntry(1, len, offset>>8);
- assigned++;
- }
- }
-
- // COPY_2_BYTE_OFFSET.
- // Tag contains len-1 in top 6 bits, and offset in next two bytes.
- for (unsigned int len = 1; len <= 64; len++) {
- dst[COPY_2_BYTE_OFFSET | ((len-1)<<2)] = MakeEntry(2, len, 0);
- assigned++;
- }
-
- // COPY_4_BYTE_OFFSET.
- // Tag contents len-1 in top 6 bits, and offset in next four bytes.
- for (unsigned int len = 1; len <= 64; len++) {
- dst[COPY_4_BYTE_OFFSET | ((len-1)<<2)] = MakeEntry(4, len, 0);
- assigned++;
- }
-
- // Check that each entry was initialized exactly once.
- if (assigned != 256) {
- fprintf(stderr, "ComputeTable: assigned only %d of 256\n", assigned);
- abort();
- }
- for (int i = 0; i < 256; i++) {
- if (dst[i] == 0xffff) {
- fprintf(stderr, "ComputeTable: did not assign byte %d\n", i);
- abort();
- }
- }
-
- if (FLAGS_snappy_dump_decompression_table) {
- printf("static const uint16 char_table[256] = {\n ");
- for (int i = 0; i < 256; i++) {
- printf("0x%04x%s",
- dst[i],
- ((i == 255) ? "\n" : (((i%8) == 7) ? ",\n " : ", ")));
- }
- printf("};\n");
- }
-
- // Check that computed table matched recorded table
- for (int i = 0; i < 256; i++) {
- if (dst[i] != char_table[i]) {
- fprintf(stderr, "ComputeTable: byte %d: computed (%x), expect (%x)\n",
- i, static_cast<int>(dst[i]), static_cast<int>(char_table[i]));
- abort();
- }
- }
+static inline bool LeftShiftOverflows(uint8 value, uint32 shift) {
+ assert(shift < 32);
+ static const uint8 masks[] = {
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, //
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, //
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, //
+ 0x00, 0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe};
+ return (value & masks[shift]) != 0;
}
-#endif /* !NDEBUG */
// Helper class for decompression
class SnappyDecompressor {
@@ -687,7 +774,7 @@
}
// Read the uncompressed length stored at the start of the compressed data.
- // On succcess, stores the length in *result and returns true.
+ // On success, stores the length in *result and returns true.
// On failure, returns false.
bool ReadUncompressedLength(uint32* result) {
assert(ip_ == NULL); // Must not have read anything yet
@@ -701,7 +788,9 @@
if (n == 0) return false;
const unsigned char c = *(reinterpret_cast<const unsigned char*>(ip));
reader_->Skip(1);
- *result |= static_cast<uint32>(c & 0x7f) << shift;
+ uint32 val = c & 0x7f;
+ if (LeftShiftOverflows(static_cast<uint8>(val), shift)) return false;
+ *result |= val << shift;
if (c < 128) {
break;
}
@@ -713,9 +802,27 @@
// Process the next item found in the input.
// Returns true if successful, false on error or end of input.
template <class Writer>
+#if defined(__GNUC__) && defined(__x86_64__)
+ __attribute__((aligned(32)))
+#endif
void DecompressAllTags(Writer* writer) {
- const char* ip = ip_;
+ // In x86, pad the function body to start 16 bytes later. This function has
+ // a couple of hotspots that are highly sensitive to alignment: we have
+ // observed regressions by more than 20% in some metrics just by moving the
+ // exact same code to a different position in the benchmark binary.
+ //
+ // Putting this code on a 32-byte-aligned boundary + 16 bytes makes us hit
+ // the "lucky" case consistently. Unfortunately, this is a very brittle
+ // workaround, and future differences in code generation may reintroduce
+ // this regression. If you experience a big, difficult to explain, benchmark
+ // performance regression here, first try removing this hack.
+#if defined(__GNUC__) && defined(__x86_64__)
+ // Two 8-byte "NOP DWORD ptr [EAX + EAX*1 + 00000000H]" instructions.
+ asm(".byte 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00");
+ asm(".byte 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00");
+#endif
+ const char* ip = ip_;
// We could have put this refill fragment only at the beginning of the loop.
// However, duplicating it at the end of each branch gives the compiler more
// scope to optimize the <ip_limit_ - ip> expression based on the local
@@ -731,21 +838,34 @@
for ( ;; ) {
const unsigned char c = *(reinterpret_cast<const unsigned char*>(ip++));
- if ((c & 0x3) == LITERAL) {
+ // Ratio of iterations that have LITERAL vs non-LITERAL for different
+ // inputs.
+ //
+ // input LITERAL NON_LITERAL
+ // -----------------------------------
+ // html|html4|cp 23% 77%
+ // urls 36% 64%
+ // jpg 47% 53%
+ // pdf 19% 81%
+ // txt[1-4] 25% 75%
+ // pb 24% 76%
+ // bin 24% 76%
+ if (SNAPPY_PREDICT_FALSE((c & 0x3) == LITERAL)) {
size_t literal_length = (c >> 2) + 1u;
if (writer->TryFastAppend(ip, ip_limit_ - ip, literal_length)) {
assert(literal_length < 61);
ip += literal_length;
- // NOTE(user): There is no MAYBE_REFILL() here, as TryFastAppend()
+ // NOTE: There is no MAYBE_REFILL() here, as TryFastAppend()
// will not return true unless there's already at least five spare
// bytes in addition to the literal.
continue;
}
- if (PREDICT_FALSE(literal_length >= 61)) {
+ if (SNAPPY_PREDICT_FALSE(literal_length >= 61)) {
// Long literal.
const size_t literal_length_length = literal_length - 60;
literal_length =
- (LittleEndian::Load32(ip) & wordmask[literal_length_length]) + 1;
+ ExtractLowBytes(LittleEndian::Load32(ip), literal_length_length) +
+ 1;
ip += literal_length_length;
}
@@ -767,15 +887,16 @@
ip += literal_length;
MAYBE_REFILL();
} else {
- const uint32 entry = char_table[c];
- const uint32 trailer = LittleEndian::Load32(ip) & wordmask[entry >> 11];
- const uint32 length = entry & 0xff;
+ const size_t entry = char_table[c];
+ const size_t trailer =
+ ExtractLowBytes(LittleEndian::Load32(ip), entry >> 11);
+ const size_t length = entry & 0xff;
ip += entry >> 11;
// copy_offset/256 is encoded in bits 8..10. By just fetching
// those bits, we get copy_offset (since the bit-field starts at
// bit 8).
- const uint32 copy_offset = entry & 0x700;
+ const size_t copy_offset = entry & 0x700;
if (!writer->AppendFromSelf(copy_offset + trailer, length)) {
return;
}
@@ -795,10 +916,8 @@
size_t n;
ip = reader_->Peek(&n);
peeked_ = n;
- if (n == 0) {
- eof_ = true;
- return false;
- }
+ eof_ = (n == 0);
+ if (eof_) return false;
ip_limit_ = ip + n;
}
@@ -823,7 +942,7 @@
size_t length;
const char* src = reader_->Peek(&length);
if (length == 0) return false;
- uint32 to_add = min<uint32>(needed - nbuf, length);
+ uint32 to_add = std::min<uint32>(needed - nbuf, length);
memcpy(scratch_ + nbuf, src, to_add);
nbuf += to_add;
reader_->Skip(to_add);
@@ -852,13 +971,18 @@
SnappyDecompressor decompressor(r);
uint32 uncompressed_len = 0;
if (!decompressor.ReadUncompressedLength(&uncompressed_len)) return false;
- return InternalUncompressAllTags(&decompressor, writer, uncompressed_len);
+
+ return InternalUncompressAllTags(&decompressor, writer, r->Available(),
+ uncompressed_len);
}
template <typename Writer>
static bool InternalUncompressAllTags(SnappyDecompressor* decompressor,
Writer* writer,
+ uint32 compressed_len,
uint32 uncompressed_len) {
+ Report("snappy_uncompress", compressed_len, uncompressed_len);
+
writer->SetExpectedLength(uncompressed_len);
// Process the entire input
@@ -875,21 +999,20 @@
size_t Compress(Source* reader, Sink* writer) {
size_t written = 0;
size_t N = reader->Available();
+ const size_t uncompressed_size = N;
char ulength[Varint::kMax32];
char* p = Varint::Encode32(ulength, N);
writer->Append(ulength, p-ulength);
written += (p - ulength);
- internal::WorkingMemory wmem;
- char* scratch = NULL;
- char* scratch_output = NULL;
+ internal::WorkingMemory wmem(N);
while (N > 0) {
// Get next block to compress (without copying if possible)
size_t fragment_size;
const char* fragment = reader->Peek(&fragment_size);
assert(fragment_size != 0); // premature end of input
- const size_t num_to_read = min(N, kBlockSize);
+ const size_t num_to_read = std::min(N, kBlockSize);
size_t bytes_read = fragment_size;
size_t pending_advance = 0;
@@ -898,19 +1021,13 @@
pending_advance = num_to_read;
fragment_size = num_to_read;
} else {
- // Read into scratch buffer
- if (scratch == NULL) {
- // If this is the last iteration, we want to allocate N bytes
- // of space, otherwise the max possible kBlockSize space.
- // num_to_read contains exactly the correct value
- scratch = new char[num_to_read];
- }
+ char* scratch = wmem.GetScratchInput();
memcpy(scratch, fragment, bytes_read);
reader->Skip(bytes_read);
while (bytes_read < num_to_read) {
fragment = reader->Peek(&fragment_size);
- size_t n = min<size_t>(fragment_size, num_to_read - bytes_read);
+ size_t n = std::min<size_t>(fragment_size, num_to_read - bytes_read);
memcpy(scratch + bytes_read, fragment, n);
bytes_read += n;
reader->Skip(n);
@@ -930,16 +1047,13 @@
// Need a scratch buffer for the output, in case the byte sink doesn't
// have room for us directly.
- if (scratch_output == NULL) {
- scratch_output = new char[max_output];
- } else {
- // Since we encode kBlockSize regions followed by a region
- // which is <= kBlockSize in length, a previously allocated
- // scratch_output[] region is big enough for this iteration.
- }
- char* dest = writer->GetAppendBuffer(max_output, scratch_output);
- char* end = internal::CompressFragment(fragment, fragment_size,
- dest, table, table_size);
+
+ // Since we encode kBlockSize regions followed by a region
+ // which is <= kBlockSize in length, a previously allocated
+ // scratch_output[] region is big enough for this iteration.
+ char* dest = writer->GetAppendBuffer(max_output, wmem.GetScratchOutput());
+ char* end = internal::CompressFragment(fragment, fragment_size, dest, table,
+ table_size);
writer->Append(dest, end - dest);
written += (end - dest);
@@ -947,8 +1061,7 @@
reader->Skip(pending_advance);
}
- delete[] scratch;
- delete[] scratch_output;
+ Report("snappy_compress", written, uncompressed_size);
return written;
}
@@ -962,14 +1075,22 @@
// Writer template argument to SnappyDecompressor::DecompressAllTags().
class SnappyIOVecWriter {
private:
+ // output_iov_end_ is set to iov + count and used to determine when
+ // the end of the iovs is reached.
+ const struct iovec* output_iov_end_;
+
+#if !defined(NDEBUG)
const struct iovec* output_iov_;
- const size_t output_iov_count_;
+#endif // !defined(NDEBUG)
- // We are currently writing into output_iov_[curr_iov_index_].
- size_t curr_iov_index_;
+ // Current iov that is being written into.
+ const struct iovec* curr_iov_;
- // Bytes written to output_iov_[curr_iov_index_] so far.
- size_t curr_iov_written_;
+ // Pointer to current iov's write location.
+ char* curr_iov_output_;
+
+ // Remaining bytes to write into curr_iov_output.
+ size_t curr_iov_remaining_;
// Total bytes decompressed into output_iov_ so far.
size_t total_written_;
@@ -977,22 +1098,24 @@
// Maximum number of bytes that will be decompressed into output_iov_.
size_t output_limit_;
- inline char* GetIOVecPointer(size_t index, size_t offset) {
- return reinterpret_cast<char*>(output_iov_[index].iov_base) +
- offset;
+ static inline char* GetIOVecPointer(const struct iovec* iov, size_t offset) {
+ return reinterpret_cast<char*>(iov->iov_base) + offset;
}
public:
// Does not take ownership of iov. iov must be valid during the
// entire lifetime of the SnappyIOVecWriter.
inline SnappyIOVecWriter(const struct iovec* iov, size_t iov_count)
- : output_iov_(iov),
- output_iov_count_(iov_count),
- curr_iov_index_(0),
- curr_iov_written_(0),
+ : output_iov_end_(iov + iov_count),
+#if !defined(NDEBUG)
+ output_iov_(iov),
+#endif // !defined(NDEBUG)
+ curr_iov_(iov),
+ curr_iov_output_(iov_count ? reinterpret_cast<char*>(iov->iov_base)
+ : nullptr),
+ curr_iov_remaining_(iov_count ? iov->iov_len : 0),
total_written_(0),
- output_limit_(-1) {
- }
+ output_limit_(-1) {}
inline void SetExpectedLength(size_t len) {
output_limit_ = len;
@@ -1007,23 +1130,25 @@
return false;
}
+ return AppendNoCheck(ip, len);
+ }
+
+ inline bool AppendNoCheck(const char* ip, size_t len) {
while (len > 0) {
- assert(curr_iov_written_ <= output_iov_[curr_iov_index_].iov_len);
- if (curr_iov_written_ >= output_iov_[curr_iov_index_].iov_len) {
+ if (curr_iov_remaining_ == 0) {
// This iovec is full. Go to the next one.
- if (curr_iov_index_ + 1 >= output_iov_count_) {
+ if (curr_iov_ + 1 >= output_iov_end_) {
return false;
}
- curr_iov_written_ = 0;
- ++curr_iov_index_;
+ ++curr_iov_;
+ curr_iov_output_ = reinterpret_cast<char*>(curr_iov_->iov_base);
+ curr_iov_remaining_ = curr_iov_->iov_len;
}
- const size_t to_write = std::min(
- len, output_iov_[curr_iov_index_].iov_len - curr_iov_written_);
- memcpy(GetIOVecPointer(curr_iov_index_, curr_iov_written_),
- ip,
- to_write);
- curr_iov_written_ += to_write;
+ const size_t to_write = std::min(len, curr_iov_remaining_);
+ memcpy(curr_iov_output_, ip, to_write);
+ curr_iov_output_ += to_write;
+ curr_iov_remaining_ -= to_write;
total_written_ += to_write;
ip += to_write;
len -= to_write;
@@ -1035,12 +1160,11 @@
inline bool TryFastAppend(const char* ip, size_t available, size_t len) {
const size_t space_left = output_limit_ - total_written_;
if (len <= 16 && available >= 16 + kMaximumTagLength && space_left >= 16 &&
- output_iov_[curr_iov_index_].iov_len - curr_iov_written_ >= 16) {
+ curr_iov_remaining_ >= 16) {
// Fast path, used for the majority (about 95%) of invocations.
- char* ptr = GetIOVecPointer(curr_iov_index_, curr_iov_written_);
- UnalignedCopy64(ip, ptr);
- UnalignedCopy64(ip + 8, ptr + 8);
- curr_iov_written_ += len;
+ UnalignedCopy128(ip, curr_iov_output_);
+ curr_iov_output_ += len;
+ curr_iov_remaining_ -= len;
total_written_ += len;
return true;
}
@@ -1049,7 +1173,9 @@
}
inline bool AppendFromSelf(size_t offset, size_t len) {
- if (offset > total_written_ || offset == 0) {
+ // See SnappyArrayWriter::AppendFromSelf for an explanation of
+ // the "offset - 1u" trick.
+ if (offset - 1u >= total_written_) {
return false;
}
const size_t space_left = output_limit_ - total_written_;
@@ -1058,8 +1184,8 @@
}
// Locate the iovec from which we need to start the copy.
- size_t from_iov_index = curr_iov_index_;
- size_t from_iov_offset = curr_iov_written_;
+ const iovec* from_iov = curr_iov_;
+ size_t from_iov_offset = curr_iov_->iov_len - curr_iov_remaining_;
while (offset > 0) {
if (from_iov_offset >= offset) {
from_iov_offset -= offset;
@@ -1067,46 +1193,47 @@
}
offset -= from_iov_offset;
- assert(from_iov_index > 0);
- --from_iov_index;
- from_iov_offset = output_iov_[from_iov_index].iov_len;
+ --from_iov;
+#if !defined(NDEBUG)
+ assert(from_iov >= output_iov_);
+#endif // !defined(NDEBUG)
+ from_iov_offset = from_iov->iov_len;
}
// Copy <len> bytes starting from the iovec pointed to by from_iov_index to
// the current iovec.
while (len > 0) {
- assert(from_iov_index <= curr_iov_index_);
- if (from_iov_index != curr_iov_index_) {
- const size_t to_copy = std::min(
- output_iov_[from_iov_index].iov_len - from_iov_offset,
- len);
- Append(GetIOVecPointer(from_iov_index, from_iov_offset), to_copy);
+ assert(from_iov <= curr_iov_);
+ if (from_iov != curr_iov_) {
+ const size_t to_copy =
+ std::min(from_iov->iov_len - from_iov_offset, len);
+ AppendNoCheck(GetIOVecPointer(from_iov, from_iov_offset), to_copy);
len -= to_copy;
if (len > 0) {
- ++from_iov_index;
+ ++from_iov;
from_iov_offset = 0;
}
} else {
- assert(curr_iov_written_ <= output_iov_[curr_iov_index_].iov_len);
- size_t to_copy = std::min(output_iov_[curr_iov_index_].iov_len -
- curr_iov_written_,
- len);
+ size_t to_copy = curr_iov_remaining_;
if (to_copy == 0) {
// This iovec is full. Go to the next one.
- if (curr_iov_index_ + 1 >= output_iov_count_) {
+ if (curr_iov_ + 1 >= output_iov_end_) {
return false;
}
- ++curr_iov_index_;
- curr_iov_written_ = 0;
+ ++curr_iov_;
+ curr_iov_output_ = reinterpret_cast<char*>(curr_iov_->iov_base);
+ curr_iov_remaining_ = curr_iov_->iov_len;
continue;
}
if (to_copy > len) {
to_copy = len;
}
- IncrementalCopy(GetIOVecPointer(from_iov_index, from_iov_offset),
- GetIOVecPointer(curr_iov_index_, curr_iov_written_),
- to_copy);
- curr_iov_written_ += to_copy;
+
+ IncrementalCopy(GetIOVecPointer(from_iov, from_iov_offset),
+ curr_iov_output_, curr_iov_output_ + to_copy,
+ curr_iov_output_ + curr_iov_remaining_);
+ curr_iov_output_ += to_copy;
+ curr_iov_remaining_ -= to_copy;
from_iov_offset += to_copy;
total_written_ += to_copy;
len -= to_copy;
@@ -1175,8 +1302,7 @@
const size_t space_left = op_limit_ - op;
if (len <= 16 && available >= 16 + kMaximumTagLength && space_left >= 16) {
// Fast path, used for the majority (about 95%) of invocations.
- UnalignedCopy64(ip, op);
- UnalignedCopy64(ip + 8, op + 8);
+ UnalignedCopy128(ip, op);
op_ = op + len;
return true;
} else {
@@ -1185,8 +1311,7 @@
}
inline bool AppendFromSelf(size_t offset, size_t len) {
- char* op = op_;
- const size_t space_left = op_limit_ - op;
+ char* const op_end = op_ + len;
// Check if we try to append from before the start of the buffer.
// Normally this would just be a check for "produced < offset",
@@ -1195,30 +1320,13 @@
// to a very big number. This is convenient, as offset==0 is another
// invalid case that we also want to catch, so that we do not go
// into an infinite loop.
- assert(op >= base_);
- size_t produced = op - base_;
- if (produced <= offset - 1u) {
- return false;
- }
- if (len <= 16 && offset >= 8 && space_left >= 16) {
- // Fast path, used for the majority (70-80%) of dynamic invocations.
- UnalignedCopy64(op - offset, op);
- UnalignedCopy64(op - offset + 8, op + 8);
- } else {
- if (space_left >= len + kMaxIncrementCopyOverflow) {
- IncrementalCopyFastPath(op - offset, op, len);
- } else {
- if (space_left < len) {
- return false;
- }
- IncrementalCopy(op - offset, op, len);
- }
- }
+ if (Produced() <= offset - 1u || op_end > op_limit_) return false;
+ op_ = IncrementalCopy(op_ - offset, op_, op_end, op_limit_);
- op_ = op + len;
return true;
}
inline size_t Produced() const {
+ assert(op_ >= base_);
return op_ - base_;
}
inline void Flush() {}
@@ -1234,7 +1342,7 @@
return InternalUncompress(compressed, &output);
}
-bool Uncompress(const char* compressed, size_t n, string* uncompressed) {
+bool Uncompress(const char* compressed, size_t n, std::string* uncompressed) {
size_t ulength;
if (!GetUncompressedLength(compressed, n, &ulength)) {
return false;
@@ -1302,9 +1410,10 @@
*compressed_length = (writer.CurrentDestination() - compressed);
}
-size_t Compress(const char* input, size_t input_length, string* compressed) {
+size_t Compress(const char* input, size_t input_length,
+ std::string* compressed) {
// Pre-grow the buffer to the max length of the compressed output
- compressed->resize(MaxCompressedLength(input_length));
+ STLStringResizeUninitialized(compressed, MaxCompressedLength(input_length));
size_t compressed_length;
RawCompress(input, input_length, string_as_array(compressed),
@@ -1327,7 +1436,7 @@
// We need random access into the data generated so far. Therefore
// we keep track of all of the generated data as an array of blocks.
// All of the blocks except the last have length kBlockSize.
- vector<char*> blocks_;
+ std::vector<char*> blocks_;
size_t expected_;
// Total size of all fully generated blocks so far
@@ -1386,8 +1495,7 @@
if (length <= 16 && available >= 16 + kMaximumTagLength &&
space_left >= 16) {
// Fast path, used for the majority (about 95%) of invocations.
- UNALIGNED_STORE64(op, UNALIGNED_LOAD64(ip));
- UNALIGNED_STORE64(op + 8, UNALIGNED_LOAD64(ip + 8));
+ UnalignedCopy128(ip, op);
op_ptr_ = op + length;
return true;
} else {
@@ -1396,16 +1504,14 @@
}
inline bool AppendFromSelf(size_t offset, size_t len) {
+ char* const op_end = op_ptr_ + len;
// See SnappyArrayWriter::AppendFromSelf for an explanation of
// the "offset - 1u" trick.
- if (offset - 1u < op_ptr_ - op_base_) {
- const size_t space_left = op_limit_ - op_ptr_;
- if (space_left >= len + kMaxIncrementCopyOverflow) {
- // Fast path: src and dst in current block.
- IncrementalCopyFastPath(op_ptr_ - offset, op_ptr_, len);
- op_ptr_ += len;
- return true;
- }
+ if (SNAPPY_PREDICT_TRUE(offset - 1u < op_ptr_ - op_base_ &&
+ op_end <= op_limit_)) {
+ // Fast path: src and dst in current block.
+ op_ptr_ = IncrementalCopy(op_ptr_ - offset, op_ptr_, op_end, op_limit_);
+ return true;
}
return SlowAppendFromSelf(offset, len);
}
@@ -1433,7 +1539,7 @@
}
// Make new block
- size_t bsize = min<size_t>(kBlockSize, expected_ - full_size_);
+ size_t bsize = std::min<size_t>(kBlockSize, expected_ - full_size_);
op_base_ = allocator_.Allocate(bsize);
op_ptr_ = op_base_;
op_limit_ = op_base_ + bsize;
@@ -1489,8 +1595,8 @@
void Flush(size_t size) {
size_t size_written = 0;
size_t block_size;
- for (size_t i = 0; i < blocks_.size(); ++i) {
- block_size = min<size_t>(blocks_[i].size, size - size_written);
+ for (int i = 0; i < blocks_.size(); ++i) {
+ block_size = std::min<size_t>(blocks_[i].size, size - size_written);
dest_->AppendAndTakeOwnership(blocks_[i].data, block_size,
&SnappySinkAllocator::Deleter, NULL);
size_written += block_size;
@@ -1510,7 +1616,7 @@
}
Sink* dest_;
- vector<Datablock> blocks_;
+ std::vector<Datablock> blocks_;
// Note: copying this object is allowed
};
@@ -1535,19 +1641,21 @@
char* buf = uncompressed->GetAppendBufferVariable(
1, uncompressed_len, &c, 1, &allocated_size);
+ const size_t compressed_len = compressed->Available();
// If we can get a flat buffer, then use it, otherwise do block by block
// uncompression
if (allocated_size >= uncompressed_len) {
SnappyArrayWriter writer(buf);
- bool result = InternalUncompressAllTags(
- &decompressor, &writer, uncompressed_len);
+ bool result = InternalUncompressAllTags(&decompressor, &writer,
+ compressed_len, uncompressed_len);
uncompressed->Append(buf, writer.Produced());
return result;
} else {
SnappySinkAllocator allocator(uncompressed);
SnappyScatteredWriter<SnappySinkAllocator> writer(allocator);
- return InternalUncompressAllTags(&decompressor, &writer, uncompressed_len);
+ return InternalUncompressAllTags(&decompressor, &writer, compressed_len,
+ uncompressed_len);
}
}
-} // end namespace snappy
+} // namespace snappy
diff --git a/c_src/snappy/snappy.h b/c_src/snappy/snappy.h
index 4568db8..e9805bf 100644
--- a/c_src/snappy/snappy.h
+++ b/c_src/snappy/snappy.h
@@ -39,7 +39,7 @@
#ifndef THIRD_PARTY_SNAPPY_SNAPPY_H__
#define THIRD_PARTY_SNAPPY_SNAPPY_H__
-#include <stddef.h>
+#include <cstddef>
#include <string>
#include "snappy-stubs-public.h"
@@ -69,11 +69,12 @@
// Higher-level string based routines (should be sufficient for most users)
// ------------------------------------------------------------------------
- // Sets "*output" to the compressed version of "input[0,input_length-1]".
- // Original contents of *output are lost.
+ // Sets "*compressed" to the compressed version of "input[0,input_length-1]".
+ // Original contents of *compressed are lost.
//
- // REQUIRES: "input[]" is not an alias of "*output".
- size_t Compress(const char* input, size_t input_length, string* output);
+ // REQUIRES: "input[]" is not an alias of "*compressed".
+ size_t Compress(const char* input, size_t input_length,
+ std::string* compressed);
// Decompresses "compressed[0,compressed_length-1]" to "*uncompressed".
// Original contents of "*uncompressed" are lost.
@@ -82,7 +83,7 @@
//
// returns false if the message is corrupted and could not be decompressed
bool Uncompress(const char* compressed, size_t compressed_length,
- string* uncompressed);
+ std::string* uncompressed);
// Decompresses "compressed" to "*uncompressed".
//
@@ -193,11 +194,14 @@
// Note that there might be older data around that is compressed with larger
// block sizes, so the decompression code should not rely on the
// non-existence of long backreferences.
- static const int kBlockLog = 16;
- static const size_t kBlockSize = 1 << kBlockLog;
+ static constexpr int kBlockLog = 16;
+ static constexpr size_t kBlockSize = 1 << kBlockLog;
- static const int kMaxHashTableBits = 14;
- static const size_t kMaxHashTableSize = 1 << kMaxHashTableBits;
+ static constexpr int kMinHashTableBits = 8;
+ static constexpr size_t kMinHashTableSize = 1 << kMinHashTableBits;
+
+ static constexpr int kMaxHashTableBits = 14;
+ static constexpr size_t kMaxHashTableSize = 1 << kMaxHashTableBits;
} // end namespace snappy
#endif // THIRD_PARTY_SNAPPY_SNAPPY_H__
diff --git a/c_src/snappy_nif.cc b/c_src/snappy_nif.cc
index 50e9018..f9d0139 100644
--- a/c_src/snappy_nif.cc
+++ b/c_src/snappy_nif.cc
@@ -22,6 +22,10 @@
#include "snappy/snappy.h"
#include "snappy/snappy-sinksource.h"
+#ifdef OTP_R13B03
+#error OTP R13B03 not supported. Upgrade to R13B04 or later.
+#endif
+
#ifdef __cplusplus
#define BEGIN_C extern "C" {
#define END_C }
@@ -40,7 +44,7 @@
void Append(const char* data, size_t n);
char* GetAppendBuffer(size_t len, char* scratch);
- ErlNifBinary& GetBin();
+ ErlNifBinary& getBin();
private:
void EnsureSize(size_t append_length);
@@ -83,7 +87,7 @@
}
ErlNifBinary&
-SnappyNifSink::GetBin()
+SnappyNifSink::getBin()
{
if(bin.size > length) {
if(!enif_realloc_binary_compat(env, &bin, length)) {
@@ -93,15 +97,13 @@
return bin;
}
-
void
SnappyNifSink::EnsureSize(size_t append_length)
{
+ size_t sz;
+
if((length + append_length) > bin.size) {
- size_t sz = append_length * 4;
- if(sz < 8192) {
- sz = 8192;
- }
+ sz = (append_length * 4) < 8192 ? 8192 : (append_length * 4);
if(!enif_realloc_binary_compat(env, &bin, bin.size + sz)) {
throw std::bad_alloc();
@@ -140,7 +142,7 @@
ERL_NIF_TERM
-snappy_compress(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[])
+snappy_compress_erl(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[])
{
ErlNifBinary input;
@@ -148,12 +150,22 @@
return enif_make_badarg(env);
}
+ // If empty binary has been provided, return an empty binary.
+ // Snappy will do this in any case, so might as well skip the
+ // overhead...
+ if(input.size == 0) {
+ ErlNifBinary empty;
+ // init empty;
+ memset(&empty,0,sizeof(ErlNifBinary));
+ return make_ok(env, enif_make_binary(env,&empty));
+ }
+
try {
snappy::ByteArraySource source(SC_PTR(input.data), input.size);
SnappyNifSink sink(env);
snappy::Compress(&source, &sink);
- return make_ok(env, enif_make_binary(env, &sink.GetBin()));
- } catch(std::bad_alloc e) {
+ return make_ok(env, enif_make_binary(env, &sink.getBin()));
+ } catch(const std::bad_alloc & e) {
return make_error(env, "insufficient_memory");
} catch(...) {
return make_error(env, "unknown");
@@ -162,7 +174,7 @@
ERL_NIF_TERM
-snappy_decompress(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[])
+snappy_decompress_erl(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[])
{
ErlNifBinary bin;
ErlNifBinary ret;
@@ -172,6 +184,15 @@
return enif_make_badarg(env);
}
+ // Check that the binary is not empty
+ if(bin.size == 0) {
+ // Snappy library cannot decompress an empty binary - although
+ // it will unfortunately let you compress one. If an empty binary
+ // has been passed - send an empty binary back.
+ memset(&ret,0,sizeof(ErlNifBinary));
+ return make_ok(env, enif_make_binary(env,&ret));
+ }
+
try {
if(!snappy::GetUncompressedLength(SC_PTR(bin.data), bin.size, &len)) {
return make_error(env, "data_not_compressed");
@@ -194,7 +215,7 @@
ERL_NIF_TERM
-snappy_uncompressed_length(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[])
+snappy_uncompressed_length_erl(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[])
{
ErlNifBinary bin;
size_t len;
@@ -257,9 +278,9 @@
static ErlNifFunc nif_functions[] = {
- {"compress", 1, snappy_compress},
- {"decompress", 1, snappy_decompress},
- {"uncompressed_length", 1, snappy_uncompressed_length},
+ {"compress", 1, snappy_compress_erl},
+ {"decompress", 1, snappy_decompress_erl},
+ {"uncompressed_length", 1, snappy_uncompressed_length_erl},
{"is_valid", 1, snappy_is_valid}
};
diff --git a/enc b/enc
new file mode 100755
index 0000000..9bcea06
--- /dev/null
+++ b/enc
Binary files differ
diff --git a/rebar.config b/rebar.config
index 3b93f8a..cb89949 100644
--- a/rebar.config
+++ b/rebar.config
@@ -1,12 +1,12 @@
-{require_otp_vsn, "R14|R15|R16|17|18|19|20|21|22|23|24|25"}.
-
{erl_opts, [debug_info, warn_unused_vars, warn_shadow_vars, warn_unused_import]}.
-{port_sources, ["c_src/*.cc",
- "c_src/snappy/*.cc"]}.
-{port_env, [
- {"CXXFLAGS", "$CXXFLAGS -DNDEBUG"},
- {"(linux|solaris|freebsd|netbsd|openbsd|dragonfly|darwin)",
- "LDFLAGS", "$LDFLAGS -lstdc++"}
-]}.
-{so_name, "snappy_nif.so"}.
{eunit_opts, [verbose]}.
+
+{artifacts, ["priv/snappy_nif.so"]}.
+
+{port_specs, [{".*", "priv/snappy_nif.so",
+ ["c_src/snappy/*.cc", "c_src/*.cc"],
+ [{env, [{"LDFLAGS", "$LDFLAGS -lstdc++"},
+ {"CXXFLAGS", "$CXXFLAGS -std=c++11"}]}]}]}.
+
+{pre_hooks, [{"", compile, "./enc compile"}]}.
+{post_hooks, [{"", clean, "./enc clean"}]}.
diff --git a/rebar.lock b/rebar.lock
new file mode 100644
index 0000000..57afcca
--- /dev/null
+++ b/rebar.lock
@@ -0,0 +1 @@
+[].
diff --git a/rebar3 b/rebar3
new file mode 100755
index 0000000..c7d4824
--- /dev/null
+++ b/rebar3
Binary files differ
diff --git a/src/snappy.app.src b/src/snappy.app.src
index 25d37b5..965a190 100644
--- a/src/snappy.app.src
+++ b/src/snappy.app.src
@@ -1,7 +1,7 @@
{application, snappy,
[
{description, "snappy compressor/decompressor Erlang NIF wrapper"},
- {vsn, "1.0.5"},
+ {vsn, "1.1.1"},
{registered, []},
{applications, [
kernel,
diff --git a/src/snappy.erl b/src/snappy.erl
index 7d3d36a..ebf1587 100644
--- a/src/snappy.erl
+++ b/src/snappy.erl
@@ -33,24 +33,20 @@
Dir ->
filename:join(Dir, "snappy_nif")
end,
- (catch erlang:load_nif(SoName, 0)),
- case erlang:system_info(otp_release) of
- "R13B03" -> true;
- _ -> ok
- end.
+ erlang:load_nif(SoName, 0).
compress(_IoList) ->
- exit(snappy_nif_not_loaded).
+ erlang:nif_error(snappy_nif_not_loaded).
decompress(_IoList) ->
- exit(snappy_nif_not_loaded).
+ erlang:nif_error(snappy_nif_not_loaded).
uncompressed_length(_IoList) ->
- exit(snappy_nif_not_loaded).
+ erlang:nif_error(snappy_nif_not_loaded).
is_valid(_IoList) ->
- exit(snappy_nif_not_loaded).
+ erlang:nif_error(snappy_nif_not_loaded).
diff --git a/test/snappy_tests.erl b/test/snappy_tests.erl
index ac39c58..37a0fc7 100644
--- a/test/snappy_tests.erl
+++ b/test/snappy_tests.erl
@@ -72,3 +72,36 @@
?assertEqual({ok, BigData}, snappy:decompress(Compressed3)),
ok.
+check_double_decompress_doesnt_cause_segfault_test() ->
+ % Try to decompress an empty binary
+ ?assertMatch({ok,<<>>}, snappy:decompress(<<>>)),
+ % And once more...
+ ?assertMatch({ok,<<>>}, snappy:decompress(<<>>)),
+ ok.
+
+check_double_compress_doesnt_cause_segfault_test() ->
+ % Try to compress an empty binary
+ ?assertMatch({ok,<<>>},snappy:compress(<<>>)),
+ % And once more...
+ ?assertMatch({ok,<<>>},snappy:compress(<<>>)),
+ ok.
+
+can_compress_and_decompress_empty_binary_test() ->
+ % Try to compress an empty binary...
+ {ok, Compressed} = snappy:compress(<<>>),
+ % Try to decompress the result of the compression...
+ {ok, Decompressed} = snappy:decompress(Compressed),
+ ok.
+
+can_compress_an_empty_binary_test() ->
+ % Try to compress an empty binary...
+ {ok, Compressed} = snappy:compress(<<>>),
+ % Check an empty binary was returned
+ ?assertMatch(Compressed,<<>>),
+ ok.
+
+can_decompress_an_empty_binary_test() ->
+ % Try to decompress an empty binary
+ {ok, Decompressed} = snappy:decompress(<<>>),
+ ?assertMatch(Decompressed,<<>>),
+ ok.