blob: 55fa3b879da4913f56cff68c819213030e34db74 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
// author Kevin Lang, Oath Research
#ifndef CPC_COMPRESSOR_HPP_
#define CPC_COMPRESSOR_HPP_
#include "cpc_common.hpp"
namespace datasketches {
/*
* This is a very efficient compressor specialized for use by the CPC Sketch.
* There are two very different compression schemes here: one for the sliding window
* and another for the table of so-called surprising values.
* These two compression schemes are designed for specific probability distributions of entries
* in these data structures and make some compromises for performance. As a result
* the compression is slightly less effective than theoretically achievable but is very fast.
*/
// forward declarations
template<typename A> class cpc_sketch_alloc;
template<typename A> class cpc_compressor;
// the compressor is not instantiated directly
// the sketch implementation uses this global function to statically allocate and construct on the first use
template<typename A>
inline cpc_compressor<A>& get_compressor();
template<typename A>
class cpc_compressor {
public:
void compress(const cpc_sketch_alloc<A>& source, compressed_state<A>& target) const;
void uncompress(const compressed_state<A>& source, uncompressed_state<A>& target, uint8_t lg_k, uint64_t num_coupons) const;
// methods below are public for testing
// This returns the number of compressed words that were actually used. It is the caller's
// responsibility to ensure that the compressed_words array is long enough to prevent over-run.
size_t low_level_compress_bytes(
const uint8_t* byte_array, // input
size_t num_bytes_to_encode,
const uint16_t* encoding_table,
uint32_t* compressed_words // output
) const;
void low_level_uncompress_bytes(
uint8_t* byte_array, // output
size_t num_bytes_to_decode,
const uint16_t* decoding_table,
const uint32_t* compressed_words,
size_t num_compressed_words // input
) const;
// Here "pairs" refers to row-column pairs that specify
// the positions of surprising values in the bit matrix.
// returns the number of compressedWords actually used
size_t low_level_compress_pairs(
const uint32_t* pair_array, // input
size_t num_pairs_to_encode,
size_t num_base_bits,
uint32_t* compressed_words // output
) const;
void low_level_uncompress_pairs(
uint32_t* pair_array, // output
size_t num_pairs_to_decode,
size_t num_base_bits,
const uint32_t* compressed_words, // input
size_t num_compressed_words // input
) const;
private:
// These decoding tables are created at library startup time by inverting the encoding tables
uint16_t* decoding_tables_for_high_entropy_byte[22] = {
// sixteen tables for the steady state (chosen based on the "phase" of C/K)
NULL, NULL, NULL, NULL,
NULL, NULL, NULL, NULL,
NULL, NULL, NULL, NULL,
NULL, NULL, NULL, NULL,
// six more tables for the gradual transition between warmup mode and the steady state.
NULL, NULL, NULL, NULL, NULL, NULL
};
uint16_t* length_limited_unary_decoding_table65;
uint8_t* column_permutations_for_decoding[16] = {
NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL
};
cpc_compressor();
template<typename T> friend cpc_compressor<T>& get_compressor();
~cpc_compressor();
void make_decoding_tables(); // call this at startup
void free_decoding_tables(); // call this at the end
void compress_sparse_flavor(const cpc_sketch_alloc<A>& source, compressed_state<A>& target) const;
void compress_hybrid_flavor(const cpc_sketch_alloc<A>& source, compressed_state<A>& target) const;
void compress_pinned_flavor(const cpc_sketch_alloc<A>& source, compressed_state<A>& target) const;
void compress_sliding_flavor(const cpc_sketch_alloc<A>& source, compressed_state<A>& target) const;
void uncompress_sparse_flavor(const compressed_state<A>& source, uncompressed_state<A>& target, uint8_t lg_k) const;
void uncompress_hybrid_flavor(const compressed_state<A>& source, uncompressed_state<A>& target, uint8_t lg_k) const;
void uncompress_pinned_flavor(const compressed_state<A>& source, uncompressed_state<A>& target, uint8_t lg_k, uint32_t num_coupons) const;
void uncompress_sliding_flavor(const compressed_state<A>& source, uncompressed_state<A>& target, uint8_t lg_k, uint32_t num_coupons) const;
uint8_t* make_inverse_permutation(const uint8_t* permu, int length);
uint16_t* make_decoding_table(const uint16_t* encoding_table, int num_byte_values);
void validate_decoding_table(const uint16_t* decoding_table, const uint16_t* encoding_table) const;
void compress_surprising_values(const vector_u32<A>& pairs, uint8_t lg_k, compressed_state<A>& result) const;
void compress_sliding_window(const uint8_t* window, uint8_t lg_k, uint32_t num_coupons, compressed_state<A>& target) const;
vector_u32<A> uncompress_surprising_values(const uint32_t* data, size_t data_words, size_t num_pairs, uint8_t lg_k) const;
void uncompress_sliding_window(const uint32_t* data, size_t data_words, vector_u8<A>& window, uint8_t lg_k, uint32_t num_coupons) const;
static size_t safe_length_for_compressed_pair_buf(uint64_t k, size_t num_pairs, size_t num_base_bits);
static size_t safe_length_for_compressed_window_buf(uint64_t k);
static uint8_t determine_pseudo_phase(uint8_t lg_k, uint64_t c);
static inline vector_u32<A> tricky_get_pairs_from_window(const uint8_t* window, uint32_t k, uint32_t num_pairs_to_get, uint32_t empty_space);
static inline uint64_t golomb_choose_number_of_base_bits(uint64_t k, uint64_t count);
};
} /* namespace datasketches */
#include "cpc_compressor_impl.hpp"
#endif