blob: 25d392a4ebc00a54fd8c979e505c2e39f7d12166 [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#include <string>
#include <gtest/gtest.h>
#include "util/compress.h"
#include "common/names.h"
namespace impala {
// Utility benchmark to test how well we can compress random string data.
// NumStrings=1000000 MinLen=10 MaxLen=10 Codec=SNAPPY
// Uncompressed len: 10000000
// Compressed len: 10006377
// Sorted Compressed len: 9346971
// NumStrings=1000000 MinLen=10 MaxLen=10 Codec=GZIP
// Uncompressed len: 10000000
// Compressed len: 6352396
// Sorted Compressed len: 5712650
// NumStrings=1000000 MinLen=5 MaxLen=15 Codec=SNAPPY
// Uncompressed len: 9498531
// Compressed len: 9503924
// Sorted Compressed len: 8825841
// NumStrings=1000000 MinLen=5 MaxLen=15 Codec=GZIP
// Uncompressed len: 9497973
// Compressed len: 6033310
// Sorted Compressed len: 5429661
// Generates num strings between min_len and max_len.
// Outputs the uncompressed/compressed/sorted_compressed sizes.
void TestCompression(int num, int min_len, int max_len, THdfsCompression::type format) {
vector<string> strings;
uint8_t* buffer = (uint8_t*)malloc(max_len * num);
int offset = 0;
int len_delta = max_len - min_len;
len_delta = max(len_delta, 1);
for (int i = 0; i < num; ++i) {
int len = rand() % len_delta + min_len;
int start = offset;
for (int j = 0; j < len; ++j) {
buffer[offset++] = rand() % 26 + 'a';
}
strings.push_back(string((char*)buffer + start, len));
}
// Sort the input and make a new buffer
uint8_t* sorted_buffer = (uint8_t*)malloc(offset);
int sorted_offset = 0;
sort(strings.begin(), strings.end());
for (int i = 0; i < strings.size(); ++i) {
memcpy(sorted_buffer + sorted_offset, strings[i].data(), strings[i].size());
sorted_offset += strings[i].size();
}
scoped_ptr<Codec> compressor;
Codec::CodecInfo codec_info(format);
Status status = Codec::CreateCompressor(NULL, false, codec_info, &compressor);
DCHECK(status.ok());
int64_t compressed_len = compressor->MaxOutputLen(offset);
uint8_t* compressed_buffer = (uint8_t*)malloc(compressed_len);
ABORT_IF_ERROR(
compressor->ProcessBlock(true, offset, buffer, &compressed_len, &compressed_buffer));
int64_t sorted_compressed_len = compressor->MaxOutputLen(offset);
uint8_t* sorted_compressed_buffer = (uint8_t*)malloc(sorted_compressed_len);
ABORT_IF_ERROR(compressor->ProcessBlock(true, offset, sorted_buffer,
&sorted_compressed_len, &sorted_compressed_buffer));
cout << "NumStrings=" << num << " MinLen=" << min_len << " MaxLen=" << max_len
<< " Codec=" << codec_info.format_ << endl;
cout << " Uncompressed len: " << offset << endl;
cout << " Compressed len: " << compressed_len << endl;
cout << " Sorted Compressed len: " << sorted_compressed_len << endl;
compressor->Close();
free(buffer);
free(compressed_buffer);
free(sorted_buffer);
free(sorted_compressed_buffer);
}
}
int main(int argc, char **argv) {
impala::TestCompression(1000000, 10, 10, impala::THdfsCompression::SNAPPY);
impala::TestCompression(1000000, 10, 10, impala::THdfsCompression::GZIP);
impala::TestCompression(1000000, 5, 15, impala::THdfsCompression::SNAPPY);
impala::TestCompression(1000000, 5, 15, impala::THdfsCompression::GZIP);
return 0;
}