tools/lzo/lzo_writer.cpp - doris - Git at Google

 // Licensed to the Apache Software Foundation (ASF) under one
 // or more contributor license agreements.  See the NOTICE file
 // distributed with this work for additional information
 // regarding copyright ownership.  The ASF licenses this file
 // to you under the Apache License, Version 2.0 (the
 // "License"); you may not use this file except in compliance
 // with the License.  You may obtain a copy of the License at
 //
 //   http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing,
 // software distributed under the License is distributed on an
 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 // KIND, either express or implied.  See the License for the
 // specific language governing permissions and limitations
 // under the License.

 #include <fstream>
 #include <iostream>
 #include <string>
 #include <vector>
 #include <cstdint>
 #include <lzo/lzo1x.h>
 #include <lzo/lzoconf.h>

 // LZO file format constants
 const uint8_t LZOP_MAGIC[9] = {0x89, 0x4c, 0x5a, 0x4f, 0x00, 0x0d, 0x0a, 0x1a, 0x0a};
 const uint16_t LZOP_VERSION = 0x1040;
 const uint16_t MY_LZO_VERSION = 0x2080;  // LZO library version
 const uint16_t LZOP_VERSION_NEEDED = 0x0940;
 const uint8_t COMPRESSION_METHOD = 1;  // LZO1X
 const uint8_t COMPRESSION_LEVEL = 5;
 const uint32_t LZOP_FLAGS = 0x00003;  // Use ADLER32 for all checksums
 const uint32_t HEADER_SIZE = 34;  // Minimum header size without filename
 const uint32_t ADLER32_INIT_VALUE = 1;  // Initial value for Adler32

 // Compute Adler-32 checksum (same implementation as in Doris)
 uint32_t olap_adler32(uint32_t adler, const char* buf, size_t len) {
     uint32_t s1 = adler & 0xffff;
     uint32_t s2 = (adler >> 16) & 0xffff;

     for (size_t i = 0; i < len; i++) {
         s1 = (s1 + (unsigned char)buf[i]) % 65521;
         s2 = (s2 + s1) % 65521;
     }

     return (s2 << 16) + s1;
 }

 class LzoWriter {
 public:
     LzoWriter(const std::string& filename) : _filename(filename) {}

     bool init() {
         // Initialize LZO library
         if (lzo_init() != LZO_E_OK) {
             std::cerr << "Failed to initialize LZO library" << std::endl;
             return false;
         }

         _out_file.open(_filename, std::ios::binary);
         if (!_out_file.is_open()) {
             std::cerr << "Failed to open output file: " << _filename << std::endl;
             return false;
         }

         // Allocate work memory for compression
         _wrkmem.resize(LZO1X_1_MEM_COMPRESS);

         // Write file header
         write_header();
         return true;
     }

     void write_header() {
         // Prepare header data first
         std::vector<uint8_t> header_data;

         // Write magic number (not included in checksum)
         _out_file.write(reinterpret_cast<const char*>(LZOP_MAGIC), sizeof(LZOP_MAGIC));

         // Add version info to header data
         uint16_t version = __builtin_bswap16(LZOP_VERSION);
         header_data.insert(header_data.end(), reinterpret_cast<uint8_t*>(&version),
                          reinterpret_cast<uint8_t*>(&version) + sizeof(version));

         uint16_t lib_version = __builtin_bswap16(MY_LZO_VERSION);
         header_data.insert(header_data.end(), reinterpret_cast<uint8_t*>(&lib_version),
                          reinterpret_cast<uint8_t*>(&lib_version) + sizeof(lib_version));

         uint16_t version_needed = __builtin_bswap16(LZOP_VERSION_NEEDED);
         header_data.insert(header_data.end(), reinterpret_cast<uint8_t*>(&version_needed),
                          reinterpret_cast<uint8_t*>(&version_needed) + sizeof(version_needed));

         // Add method and level
         header_data.push_back(COMPRESSION_METHOD);
         header_data.push_back(COMPRESSION_LEVEL);

         // Add flags
         uint32_t flags = __builtin_bswap32(LZOP_FLAGS);
         header_data.insert(header_data.end(), reinterpret_cast<uint8_t*>(&flags),
                          reinterpret_cast<uint8_t*>(&flags) + sizeof(flags));

         // Add mode
         uint32_t mode = 0;
         header_data.insert(header_data.end(), reinterpret_cast<uint8_t*>(&mode),
                          reinterpret_cast<uint8_t*>(&mode) + sizeof(mode));

         // Add mtime
         header_data.insert(header_data.end(), reinterpret_cast<uint8_t*>(&mode),
                          reinterpret_cast<uint8_t*>(&mode) + sizeof(mode));
         header_data.insert(header_data.end(), reinterpret_cast<uint8_t*>(&mode),
                          reinterpret_cast<uint8_t*>(&mode) + sizeof(mode));

         // Add filename length
         header_data.push_back(0);

         // Write all header data
         _out_file.write(reinterpret_cast<const char*>(header_data.data()), header_data.size());

         // Calculate and write header checksum
         uint32_t header_checksum = compute_adler32(header_data.data(), header_data.size());
         write_uint32(header_checksum);
     }

     void write_normal_block(const std::string& data) {
         std::vector<uint8_t> compressed_data(data.size() + data.size() / 16 + 64 + 3);
         lzo_uint compressed_len = 0;

         // Compress the data
         int r = lzo1x_1_compress(
             reinterpret_cast<const uint8_t*>(data.data()),
             data.size(),
             compressed_data.data(),
             &compressed_len,
             _wrkmem.data());

         if (r != LZO_E_OK) {
             std::cerr << "Compression failed" << std::endl;
             return;
         }

         std::cout << "Block info:" << std::endl;
         std::cout << "  Original data size: " << data.size() << std::endl;
         std::cout << "  Compressed size: " << compressed_len << std::endl;
         std::cout << "  Original data: '" << data << "'" << std::endl;

         // Write uncompressed size
         write_uint32(data.size());

         // If compressed size is not smaller than original size,
         // we will store the original data without compression
         bool is_compressed = compressed_len < data.size();

         // Write compressed size (or original size if not compressed)
         write_uint32(is_compressed ? compressed_len : data.size());

         // Write uncompressed checksum
         uint32_t uncompressed_checksum = compute_adler32(
             reinterpret_cast<const uint8_t*>(data.data()), data.size());
         write_uint32(uncompressed_checksum);

         std::cout << "  Uncompressed checksum calculation:" << std::endl;
         std::cout << "    Data length: " << data.size() << std::endl;
         std::cout << "    First few bytes:";
         for (size_t i = 0; i < std::min(data.size(), size_t(16)); ++i) {
             printf(" %02x", (unsigned char)data[i]);
         }
         std::cout << std::endl;
         std::cout << "    Computed checksum: " << std::hex << uncompressed_checksum << std::dec << std::endl;

         if (is_compressed) {
             // Detailed logging of compressed data
             std::cout << "  Complete compressed data:" << std::endl;
             std::cout << "    All bytes:";
             for (size_t i = 0; i < compressed_len; ++i) {
                 if (i % 16 == 0) std::cout << std::endl << "    ";
                 printf(" %02x", compressed_data[i]);
             }
             std::cout << std::endl;

             // Write compressed checksum
             uint32_t compressed_checksum = compute_adler32(compressed_data.data(), compressed_len);
             write_uint32(compressed_checksum);

             std::cout << "  Compressed checksum calculation:" << std::endl;
             std::cout << "    Data length: " << compressed_len << std::endl;
             std::cout << "    Bytes used for checksum:";
             for (size_t i = 0; i < compressed_len; ++i) {
                 if (i % 16 == 0) std::cout << std::endl << "    ";
                 printf(" %02x", compressed_data[i]);
             }
             std::cout << std::endl;
             std::cout << "    Computed checksum: " << std::hex << compressed_checksum << std::dec << std::endl;

             // Write compressed data
             _out_file.write(reinterpret_cast<const char*>(compressed_data.data()), compressed_len);
         } else {
             std::cout << "  Data not compressed (compressed size >= original size)" << std::endl;
             // Write original data directly
             _out_file.write(data.data(), data.size());
         }
         std::cout << "----------------------------------------" << std::endl;
     }

     void write_zero_block() {
         // Write a block with uncompressed size = 0 to mark end of file
         write_uint32(0);
     }

     void close() {
         if (_out_file.is_open()) {
             _out_file.close();
         }
     }

 private:
     void write_uint8(uint8_t value) {
         _out_file.write(reinterpret_cast<const char*>(&value), sizeof(value));
     }

     void write_uint16(uint16_t value) {
         value = __builtin_bswap16(value);  // Convert to big-endian
         _out_file.write(reinterpret_cast<const char*>(&value), sizeof(value));
     }

     void write_uint32(uint32_t value) {
         value = __builtin_bswap32(value);  // Convert to big-endian
         _out_file.write(reinterpret_cast<const char*>(&value), sizeof(value));
     }

     // Compute Adler-32 checksum using the same implementation as Doris
     uint32_t compute_adler32(const uint8_t* data, size_t len) {
         uint32_t checksum = olap_adler32(ADLER32_INIT_VALUE, reinterpret_cast<const char*>(data), len);
         std::cout << "  Adler32 details:" << std::endl;
         std::cout << "    Input length: " << len << std::endl;
         std::cout << "    Initial value: " << ADLER32_INIT_VALUE << std::endl;
         std::cout << "    Final checksum: " << std::hex << checksum << std::dec << std::endl;
         return checksum;
     }

     std::string _filename;
     std::ofstream _out_file;
     std::vector<uint8_t> _wrkmem;
 };

 int main(int argc, char** argv) {
     if (argc != 2) {
         std::cerr << "Usage: " << argv[0] << " <output_file>" << std::endl;
         return 1;
     }

     LzoWriter writer(argv[1]);
     if (!writer.init()) {
         return 1;
     }

     // Write a zero-sized block at the begin
     writer.write_zero_block();

     // Write first normal block with test data
     std::string test_data1 = "This is the first block of test data for LZO compression!\n";
     writer.write_normal_block(test_data1);

     // Write a zero-sized block in the middle
     writer.write_zero_block();

     // Write third normal block with different test data
     std::string test_data2 = "This is the third block with more test data for LZO compression!";
     writer.write_normal_block(test_data2);

     // Write a zero-sized block in the end
     writer.write_zero_block();

     // Write a zero-sized block in the end
     writer.write_zero_block();

     writer.close();
     std::cout << "Successfully created LZO file with three blocks (middle block size = 0)" << std::endl;

     return 0;
 }
	// Licensed to the Apache Software Foundation (ASF) under one
	// or more contributor license agreements. See the NOTICE file
	// distributed with this work for additional information
	// regarding copyright ownership. The ASF licenses this file
	// to you under the Apache License, Version 2.0 (the
	// "License"); you may not use this file except in compliance
	// with the License. You may obtain a copy of the License at
	//
	// http://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing,
	// software distributed under the License is distributed on an
	// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	// KIND, either express or implied. See the License for the
	// specific language governing permissions and limitations
	// under the License.

	#include <fstream>
	#include <iostream>
	#include <string>
	#include <vector>
	#include <cstdint>
	#include <lzo/lzo1x.h>
	#include <lzo/lzoconf.h>

	// LZO file format constants
	const uint8_t LZOP_MAGIC[9] = {0x89, 0x4c, 0x5a, 0x4f, 0x00, 0x0d, 0x0a, 0x1a, 0x0a};
	const uint16_t LZOP_VERSION = 0x1040;
	const uint16_t MY_LZO_VERSION = 0x2080; // LZO library version
	const uint16_t LZOP_VERSION_NEEDED = 0x0940;
	const uint8_t COMPRESSION_METHOD = 1; // LZO1X
	const uint8_t COMPRESSION_LEVEL = 5;
	const uint32_t LZOP_FLAGS = 0x00003; // Use ADLER32 for all checksums
	const uint32_t HEADER_SIZE = 34; // Minimum header size without filename
	const uint32_t ADLER32_INIT_VALUE = 1; // Initial value for Adler32

	// Compute Adler-32 checksum (same implementation as in Doris)
	uint32_t olap_adler32(uint32_t adler, const char* buf, size_t len) {
	uint32_t s1 = adler & 0xffff;
	uint32_t s2 = (adler >> 16) & 0xffff;

	for (size_t i = 0; i < len; i++) {
	s1 = (s1 + (unsigned char)buf[i]) % 65521;
	s2 = (s2 + s1) % 65521;
	}

	return (s2 << 16) + s1;
	}

	class LzoWriter {
	public:
	LzoWriter(const std::string& filename) : _filename(filename) {}

	bool init() {
	// Initialize LZO library
	if (lzo_init() != LZO_E_OK) {
	std::cerr << "Failed to initialize LZO library" << std::endl;
	return false;
	}

	_out_file.open(_filename, std::ios::binary);
	if (!_out_file.is_open()) {
	std::cerr << "Failed to open output file: " << _filename << std::endl;
	return false;
	}

	// Allocate work memory for compression
	_wrkmem.resize(LZO1X_1_MEM_COMPRESS);

	// Write file header
	write_header();
	return true;
	}

	void write_header() {
	// Prepare header data first
	std::vector<uint8_t> header_data;

	// Write magic number (not included in checksum)
	_out_file.write(reinterpret_cast<const char*>(LZOP_MAGIC), sizeof(LZOP_MAGIC));

	// Add version info to header data
	uint16_t version = __builtin_bswap16(LZOP_VERSION);
	header_data.insert(header_data.end(), reinterpret_cast<uint8_t*>(&version),
	reinterpret_cast<uint8_t*>(&version) + sizeof(version));

	uint16_t lib_version = __builtin_bswap16(MY_LZO_VERSION);
	header_data.insert(header_data.end(), reinterpret_cast<uint8_t*>(&lib_version),
	reinterpret_cast<uint8_t*>(&lib_version) + sizeof(lib_version));

	uint16_t version_needed = __builtin_bswap16(LZOP_VERSION_NEEDED);
	header_data.insert(header_data.end(), reinterpret_cast<uint8_t*>(&version_needed),
	reinterpret_cast<uint8_t*>(&version_needed) + sizeof(version_needed));

	// Add method and level
	header_data.push_back(COMPRESSION_METHOD);
	header_data.push_back(COMPRESSION_LEVEL);

	// Add flags
	uint32_t flags = __builtin_bswap32(LZOP_FLAGS);
	header_data.insert(header_data.end(), reinterpret_cast<uint8_t*>(&flags),
	reinterpret_cast<uint8_t*>(&flags) + sizeof(flags));

	// Add mode
	uint32_t mode = 0;
	header_data.insert(header_data.end(), reinterpret_cast<uint8_t*>(&mode),
	reinterpret_cast<uint8_t*>(&mode) + sizeof(mode));

	// Add mtime
	header_data.insert(header_data.end(), reinterpret_cast<uint8_t*>(&mode),
	reinterpret_cast<uint8_t*>(&mode) + sizeof(mode));
	header_data.insert(header_data.end(), reinterpret_cast<uint8_t*>(&mode),
	reinterpret_cast<uint8_t*>(&mode) + sizeof(mode));

	// Add filename length
	header_data.push_back(0);

	// Write all header data
	_out_file.write(reinterpret_cast<const char*>(header_data.data()), header_data.size());

	// Calculate and write header checksum
	uint32_t header_checksum = compute_adler32(header_data.data(), header_data.size());
	write_uint32(header_checksum);
	}

	void write_normal_block(const std::string& data) {
	std::vector<uint8_t> compressed_data(data.size() + data.size() / 16 + 64 + 3);
	lzo_uint compressed_len = 0;

	// Compress the data
	int r = lzo1x_1_compress(
	reinterpret_cast<const uint8_t*>(data.data()),
	data.size(),
	compressed_data.data(),
	&compressed_len,
	_wrkmem.data());

	if (r != LZO_E_OK) {
	std::cerr << "Compression failed" << std::endl;
	return;
	}

	std::cout << "Block info:" << std::endl;
	std::cout << " Original data size: " << data.size() << std::endl;
	std::cout << " Compressed size: " << compressed_len << std::endl;
	std::cout << " Original data: '" << data << "'" << std::endl;

	// Write uncompressed size
	write_uint32(data.size());

	// If compressed size is not smaller than original size,
	// we will store the original data without compression
	bool is_compressed = compressed_len < data.size();

	// Write compressed size (or original size if not compressed)
	write_uint32(is_compressed ? compressed_len : data.size());

	// Write uncompressed checksum
	uint32_t uncompressed_checksum = compute_adler32(
	reinterpret_cast<const uint8_t*>(data.data()), data.size());
	write_uint32(uncompressed_checksum);

	std::cout << " Uncompressed checksum calculation:" << std::endl;
	std::cout << " Data length: " << data.size() << std::endl;
	std::cout << " First few bytes:";
	for (size_t i = 0; i < std::min(data.size(), size_t(16)); ++i) {
	printf(" %02x", (unsigned char)data[i]);
	}
	std::cout << std::endl;
	std::cout << " Computed checksum: " << std::hex << uncompressed_checksum << std::dec << std::endl;

	if (is_compressed) {
	// Detailed logging of compressed data
	std::cout << " Complete compressed data:" << std::endl;
	std::cout << " All bytes:";
	for (size_t i = 0; i < compressed_len; ++i) {
	if (i % 16 == 0) std::cout << std::endl << " ";
	printf(" %02x", compressed_data[i]);
	}
	std::cout << std::endl;

	// Write compressed checksum
	uint32_t compressed_checksum = compute_adler32(compressed_data.data(), compressed_len);
	write_uint32(compressed_checksum);

	std::cout << " Compressed checksum calculation:" << std::endl;
	std::cout << " Data length: " << compressed_len << std::endl;
	std::cout << " Bytes used for checksum:";
	for (size_t i = 0; i < compressed_len; ++i) {
	if (i % 16 == 0) std::cout << std::endl << " ";
	printf(" %02x", compressed_data[i]);
	}
	std::cout << std::endl;
	std::cout << " Computed checksum: " << std::hex << compressed_checksum << std::dec << std::endl;

	// Write compressed data
	_out_file.write(reinterpret_cast<const char*>(compressed_data.data()), compressed_len);
	} else {
	std::cout << " Data not compressed (compressed size >= original size)" << std::endl;
	// Write original data directly
	_out_file.write(data.data(), data.size());
	}
	std::cout << "----------------------------------------" << std::endl;
	}

	void write_zero_block() {
	// Write a block with uncompressed size = 0 to mark end of file
	write_uint32(0);
	}

	void close() {
	if (_out_file.is_open()) {
	_out_file.close();
	}
	}

	private:
	void write_uint8(uint8_t value) {
	_out_file.write(reinterpret_cast<const char*>(&value), sizeof(value));
	}

	void write_uint16(uint16_t value) {
	value = __builtin_bswap16(value); // Convert to big-endian
	_out_file.write(reinterpret_cast<const char*>(&value), sizeof(value));
	}

	void write_uint32(uint32_t value) {
	value = __builtin_bswap32(value); // Convert to big-endian
	_out_file.write(reinterpret_cast<const char*>(&value), sizeof(value));
	}

	// Compute Adler-32 checksum using the same implementation as Doris
	uint32_t compute_adler32(const uint8_t* data, size_t len) {
	uint32_t checksum = olap_adler32(ADLER32_INIT_VALUE, reinterpret_cast<const char*>(data), len);
	std::cout << " Adler32 details:" << std::endl;
	std::cout << " Input length: " << len << std::endl;
	std::cout << " Initial value: " << ADLER32_INIT_VALUE << std::endl;
	std::cout << " Final checksum: " << std::hex << checksum << std::dec << std::endl;
	return checksum;
	}

	std::string _filename;
	std::ofstream _out_file;
	std::vector<uint8_t> _wrkmem;
	};

	int main(int argc, char** argv) {
	if (argc != 2) {
	std::cerr << "Usage: " << argv[0] << " <output_file>" << std::endl;
	return 1;
	}

	LzoWriter writer(argv[1]);
	if (!writer.init()) {
	return 1;
	}

	// Write a zero-sized block at the begin
	writer.write_zero_block();

	// Write first normal block with test data
	std::string test_data1 = "This is the first block of test data for LZO compression!\n";
	writer.write_normal_block(test_data1);

	// Write a zero-sized block in the middle
	writer.write_zero_block();

	// Write third normal block with different test data
	std::string test_data2 = "This is the third block with more test data for LZO compression!";
	writer.write_normal_block(test_data2);

	// Write a zero-sized block in the end
	writer.write_zero_block();

	// Write a zero-sized block in the end
	writer.write_zero_block();

	writer.close();
	std::cout << "Successfully created LZO file with three blocks (middle block size = 0)" << std::endl;

	return 0;
	}