blob: 1f1ec2c8b0706e02daf72d59ed0443c55fc0de9d [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <string>
#include "common/status.h"
namespace impala {
class MemPool;
class MemTracker;
class ScannerContext;
/// Class for encoding and decoding character buffers between different encodings and
/// UTF-8. Empolys the Boost.Locale library for encoding and decoding.
class CharCodec {
public:
static const int MAX_SYMBOL;
CharCodec(MemPool* memory_pool, const std::string& encoding, char tuple_delim = '\n',
bool reuse_buffer = false);
/// Decodes 'buffer' from 'encoding_' to UTF-8, handling partial symbols and delimiters.
///
/// The function processes the buffer in three parts:
/// 1. Prefix: attempts to complete partial_symbol_, stored from previous DecodeBuffer
/// call, by adding first bytes from buffer one by one.
/// 2. Core: Converts the main part of the buffer up to the last delimiter found.
/// 3. Suffix: in case buffer is split in the middle of a symbol, progressively
/// determines the incomplete part and stores it into partial_symbol_.
Status DecodeBuffer(uint8_t** buffer, int64_t* bytes_read, MemPool* pool, bool eosr,
bool decompress, ScannerContext* context);
/// Encodes 'str' from UTF-8 into a given 'encoding_'. Since
/// HdfsTextTableWriter::Flush(), currently being the only client of this function,
/// always flushes the buffer at the end of the row, we don't need to handle partial
/// symbols here.
Status EncodeBuffer(const std::string& str, std::string* result);
private:
Status HandlePrefix(uint8_t** buf_start, uint8_t* buf_end, std::string* result_prefix);
Status HandleCore(uint8_t** buf_start, uint8_t* buf_end, std::string* result_core);
Status HandleSuffix(uint8_t** buf_start, uint8_t* buf_end, std::string* result_suffix);
/// Pool to allocate the buffer to hold transformed data.
MemPool* memory_pool_ = nullptr;
/// Name of the encoding of the input / output data.
std::string encoding_;
/// The following members are only used by DecodeBuffer:
/// Delimiter used to separate tuples.
const char tuple_delim_;
/// Buffer to hold the partial symbol that could not be decoded in the previous call to
/// DecodeBuffer.
std::vector<uint8_t> partial_symbol_;
/// Can we reuse the output buffer or do we need to allocate on each call?
bool reuse_buffer_;
/// Buffer to hold transformed data.
uint8_t* out_buffer_ = nullptr;
/// Length of the output buffer.
int64_t buffer_length_ = 0;
};
}