blob: 97611e65bd6d1e3048d0d1f63f0819acf463953e [file] [log] [blame]
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "Dictionary.hh"
#include <google/protobuf/arena.h>
#include <memory>
#include <utility>
using google::protobuf::Arena;
namespace orc {
SortedStringDictionary::SortedStringDictionary()
: totalLength_(0), arena_(std::make_unique<Arena>()) {
#ifdef BUILD_SPARSEHASH
/// Need to set empty key otherwise dense_hash_map will not work correctly
keyToIndex_.set_empty_key(std::string_view{});
#endif
}
SortedStringDictionary::~SortedStringDictionary() = default;
// insert a new string into dictionary, return its insertion order
size_t SortedStringDictionary::insert(const char* str, size_t len) {
size_t index = keyToIndex_.size();
auto it = keyToIndex_.find(std::string_view{str, len});
if (it != keyToIndex_.end()) {
return it->second;
} else {
auto s = Arena::Create<std::string>(arena_.get(), str, len);
keyToIndex_.emplace(std::string_view{s->data(), s->length()}, index);
totalLength_ += len;
return index;
}
}
// reserve space for dictionary entries
void SortedStringDictionary::reserve(size_t size) {
keyToIndex_.reserve(size);
}
/**
* Write dictionary data & length to output buffer and
* reorder input index buffer from insertion order to dictionary order
*
* We require this function because string values are buffered by indexes
* in their insertion order. Until the entire dictionary is complete can
* we get their sorted indexes in the dictionary in that ORC specification
* demands dictionary should be ordered. Therefore this function transforms
* the indexes from insertion order to dictionary value order for final
* output.
*/
void SortedStringDictionary::flush(AppendOnlyBufferedStream* dataStream,
RleEncoder* lengthEncoder,
std::vector<int64_t>& idxBuffer) const {
std::vector<std::pair<std::string_view, size_t>> flatDict;
flatDict.reserve(keyToIndex_.size());
for (auto [key, index] : keyToIndex_) {
flatDict.emplace_back(key, index);
}
std::sort(flatDict.begin(), flatDict.end());
for (const auto& [entry, _] : flatDict) {
dataStream->write(entry.data(), entry.size());
lengthEncoder->write(static_cast<int64_t>(entry.size()));
}
std::vector<size_t> mapping(flatDict.size());
for (size_t i = 0; i < flatDict.size(); ++i) {
mapping[flatDict[i].second] = i;
}
// do the transformation
for (size_t i = 0; i != idxBuffer.size(); ++i) {
idxBuffer[i] = static_cast<int64_t>(mapping[static_cast<size_t>(idxBuffer[i])]);
}
}
// get dict entries in insertion order
std::vector<std::string_view> SortedStringDictionary::getEntriesInInsertionOrder() const {
std::vector<std::string_view> entries(keyToIndex_.size());
for (auto [key, index] : keyToIndex_) {
entries[index] = key;
}
return entries;
}
// return count of entries
size_t SortedStringDictionary::size() const {
return keyToIndex_.size();
}
// return capacity of dictionary
size_t SortedStringDictionary::capacity() const {
return keyToIndex_.bucket_count() * keyToIndex_.max_load_factor();
}
// return total length of strings in the dictioanry
uint64_t SortedStringDictionary::length() const {
return totalLength_;
}
void SortedStringDictionary::clear() {
totalLength_ = 0;
keyToIndex_.clear();
arena_->Reset();
}
} // namespace orc