| // Licensed to the Apache Software Foundation (ASF) under one |
| // or more contributor license agreements. See the NOTICE file |
| // distributed with this work for additional information |
| // regarding copyright ownership. The ASF licenses this file |
| // to you under the Apache License, Version 2.0 (the |
| // "License"); you may not use this file except in compliance |
| // with the License. You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, |
| // software distributed under the License is distributed on an |
| // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| // KIND, either express or implied. See the License for the |
| // specific language governing permissions and limitations |
| // under the License. |
| |
| #pragma once |
| |
| #include <cassert> |
| #include <cstdint> |
| #include <cstring> |
| #include <iosfwd> |
| #include <limits> |
| #include <string> |
| #include <utility> |
| #include <vector> |
| |
| #include "arrow/status.h" |
| #include "arrow/util/macros.h" |
| #include "arrow/util/string_view.h" |
| #include "arrow/util/visibility.h" |
| |
| namespace arrow { |
| namespace internal { |
| |
| // A non-zero-terminated small string class. |
| // std::string usually has a small string optimization |
| // (see review at https://shaharmike.com/cpp/std-string/) |
| // but this one allows tight control and optimization of memory layout. |
| template <uint8_t N> |
| class SmallString { |
| public: |
| SmallString() : length_(0) {} |
| |
| template <typename T> |
| SmallString(const T& v) { // NOLINT implicit constructor |
| *this = util::string_view(v); |
| } |
| |
| SmallString& operator=(const util::string_view s) { |
| #ifndef NDEBUG |
| CheckSize(s.size()); |
| #endif |
| length_ = static_cast<uint8_t>(s.size()); |
| std::memcpy(data_, s.data(), length_); |
| return *this; |
| } |
| |
| SmallString& operator=(const std::string& s) { |
| *this = util::string_view(s); |
| return *this; |
| } |
| |
| SmallString& operator=(const char* s) { |
| *this = util::string_view(s); |
| return *this; |
| } |
| |
| explicit operator util::string_view() const { |
| return util::string_view(data_, length_); |
| } |
| |
| const char* data() const { return data_; } |
| size_t length() const { return length_; } |
| bool empty() const { return length_ == 0; } |
| char operator[](size_t pos) const { |
| #ifdef NDEBUG |
| assert(pos <= length_); |
| #endif |
| return data_[pos]; |
| } |
| |
| SmallString substr(size_t pos) const { |
| return SmallString(util::string_view(*this).substr(pos)); |
| } |
| |
| SmallString substr(size_t pos, size_t count) const { |
| return SmallString(util::string_view(*this).substr(pos, count)); |
| } |
| |
| template <typename T> |
| bool operator==(T&& other) const { |
| return util::string_view(*this) == util::string_view(std::forward<T>(other)); |
| } |
| |
| template <typename T> |
| bool operator!=(T&& other) const { |
| return util::string_view(*this) != util::string_view(std::forward<T>(other)); |
| } |
| |
| protected: |
| uint8_t length_; |
| char data_[N]; |
| |
| void CheckSize(size_t n) { assert(n <= N); } |
| }; |
| |
| template <uint8_t N> |
| std::ostream& operator<<(std::ostream& os, const SmallString<N>& str) { |
| return os << util::string_view(str); |
| } |
| |
| // A trie class for byte strings, optimized for small sets of short strings. |
| // This class is immutable by design, use a TrieBuilder to construct it. |
| class ARROW_EXPORT Trie { |
| using index_type = int16_t; |
| using fast_index_type = int_fast16_t; |
| static constexpr auto kMaxIndex = std::numeric_limits<index_type>::max(); |
| |
| public: |
| Trie() : size_(0) {} |
| Trie(Trie&&) = default; |
| Trie& operator=(Trie&&) = default; |
| |
| int32_t Find(util::string_view s) const { |
| const Node* node = &nodes_[0]; |
| fast_index_type pos = 0; |
| if (s.length() > static_cast<size_t>(kMaxIndex)) { |
| return -1; |
| } |
| fast_index_type remaining = static_cast<fast_index_type>(s.length()); |
| |
| while (remaining > 0) { |
| auto substring_length = node->substring_length(); |
| if (substring_length > 0) { |
| auto substring_data = node->substring_data(); |
| if (remaining < substring_length) { |
| // Input too short |
| return -1; |
| } |
| for (fast_index_type i = 0; i < substring_length; ++i) { |
| if (s[pos++] != substring_data[i]) { |
| // Mismatching substring |
| return -1; |
| } |
| --remaining; |
| } |
| if (remaining == 0) { |
| // Matched node exactly |
| return node->found_index_; |
| } |
| } |
| // Lookup child using next input character |
| if (node->child_lookup_ == -1) { |
| // Input too long |
| return -1; |
| } |
| auto c = static_cast<uint8_t>(s[pos++]); |
| --remaining; |
| auto child_index = lookup_table_[node->child_lookup_ * 256 + c]; |
| if (child_index == -1) { |
| // Child not found |
| return -1; |
| } |
| node = &nodes_[child_index]; |
| } |
| |
| // Input exhausted |
| if (node->substring_.empty()) { |
| // Matched node exactly |
| return node->found_index_; |
| } else { |
| return -1; |
| } |
| } |
| |
| Status Validate() const; |
| |
| void Dump() const; |
| |
| protected: |
| static constexpr size_t kNodeSize = 16; |
| static constexpr auto kMaxSubstringLength = |
| kNodeSize - 2 * sizeof(index_type) - sizeof(int8_t); |
| |
| struct Node { |
| // If this node is a valid end of string, index of found string, otherwise -1 |
| index_type found_index_; |
| // Base index for child lookup in lookup_table_ (-1 if no child nodes) |
| index_type child_lookup_; |
| // The substring for this node. |
| SmallString<kMaxSubstringLength> substring_; |
| |
| fast_index_type substring_length() const { |
| return static_cast<fast_index_type>(substring_.length()); |
| } |
| const char* substring_data() const { return substring_.data(); } |
| }; |
| |
| static_assert(sizeof(Node) == kNodeSize, "Unexpected node size"); |
| |
| ARROW_DISALLOW_COPY_AND_ASSIGN(Trie); |
| |
| void Dump(const Node* node, const std::string& indent) const; |
| |
| // Node table: entry 0 is the root node |
| std::vector<Node> nodes_; |
| |
| // Indexed lookup structure: gives index in node table, or -1 if not found |
| std::vector<index_type> lookup_table_; |
| |
| // Number of entries |
| index_type size_; |
| |
| friend class TrieBuilder; |
| }; |
| |
| class ARROW_EXPORT TrieBuilder { |
| using index_type = Trie::index_type; |
| using fast_index_type = Trie::fast_index_type; |
| |
| public: |
| TrieBuilder(); |
| Status Append(util::string_view s, bool allow_duplicate = false); |
| Trie Finish(); |
| |
| protected: |
| // Extend the lookup table by 256 entries, return the index of the new span |
| Status ExtendLookupTable(index_type* out_lookup_index); |
| // Split the node given by the index at the substring index `split_at` |
| Status SplitNode(fast_index_type node_index, fast_index_type split_at); |
| // Append an already constructed child node to the parent |
| Status AppendChildNode(Trie::Node* parent, uint8_t ch, Trie::Node&& node); |
| // Create a matching child node from this parent |
| Status CreateChildNode(Trie::Node* parent, uint8_t ch, util::string_view substring); |
| Status CreateChildNode(Trie::Node* parent, char ch, util::string_view substring); |
| |
| Trie trie_; |
| |
| static constexpr auto kMaxIndex = std::numeric_limits<index_type>::max(); |
| }; |
| |
| } // namespace internal |
| } // namespace arrow |