| // Licensed to the Apache Software Foundation (ASF) under one |
| // or more contributor license agreements. See the NOTICE file |
| // distributed with this work for additional information |
| // regarding copyright ownership. The ASF licenses this file |
| // to you under the Apache License, Version 2.0 (the |
| // "License"); you may not use this file except in compliance |
| // with the License. You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, |
| // software distributed under the License is distributed on an |
| // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| // KIND, either express or implied. See the License for the |
| // specific language governing permissions and limitations |
| // under the License. |
| |
| #pragma once |
| |
| #include <cstdint> |
| #include <memory> |
| |
| #include "arrow/status.h" |
| #include "arrow/util/macros.h" |
| #include "arrow/util/string_view.h" |
| #include "arrow/util/visibility.h" |
| |
| namespace arrow { |
| |
| class Buffer; |
| |
| class ARROW_EXPORT BoundaryFinder { |
| public: |
| BoundaryFinder() = default; |
| |
| virtual ~BoundaryFinder(); |
| |
| /// \brief Find the position of the first delimiter inside block |
| /// |
| /// `partial` is taken to be the beginning of the block, and `block` |
| /// its continuation. Also, `partial` doesn't contain a delimiter. |
| /// |
| /// The returned `out_pos` is relative to `block`'s start and should point |
| /// to the first character after the first delimiter. |
| /// `out_pos` will be -1 if no delimiter is found. |
| virtual Status FindFirst(util::string_view partial, util::string_view block, |
| int64_t* out_pos) = 0; |
| |
| /// \brief Find the position of the last delimiter inside block |
| /// |
| /// The returned `out_pos` is relative to `block`'s start and should point |
| /// to the first character after the last delimiter. |
| /// `out_pos` will be -1 if no delimiter is found. |
| virtual Status FindLast(util::string_view block, int64_t* out_pos) = 0; |
| |
| static constexpr int64_t kNoDelimiterFound = -1; |
| |
| protected: |
| ARROW_DISALLOW_COPY_AND_ASSIGN(BoundaryFinder); |
| }; |
| |
| ARROW_EXPORT |
| std::shared_ptr<BoundaryFinder> MakeNewlineBoundaryFinder(); |
| |
| /// \brief A reusable block-based chunker for delimited data |
| /// |
| /// The chunker takes a block of delimited data and helps carve a sub-block |
| /// which begins and ends on delimiters (suitable for consumption by parsers |
| /// which can only parse whole objects). |
| class ARROW_EXPORT Chunker { |
| public: |
| explicit Chunker(std::shared_ptr<BoundaryFinder> delimiter); |
| ~Chunker(); |
| |
| /// \brief Carve up a chunk in a block of data to contain only whole objects |
| /// |
| /// Pre-conditions: |
| /// - `block` is the start of a valid block of delimited data |
| /// (i.e. starts just after a delimiter) |
| /// |
| /// Post-conditions: |
| /// - block == whole + partial |
| /// - `whole` is a valid block of delimited data |
| /// (i.e. starts just after a delimiter and ends with a delimiter) |
| /// - `partial` doesn't contain an entire delimited object |
| /// (IOW: `partial` is generally small) |
| /// |
| /// This method will look for the last delimiter in `block` and may |
| /// therefore be costly. |
| /// |
| /// \param[in] block data to be chunked |
| /// \param[out] whole subrange of block containing whole delimited objects |
| /// \param[out] partial subrange of block starting with a partial delimited object |
| Status Process(std::shared_ptr<Buffer> block, std::shared_ptr<Buffer>* whole, |
| std::shared_ptr<Buffer>* partial); |
| |
| /// \brief Carve the completion of a partial object out of a block |
| /// |
| /// Pre-conditions: |
| /// - `partial` is the start of a valid block of delimited data |
| /// (i.e. starts just after a delimiter) |
| /// - `block` follows `partial` in file order |
| /// |
| /// Post-conditions: |
| /// - block == completion + rest |
| /// - `partial + completion` is a valid block of delimited data |
| /// (i.e. starts just after a delimiter and ends with a delimiter) |
| /// - `completion` doesn't contain an entire delimited object |
| /// (IOW: `completion` is generally small) |
| /// |
| /// This method will look for the first delimiter in `block` and should |
| /// therefore be reasonably cheap. |
| /// |
| /// \param[in] partial incomplete delimited data |
| /// \param[in] block delimited data following partial |
| /// \param[out] completion subrange of block containing the completion of partial |
| /// \param[out] rest subrange of block containing what completion does not cover |
| Status ProcessWithPartial(std::shared_ptr<Buffer> partial, |
| std::shared_ptr<Buffer> block, |
| std::shared_ptr<Buffer>* completion, |
| std::shared_ptr<Buffer>* rest); |
| |
| /// \brief Like ProcessWithPartial, but for the last block of a file |
| /// |
| /// This method allows for a final delimited object without a trailing delimiter |
| /// (ProcessWithPartial would return an error in that case). |
| /// |
| /// Pre-conditions: |
| /// - `partial` is the start of a valid block of delimited data |
| /// - `block` follows `partial` in file order and is the last data block |
| /// |
| /// Post-conditions: |
| /// - block == completion + rest |
| /// - `partial + completion` is a valid block of delimited data |
| /// - `completion` doesn't contain an entire delimited object |
| /// (IOW: `completion` is generally small) |
| /// |
| Status ProcessFinal(std::shared_ptr<Buffer> partial, std::shared_ptr<Buffer> block, |
| std::shared_ptr<Buffer>* completion, std::shared_ptr<Buffer>* rest); |
| |
| protected: |
| ARROW_DISALLOW_COPY_AND_ASSIGN(Chunker); |
| |
| std::shared_ptr<BoundaryFinder> boundary_finder_; |
| }; |
| |
| } // namespace arrow |