| // Licensed to the Apache Software Foundation (ASF) under one |
| // or more contributor license agreements. See the NOTICE file |
| // distributed with this work for additional information |
| // regarding copyright ownership. The ASF licenses this file |
| // to you under the Apache License, Version 2.0 (the |
| // "License"); you may not use this file except in compliance |
| // with the License. You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, |
| // software distributed under the License is distributed on an |
| // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| // KIND, either express or implied. See the License for the |
| // specific language governing permissions and limitations |
| // under the License. |
| |
| #include "byte_stream_split.h" |
| |
| #include <glog/logging.h> |
| |
| #include <array> |
| #include <bit> // IWYU pragma: keep |
| #include <cstring> |
| #include <vector> |
| |
| namespace doris { |
| |
| inline void do_merge_streams(const uint8_t** src_streams, int width, int64_t nvalues, |
| uint8_t* dest) { |
| // Value empirically chosen to provide the best performance on the author's machine |
| constexpr int kBlockSize = 128; |
| |
| while (nvalues >= kBlockSize) { |
| for (int stream = 0; stream < width; ++stream) { |
| // Take kBlockSize bytes from the given stream and spread them |
| // to their logical places in destination. |
| const uint8_t* src = src_streams[stream]; |
| for (int i = 0; i < kBlockSize; i += 8) { |
| uint64_t v; |
| std::memcpy(&v, src + i, sizeof(v)); |
| if constexpr (std::endian::native == std::endian::little) { |
| dest[stream + i * width] = static_cast<uint8_t>(v); |
| dest[stream + (i + 1) * width] = static_cast<uint8_t>(v >> 8); |
| dest[stream + (i + 2) * width] = static_cast<uint8_t>(v >> 16); |
| dest[stream + (i + 3) * width] = static_cast<uint8_t>(v >> 24); |
| dest[stream + (i + 4) * width] = static_cast<uint8_t>(v >> 32); |
| dest[stream + (i + 5) * width] = static_cast<uint8_t>(v >> 40); |
| dest[stream + (i + 6) * width] = static_cast<uint8_t>(v >> 48); |
| dest[stream + (i + 7) * width] = static_cast<uint8_t>(v >> 56); |
| } else if constexpr (std::endian::native == std::endian::big) { |
| dest[stream + i * width] = static_cast<uint8_t>(v >> 56); |
| dest[stream + (i + 1) * width] = static_cast<uint8_t>(v >> 48); |
| dest[stream + (i + 2) * width] = static_cast<uint8_t>(v >> 40); |
| dest[stream + (i + 3) * width] = static_cast<uint8_t>(v >> 32); |
| dest[stream + (i + 4) * width] = static_cast<uint8_t>(v >> 24); |
| dest[stream + (i + 5) * width] = static_cast<uint8_t>(v >> 16); |
| dest[stream + (i + 6) * width] = static_cast<uint8_t>(v >> 8); |
| dest[stream + (i + 7) * width] = static_cast<uint8_t>(v); |
| } |
| } |
| src_streams[stream] += kBlockSize; |
| } |
| dest += width * kBlockSize; |
| nvalues -= kBlockSize; |
| } |
| |
| // Epilog |
| for (int stream = 0; stream < width; ++stream) { |
| const uint8_t* src = src_streams[stream]; |
| for (int64_t i = 0; i < nvalues; ++i) { |
| dest[stream + i * width] = src[i]; |
| } |
| } |
| } |
| |
| template <int kNumStreams> |
| void byte_stream_split_decode_scalar(const uint8_t* src, int width, int64_t offset, |
| int64_t num_values, int64_t stride, uint8_t* dest) { |
| DCHECK(width == kNumStreams); |
| std::array<const uint8_t*, kNumStreams> src_streams; |
| for (int stream = 0; stream < kNumStreams; ++stream) { |
| src_streams[stream] = &src[stream * stride + offset]; |
| } |
| do_merge_streams(src_streams.data(), kNumStreams, num_values, dest); |
| } |
| |
| inline void byte_stream_split_decode_scalar_dynamic(const uint8_t* src, int width, int64_t offset, |
| int64_t num_values, int64_t stride, |
| uint8_t* dest) { |
| std::vector<const uint8_t*> src_streams; |
| src_streams.resize(width); |
| for (int stream = 0; stream < width; ++stream) { |
| src_streams[stream] = &src[stream * stride + offset]; |
| } |
| do_merge_streams(src_streams.data(), width, num_values, dest); |
| } |
| |
| // TODO: optimize using simd: https://github.com/apache/arrow/pull/38529 |
| void byte_stream_split_decode(const uint8_t* src, int width, int64_t offset, int64_t num_values, |
| int64_t stride, uint8_t* dest) { |
| switch (width) { |
| case 1: |
| memcpy(dest, src + offset * width, num_values); |
| return; |
| case 2: |
| return byte_stream_split_decode_scalar<2>(src, width, offset, num_values, stride, dest); |
| case 4: |
| return byte_stream_split_decode_scalar<4>(src, width, offset, num_values, stride, dest); |
| case 8: |
| return byte_stream_split_decode_scalar<8>(src, width, offset, num_values, stride, dest); |
| case 16: |
| return byte_stream_split_decode_scalar<16>(src, width, offset, num_values, stride, dest); |
| } |
| return byte_stream_split_decode_scalar_dynamic(src, width, offset, num_values, stride, dest); |
| } |
| |
| } // namespace doris |