blob: 54f558acfb7a4188a096f8199ffe9e37fbbec66d [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#include "byte_stream_split.h"
#include <glog/logging.h>
#include <array>
#include <bit> // IWYU pragma: keep
#include <cstring>
#include <vector>
namespace doris {
inline void do_merge_streams(const uint8_t** src_streams, int width, int64_t nvalues,
uint8_t* dest) {
// Value empirically chosen to provide the best performance on the author's machine
constexpr int kBlockSize = 128;
while (nvalues >= kBlockSize) {
for (int stream = 0; stream < width; ++stream) {
// Take kBlockSize bytes from the given stream and spread them
// to their logical places in destination.
const uint8_t* src = src_streams[stream];
for (int i = 0; i < kBlockSize; i += 8) {
uint64_t v;
std::memcpy(&v, src + i, sizeof(v));
if constexpr (std::endian::native == std::endian::little) {
dest[stream + i * width] = static_cast<uint8_t>(v);
dest[stream + (i + 1) * width] = static_cast<uint8_t>(v >> 8);
dest[stream + (i + 2) * width] = static_cast<uint8_t>(v >> 16);
dest[stream + (i + 3) * width] = static_cast<uint8_t>(v >> 24);
dest[stream + (i + 4) * width] = static_cast<uint8_t>(v >> 32);
dest[stream + (i + 5) * width] = static_cast<uint8_t>(v >> 40);
dest[stream + (i + 6) * width] = static_cast<uint8_t>(v >> 48);
dest[stream + (i + 7) * width] = static_cast<uint8_t>(v >> 56);
} else if constexpr (std::endian::native == std::endian::big) {
dest[stream + i * width] = static_cast<uint8_t>(v >> 56);
dest[stream + (i + 1) * width] = static_cast<uint8_t>(v >> 48);
dest[stream + (i + 2) * width] = static_cast<uint8_t>(v >> 40);
dest[stream + (i + 3) * width] = static_cast<uint8_t>(v >> 32);
dest[stream + (i + 4) * width] = static_cast<uint8_t>(v >> 24);
dest[stream + (i + 5) * width] = static_cast<uint8_t>(v >> 16);
dest[stream + (i + 6) * width] = static_cast<uint8_t>(v >> 8);
dest[stream + (i + 7) * width] = static_cast<uint8_t>(v);
}
}
src_streams[stream] += kBlockSize;
}
dest += width * kBlockSize;
nvalues -= kBlockSize;
}
// Epilog
for (int stream = 0; stream < width; ++stream) {
const uint8_t* src = src_streams[stream];
for (int64_t i = 0; i < nvalues; ++i) {
dest[stream + i * width] = src[i];
}
}
}
template <int kNumStreams>
void byte_stream_split_decode_scalar(const uint8_t* src, int width, int64_t offset,
int64_t num_values, int64_t stride, uint8_t* dest) {
DCHECK(width == kNumStreams);
std::array<const uint8_t*, kNumStreams> src_streams;
for (int stream = 0; stream < kNumStreams; ++stream) {
src_streams[stream] = &src[stream * stride + offset];
}
do_merge_streams(src_streams.data(), kNumStreams, num_values, dest);
}
inline void byte_stream_split_decode_scalar_dynamic(const uint8_t* src, int width, int64_t offset,
int64_t num_values, int64_t stride,
uint8_t* dest) {
std::vector<const uint8_t*> src_streams;
src_streams.resize(width);
for (int stream = 0; stream < width; ++stream) {
src_streams[stream] = &src[stream * stride + offset];
}
do_merge_streams(src_streams.data(), width, num_values, dest);
}
// TODO: optimize using simd: https://github.com/apache/arrow/pull/38529
void byte_stream_split_decode(const uint8_t* src, int width, int64_t offset, int64_t num_values,
int64_t stride, uint8_t* dest) {
switch (width) {
case 1:
memcpy(dest, src + offset * width, num_values);
return;
case 2:
return byte_stream_split_decode_scalar<2>(src, width, offset, num_values, stride, dest);
case 4:
return byte_stream_split_decode_scalar<4>(src, width, offset, num_values, stride, dest);
case 8:
return byte_stream_split_decode_scalar<8>(src, width, offset, num_values, stride, dest);
case 16:
return byte_stream_split_decode_scalar<16>(src, width, offset, num_values, stride, dest);
}
return byte_stream_split_decode_scalar_dynamic(src, width, offset, num_values, stride, dest);
}
} // namespace doris