blob: 6102adf4ba4c42f945b1b25657297a0da9c2d811 [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#include "kudu/util/char_util.h"
#ifdef __aarch64__
#include "kudu/util/sse2neon.h"
#else
#include <emmintrin.h>
#include <smmintrin.h>
#endif //__aarch64__
#include <algorithm>
#include <cstring>
namespace kudu {
Slice UTF8Truncate(Slice val, size_t max_utf8_length) {
size_t num_utf8_chars = 0;
const uint8_t* str;
const uint8_t* start;
str = start = val.data();
size_t num_bytes = 0;
size_t size = val.size();
// Mask used to determine whether there are any non-ASCII characters in a
// 128-bit chunk
const __m128i mask = _mm_set1_epi32(0x80808080);
while (num_bytes < size) {
// If the next chunk of bytes are all ASCII we can fast path them.
if (size - num_bytes >= 16 &&
max_utf8_length - num_utf8_chars >= 16 &&
_mm_test_all_zeros(_mm_loadu_si128(reinterpret_cast<const __m128i*>(str)),
mask) == 1) {
num_utf8_chars += 16;
num_bytes += 16;
str += 16;
} else if (size - num_bytes >= 8 &&
max_utf8_length - num_utf8_chars >= 8 &&
(*(reinterpret_cast<const int64_t*>(str)) & 0x8080808080808080) == 0) {
num_utf8_chars += 8;
num_bytes += 8;
str += 8;
} else {
num_utf8_chars += (*str++ & 0xc0) != 0x80;
num_bytes++;
if (num_utf8_chars > max_utf8_length) {
num_bytes--;
num_utf8_chars--;
break;
}
}
}
num_bytes = std::min<size_t>(size, num_bytes);
auto relocated = new uint8_t[num_bytes];
memcpy(relocated, val.data(), num_bytes);
return Slice(relocated, num_bytes);
}
} // namespace kudu