blob: 32ccf0a8ec14a75e89544187793edf8c1063b51f [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#include "benchmark/benchmark.h"
#include <string>
#include <type_traits>
#include <vector>
#include "arrow/testing/gtest_util.h"
#include "arrow/util/utf8.h"
namespace arrow {
namespace util {
static const char* tiny_valid_ascii = "characters";
static const char* tiny_valid_non_ascii = "caractères";
static const char* valid_ascii =
"UTF-8 is a variable width character encoding capable of encoding all 1,112,064 "
"valid code points in Unicode using one to four 8-bit bytes";
static const char* valid_almost_ascii =
"UTF-8 est un codage de caractères informatiques conçu pour coder l’ensemble des "
"caractères du « répertoire universel de caractères codés »";
static const char* valid_non_ascii =
"UTF-8 はISO/IEC 10646 (UCS) "
"とUnicodeで使える8ビット符号単位の文字符号化形式及び文字符号化スキーム。 ";
static std::string MakeLargeString(const std::string& base, int64_t nbytes) {
int64_t nrepeats = (nbytes + base.size() - 1) / base.size();
std::string s;
s.reserve(nrepeats * nbytes);
for (int64_t i = 0; i < nrepeats; ++i) {
s += base;
}
return s;
}
static void BenchmarkUTF8Validation(
benchmark::State& state, // NOLINT non-const reference
const std::string& s, bool expected) {
auto data = reinterpret_cast<const uint8_t*>(s.data());
auto data_size = static_cast<int64_t>(s.size());
InitializeUTF8();
bool b = ValidateUTF8(data, data_size);
if (b != expected) {
std::cerr << "Unexpected validation result" << std::endl;
std::abort();
}
while (state.KeepRunning()) {
bool b = ValidateUTF8(data, data_size);
benchmark::DoNotOptimize(b);
}
state.SetBytesProcessed(state.iterations() * s.size());
}
static void BenchmarkASCIIValidation(
benchmark::State& state, // NOLINT non-const reference
const std::string& s, bool expected) {
auto data = reinterpret_cast<const uint8_t*>(s.data());
auto data_size = static_cast<int64_t>(s.size());
InitializeUTF8();
bool b = ValidateAscii(data, data_size);
if (b != expected) {
std::cerr << "Unexpected validation result" << std::endl;
std::abort();
}
while (state.KeepRunning()) {
bool b = ValidateAscii(data, data_size);
benchmark::DoNotOptimize(b);
}
state.SetBytesProcessed(state.iterations() * s.size());
}
static void ValidateTinyAscii(benchmark::State& state) { // NOLINT non-const reference
BenchmarkASCIIValidation(state, tiny_valid_ascii, true);
}
static void ValidateTinyNonAscii(benchmark::State& state) { // NOLINT non-const reference
BenchmarkUTF8Validation(state, tiny_valid_non_ascii, true);
}
static void ValidateSmallAscii(benchmark::State& state) { // NOLINT non-const reference
BenchmarkASCIIValidation(state, valid_ascii, true);
}
static void ValidateSmallAlmostAscii(
benchmark::State& state) { // NOLINT non-const reference
BenchmarkUTF8Validation(state, valid_almost_ascii, true);
}
static void ValidateSmallNonAscii(
benchmark::State& state) { // NOLINT non-const reference
BenchmarkUTF8Validation(state, valid_non_ascii, true);
}
static void ValidateLargeAscii(benchmark::State& state) { // NOLINT non-const reference
auto s = MakeLargeString(valid_ascii, 100000);
BenchmarkASCIIValidation(state, s, true);
}
static void ValidateLargeAlmostAscii(
benchmark::State& state) { // NOLINT non-const reference
auto s = MakeLargeString(valid_almost_ascii, 100000);
BenchmarkUTF8Validation(state, s, true);
}
static void ValidateLargeNonAscii(
benchmark::State& state) { // NOLINT non-const reference
auto s = MakeLargeString(valid_non_ascii, 100000);
BenchmarkUTF8Validation(state, s, true);
}
BENCHMARK(ValidateTinyAscii);
BENCHMARK(ValidateTinyNonAscii);
BENCHMARK(ValidateSmallAscii);
BENCHMARK(ValidateSmallAlmostAscii);
BENCHMARK(ValidateSmallNonAscii);
BENCHMARK(ValidateLargeAscii);
BENCHMARK(ValidateLargeAlmostAscii);
BENCHMARK(ValidateLargeNonAscii);
} // namespace util
} // namespace arrow