blob: 2342db323fc71ac48c4796fb97f10af482f4a8db [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*
*/
#include "string_util.h"
#include <fmt/format.h>
#include <regex>
#include <string>
#include "parse_util.h"
namespace util {
std::string Float2String(double d) {
if (std::isinf(d)) {
return d > 0 ? "inf" : "-inf";
}
return fmt::format("{:.17g}", d);
}
std::string ToLower(std::string in) {
std::transform(in.begin(), in.end(), in.begin(), [](char c) -> char { return static_cast<char>(std::tolower(c)); });
return in;
}
std::string ToUpper(std::string in) {
std::transform(in.begin(), in.end(), in.begin(), [](char c) -> char { return static_cast<char>(std::toupper(c)); });
return in;
}
bool EqualICase(std::string_view lhs, std::string_view rhs) {
return lhs.size() == rhs.size() && std::equal(lhs.begin(), lhs.end(), rhs.begin(),
[](char l, char r) { return std::tolower(l) == std::tolower(r); });
}
std::string Trim(std::string in, std::string_view chars) {
if (in.empty()) return in;
in.erase(0, in.find_first_not_of(chars));
in.erase(in.find_last_not_of(chars) + 1);
return in;
}
std::vector<std::string> Split(std::string_view in, std::string_view delim) {
std::vector<std::string> out;
if (in.empty()) {
return out;
}
if (delim.empty()) {
out.resize(in.size());
std::transform(in.begin(), in.end(), out.begin(), [](char c) -> std::string { return {c}; });
return out;
}
size_t begin = 0, end = in.find_first_of(delim);
do {
std::string_view elem = in.substr(begin, end - begin);
if (!elem.empty()) out.emplace_back(elem.begin(), elem.end());
if (end == std::string::npos) break;
begin = end + 1;
end = in.find_first_of(delim, begin);
} while (true);
return out;
}
std::vector<std::string> Split2KV(const std::string &in, const std::string &delim) {
std::vector<std::string> out;
std::string::size_type pos = in.find_first_of(delim);
if (pos != std::string::npos) {
std::string key = in.substr(0, pos);
if (!key.empty()) out.push_back(std::move(key));
std::string value = Trim(in.substr(pos + 1), delim);
if (!value.empty()) out.push_back(std::move(value));
}
return out;
}
bool StartsWith(std::string_view str, std::string_view prefix) {
return str.size() >= prefix.size() && str.compare(0, prefix.size(), prefix) == 0;
}
bool StartsWithICase(std::string_view str, std::string_view prefix) {
return str.size() >= prefix.size() && EqualICase(str.substr(0, prefix.size()), prefix);
}
bool EndsWith(std::string_view str, std::string_view suffix) {
return str.size() >= suffix.size() && str.compare(str.size() - suffix.size(), suffix.size(), suffix) == 0;
}
bool EndsWithICase(std::string_view str, std::string_view suffix) {
return str.size() >= suffix.size() && EqualICase(str.substr(str.size() - suffix.size()), suffix);
}
Status ValidateGlob(std::string_view glob) {
for (size_t idx = 0; idx < glob.size(); ++idx) {
switch (glob[idx]) {
case '*':
case '?':
break;
case ']':
return {Status::NotOK, "Unmatched unescaped ]"};
case '\\':
if (idx == glob.size() - 1) {
return {Status::NotOK, "Trailing unescaped backslash"};
}
// Skip the next character: this is a literal so nothing can go wrong
idx++;
break;
case '[':
idx++; // Skip the opening bracket
while (idx < glob.size() && glob[idx] != ']') {
if (glob[idx] == '\\') {
idx += 2;
continue;
} else if (idx + 1 < glob.size() && glob[idx + 1] == '-') {
if (idx + 2 >= glob.size()) {
return {Status::NotOK, "Unterminated character range"};
}
// Skip the - and the end of the range
idx += 2;
}
idx++;
}
if (idx == glob.size()) {
return {Status::NotOK, "Unterminated [ group"};
}
break;
default:
// This is a literal: nothing can go wrong
break;
}
}
return Status::OK();
}
constexpr bool StringMatchImpl(std::string_view pattern, std::string_view string, bool ignore_case,
bool *skip_longer_matches, size_t recursion_depth = 0) {
// If we want to ignore case, this is equivalent to converting both the pattern and the string to lowercase
const auto canonicalize = [ignore_case](unsigned char c) -> unsigned char {
return ignore_case ? static_cast<unsigned char>(std::tolower(c)) : c;
};
if (recursion_depth > 1000) return false;
while (!pattern.empty() && !string.empty()) {
switch (pattern[0]) {
case '*':
// Optimization: collapse multiple * into one
while (pattern.size() >= 2 && pattern[1] == '*') {
pattern.remove_prefix(1);
}
// Optimization: If the '*' is the last character in the pattern, it can match anything
if (pattern.length() == 1) return true;
while (!string.empty()) {
if (StringMatchImpl(pattern.substr(1), string, ignore_case, skip_longer_matches, recursion_depth + 1))
return true;
if (*skip_longer_matches) return false;
string.remove_prefix(1);
}
// There was no match for the rest of the pattern starting
// from anywhere in the rest of the string. If there were
// any '*' earlier in the pattern, we can terminate the
// search early without trying to match them to longer
// substrings. This is because a longer match for the
// earlier part of the pattern would require the rest of the
// pattern to match starting later in the string, and we
// have just determined that there is no match for the rest
// of the pattern starting from anywhere in the current
// string.
*skip_longer_matches = true;
return false;
case '?':
if (string.empty()) return false;
string.remove_prefix(1);
break;
case '[': {
pattern.remove_prefix(1);
const bool invert = pattern[0] == '^';
if (invert) pattern.remove_prefix(1);
bool match = false;
while (true) {
if (pattern.empty()) {
// unterminated [ group: reject invalid pattern
return false;
} else if (pattern[0] == ']') {
break;
} else if (pattern.length() >= 2 && pattern[0] == '\\') {
pattern.remove_prefix(1);
if (pattern[0] == string[0]) match = true;
} else if (pattern.length() >= 3 && pattern[1] == '-') {
unsigned char start = canonicalize(pattern[0]);
unsigned char end = canonicalize(pattern[2]);
if (start > end) std::swap(start, end);
const int c = canonicalize(string[0]);
pattern.remove_prefix(2);
if (c >= start && c <= end) match = true;
} else if (canonicalize(pattern[0]) == canonicalize(string[0])) {
match = true;
}
pattern.remove_prefix(1);
}
if (invert) match = !match;
if (!match) return false;
string.remove_prefix(1);
break;
}
case '\\':
if (pattern.length() >= 2) {
pattern.remove_prefix(1);
}
[[fallthrough]];
default:
// Just a normal character
if (!ignore_case) {
if (pattern[0] != string[0]) return false;
} else {
if (std::tolower((int)pattern[0]) != std::tolower((int)string[0])) return false;
}
string.remove_prefix(1);
break;
}
pattern.remove_prefix(1);
}
// Now that either the pattern is empty or the string is empty, this is a match iff
// the pattern consists only of '*', and the string is empty.
return string.empty() && std::all_of(pattern.begin(), pattern.end(), [](char c) { return c == '*'; });
}
// Given a glob [pattern] and a string [string], return true iff the string matches the glob.
// If [ignore_case] is true, the match is case-insensitive.
bool StringMatch(std::string_view glob, std::string_view str, bool ignore_case) {
bool skip_longer_matches = false;
return StringMatchImpl(glob, str, ignore_case, &skip_longer_matches);
}
// Split a glob pattern into a literal prefix and a suffix containing wildcards.
// For example, if the user calls [KEYS bla*bla], this function will return {"bla", "*bla"}.
// This allows the caller of this function to optimize this call by performing a
// prefix-scan on "bla" and then filtering the results using the GlobMatches function.
std::pair<std::string, std::string> SplitGlob(std::string_view glob) {
// Stores the prefix of the glob pattern, with backslashes removed
std::string prefix;
// Find the first un-escaped '*', '?' or '[' character in [glob]
for (size_t idx = 0; idx < glob.size(); ++idx) {
if (glob[idx] == '*' || glob[idx] == '?' || glob[idx] == '[') {
// Return a pair of views: the part of the glob before the wildcard, and the part after
return {prefix, std::string(glob.substr(idx))};
} else if (glob[idx] == '\\') {
// Skip checking whether the next character is a special character
++idx;
// Append the escaped special character to the prefix
if (idx < glob.size()) prefix.push_back(glob[idx]);
} else {
prefix.push_back(glob[idx]);
}
}
// No wildcard found, return the entire string (without the backslashes) as the prefix
return {prefix, ""};
}
static int HexDigitToInt(const char c) {
if (c >= '0' && c <= '9') {
return c - '0';
} else if (c >= 'a' && c <= 'f') {
return c - 'a' + 10;
} else if (c >= 'A' && c <= 'F') {
return c - 'A' + 10;
}
return 0;
}
StatusOr<std::vector<std::string>> SplitArguments(std::string_view in) {
std::vector<std::string> arguments;
std::string current_string;
enum State { NORMAL, DOUBLE_QUOTED, SINGLE_QUOTED, ESCAPE } state = NORMAL;
bool done = false;
for (size_t i = 0; i < in.size() && !done; i++) {
const auto c = in[i];
switch (state) {
case NORMAL:
if (std::isspace(c)) {
if (!current_string.empty()) {
arguments.emplace_back(std::move(current_string));
current_string.clear();
}
} else if (c == '\r' || c == '\n' || c == '\t') {
done = true;
} else if (c == '"') {
state = DOUBLE_QUOTED;
} else if (c == '\'') {
state = SINGLE_QUOTED;
} else {
current_string.push_back(c);
}
break;
case SINGLE_QUOTED:
if (c == '\\' && (i + 1) < in.size() && in[i + 1] == '\'') {
current_string.push_back('\'');
i++;
} else if (c == '\'') {
//
if (i + 1 < in.size() && !std::isspace(in[i + 1])) {
return {Status::NotOK, "the closed single quote must be followed by a space"};
}
state = NORMAL;
} else {
current_string.push_back(c);
}
break;
case DOUBLE_QUOTED:
if (c == '\\') {
state = ESCAPE;
} else if (c == '"') {
if (i + 1 < in.size() && !std::isspace(in[i + 1])) {
return {Status::NotOK, "the closed double quote must be followed by a space"};
}
state = NORMAL;
} else {
current_string.push_back(c);
}
break;
case ESCAPE:
// It's the hex digit after the \x
if (c == 'x' && (i + 2) < in.size() && std::isxdigit(in[i + 1]) && std::isxdigit(in[i + 2])) {
// Convert the hex digit to a char
auto hex_byte = static_cast<char>(HexDigitToInt(in[i + 1]) * 16 | HexDigitToInt(in[i + 2]));
current_string.push_back(hex_byte);
i += 2;
} else if (c == '"' || c == '\'' || c == '\\') {
current_string.push_back(c);
} else if (c == 'n') {
current_string.push_back('\n');
} else if (c == 'r') {
current_string.push_back('\r');
} else if (c == 't') {
current_string.push_back('\t');
} else if (c == 'b') {
current_string.push_back('\b');
} else if (c == 'a') {
current_string.push_back('\a');
} else {
current_string.push_back(c);
}
state = DOUBLE_QUOTED;
break;
}
}
if (state == DOUBLE_QUOTED || state == SINGLE_QUOTED) {
return {Status::NotOK, "unclosed quote string"};
}
if (state == ESCAPE) {
return {Status::NotOK, "unexpected trailing escape character"};
}
if (!current_string.empty()) {
arguments.emplace_back(std::move(current_string));
}
return arguments;
}
std::vector<std::string> RegexMatch(const std::string &str, const std::string &regex) {
std::regex base_regex(regex);
std::smatch pieces_match;
std::vector<std::string> out;
if (std::regex_match(str, pieces_match, base_regex)) {
for (const auto &piece : pieces_match) {
out.emplace_back(piece.str());
}
}
return out;
}
std::string StringToHex(std::string_view input) {
static const char hex_digits[] = "0123456789ABCDEF";
std::string output;
output.reserve(input.length() * 2);
for (unsigned char c : input) {
output.push_back(hex_digits[c >> 4]);
output.push_back(hex_digits[c & 15]);
}
return output;
}
constexpr unsigned long long ExpTo1024(unsigned n) { return 1ULL << (n * 10); }
std::string BytesToHuman(uint64_t n) {
if (n < ExpTo1024(1)) {
return fmt::format("{}B", n);
} else if (n < ExpTo1024(2)) {
return fmt::format("{:.2f}K", static_cast<double>(n) / ExpTo1024(1));
} else if (n < ExpTo1024(3)) {
return fmt::format("{:.2f}M", static_cast<double>(n) / ExpTo1024(2));
} else if (n < ExpTo1024(4)) {
return fmt::format("{:.2f}G", static_cast<double>(n) / ExpTo1024(3));
} else if (n < ExpTo1024(5)) {
return fmt::format("{:.2f}T", static_cast<double>(n) / ExpTo1024(4));
} else if (n < ExpTo1024(6)) {
return fmt::format("{:.2f}P", static_cast<double>(n) / ExpTo1024(5));
} else {
return fmt::format("{}B", n);
}
}
std::vector<std::string> TokenizeRedisProtocol(const std::string &value) {
std::vector<std::string> tokens;
if (value.empty()) {
return tokens;
}
enum ParserState { stateArrayLen, stateBulkLen, stateBulkData };
uint64_t array_len = 0, bulk_len = 0;
int state = stateArrayLen;
const char *start = value.data(), *end = start + value.size(), *p = nullptr;
while (start != end) {
switch (state) {
case stateArrayLen: {
if (start[0] != '*') {
return tokens;
}
p = strchr(start, '\r');
if (!p || (p == end) || p[1] != '\n') {
tokens.clear();
return tokens;
}
// parse_result expects to must be parsed successfully here
array_len = *ParseInt<uint64_t>(std::string(start + 1, p), 10);
start = p + 2;
state = stateBulkLen;
break;
}
case stateBulkLen: {
if (start[0] != '$') {
return tokens;
}
p = strchr(start, '\r');
if (!p || (p == end) || p[1] != '\n') {
tokens.clear();
return tokens;
}
// parse_result expects to must be parsed successfully here
bulk_len = *ParseInt<uint64_t>(std::string(start + 1, p), 10);
start = p + 2;
state = stateBulkData;
break;
}
case stateBulkData: {
if (bulk_len + 2 > static_cast<uint64_t>(end - start)) {
tokens.clear();
return tokens;
}
tokens.emplace_back(start, start + bulk_len);
start += bulk_len + 2;
state = stateBulkLen;
break;
}
}
}
if (array_len != tokens.size()) {
tokens.clear();
}
return tokens;
}
/* escape string where all the non-printable characters
* (tested with isprint()) are turned into escapes in
* the form "\n\r\a...." or "\x<hex-number>". */
std::string EscapeString(std::string_view s) {
std::string str;
str.reserve(s.size());
for (auto ch : s) {
switch (ch) {
case '\\':
case '"':
str += "\\";
str += ch;
break;
case '\n':
str += "\\n";
break;
case '\r':
str += "\\r";
break;
case '\t':
str += "\\t";
break;
case '\a':
str += "\\a";
break;
case '\b':
str += "\\b";
break;
case '\v':
str += "\\v";
break;
case '\f':
str += "\\f";
break;
default:
if (isprint(ch)) {
str += ch;
} else {
str += fmt::format("\\x{:02x}", (unsigned char)ch);
}
}
}
return str;
}
std::string StringNext(std::string s) {
for (auto iter = s.rbegin(); iter != s.rend(); ++iter) {
if (*iter != char(0xff)) {
(*iter)++;
break;
}
}
return s;
}
} // namespace util