blob: c06de5366326197a7386cc89283a3e7677a57674 [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
// This file is copied from
// https://github.com/ClickHouse/ClickHouse/blob/master/src/Common/formatIPv6.h
// and modified by Doris
#pragma once
#include <vec/common/hex.h>
#include <vec/common/string_utils/string_utils.h>
#include <vec/core/types.h>
#include <algorithm>
#include <array>
#include <bit>
#include <cstdint>
#include <cstring>
#include <utility>
constexpr size_t IPV4_BINARY_LENGTH = 4;
constexpr size_t IPV4_MAX_TEXT_LENGTH = 15; /// Does not count tail zero byte.
constexpr size_t IPV6_MAX_TEXT_LENGTH = 39;
constexpr size_t IPV4_MIN_NUM_VALUE = 0; //num value of '0.0.0.0'
constexpr size_t IPV4_MAX_NUM_VALUE = 4294967295; //num value of '255.255.255.255'
constexpr int IPV4_MAX_OCTET_VALUE = 255; //max value of octet
constexpr size_t IPV4_OCTET_BITS = 8;
constexpr size_t DECIMAL_BASE = 10;
constexpr size_t IPV6_BINARY_LENGTH = 16;
namespace doris::vectorized {
#include "common/compile_check_begin.h"
extern const std::array<std::pair<const char*, size_t>, 256> one_byte_to_string_lookup_table;
/** Format 4-byte binary sequesnce as IPv4 text: 'aaa.bbb.ccc.ddd',
* expects in out to be in BE-format, that is 0x7f000001 => "127.0.0.1".
*
* Any number of the tail bytes can be masked with given mask string.
*
* Assumptions:
* src is IPV4_BINARY_LENGTH long,
* dst is IPV4_MAX_TEXT_LENGTH long,
* mask_tail_octets <= IPV4_BINARY_LENGTH
* mask_string is NON-NULL, if mask_tail_octets > 0.
*
* Examples:
* format_ipv4(&0x7f000001, dst, mask_tail_octets = 0, nullptr);
* > dst == "127.0.0.1"
* format_ipv4(&0x7f000001, dst, mask_tail_octets = 1, "xxx");
* > dst == "127.0.0.xxx"
* format_ipv4(&0x7f000001, dst, mask_tail_octets = 1, "0");
* > dst == "127.0.0.0"
*/
inline void format_ipv4(const unsigned char* src, size_t src_size, char*& dst,
uint8_t mask_tail_octets = 0, const char* mask_string = "xxx") {
const size_t mask_length = mask_string ? strlen(mask_string) : 0;
const size_t limit = std::min(IPV4_BINARY_LENGTH, IPV4_BINARY_LENGTH - mask_tail_octets);
const size_t padding = std::min(4 - src_size, limit);
for (size_t octet = 0; octet < padding; ++octet) {
*dst++ = '0';
*dst++ = '.';
}
for (size_t octet = 4 - src_size; octet < limit; ++octet) {
uint8_t value = 0;
if constexpr (std::endian::native == std::endian::little)
value = static_cast<uint8_t>(src[IPV4_BINARY_LENGTH - octet - 1]);
else
value = static_cast<uint8_t>(src[octet]);
const uint8_t len = static_cast<uint8_t>(one_byte_to_string_lookup_table[value].second);
const char* str = one_byte_to_string_lookup_table[value].first;
memcpy(dst, str, len);
dst += len;
*dst++ = '.';
}
for (size_t mask = 0; mask < mask_tail_octets; ++mask) {
memcpy(dst, mask_string, mask_length);
dst += mask_length;
*dst++ = '.';
}
dst--;
}
inline void format_ipv4(const unsigned char* src, char*& dst, uint8_t mask_tail_octets = 0,
const char* mask_string = "xxx") {
format_ipv4(src, 4, dst, mask_tail_octets, mask_string);
}
/** Unsafe (no bounds-checking for src nor dst), optimized version of parsing IPv4 string.
*
* Parses the input string `src` and stores binary host-endian value into buffer pointed by `dst`,
* which should be long enough.
* That is "127.0.0.1" becomes 0x7f000001.
*
* In case of failure doesn't modify buffer pointed by `dst`.
*
* WARNING - this function is adapted to work with ReadBuffer, where src is the position reference (ReadBuffer::position())
* and eof is the ReadBuffer::eof() - therefore algorithm below does not rely on buffer's continuity.
* To parse strings use overloads below.
*
* @param src - iterator (reference to pointer) over input string - warning - continuity is not guaranteed.
* @param eof - function returning true if iterator riched the end - warning - can break iterator's continuity.
* @param dst - where to put output bytes, expected to be non-null and at IPV4_BINARY_LENGTH-long.
* @param first_octet - preparsed first octet
* @return - true if parsed successfully, false otherwise.
*/
template <typename T, typename EOFfunction>
requires(std::is_same<typename std::remove_cv<T>::type, char>::value)
inline bool parse_ipv4(T*& src, EOFfunction eof, unsigned char* dst, int64_t first_octet = -1) {
if (src == nullptr || first_octet > IPV4_MAX_OCTET_VALUE) {
return false;
}
int64_t result = 0;
int offset = (IPV4_BINARY_LENGTH - 1) * IPV4_OCTET_BITS;
if (first_octet >= 0) {
result |= first_octet << offset;
offset -= IPV4_OCTET_BITS;
}
for (; true; offset -= IPV4_OCTET_BITS, ++src) {
if (eof()) {
return false;
}
int64_t value = 0;
size_t len = 0;
while (is_numeric_ascii(*src) && len <= 3) {
value = value * DECIMAL_BASE + (*src - '0');
++len;
++src;
if (eof()) {
break;
}
}
if (len == 0 || value > IPV4_MAX_OCTET_VALUE || (offset > 0 && (eof() || *src != '.'))) {
return false;
}
result |= value << offset;
if (offset == 0) {
break;
}
}
memcpy(dst, &result, sizeof(result));
return true;
}
/// returns pointer to the right after parsed sequence or null on failed parsing
inline const char* parse_ipv4(const char* src, const char* end, unsigned char* dst) {
if (parse_ipv4(
src, [&src, end]() { return src == end; }, dst)) {
return src;
}
return nullptr;
}
/// returns true if whole buffer was parsed successfully
inline bool parse_ipv4_whole(const char* src, const char* end, unsigned char* dst) {
return parse_ipv4(src, end, dst) == end;
}
/// returns pointer to the right after parsed sequence or null on failed parsing
inline const char* parse_ipv4(const char* src, unsigned char* dst) {
if (parse_ipv4(
src, []() { return false; }, dst)) {
return src;
}
return nullptr;
}
/// returns true if whole null-terminated string was parsed successfully
inline bool parse_ipv4_whole(const char* src, unsigned char* dst) {
const char* end = parse_ipv4(src, dst);
return end != nullptr && *end == '\0';
}
/// integer logarithm, return ceil(log(value, base)) (the smallest integer greater or equal than log(value, base)
inline constexpr UInt32 int_log(const UInt32 value, const UInt32 base, const bool carry) {
return value >= base ? 1 + int_log(value / base, base, value % base || carry)
: value % base > 1 || carry;
}
/// Print integer in desired base, faster than sprintf.
/// NOTE This is not the best way. See https://github.com/miloyip/itoa-benchmark
/// But it doesn't matter here.
template <UInt32 base, typename T>
inline void print_integer(char*& out, T value) {
if (value == 0) {
*out++ = '0';
} else {
constexpr size_t buffer_size = sizeof(T) * int_log(256, base, false);
char buf[buffer_size];
auto ptr = buf;
while (value > 0) {
*ptr = hex_digit_lowercase(value % base);
++ptr;
value /= base;
}
/// Copy to out reversed.
while (ptr != buf) {
--ptr;
*out = *ptr;
++out;
}
}
}
/** Rewritten inet_ntop6 from http://svn.apache.org/repos/asf/apr/apr/trunk/network_io/unix/inet_pton.c
* performs significantly faster than the reference implementation due to the absence of sprintf calls,
* bounds checking, unnecessary string copying and length calculation.
* @param src - pointer to IPv6 (16 bytes) stored in little-endian byte order
* @param dst - where to put format result bytes
* @param zeroed_tail_bytes_count - the parameter is currently not being used
*/
inline void format_ipv6(unsigned char* src, char*& dst, uint8_t zeroed_tail_bytes_count = 0) {
struct {
Int64 base, len;
} best {-1, 0}, cur {-1, 0};
std::array<UInt16, IPV6_BINARY_LENGTH / sizeof(UInt16)> words {};
// the current function logic is processed in big endian manner
// but ipv6 in doris is stored in little-endian byte order
// so transfer to big-endian byte order first
// compatible with parse_ipv6 function in format_ip.h
std::reverse(src, src + IPV6_BINARY_LENGTH);
/** Preprocess:
* Copy the input (bytewise) array into a wordwise array.
* Find the longest run of 0x00's in src[] for :: shorthanding. */
for (size_t i = 0; i < (IPV6_BINARY_LENGTH - zeroed_tail_bytes_count); i += 2) {
words[i / 2] = (uint16_t)(src[i] << 8) | src[i + 1];
}
for (size_t i = 0; i < words.size(); i++) {
if (words[i] == 0) {
if (cur.base == -1) {
cur.base = i;
cur.len = 1;
} else {
cur.len++;
}
} else {
if (cur.base != -1) {
if (best.base == -1 || cur.len > best.len) {
best = cur;
}
cur.base = -1;
}
}
}
if (cur.base != -1) {
if (best.base == -1 || cur.len > best.len) {
best = cur;
}
}
if (best.base != -1 && best.len < 2) {
best.base = -1;
}
/// Format the result.
for (size_t i = 0; i < words.size(); i++) {
/// Are we inside the best run of 0x00's?
if (best.base != -1) {
auto best_base = static_cast<size_t>(best.base);
if (i >= best_base && i < (best_base + best.len)) {
if (i == best_base) {
*dst++ = ':';
}
continue;
}
}
/// Are we following an initial run of 0x00s or any real hex?
if (i != 0) {
*dst++ = ':';
}
/// Is this address an encapsulated IPv4?
if (i == 6 && best.base == 0 && (best.len == 6 || (best.len == 5 && words[5] == 0xffffu))) {
uint8_t ipv4_buffer[IPV4_BINARY_LENGTH] = {0};
memcpy(ipv4_buffer, src + 12, IPV4_BINARY_LENGTH);
// Due to historical reasons format_ipv4() takes ipv4 in BE format, but inside ipv6 we store it in LE-format.
if constexpr (std::endian::native == std::endian::little) {
std::reverse(std::begin(ipv4_buffer), std::end(ipv4_buffer));
}
format_ipv4(ipv4_buffer, dst,
std::min(zeroed_tail_bytes_count, static_cast<uint8_t>(IPV4_BINARY_LENGTH)),
"0");
// format_ipv4 has already added a null-terminator for us.
return;
}
print_integer<16>(dst, words[i]);
}
/// Was it a trailing run of 0x00's?
if (best.base != -1 &&
static_cast<size_t>(best.base) + static_cast<size_t>(best.len) == words.size()) {
*dst++ = ':';
}
}
/** Unsafe (no bounds-checking for src nor dst), optimized version of parsing IPv6 string.
*
* Parses the input string `src` and stores binary little-endian value into buffer pointed by `dst`,
* which should be long enough. In case of failure zeroes IPV6_BINARY_LENGTH bytes of buffer pointed by `dst`.
*
* WARNING - this function is adapted to work with ReadBuffer, where src is the position reference (ReadBuffer::position())
* and eof is the ReadBuffer::eof() - therefore algorithm below does not rely on buffer's continuity.
* To parse strings use overloads below.
*
* @param src - iterator (reference to pointer) over input string - warning - continuity is not guaranteed.
* @param eof - function returning true if iterator riched the end - warning - can break iterator's continuity.
* @param dst - where to put output bytes in little-endian byte order, expected to be non-null and at IPV6_BINARY_LENGTH-long.
* @param first_block - preparsed first block
* @return - true if parsed successfully, false otherwise.
*/
template <typename T, typename EOFfunction>
requires(std::is_same<typename std::remove_cv<T>::type, char>::value)
inline bool parse_ipv6(T*& src, EOFfunction eof, unsigned char* dst, int32_t first_block = -1) {
const auto clear_dst = [dst]() {
std::memset(dst, '\0', IPV6_BINARY_LENGTH);
return false;
};
if (src == nullptr || eof()) return clear_dst();
int groups = 0; /// number of parsed groups
unsigned char* iter = dst; /// iterator over dst buffer
unsigned char* zptr =
nullptr; /// pointer into dst buffer array where all-zeroes block ("::") is started
std::memset(dst, '\0', IPV6_BINARY_LENGTH);
if (first_block >= 0) {
*iter++ = static_cast<unsigned char>((first_block >> 8) & 0xffu);
*iter++ = static_cast<unsigned char>(first_block & 0xffu);
if (*src == ':') {
zptr = iter;
++src;
}
++groups;
}
bool group_start = true;
while (!eof() && groups < 8) {
if (*src == ':') {
++src;
if (eof()) /// trailing colon is not allowed
return clear_dst();
group_start = true;
if (*src == ':') {
if (zptr != nullptr) /// multiple all-zeroes blocks are not allowed
return clear_dst();
zptr = iter;
++src;
continue;
}
if (groups == 0) /// leading colon is not allowed
return clear_dst();
}
/// mixed IPv4 parsing
if (*src == '.') {
if (groups <= 1 && zptr == nullptr) /// IPv4 block can't be the first
return clear_dst();
if (group_start) /// first octet of IPv4 should be already parsed as an IPv6 group
return clear_dst();
++src;
if (eof()) return clear_dst();
/// last parsed group should be reinterpreted as a decimal value - it's the first octet of IPv4
--groups;
iter -= 2;
UInt16 num = 0;
for (int i = 0; i < 2; ++i) {
unsigned char first = (iter[i] >> 4) & 0x0fu;
unsigned char second = iter[i] & 0x0fu;
if (first > 9 || second > 9) return clear_dst();
(num *= 100) += first * 10 + second;
}
if (num > 255) return clear_dst();
/// parse IPv4 with known first octet
if (!parse_ipv4(src, eof, iter, num)) return clear_dst();
if constexpr (std::endian::native == std::endian::little)
std::reverse(iter, iter + IPV4_BINARY_LENGTH);
iter += 4;
groups += 2;
break; /// IPv4 block is the last - end of parsing
}
if (!group_start) /// end of parsing
break;
group_start = false;
UInt16 val = 0; /// current decoded group
int xdigits = 0; /// number of decoded hex digits in current group
for (; !eof() && xdigits < 4; ++src, ++xdigits) {
UInt8 num = unhex(*src);
if (num == 0xFF) break;
(val <<= 4) |= num;
}
if (xdigits == 0) /// end of parsing
break;
*iter++ = static_cast<unsigned char>((val >> 8) & 0xffu);
*iter++ = static_cast<unsigned char>(val & 0xffu);
++groups;
}
/// either all 8 groups or all-zeroes block should be present
if (groups < 8 && zptr == nullptr) return clear_dst();
/// process all-zeroes block
if (zptr != nullptr) {
size_t msize = iter - zptr;
std::memmove(dst + IPV6_BINARY_LENGTH - msize, zptr, msize);
std::memset(zptr, '\0', IPV6_BINARY_LENGTH - (iter - dst));
}
/// the current function logic is processed in big endian manner
/// but ipv6 in doris is stored in little-endian byte order
/// so transfer to little-endian
std::reverse(dst, dst + IPV6_BINARY_LENGTH);
return true;
}
/// returns pointer to the right after parsed sequence or null on failed parsing
inline const char* parse_ipv6(const char* src, const char* end, unsigned char* dst) {
if (parse_ipv6(
src, [&src, end]() { return src == end; }, dst))
return src;
return nullptr;
}
/// returns true if whole buffer was parsed successfully
inline bool parse_ipv6_whole(const char* src, const char* end, unsigned char* dst) {
return parse_ipv6(src, end, dst) == end;
}
/// returns pointer to the right after parsed sequence or null on failed parsing
inline const char* parse_ipv6(const char* src, unsigned char* dst) {
if (parse_ipv6(
src, []() { return false; }, dst))
return src;
return nullptr;
}
/// returns true if whole null-terminated string was parsed successfully
inline bool parse_ipv6_whole(const char* src, unsigned char* dst) {
const char* end = parse_ipv6(src, dst);
return end != nullptr && *end == '\0';
}
#include "common/compile_check_end.h"
} // namespace doris::vectorized