blob: b2ec5e0c9d9a4be60ad3490d05da481548e2349e [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
// This file is copied from
// https://github.com/ClickHouse/ClickHouse/blob/master/src/Functions/URL/domain.h
// and modified by Doris
#pragma once
#include <cstring>
#include "vec/common/string_utils/string_utils.h"
#include "vec/functions/url/find_symbols.h"
#include "vec/functions/url/protocol.h"
#include "vec/functions/url/tldLookup.h"
namespace doris::vectorized {
inline StringRef check_and_return_host(const Pos& pos, const Pos& dot_pos,
const Pos& start_of_host) {
if (!dot_pos || start_of_host >= pos || pos - dot_pos == 1) return StringRef {};
auto after_dot = *(dot_pos + 1);
if (after_dot == ':' || after_dot == '/' || after_dot == '?' || after_dot == '#')
return StringRef {};
return StringRef(start_of_host, pos - start_of_host);
}
/// Extracts host from given url.
///
/// @return empty StringRef if the host is not valid (i.e. it does not have dot, or there no symbol after dot).
inline StringRef get_url_host(const char* data, size_t size) {
Pos pos = data;
Pos end = data + size;
if (*pos == '/' && *(pos + 1) == '/') {
pos += 2;
} else {
Pos scheme_end = data + std::min(size, 16UL);
for (++pos; pos < scheme_end; ++pos) {
if (!is_alpha_numeric_ascii(*pos)) {
switch (*pos) {
case '.':
case '-':
case '+':
break;
case ' ': /// restricted symbols
case '\t':
case '<':
case '>':
case '%':
case '{':
case '}':
case '|':
case '\\':
case '^':
case '~':
case '[':
case ']':
case ';':
case '=':
case '&':
return StringRef {};
default:
goto exloop;
}
}
}
exloop:
if ((scheme_end - pos) > 2 && *pos == ':' && *(pos + 1) == '/' && *(pos + 2) == '/')
pos += 3;
else
pos = data;
}
Pos dot_pos = nullptr;
const auto* start_of_host = pos;
for (; pos < end; ++pos) {
switch (*pos) {
case '.':
dot_pos = pos;
break;
case ':': /// end symbols
case '/':
case '?':
case '#':
return check_and_return_host(pos, dot_pos, start_of_host);
case '@': /// myemail@gmail.com
start_of_host = pos + 1;
break;
case ' ': /// restricted symbols in whole URL
case '\t':
case '<':
case '>':
case '%':
case '{':
case '}':
case '|':
case '\\':
case '^':
case '~':
case '[':
case ']':
case ';':
case '=':
case '&':
return StringRef {};
}
}
return check_and_return_host(pos, dot_pos, start_of_host);
}
template <bool without_www>
struct ExtractDomain {
static size_t get_reserve_length_for_element() { return 15; }
static void execute(Pos data, size_t size, Pos& res_data, size_t& res_size) {
StringRef host = get_url_host(data, size);
if (host.size == 0) {
res_data = data;
res_size = 0;
} else {
if (without_www && host.size > 4 && !strncmp(host.data, "www.", 4))
host = {host.data + 4, host.size - 4};
res_data = host.data;
res_size = host.size;
}
}
};
struct ExtractTopLevelDomain {
static size_t get_reserve_length_for_element() { return 5; }
static void execute(const char* data, size_t size, const char*& res_data, size_t& res_size) {
res_data = data;
res_size = 0;
StringRef host = get_url_host(data, size);
if (host.size == 0) {
return;
} else {
auto host_view = host.to_string_view();
if (host_view[host_view.size() - 1] == '.') {
host_view.remove_suffix(1);
}
const auto* host_end = host_view.data() + host_view.size();
const char* last_dot = find_last_symbols_or_null<'.'>(host_view.data(), host_end);
if (!last_dot) {
return;
}
/// For IPv4 addresses select nothing.
///
/// NOTE: it is safe to access last_dot[1]
/// since getURLHost() will not return a host if there is symbol after dot.
if (is_numeric_ascii(last_dot[1])) {
return;
}
res_data = last_dot + 1;
res_size = host_end - res_data;
}
}
};
struct ExtractFirstSignificantSubdomain {
static size_t get_reserve_length_for_element() { return 10; }
static void execute(const Pos data, const size_t size, Pos& res_data, size_t& res_size,
Pos* out_domain_end = nullptr) {
res_data = data;
res_size = 0;
Pos tmp;
size_t domain_length = 0;
ExtractDomain<true>::execute(data, size, tmp, domain_length);
if (domain_length == 0) {
return;
}
if (out_domain_end) {
*out_domain_end = tmp + domain_length;
}
/// cut useless dot
if (tmp[domain_length - 1] == '.') {
--domain_length;
}
res_data = tmp;
res_size = domain_length;
const auto* begin = tmp;
const auto* end = begin + domain_length;
std::array<const char*, 3> last_periods {};
const auto* pos = find_first_symbols<'.'>(begin, end);
while (pos < end) {
last_periods[2] = last_periods[1];
last_periods[1] = last_periods[0];
last_periods[0] = pos;
pos = find_first_symbols<'.'>(pos + 1, end);
}
if (!last_periods[0]) {
return;
}
if (!last_periods[1]) {
res_size = last_periods[0] - begin;
return;
}
if (!last_periods[2]) {
last_periods[2] = begin - 1;
}
const auto* end_of_level_domain = find_first_symbols<'/'>(last_periods[0], end);
if (!end_of_level_domain) {
end_of_level_domain = end;
}
auto host_len = static_cast<size_t>(end_of_level_domain - last_periods[1] - 1);
StringRef host {last_periods[1] + 1, host_len};
if (tldLookup::is_valid(host.data, host.size)) {
res_data += last_periods[2] + 1 - begin;
res_size = last_periods[1] - last_periods[2] - 1;
} else {
res_data += last_periods[1] + 1 - begin;
res_size = last_periods[0] - last_periods[1] - 1;
}
}
};
struct CutToFirstSignificantSubdomain {
static size_t get_reserve_length_for_element() { return 15; }
static void execute(const Pos data, const size_t size, Pos& res_data, size_t& res_size) {
res_data = data;
res_size = 0;
Pos tmp_data = data;
size_t tmp_length;
Pos domain_end = data;
ExtractFirstSignificantSubdomain::execute(data, size, tmp_data, tmp_length, &domain_end);
if (tmp_length == 0) {
return;
}
res_data = tmp_data;
res_size = domain_end - tmp_data;
}
};
} // namespace doris::vectorized