blob: b6736496d24345cbdbfe582f281f27ae40f03b4c [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
// This file is copied from
// https://github.com/ClickHouse/ClickHouse/blob/master/src/Functions/URL/FunctionsURL.h
// and modified by Doris
#pragma once
#include "vec/columns/column_string.h"
#include "vec/common/memcpy_small.h"
namespace doris::vectorized {
/** URL processing functions. See implementation in separate .cpp files.
* All functions are not strictly follow RFC, instead they are maximally simplified for performance reasons.
*
* Functions for extraction parts of URL.
* If URL has nothing like, then empty string is returned.
*
* domain
* domainWithoutWWW
* topLevelDomain
* protocol
* path
* queryString
* fragment
* queryStringAndFragment
* netloc
*
* Functions, removing parts from URL.
* If URL has nothing like, then it is returned unchanged.
*
* cutWWW
* cutFragment
* cutQueryString
* cutQueryStringAndFragment
*
* Extract value of parameter in query string or in fragment identifier. Return empty string, if URL has no such parameter.
* If there are many parameters with same name - return value of first one. Value is not %-decoded.
*
* extractURLParameter(URL, name)
*
* Extract all parameters from URL in form of array of strings name=value.
* extractURLParameters(URL)
*
* Extract names of all parameters from URL in form of array of strings.
* extractURLParameterNames(URL)
*
* Remove specified parameter from URL.
* cutURLParameter(URL, name)
*
* Get array of URL 'hierarchy' as in web-analytics tree-like reports. See the docs.
* URLHierarchy(URL)
*/
using Pos = const char*;
/** Select part of string using the Extractor.
*/
template <typename Extractor>
struct ExtractSubstringImpl {
static void vector(const ColumnString::Chars& data, const ColumnString::Offsets& offsets,
ColumnString::Chars& res_data, ColumnString::Offsets& res_offsets) {
size_t size = offsets.size();
res_offsets.resize(size);
res_data.reserve(size * Extractor::get_reserve_length_for_element());
size_t prev_offset = 0;
size_t res_offset = 0;
/// Matched part.
Pos start;
size_t length;
for (size_t i = 0; i < size; ++i) {
Extractor::execute(reinterpret_cast<const char*>(&data[prev_offset]),
offsets[i] - prev_offset, start, length);
res_data.resize(res_data.size() + length);
memcpy_small_allow_read_write_overflow15(&res_data[res_offset], start, length);
res_offset += length;
res_offsets[i] = res_offset;
prev_offset = offsets[i];
}
}
static void constant(const std::string& data, std::string& res_data) {
Pos start;
size_t length;
Extractor::execute(data.data(), data.size(), start, length);
res_data.assign(start, length);
}
};
/** Delete part of string using the Extractor.
*/
template <typename Extractor>
struct CutSubstringImpl {
static void vector(const ColumnString::Chars& data, const ColumnString::Offsets& offsets,
ColumnString::Chars& res_data, ColumnString::Offsets& res_offsets) {
res_data.reserve(data.size());
size_t size = offsets.size();
res_offsets.resize(size);
size_t prev_offset = 0;
size_t res_offset = 0;
/// Matched part.
Pos start;
size_t length;
for (size_t i = 0; i < size; ++i) {
const char* current = reinterpret_cast<const char*>(&data[prev_offset]);
Extractor::execute(current, offsets[i] - prev_offset, start, length);
size_t start_index = start - reinterpret_cast<const char*>(data.data());
res_data.resize(res_data.size() + offsets[i] - prev_offset - length);
memcpy_small_allow_read_write_overflow15(&res_data[res_offset], current,
start - current);
memcpy_small_allow_read_write_overflow15(&res_data[res_offset + start - current],
start + length,
offsets[i] - start_index - length);
res_offset += offsets[i] - prev_offset - length;
res_offsets[i] = res_offset;
prev_offset = offsets[i];
}
}
static void constant(const std::string& data, std::string& res_data) {
Pos start;
size_t length;
Extractor::execute(data.data(), data.size(), start, length);
res_data.reserve(data.size() - length);
res_data.append(data.data(), start);
res_data.append(start + length, data.data() + data.size());
}
};
} // namespace doris::vectorized