be/src/vec/functions/url/functions_url.h - doris - Git at Google

 // Licensed to the Apache Software Foundation (ASF) under one
 // or more contributor license agreements.  See the NOTICE file
 // distributed with this work for additional information
 // regarding copyright ownership.  The ASF licenses this file
 // to you under the Apache License, Version 2.0 (the
 // "License"); you may not use this file except in compliance
 // with the License.  You may obtain a copy of the License at
 //
 //   http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing,
 // software distributed under the License is distributed on an
 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 // KIND, either express or implied.  See the License for the
 // specific language governing permissions and limitations
 // under the License.
 // This file is copied from
 // https://github.com/ClickHouse/ClickHouse/blob/master/src/Functions/URL/FunctionsURL.h
 // and modified by Doris

 #pragma once

 #include "vec/columns/column_string.h"
 #include "vec/common/memcpy_small.h"

 namespace doris::vectorized {

 /** URL processing functions. See implementation in separate .cpp files.
   * All functions are not strictly follow RFC, instead they are maximally simplified for performance reasons.
   *
   * Functions for extraction parts of URL.
   * If URL has nothing like, then empty string is returned.
   *
   *  domain
   *  domainWithoutWWW
   *  topLevelDomain
   *  protocol
   *  path
   *  queryString
   *  fragment
   *  queryStringAndFragment
   *  netloc
   *
   * Functions, removing parts from URL.
   * If URL has nothing like, then it is returned unchanged.
   *
   *  cutWWW
   *  cutFragment
   *  cutQueryString
   *  cutQueryStringAndFragment
   *
   * Extract value of parameter in query string or in fragment identifier. Return empty string, if URL has no such parameter.
   * If there are many parameters with same name - return value of first one. Value is not %-decoded.
   *
   *  extractURLParameter(URL, name)
   *
   * Extract all parameters from URL in form of array of strings name=value.
   *  extractURLParameters(URL)
   *
   * Extract names of all parameters from URL in form of array of strings.
   *  extractURLParameterNames(URL)
   *
   * Remove specified parameter from URL.
   *  cutURLParameter(URL, name)
   *
   * Get array of URL 'hierarchy' as in web-analytics tree-like reports. See the docs.
   *  URLHierarchy(URL)
   */

 using Pos = const char*;

 /** Select part of string using the Extractor.
   */
 template <typename Extractor>
 struct ExtractSubstringImpl {
     static void vector(const ColumnString::Chars& data, const ColumnString::Offsets& offsets,
                        ColumnString::Chars& res_data, ColumnString::Offsets& res_offsets) {
         size_t size = offsets.size();
         res_offsets.resize(size);
         res_data.reserve(size * Extractor::get_reserve_length_for_element());

         size_t prev_offset = 0;
         size_t res_offset = 0;

         /// Matched part.
         Pos start;
         size_t length;

         for (size_t i = 0; i < size; ++i) {
             Extractor::execute(reinterpret_cast<const char*>(&data[prev_offset]),
                                offsets[i] - prev_offset, start, length);
             res_data.resize(res_data.size() + length);
             memcpy_small_allow_read_write_overflow15(&res_data[res_offset], start, length);
             res_offset += length;

             res_offsets[i] = res_offset;
             prev_offset = offsets[i];
         }
     }

     static void constant(const std::string& data, std::string& res_data) {
         Pos start;
         size_t length;
         Extractor::execute(data.data(), data.size(), start, length);
         res_data.assign(start, length);
     }
 };

 /** Delete part of string using the Extractor.
   */
 template <typename Extractor>
 struct CutSubstringImpl {
     static void vector(const ColumnString::Chars& data, const ColumnString::Offsets& offsets,
                        ColumnString::Chars& res_data, ColumnString::Offsets& res_offsets) {
         res_data.reserve(data.size());
         size_t size = offsets.size();
         res_offsets.resize(size);

         size_t prev_offset = 0;
         size_t res_offset = 0;

         /// Matched part.
         Pos start;
         size_t length;

         for (size_t i = 0; i < size; ++i) {
             const char* current = reinterpret_cast<const char*>(&data[prev_offset]);
             Extractor::execute(current, offsets[i] - prev_offset, start, length);
             size_t start_index = start - reinterpret_cast<const char*>(data.data());

             res_data.resize(res_data.size() + offsets[i] - prev_offset - length);
             memcpy_small_allow_read_write_overflow15(&res_data[res_offset], current,
                                                      start - current);
             memcpy_small_allow_read_write_overflow15(&res_data[res_offset + start - current],
                                                      start + length,
                                                      offsets[i] - start_index - length);
             res_offset += offsets[i] - prev_offset - length;

             res_offsets[i] = res_offset;
             prev_offset = offsets[i];
         }
     }

     static void constant(const std::string& data, std::string& res_data) {
         Pos start;
         size_t length;
         Extractor::execute(data.data(), data.size(), start, length);
         res_data.reserve(data.size() - length);
         res_data.append(data.data(), start);
         res_data.append(start + length, data.data() + data.size());
     }
 };

 } // namespace doris::vectorized
	// Licensed to the Apache Software Foundation (ASF) under one
	// or more contributor license agreements. See the NOTICE file
	// distributed with this work for additional information
	// regarding copyright ownership. The ASF licenses this file
	// to you under the Apache License, Version 2.0 (the
	// "License"); you may not use this file except in compliance
	// with the License. You may obtain a copy of the License at
	//
	// http://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing,
	// software distributed under the License is distributed on an
	// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	// KIND, either express or implied. See the License for the
	// specific language governing permissions and limitations
	// under the License.
	// This file is copied from
	// https://github.com/ClickHouse/ClickHouse/blob/master/src/Functions/URL/FunctionsURL.h
	// and modified by Doris

	#pragma once

	#include "vec/columns/column_string.h"
	#include "vec/common/memcpy_small.h"

	namespace doris::vectorized {

	/** URL processing functions. See implementation in separate .cpp files.
	* All functions are not strictly follow RFC, instead they are maximally simplified for performance reasons.
	*
	* Functions for extraction parts of URL.
	* If URL has nothing like, then empty string is returned.
	*
	* domain
	* domainWithoutWWW
	* topLevelDomain
	* protocol
	* path
	* queryString
	* fragment
	* queryStringAndFragment
	* netloc
	*
	* Functions, removing parts from URL.
	* If URL has nothing like, then it is returned unchanged.
	*
	* cutWWW
	* cutFragment
	* cutQueryString
	* cutQueryStringAndFragment
	*
	* Extract value of parameter in query string or in fragment identifier. Return empty string, if URL has no such parameter.
	* If there are many parameters with same name - return value of first one. Value is not %-decoded.
	*
	* extractURLParameter(URL, name)
	*
	* Extract all parameters from URL in form of array of strings name=value.
	* extractURLParameters(URL)
	*
	* Extract names of all parameters from URL in form of array of strings.
	* extractURLParameterNames(URL)
	*
	* Remove specified parameter from URL.
	* cutURLParameter(URL, name)
	*
	* Get array of URL 'hierarchy' as in web-analytics tree-like reports. See the docs.
	* URLHierarchy(URL)
	*/

	using Pos = const char*;

	/** Select part of string using the Extractor.
	*/
	template <typename Extractor>
	struct ExtractSubstringImpl {
	static void vector(const ColumnString::Chars& data, const ColumnString::Offsets& offsets,
	ColumnString::Chars& res_data, ColumnString::Offsets& res_offsets) {
	size_t size = offsets.size();
	res_offsets.resize(size);
	res_data.reserve(size * Extractor::get_reserve_length_for_element());

	size_t prev_offset = 0;
	size_t res_offset = 0;

	/// Matched part.
	Pos start;
	size_t length;

	for (size_t i = 0; i < size; ++i) {
	Extractor::execute(reinterpret_cast<const char*>(&data[prev_offset]),
	offsets[i] - prev_offset, start, length);
	res_data.resize(res_data.size() + length);
	memcpy_small_allow_read_write_overflow15(&res_data[res_offset], start, length);
	res_offset += length;

	res_offsets[i] = res_offset;
	prev_offset = offsets[i];
	}
	}

	static void constant(const std::string& data, std::string& res_data) {
	Pos start;
	size_t length;
	Extractor::execute(data.data(), data.size(), start, length);
	res_data.assign(start, length);
	}
	};

	/** Delete part of string using the Extractor.
	*/
	template <typename Extractor>
	struct CutSubstringImpl {
	static void vector(const ColumnString::Chars& data, const ColumnString::Offsets& offsets,
	ColumnString::Chars& res_data, ColumnString::Offsets& res_offsets) {
	res_data.reserve(data.size());
	size_t size = offsets.size();
	res_offsets.resize(size);

	size_t prev_offset = 0;
	size_t res_offset = 0;

	/// Matched part.
	Pos start;
	size_t length;

	for (size_t i = 0; i < size; ++i) {
	const char* current = reinterpret_cast<const char*>(&data[prev_offset]);
	Extractor::execute(current, offsets[i] - prev_offset, start, length);
	size_t start_index = start - reinterpret_cast<const char*>(data.data());

	res_data.resize(res_data.size() + offsets[i] - prev_offset - length);
	memcpy_small_allow_read_write_overflow15(&res_data[res_offset], current,
	start - current);
	memcpy_small_allow_read_write_overflow15(&res_data[res_offset + start - current],
	start + length,
	offsets[i] - start_index - length);
	res_offset += offsets[i] - prev_offset - length;

	res_offsets[i] = res_offset;
	prev_offset = offsets[i];
	}
	}

	static void constant(const std::string& data, std::string& res_data) {
	Pos start;
	size_t length;
	Extractor::execute(data.data(), data.size(), start, length);
	res_data.reserve(data.size() - length);
	res_data.append(data.data(), start);
	res_data.append(start + length, data.data() + data.size());
	}
	};

	} // namespace doris::vectorized