be/src/util/url_parser.cpp - doris - Git at Google

 // Licensed to the Apache Software Foundation (ASF) under one
 // or more contributor license agreements.  See the NOTICE file
 // distributed with this work for additional information
 // regarding copyright ownership.  The ASF licenses this file
 // to you under the Apache License, Version 2.0 (the
 // "License"); you may not use this file except in compliance
 // with the License.  You may obtain a copy of the License at
 //
 //   http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing,
 // software distributed under the License is distributed on an
 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 // KIND, either express or implied.  See the License for the
 // specific language governing permissions and limitations
 // under the License.

 #include "util/url_parser.h"

 #include <ctype.h>
 #include <stdint.h>

 #include <algorithm>
 #include <string>

 #include "runtime/string_search.hpp"
 #include "vec/common/string_ref.h"

 namespace doris {

 const StringRef UrlParser::_s_url_authority(const_cast<char*>("AUTHORITY"), 9);
 const StringRef UrlParser::_s_url_file(const_cast<char*>("FILE"), 4);
 const StringRef UrlParser::_s_url_host(const_cast<char*>("HOST"), 4);
 const StringRef UrlParser::_s_url_path(const_cast<char*>("PATH"), 4);
 const StringRef UrlParser::_s_url_protocol(const_cast<char*>("PROTOCOL"), 8);
 const StringRef UrlParser::_s_url_query(const_cast<char*>("QUERY"), 5);
 const StringRef UrlParser::_s_url_ref(const_cast<char*>("REF"), 3);
 const StringRef UrlParser::_s_url_userinfo(const_cast<char*>("USERINFO"), 8);
 const StringRef UrlParser::_s_url_port(const_cast<char*>("PORT"), 4);
 const StringRef UrlParser::_s_protocol(const_cast<char*>("://"), 3);
 const StringRef UrlParser::_s_at(const_cast<char*>("@"), 1);
 const StringRef UrlParser::_s_slash(const_cast<char*>("/"), 1);
 const StringRef UrlParser::_s_colon(const_cast<char*>(":"), 1);
 const StringRef UrlParser::_s_question(const_cast<char*>("?"), 1);
 const StringRef UrlParser::_s_hash(const_cast<char*>("#"), 1);
 const StringSearch UrlParser::_s_protocol_search(&_s_protocol);
 const StringSearch UrlParser::_s_at_search(&_s_at);
 const StringSearch UrlParser::_s_slash_search(&_s_slash);
 const StringSearch UrlParser::_s_colon_search(&_s_colon);
 const StringSearch UrlParser::_s_question_search(&_s_question);
 const StringSearch UrlParser::_s_hash_search(&_s_hash);

 bool UrlParser::parse_url(const StringRef& url, UrlPart part, StringRef* result) {
     result->data = nullptr;
     result->size = 0;
     // Remove leading and trailing spaces.
     StringRef trimmed_url = url.trim();

     // All parts require checking for the _s_protocol.
     int32_t protocol_pos = _s_protocol_search.search(&trimmed_url);
     if (protocol_pos < 0) {
         return false;
     }

     // Positioned to first char after '://'.
     StringRef protocol_end = trimmed_url.substring(protocol_pos + _s_protocol.size);

     switch (part) {
     case AUTHORITY: {
         // Find first '/'.
         int32_t end_pos = _s_slash_search.search(&protocol_end);
         *result = protocol_end.substring(0, end_pos);
         break;
     }

     case FILE:
     case PATH: {
         // Find first '/'.
         int32_t start_pos = _s_slash_search.search(&protocol_end);

         if (start_pos < 0) {
             // Return empty string. This is what Hive does.
             return true;
         }

         StringRef path_start = protocol_end.substring(start_pos);
         int32_t end_pos;

         if (part == FILE) {
             // End _s_at '#'.
             end_pos = _s_hash_search.search(&path_start);
         } else {
             // End string _s_at next '?' or '#'.
             end_pos = _s_question_search.search(&path_start);

             if (end_pos < 0) {
                 // No '?' was found, look for '#'.
                 end_pos = _s_hash_search.search(&path_start);
             }
         }

         *result = path_start.substring(0, end_pos);
         break;
     }

     case HOST: {
         // Find '@'.
         int32_t start_pos = _s_at_search.search(&protocol_end);

         if (start_pos < 0) {
             // No '@' was found, i.e., no user:pass info was given, start after _s_protocol.
             start_pos = 0;
         } else {
             // Skip '@'.
             start_pos += _s_at.size;
         }

         StringRef host_start = protocol_end.substring(start_pos);
         // Find first '?'.
         int32_t query_start_pos = _s_question_search.search(&host_start);
         if (query_start_pos > 0) {
             host_start = host_start.substring(0, query_start_pos);
         }
         // Find ':' to strip out port.
         int32_t end_pos = _s_colon_search.search(&host_start);

         if (end_pos < 0) {
             // No port was given. search for '/' to determine ending position.
             end_pos = _s_slash_search.search(&host_start);
         }

         *result = host_start.substring(0, end_pos);
         break;
     }

     case PROTOCOL: {
         *result = trimmed_url.substring(0, protocol_pos);
         break;
     }

     case QUERY: {
         // Find first '?'.
         int32_t start_pos = _s_question_search.search(&protocol_end);

         if (start_pos < 0) {
             // Indicate no query was found.
             return false;
         }

         StringRef query_start = protocol_end.substring(start_pos + _s_question.size);
         // End string _s_at next '#'.
         int32_t end_pos = _s_hash_search.search(&query_start);
         *result = query_start.substring(0, end_pos);
         break;
     }

     case REF: {
         // Find '#'.
         int32_t start_pos = _s_hash_search.search(&protocol_end);

         if (start_pos < 0) {
             // Indicate no user and pass were given.
             return false;
         }

         *result = protocol_end.substring(start_pos + _s_hash.size);
         break;
     }

     case USERINFO: {
         // Find '@'.
         int32_t end_pos = _s_at_search.search(&protocol_end);

         if (end_pos < 0) {
             // Indicate no user and pass were given.
             return false;
         }

         *result = protocol_end.substring(0, end_pos);
         break;
     }

     case PORT: {
         // Find '@'.
         int32_t start_pos = _s_at_search.search(&protocol_end);

         if (start_pos < 0) {
             // No '@' was found, i.e., no user:pass info was given, start after _s_protocol.
             start_pos = 0;
         } else {
             // Skip '@'.
             start_pos += _s_at.size;
         }

         StringRef host_start = protocol_end.substring(start_pos);
         // Find ':' to strip out port.
         int32_t end_pos = _s_colon_search.search(&host_start);
         //no port found
         if (end_pos < 0) {
             return false;
         }

         StringRef port_start_str = protocol_end.substring(end_pos + _s_colon.size);
         int32_t port_end_pos = _s_slash_search.search(&port_start_str);
         //if '/' not found, try to find '?'
         if (port_end_pos < 0) {
             port_end_pos = _s_question_search.search(&port_start_str);
         }
         *result = port_start_str.substring(0, port_end_pos);
         break;
     }

     case INVALID:
         return false;
     }

     return true;
 }

 bool UrlParser::parse_url_key(const StringRef& url, UrlPart part, const StringRef& key,
                               StringRef* result) {
     // Part must be query to ask for a specific query key.
     if (part != QUERY) {
         return false;
     }

     // Remove leading and trailing spaces.
     StringRef trimmed_url = url.trim();

     // Search for the key in the url, ignoring malformed URLs for now.
     StringSearch key_search(&key);

     while (trimmed_url.size > 0) {
         // Search for the key in the current substring.
         int32_t key_pos = key_search.search(&trimmed_url);
         bool match = true;

         if (key_pos < 0) {
             return false;
         }

         // Key pos must be != 0 because it must be preceded by a '?' or a '&'.
         // Check that the char before key_pos is either '?' or '&'.
         if (key_pos == 0 ||
             (trimmed_url.data[key_pos - 1] != '?' && trimmed_url.data[key_pos - 1] != '&')) {
             match = false;
         }

         // Advance substring beyond matching key.
         trimmed_url = trimmed_url.substring(key_pos + key.size);

         if (!match) {
             continue;
         }

         if (trimmed_url.size <= 0) {
             break;
         }

         // Next character must be '=', otherwise the match cannot be a key in the query part.
         if (trimmed_url.data[0] != '=') {
             continue;
         }

         int32_t pos = 1;

         // Find ending position of key's value by matching '#' or '&'.
         while (pos < trimmed_url.size) {
             switch (trimmed_url.data[pos]) {
             case '#':
             case '&':
                 *result = trimmed_url.substring(1, pos - 1);
                 return true;
             }

             ++pos;
         }

         // Ending position is end of string.
         *result = trimmed_url.substring(1);
         return true;
     }

     return false;
 }

 UrlParser::UrlPart UrlParser::get_url_part(const StringRef& part) {
     // Quick filter on requested URL part, based on first character.
     // Hive requires the requested URL part to be all upper case.
     std::string part_str = part.to_string();
     transform(part_str.begin(), part_str.end(), part_str.begin(), ::toupper);
     StringRef newPart = StringRef(part_str);
     switch (newPart.data[0]) {
     case 'A': {
         if (!newPart.eq(_s_url_authority)) {
             return INVALID;
         }

         return AUTHORITY;
     }

     case 'F': {
         if (!newPart.eq(_s_url_file)) {
             return INVALID;
         }

         return FILE;
     }

     case 'H': {
         if (!newPart.eq(_s_url_host)) {
             return INVALID;
         }

         return HOST;
     }

     case 'P': {
         if (newPart.eq(_s_url_path)) {
             return PATH;
         } else if (newPart.eq(_s_url_protocol)) {
             return PROTOCOL;
         } else if (newPart.eq(_s_url_port)) {
             return PORT;
         } else {
             return INVALID;
         }
     }

     case 'Q': {
         if (!newPart.eq(_s_url_query)) {
             return INVALID;
         }

         return QUERY;
     }

     case 'R': {
         if (!newPart.eq(_s_url_ref)) {
             return INVALID;
         }

         return REF;
     }

     case 'U': {
         if (!newPart.eq(_s_url_userinfo)) {
             return INVALID;
         }

         return USERINFO;
     }

     default:
         return INVALID;
     }
 }

 StringRef UrlParser::extract_url(StringRef url, StringRef name) {
     StringRef result("", 0);
     // Remove leading and trailing spaces.
     StringRef trimmed_url = url.trim();
     // find '?'
     int32_t question_pos = _s_question_search.search(&trimmed_url);
     if (question_pos < 0) {
         // this url no parameters.
         // Example: https://doris.apache.org/
         return result;
     }

     // find '#'
     int32_t hash_pos = _s_hash_search.search(&trimmed_url);
     StringRef sub_url;
     if (hash_pos < 0) {
         sub_url = trimmed_url.substring(question_pos + 1, trimmed_url.size - question_pos - 1);
     } else {
         sub_url = trimmed_url.substring(question_pos + 1, hash_pos - question_pos - 1);
     }

     // find '&' and '=', and extract target parameter
     // Example: k1=aa&k2=bb&k3=cc&test=dd
     int64_t and_pod;
     auto len = sub_url.size;
     StringRef key_url;
     while (true) {
         if (len <= 0) {
             break;
         }
         and_pod = sub_url.find_first_of('&');
         if (and_pod != -1) {
             key_url = sub_url.substring(0, and_pod);
             sub_url = sub_url.substring(and_pod + 1, len - and_pod - 1);
         } else {
             auto end_pos = sub_url.find_first_of('#');
             key_url = end_pos == -1 ? sub_url : sub_url.substring(0, end_pos);
             sub_url = result;
         }
         len = sub_url.size;

         auto eq_pod = key_url.find_first_of('=');
         if (eq_pod == -1) {
             // invalid url. like: k1&k2=bb
             continue;
         }
         int32_t key_len = key_url.size;
         auto key = key_url.substring(0, eq_pod);
         if (name == key) {
             return key_url.substring(eq_pod + 1, key_len - eq_pod - 1);
         }
     }
     return result;
 }
 } // namespace doris
	// Licensed to the Apache Software Foundation (ASF) under one
	// or more contributor license agreements. See the NOTICE file
	// distributed with this work for additional information
	// regarding copyright ownership. The ASF licenses this file
	// to you under the Apache License, Version 2.0 (the
	// "License"); you may not use this file except in compliance
	// with the License. You may obtain a copy of the License at
	//
	// http://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing,
	// software distributed under the License is distributed on an
	// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	// KIND, either express or implied. See the License for the
	// specific language governing permissions and limitations
	// under the License.

	#include "util/url_parser.h"

	#include <ctype.h>
	#include <stdint.h>

	#include <algorithm>
	#include <string>

	#include "runtime/string_search.hpp"
	#include "vec/common/string_ref.h"

	namespace doris {

	const StringRef UrlParser::_s_url_authority(const_cast<char*>("AUTHORITY"), 9);
	const StringRef UrlParser::_s_url_file(const_cast<char*>("FILE"), 4);
	const StringRef UrlParser::_s_url_host(const_cast<char*>("HOST"), 4);
	const StringRef UrlParser::_s_url_path(const_cast<char*>("PATH"), 4);
	const StringRef UrlParser::_s_url_protocol(const_cast<char*>("PROTOCOL"), 8);
	const StringRef UrlParser::_s_url_query(const_cast<char*>("QUERY"), 5);
	const StringRef UrlParser::_s_url_ref(const_cast<char*>("REF"), 3);
	const StringRef UrlParser::_s_url_userinfo(const_cast<char*>("USERINFO"), 8);
	const StringRef UrlParser::_s_url_port(const_cast<char*>("PORT"), 4);
	const StringRef UrlParser::_s_protocol(const_cast<char*>("://"), 3);
	const StringRef UrlParser::_s_at(const_cast<char*>("@"), 1);
	const StringRef UrlParser::_s_slash(const_cast<char*>("/"), 1);
	const StringRef UrlParser::_s_colon(const_cast<char*>(":"), 1);
	const StringRef UrlParser::_s_question(const_cast<char*>("?"), 1);
	const StringRef UrlParser::_s_hash(const_cast<char*>("#"), 1);
	const StringSearch UrlParser::_s_protocol_search(&_s_protocol);
	const StringSearch UrlParser::_s_at_search(&_s_at);
	const StringSearch UrlParser::_s_slash_search(&_s_slash);
	const StringSearch UrlParser::_s_colon_search(&_s_colon);
	const StringSearch UrlParser::_s_question_search(&_s_question);
	const StringSearch UrlParser::_s_hash_search(&_s_hash);

	bool UrlParser::parse_url(const StringRef& url, UrlPart part, StringRef* result) {
	result->data = nullptr;
	result->size = 0;
	// Remove leading and trailing spaces.
	StringRef trimmed_url = url.trim();

	// All parts require checking for the _s_protocol.
	int32_t protocol_pos = _s_protocol_search.search(&trimmed_url);
	if (protocol_pos < 0) {
	return false;
	}

	// Positioned to first char after '://'.
	StringRef protocol_end = trimmed_url.substring(protocol_pos + _s_protocol.size);

	switch (part) {
	case AUTHORITY: {
	// Find first '/'.
	int32_t end_pos = _s_slash_search.search(&protocol_end);
	*result = protocol_end.substring(0, end_pos);
	break;
	}

	case FILE:
	case PATH: {
	// Find first '/'.
	int32_t start_pos = _s_slash_search.search(&protocol_end);

	if (start_pos < 0) {
	// Return empty string. This is what Hive does.
	return true;
	}

	StringRef path_start = protocol_end.substring(start_pos);
	int32_t end_pos;

	if (part == FILE) {
	// End _s_at '#'.
	end_pos = _s_hash_search.search(&path_start);
	} else {
	// End string _s_at next '?' or '#'.
	end_pos = _s_question_search.search(&path_start);

	if (end_pos < 0) {
	// No '?' was found, look for '#'.
	end_pos = _s_hash_search.search(&path_start);
	}
	}

	*result = path_start.substring(0, end_pos);
	break;
	}

	case HOST: {
	// Find '@'.
	int32_t start_pos = _s_at_search.search(&protocol_end);

	if (start_pos < 0) {
	// No '@' was found, i.e., no user:pass info was given, start after _s_protocol.
	start_pos = 0;
	} else {
	// Skip '@'.
	start_pos += _s_at.size;
	}

	StringRef host_start = protocol_end.substring(start_pos);
	// Find first '?'.
	int32_t query_start_pos = _s_question_search.search(&host_start);
	if (query_start_pos > 0) {
	host_start = host_start.substring(0, query_start_pos);
	}
	// Find ':' to strip out port.
	int32_t end_pos = _s_colon_search.search(&host_start);

	if (end_pos < 0) {
	// No port was given. search for '/' to determine ending position.
	end_pos = _s_slash_search.search(&host_start);
	}

	*result = host_start.substring(0, end_pos);
	break;
	}

	case PROTOCOL: {
	*result = trimmed_url.substring(0, protocol_pos);
	break;
	}

	case QUERY: {
	// Find first '?'.
	int32_t start_pos = _s_question_search.search(&protocol_end);

	if (start_pos < 0) {
	// Indicate no query was found.
	return false;
	}

	StringRef query_start = protocol_end.substring(start_pos + _s_question.size);
	// End string _s_at next '#'.
	int32_t end_pos = _s_hash_search.search(&query_start);
	*result = query_start.substring(0, end_pos);
	break;
	}

	case REF: {
	// Find '#'.
	int32_t start_pos = _s_hash_search.search(&protocol_end);

	if (start_pos < 0) {
	// Indicate no user and pass were given.
	return false;
	}

	*result = protocol_end.substring(start_pos + _s_hash.size);
	break;
	}

	case USERINFO: {
	// Find '@'.
	int32_t end_pos = _s_at_search.search(&protocol_end);

	if (end_pos < 0) {
	// Indicate no user and pass were given.
	return false;
	}

	*result = protocol_end.substring(0, end_pos);
	break;
	}

	case PORT: {
	// Find '@'.
	int32_t start_pos = _s_at_search.search(&protocol_end);

	if (start_pos < 0) {
	// No '@' was found, i.e., no user:pass info was given, start after _s_protocol.
	start_pos = 0;
	} else {
	// Skip '@'.
	start_pos += _s_at.size;
	}

	StringRef host_start = protocol_end.substring(start_pos);
	// Find ':' to strip out port.
	int32_t end_pos = _s_colon_search.search(&host_start);
	//no port found
	if (end_pos < 0) {
	return false;
	}

	StringRef port_start_str = protocol_end.substring(end_pos + _s_colon.size);
	int32_t port_end_pos = _s_slash_search.search(&port_start_str);
	//if '/' not found, try to find '?'
	if (port_end_pos < 0) {
	port_end_pos = _s_question_search.search(&port_start_str);
	}
	*result = port_start_str.substring(0, port_end_pos);
	break;
	}

	case INVALID:
	return false;
	}

	return true;
	}

	bool UrlParser::parse_url_key(const StringRef& url, UrlPart part, const StringRef& key,
	StringRef* result) {
	// Part must be query to ask for a specific query key.
	if (part != QUERY) {
	return false;
	}

	// Remove leading and trailing spaces.
	StringRef trimmed_url = url.trim();

	// Search for the key in the url, ignoring malformed URLs for now.
	StringSearch key_search(&key);

	while (trimmed_url.size > 0) {
	// Search for the key in the current substring.
	int32_t key_pos = key_search.search(&trimmed_url);
	bool match = true;

	if (key_pos < 0) {
	return false;
	}

	// Key pos must be != 0 because it must be preceded by a '?' or a '&'.
	// Check that the char before key_pos is either '?' or '&'.
	if (key_pos == 0 \|\|
	(trimmed_url.data[key_pos - 1] != '?' && trimmed_url.data[key_pos - 1] != '&')) {
	match = false;
	}

	// Advance substring beyond matching key.
	trimmed_url = trimmed_url.substring(key_pos + key.size);

	if (!match) {
	continue;
	}

	if (trimmed_url.size <= 0) {
	break;
	}

	// Next character must be '=', otherwise the match cannot be a key in the query part.
	if (trimmed_url.data[0] != '=') {
	continue;
	}

	int32_t pos = 1;

	// Find ending position of key's value by matching '#' or '&'.
	while (pos < trimmed_url.size) {
	switch (trimmed_url.data[pos]) {
	case '#':
	case '&':
	*result = trimmed_url.substring(1, pos - 1);
	return true;
	}

	++pos;
	}

	// Ending position is end of string.
	*result = trimmed_url.substring(1);
	return true;
	}

	return false;
	}

	UrlParser::UrlPart UrlParser::get_url_part(const StringRef& part) {
	// Quick filter on requested URL part, based on first character.
	// Hive requires the requested URL part to be all upper case.
	std::string part_str = part.to_string();
	transform(part_str.begin(), part_str.end(), part_str.begin(), ::toupper);
	StringRef newPart = StringRef(part_str);
	switch (newPart.data[0]) {
	case 'A': {
	if (!newPart.eq(_s_url_authority)) {
	return INVALID;
	}

	return AUTHORITY;
	}

	case 'F': {
	if (!newPart.eq(_s_url_file)) {
	return INVALID;
	}

	return FILE;
	}

	case 'H': {
	if (!newPart.eq(_s_url_host)) {
	return INVALID;
	}

	return HOST;
	}

	case 'P': {
	if (newPart.eq(_s_url_path)) {
	return PATH;
	} else if (newPart.eq(_s_url_protocol)) {
	return PROTOCOL;
	} else if (newPart.eq(_s_url_port)) {
	return PORT;
	} else {
	return INVALID;
	}
	}

	case 'Q': {
	if (!newPart.eq(_s_url_query)) {
	return INVALID;
	}

	return QUERY;
	}

	case 'R': {
	if (!newPart.eq(_s_url_ref)) {
	return INVALID;
	}

	return REF;
	}

	case 'U': {
	if (!newPart.eq(_s_url_userinfo)) {
	return INVALID;
	}

	return USERINFO;
	}

	default:
	return INVALID;
	}
	}

	StringRef UrlParser::extract_url(StringRef url, StringRef name) {
	StringRef result("", 0);
	// Remove leading and trailing spaces.
	StringRef trimmed_url = url.trim();
	// find '?'
	int32_t question_pos = _s_question_search.search(&trimmed_url);
	if (question_pos < 0) {
	// this url no parameters.
	// Example: https://doris.apache.org/
	return result;
	}

	// find '#'
	int32_t hash_pos = _s_hash_search.search(&trimmed_url);
	StringRef sub_url;
	if (hash_pos < 0) {
	sub_url = trimmed_url.substring(question_pos + 1, trimmed_url.size - question_pos - 1);
	} else {
	sub_url = trimmed_url.substring(question_pos + 1, hash_pos - question_pos - 1);
	}

	// find '&' and '=', and extract target parameter
	// Example: k1=aa&k2=bb&k3=cc&test=dd
	int64_t and_pod;
	auto len = sub_url.size;
	StringRef key_url;
	while (true) {
	if (len <= 0) {
	break;
	}
	and_pod = sub_url.find_first_of('&');
	if (and_pod != -1) {
	key_url = sub_url.substring(0, and_pod);
	sub_url = sub_url.substring(and_pod + 1, len - and_pod - 1);
	} else {
	auto end_pos = sub_url.find_first_of('#');
	key_url = end_pos == -1 ? sub_url : sub_url.substring(0, end_pos);
	sub_url = result;
	}
	len = sub_url.size;

	auto eq_pod = key_url.find_first_of('=');
	if (eq_pod == -1) {
	// invalid url. like: k1&k2=bb
	continue;
	}
	int32_t key_len = key_url.size;
	auto key = key_url.substring(0, eq_pod);
	if (name == key) {
	return key_url.substring(eq_pod + 1, key_len - eq_pod - 1);
	}
	}
	return result;
	}
	} // namespace doris