blob: a0bdd153e6b0577ef15db1bc2d6b3fad0a635eeb [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#include "util/url_parser.h"
#include <ctype.h>
#include <stdint.h>
#include <algorithm>
#include <string>
#include "runtime/string_search.hpp"
#include "vec/common/string_ref.h"
namespace doris {
const StringRef UrlParser::_s_url_authority(const_cast<char*>("AUTHORITY"), 9);
const StringRef UrlParser::_s_url_file(const_cast<char*>("FILE"), 4);
const StringRef UrlParser::_s_url_host(const_cast<char*>("HOST"), 4);
const StringRef UrlParser::_s_url_path(const_cast<char*>("PATH"), 4);
const StringRef UrlParser::_s_url_protocol(const_cast<char*>("PROTOCOL"), 8);
const StringRef UrlParser::_s_url_query(const_cast<char*>("QUERY"), 5);
const StringRef UrlParser::_s_url_ref(const_cast<char*>("REF"), 3);
const StringRef UrlParser::_s_url_userinfo(const_cast<char*>("USERINFO"), 8);
const StringRef UrlParser::_s_url_port(const_cast<char*>("PORT"), 4);
const StringRef UrlParser::_s_protocol(const_cast<char*>("://"), 3);
const StringRef UrlParser::_s_at(const_cast<char*>("@"), 1);
const StringRef UrlParser::_s_slash(const_cast<char*>("/"), 1);
const StringRef UrlParser::_s_colon(const_cast<char*>(":"), 1);
const StringRef UrlParser::_s_question(const_cast<char*>("?"), 1);
const StringRef UrlParser::_s_hash(const_cast<char*>("#"), 1);
const StringSearch UrlParser::_s_protocol_search(&_s_protocol);
const StringSearch UrlParser::_s_at_search(&_s_at);
const StringSearch UrlParser::_s_slash_search(&_s_slash);
const StringSearch UrlParser::_s_colon_search(&_s_colon);
const StringSearch UrlParser::_s_question_search(&_s_question);
const StringSearch UrlParser::_s_hash_search(&_s_hash);
bool UrlParser::parse_url(const StringRef& url, UrlPart part, StringRef* result) {
result->data = nullptr;
result->size = 0;
// Remove leading and trailing spaces.
StringRef trimmed_url = url.trim();
// All parts require checking for the _s_protocol.
int32_t protocol_pos = _s_protocol_search.search(&trimmed_url);
if (protocol_pos < 0) {
return false;
}
// Positioned to first char after '://'.
StringRef protocol_end = trimmed_url.substring(protocol_pos + _s_protocol.size);
switch (part) {
case AUTHORITY: {
// Find first '/'.
int32_t end_pos = _s_slash_search.search(&protocol_end);
*result = protocol_end.substring(0, end_pos);
break;
}
case FILE:
case PATH: {
// Find first '/'.
int32_t start_pos = _s_slash_search.search(&protocol_end);
if (start_pos < 0) {
// Return empty string. This is what Hive does.
return true;
}
StringRef path_start = protocol_end.substring(start_pos);
int32_t end_pos;
if (part == FILE) {
// End _s_at '#'.
end_pos = _s_hash_search.search(&path_start);
} else {
// End string _s_at next '?' or '#'.
end_pos = _s_question_search.search(&path_start);
if (end_pos < 0) {
// No '?' was found, look for '#'.
end_pos = _s_hash_search.search(&path_start);
}
}
*result = path_start.substring(0, end_pos);
break;
}
case HOST: {
// Find '@'.
int32_t start_pos = _s_at_search.search(&protocol_end);
if (start_pos < 0) {
// No '@' was found, i.e., no user:pass info was given, start after _s_protocol.
start_pos = 0;
} else {
// Skip '@'.
start_pos += _s_at.size;
}
StringRef host_start = protocol_end.substring(start_pos);
// Find first '?'.
int32_t query_start_pos = _s_question_search.search(&host_start);
if (query_start_pos > 0) {
host_start = host_start.substring(0, query_start_pos);
}
// Find ':' to strip out port.
int32_t end_pos = _s_colon_search.search(&host_start);
if (end_pos < 0) {
// No port was given. search for '/' to determine ending position.
end_pos = _s_slash_search.search(&host_start);
}
*result = host_start.substring(0, end_pos);
break;
}
case PROTOCOL: {
*result = trimmed_url.substring(0, protocol_pos);
break;
}
case QUERY: {
// Find first '?'.
int32_t start_pos = _s_question_search.search(&protocol_end);
if (start_pos < 0) {
// Indicate no query was found.
return false;
}
StringRef query_start = protocol_end.substring(start_pos + _s_question.size);
// End string _s_at next '#'.
int32_t end_pos = _s_hash_search.search(&query_start);
*result = query_start.substring(0, end_pos);
break;
}
case REF: {
// Find '#'.
int32_t start_pos = _s_hash_search.search(&protocol_end);
if (start_pos < 0) {
// Indicate no user and pass were given.
return false;
}
*result = protocol_end.substring(start_pos + _s_hash.size);
break;
}
case USERINFO: {
// Find '@'.
int32_t end_pos = _s_at_search.search(&protocol_end);
if (end_pos < 0) {
// Indicate no user and pass were given.
return false;
}
*result = protocol_end.substring(0, end_pos);
break;
}
case PORT: {
// Find '@'.
int32_t start_pos = _s_at_search.search(&protocol_end);
if (start_pos < 0) {
// No '@' was found, i.e., no user:pass info was given, start after _s_protocol.
start_pos = 0;
} else {
// Skip '@'.
start_pos += _s_at.size;
}
StringRef host_start = protocol_end.substring(start_pos);
// Find ':' to strip out port.
int32_t end_pos = _s_colon_search.search(&host_start);
//no port found
if (end_pos < 0) {
return false;
}
StringRef port_start_str = protocol_end.substring(end_pos + _s_colon.size);
int32_t port_end_pos = _s_slash_search.search(&port_start_str);
//if '/' not found, try to find '?'
if (port_end_pos < 0) {
port_end_pos = _s_question_search.search(&port_start_str);
}
*result = port_start_str.substring(0, port_end_pos);
break;
}
case INVALID:
return false;
}
return true;
}
bool UrlParser::parse_url_key(const StringRef& url, UrlPart part, const StringRef& key,
StringRef* result) {
// Part must be query to ask for a specific query key.
if (part != QUERY) {
return false;
}
// Remove leading and trailing spaces.
StringRef trimmed_url = url.trim();
// Search for the key in the url, ignoring malformed URLs for now.
StringSearch key_search(&key);
while (trimmed_url.size > 0) {
// Search for the key in the current substring.
int32_t key_pos = key_search.search(&trimmed_url);
bool match = true;
if (key_pos < 0) {
return false;
}
// Key pos must be != 0 because it must be preceded by a '?' or a '&'.
// Check that the char before key_pos is either '?' or '&'.
if (key_pos == 0 ||
(trimmed_url.data[key_pos - 1] != '?' && trimmed_url.data[key_pos - 1] != '&')) {
match = false;
}
// Advance substring beyond matching key.
trimmed_url = trimmed_url.substring(key_pos + key.size);
if (!match) {
continue;
}
if (trimmed_url.size <= 0) {
break;
}
// Next character must be '=', otherwise the match cannot be a key in the query part.
if (trimmed_url.data[0] != '=') {
continue;
}
int32_t pos = 1;
// Find ending position of key's value by matching '#' or '&'.
while (pos < trimmed_url.size) {
switch (trimmed_url.data[pos]) {
case '#':
case '&':
*result = trimmed_url.substring(1, pos - 1);
return true;
}
++pos;
}
// Ending position is end of string.
*result = trimmed_url.substring(1);
return true;
}
return false;
}
UrlParser::UrlPart UrlParser::get_url_part(const StringRef& part) {
// Quick filter on requested URL part, based on first character.
// Hive requires the requested URL part to be all upper case.
std::string part_str = part.to_string();
transform(part_str.begin(), part_str.end(), part_str.begin(), ::toupper);
StringRef newPart = StringRef(part_str);
switch (newPart.data[0]) {
case 'A': {
if (!newPart.eq(_s_url_authority)) {
return INVALID;
}
return AUTHORITY;
}
case 'F': {
if (!newPart.eq(_s_url_file)) {
return INVALID;
}
return FILE;
}
case 'H': {
if (!newPart.eq(_s_url_host)) {
return INVALID;
}
return HOST;
}
case 'P': {
if (newPart.eq(_s_url_path)) {
return PATH;
} else if (newPart.eq(_s_url_protocol)) {
return PROTOCOL;
} else if (newPart.eq(_s_url_port)) {
return PORT;
} else {
return INVALID;
}
}
case 'Q': {
if (!newPart.eq(_s_url_query)) {
return INVALID;
}
return QUERY;
}
case 'R': {
if (!newPart.eq(_s_url_ref)) {
return INVALID;
}
return REF;
}
case 'U': {
if (!newPart.eq(_s_url_userinfo)) {
return INVALID;
}
return USERINFO;
}
default:
return INVALID;
}
}
StringRef UrlParser::extract_url(StringRef url, StringRef name) {
StringRef result("", 0);
// Remove leading and trailing spaces.
StringRef trimmed_url = url.trim();
// find '?'
int32_t question_pos = _s_question_search.search(&trimmed_url);
if (question_pos < 0) {
// this url no parameters.
// Example: https://doris.apache.org/
return result;
}
// find '#'
int32_t hash_pos = _s_hash_search.search(&trimmed_url);
StringRef sub_url;
if (hash_pos < 0) {
sub_url = trimmed_url.substring(question_pos + 1, trimmed_url.size - question_pos - 1);
} else {
sub_url = trimmed_url.substring(question_pos + 1, hash_pos - question_pos - 1);
}
// find '&' and '=', and extract target parameter
// Example: k1=aa&k2=bb&k3=cc&test=dd
int64_t and_pod;
auto len = sub_url.size;
StringRef key_url;
while (true) {
if (len <= 0) {
break;
}
and_pod = sub_url.find_first_of('&');
if (and_pod != -1) {
key_url = sub_url.substring(0, and_pod);
sub_url = sub_url.substring(and_pod + 1, len - and_pod - 1);
} else {
auto end_pos = sub_url.find_first_of('#');
key_url = end_pos == -1 ? sub_url : sub_url.substring(0, end_pos);
sub_url = result;
}
len = sub_url.size;
auto eq_pod = key_url.find_first_of('=');
if (eq_pod == -1) {
// invalid url. like: k1&k2=bb
continue;
}
int32_t key_len = key_url.size;
auto key = key_url.substring(0, eq_pod);
if (name == key) {
return key_url.substring(eq_pod + 1, key_len - eq_pod - 1);
}
}
return result;
}
} // namespace doris