blob: 35c6b8981778b731b1ff09ff1d0f41a8796a2098 [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#include "arrow/util/uri.h"
#include <cstring>
#include <sstream>
#include <vector>
#include "arrow/util/string_view.h"
#include "arrow/util/value_parsing.h"
#include "arrow/vendored/uriparser/Uri.h"
namespace arrow {
namespace internal {
namespace {
util::string_view TextRangeToView(const UriTextRangeStructA& range) {
if (range.first == nullptr) {
return "";
} else {
return {range.first, static_cast<size_t>(range.afterLast - range.first)};
}
}
std::string TextRangeToString(const UriTextRangeStructA& range) {
return std::string(TextRangeToView(range));
}
// There can be a difference between an absent field and an empty field.
// For example, in "unix:/tmp/foo", the host is absent, while in
// "unix:///tmp/foo", the host is empty but present.
// This function helps distinguish.
bool IsTextRangeSet(const UriTextRangeStructA& range) { return range.first != nullptr; }
#ifdef _WIN32
bool IsDriveSpec(const util::string_view s) {
return (s.length() >= 2 && s[1] == ':' &&
((s[0] >= 'A' && s[0] <= 'Z') || (s[0] >= 'a' && s[0] <= 'z')));
}
#endif
} // namespace
std::string UriEscape(const std::string& s) {
if (s.empty()) {
// Avoid passing null pointer to uriEscapeExA
return s;
}
std::string escaped;
escaped.resize(3 * s.length());
auto end = uriEscapeExA(s.data(), s.data() + s.length(), &escaped[0],
/*spaceToPlus=*/URI_FALSE, /*normalizeBreaks=*/URI_FALSE);
escaped.resize(end - &escaped[0]);
return escaped;
}
std::string UriUnescape(const util::string_view s) {
std::string result(s);
if (!result.empty()) {
auto end = uriUnescapeInPlaceA(&result[0]);
result.resize(end - &result[0]);
}
return result;
}
std::string UriEncodeHost(const std::string& host) {
// Fairly naive check: if it contains a ':', it's IPv6 and needs
// brackets, else it's OK
if (host.find(":") != std::string::npos) {
std::string result = "[";
result += host;
result += ']';
return result;
} else {
return host;
}
}
struct Uri::Impl {
Impl() : string_rep_(""), port_(-1) { memset(&uri_, 0, sizeof(uri_)); }
~Impl() { uriFreeUriMembersA(&uri_); }
void Reset() {
uriFreeUriMembersA(&uri_);
memset(&uri_, 0, sizeof(uri_));
data_.clear();
string_rep_.clear();
path_segments_.clear();
port_ = -1;
}
const std::string& KeepString(const std::string& s) {
data_.push_back(s);
return data_.back();
}
UriUriA uri_;
// Keep alive strings that uriparser stores pointers to
std::vector<std::string> data_;
std::string string_rep_;
int32_t port_;
std::vector<util::string_view> path_segments_;
bool is_file_uri_;
bool is_absolute_path_;
};
Uri::Uri() : impl_(new Impl) {}
Uri::~Uri() {}
Uri::Uri(Uri&& u) : impl_(std::move(u.impl_)) {}
Uri& Uri::operator=(Uri&& u) {
impl_ = std::move(u.impl_);
return *this;
}
std::string Uri::scheme() const { return TextRangeToString(impl_->uri_.scheme); }
std::string Uri::host() const { return TextRangeToString(impl_->uri_.hostText); }
bool Uri::has_host() const { return IsTextRangeSet(impl_->uri_.hostText); }
std::string Uri::port_text() const { return TextRangeToString(impl_->uri_.portText); }
int32_t Uri::port() const { return impl_->port_; }
std::string Uri::username() const {
auto userpass = TextRangeToView(impl_->uri_.userInfo);
auto sep_pos = userpass.find_first_of(':');
if (sep_pos == util::string_view::npos) {
return UriUnescape(userpass);
} else {
return UriUnescape(userpass.substr(0, sep_pos));
}
}
std::string Uri::password() const {
auto userpass = TextRangeToView(impl_->uri_.userInfo);
auto sep_pos = userpass.find_first_of(':');
if (sep_pos == util::string_view::npos) {
return std::string();
} else {
return UriUnescape(userpass.substr(sep_pos + 1));
}
}
std::string Uri::path() const {
const auto& segments = impl_->path_segments_;
bool must_prepend_slash = impl_->is_absolute_path_;
#ifdef _WIN32
// On Windows, "file:///C:/foo" should have path "C:/foo", not "/C:/foo",
// despite it being absolute.
// (see https://tools.ietf.org/html/rfc8089#page-13)
if (impl_->is_absolute_path_ && impl_->is_file_uri_ && segments.size() > 0 &&
IsDriveSpec(segments[0])) {
must_prepend_slash = false;
}
#endif
std::stringstream ss;
if (must_prepend_slash) {
ss << "/";
}
bool first = true;
for (const auto& seg : segments) {
if (!first) {
ss << "/";
}
first = false;
ss << seg;
}
return std::move(ss).str();
}
std::string Uri::query_string() const { return TextRangeToString(impl_->uri_.query); }
Result<std::vector<std::pair<std::string, std::string>>> Uri::query_items() const {
const auto& query = impl_->uri_.query;
UriQueryListA* query_list;
int item_count;
std::vector<std::pair<std::string, std::string>> items;
if (query.first == nullptr) {
return items;
}
if (uriDissectQueryMallocA(&query_list, &item_count, query.first, query.afterLast) !=
URI_SUCCESS) {
return Status::Invalid("Cannot parse query string: '", query_string(), "'");
}
std::unique_ptr<UriQueryListA, decltype(&uriFreeQueryListA)> query_guard(
query_list, uriFreeQueryListA);
items.reserve(item_count);
while (query_list != nullptr) {
if (query_list->value != nullptr) {
items.emplace_back(query_list->key, query_list->value);
} else {
items.emplace_back(query_list->key, "");
}
query_list = query_list->next;
}
return items;
}
const std::string& Uri::ToString() const { return impl_->string_rep_; }
Status Uri::Parse(const std::string& uri_string) {
impl_->Reset();
const auto& s = impl_->KeepString(uri_string);
impl_->string_rep_ = s;
const char* error_pos;
if (uriParseSingleUriExA(&impl_->uri_, s.data(), s.data() + s.size(), &error_pos) !=
URI_SUCCESS) {
return Status::Invalid("Cannot parse URI: '", uri_string, "'");
}
const auto scheme = TextRangeToView(impl_->uri_.scheme);
if (scheme.empty()) {
return Status::Invalid("URI has empty scheme: '", uri_string, "'");
}
impl_->is_file_uri_ = (scheme == "file");
// Gather path segments
auto path_seg = impl_->uri_.pathHead;
while (path_seg != nullptr) {
impl_->path_segments_.push_back(TextRangeToView(path_seg->text));
path_seg = path_seg->next;
}
// Decide whether URI path is absolute
impl_->is_absolute_path_ = false;
if (impl_->uri_.absolutePath == URI_TRUE) {
impl_->is_absolute_path_ = true;
} else if (has_host() && impl_->path_segments_.size() > 0) {
// When there's a host (even empty), uriparser considers the path relative.
// Several URI parsers for Python all consider it absolute, though.
// For example, the path for "file:///tmp/foo" is "/tmp/foo", not "tmp/foo".
// Similarly, the path for "file://localhost/" is "/".
// However, the path for "file://localhost" is "".
impl_->is_absolute_path_ = true;
}
#ifdef _WIN32
// There's an exception on Windows: "file:/C:foo/bar" is relative.
if (impl_->is_file_uri_ && impl_->path_segments_.size() > 0) {
const auto& first_seg = impl_->path_segments_[0];
if (IsDriveSpec(first_seg) && (first_seg.length() >= 3 && first_seg[2] != '/')) {
impl_->is_absolute_path_ = false;
}
}
#endif
if (impl_->is_file_uri_ && !impl_->is_absolute_path_) {
return Status::Invalid("File URI cannot be relative: '", uri_string, "'");
}
// Parse port number
auto port_text = TextRangeToView(impl_->uri_.portText);
if (port_text.size()) {
uint16_t port_num;
if (!ParseValue<UInt16Type>(port_text.data(), port_text.size(), &port_num)) {
return Status::Invalid("Invalid port number '", port_text, "' in URI '", uri_string,
"'");
}
impl_->port_ = port_num;
}
return Status::OK();
}
} // namespace internal
} // namespace arrow