blob: ad50e6b3b877c73ff6846aa8445617f9b94b1eac [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#include "exprs/timezone_db.h"
#include <libgen.h>
#include <iostream>
#include <string>
#include <regex>
#include <boost/algorithm/string.hpp>
#include "common/compiler-util.h"
#include "common/logging.h"
#include "kudu/util/path_util.h"
#include "gutil/strings/ascii_ctype.h"
#include "gutil/strings/substitute.h"
#include "runtime/hdfs-fs-cache.h"
#include "util/debug-util.h"
#include "util/filesystem-util.h"
#include "util/hdfs-util.h"
#include "util/string-parser.h"
#include "common/names.h"
using std::ios_base;
using std::istream;
using std::regex;
using std::regex_constants::ECMAScript;
using std::regex_match;
using boost::algorithm::trim;
using kudu::JoinPathSegments;
DEFINE_string(hdfs_zone_info_zip, "",
"HDFS/S3A/ADLS path to a zip archive of the IANA time-zone database to use.");
DEFINE_string(hdfs_zone_alias_conf, "",
"HDFS/S3A/ADLS path to config file defining non-standard time-zone aliases.");
DECLARE_string(local_library_dir);
namespace impala {
static const int HDFS_READ_SIZE = 64 * 1024; // bytes
const string TimezoneDatabase::ZONE_INFO_DIR = "/usr/share/zoneinfo";
const string TimezoneDatabase::TIMEZONE_ALIASES = \
"#\n"
"# Java supports these non-standard time-zone abbreviations\n"
"#\n"
"ACT = Australia/Darwin\n"
"AET = Australia/Sydney\n"
"AGT = America/Argentina/Buenos_Aires\n"
"ART = Africa/Cairo\n"
"AST = America/Anchorage\n"
"BET = America/Sao_Paulo\n"
"BST = Asia/Dhaka\n"
"CAT = Africa/Harare\n"
"CNT = America/St_Johns\n"
"CST = America/Chicago\n"
"CTT = Asia/Shanghai\n"
"EAT = Africa/Addis_Ababa\n"
"ECT = Europe/Paris\n"
"IET = America/Indiana/Indianapolis\n"
"IST = Asia/Kolkata\n"
"JST = Asia/Tokyo\n"
"MIT = Pacific/Apia\n"
"NET = Asia/Yerevan\n"
"NST = Pacific/Auckland\n"
"PLT = Asia/Karachi\n"
"PNT = America/Phoenix\n"
"PRT = America/Puerto_Rico\n"
"PST = America/Los_Angeles\n"
"SST = Pacific/Guadalcanal\n"
"VST = Asia/Ho_Chi_Minh\n";
const Timezone TimezoneDatabase::UTC_TIMEZONE_ = cctz::utc_time_zone();
TimezoneDatabase::TimezoneMap TimezoneDatabase::tz_name_map_;
string TimezoneDatabase::tz_db_path_;
bool TimezoneDatabase::IsTimezoneNameSegmentValid(const string& tz_segment) {
static const regex reg("[A-Z][A-Za-z0-9:_+-]*", ECMAScript);
return regex_match(tz_segment, reg);
}
bool TimezoneDatabase::IsTimezoneNameValid(const string& tz_name) {
static const regex reg("[A-Z][A-Za-z0-9:_+-]*(/[A-Z][A-Za-z0-9:_+-]*)*", ECMAScript);
return regex_match(tz_name, reg);
}
bool TimezoneDatabase::IsTimezoneOffsetValid(const string& tz_offset,
int64_t* offset_sec) {
if (tz_offset.empty()) return false;
// The absolute value of the offset_sec cannot be greater than or equal to 24 hours.
constexpr int64_t max_abs_offset_sec = 24*60*60 - 1;
StringParser::ParseResult result;
*offset_sec = StringParser::StringToInt<int64_t>(
tz_offset.c_str(), tz_offset.length(), &result);
return result == StringParser::PARSE_SUCCESS &&
*offset_sec <= max_abs_offset_sec && *offset_sec >= -max_abs_offset_sec;
}
// The implementation here was adapted from
// https://github.com/HowardHinnant/date/blob/040eed838bb1f695c31c6016dbe74bddc0302bb8/
// src/tz.cpp#L3652
// available under MIT license.
string TimezoneDatabase::LocalZoneName() {
{
// Allow ${TZ} to override the default zone.
const char* zone = ":localtime";
char* tz_env = nullptr;
tz_env = getenv("TZ");
if (tz_env) zone = tz_env;
// We only support the "[:]<zone-name>" form.
if (*zone == ':') ++zone;
if (strcmp(zone, "localtime") != 0) return string(zone);
// Fall through to try other means.
}
{
// Check /etc/localtime.
// - If it exists and is a symlink it should point to the current timezone file in the
// zoneinfo directory.
// - If it doesn't exist or is not a symlink we want to try other means.
const char* localtime = "/etc/localtime";
bool is_symbolic_link;
string canonical_path;
Status status = FileSystemUtil::IsSymbolicLink(localtime, &is_symbolic_link,
&canonical_path);
if (!status.ok()) {
LOG(WARNING) << status.GetDetail();
} else if (is_symbolic_link) {
string linked_tz;
if (FileSystemUtil::GetRelativePath(canonical_path, ZONE_INFO_DIR, &linked_tz)
&& !linked_tz.empty()) {
return linked_tz;
}
LOG(WARNING) << "Symbolic link " << localtime << " resolved to the wrong path: "
<< canonical_path;
}
// Fall through to try other means.
}
{
// On some versions of some linux distro's (e.g. Ubuntu), the current timezone might
// be in the first line of the /etc/timezone file.
ifstream timezone_file("/etc/timezone");
if (timezone_file.is_open()) {
string result;
getline(timezone_file, result);
if (!result.empty()) return result;
}
// Fall through to try other means.
}
{
// On some versions of some linux distro's (e.g. Red Hat), the current timezone might
// be in the first line of the /etc/sysconfig/clock file as: ZONE="US/Eastern"
ifstream timezone_file("/etc/sysconfig/clock");
string result;
while (timezone_file) {
getline(timezone_file, result);
auto p = result.find("ZONE=\"");
if (p != string::npos) {
result.erase(p, p + 6);
result.erase(result.rfind('"'));
return result;
}
}
// Fall through to try other means.
}
LOG(WARNING) << "Could not get local timezone.";
return "";
}
Status TimezoneDatabase::LoadZoneInfoFromHdfs(const string& hdfs_zone_info_zip,
const string& local_dir) {
DCHECK(!hdfs_zone_info_zip.empty());
hdfsFS hdfs_conn, local_conn;
RETURN_IF_ERROR(HdfsFsCache::instance()->GetConnection(hdfs_zone_info_zip, &hdfs_conn));
RETURN_IF_ERROR(HdfsFsCache::instance()->GetLocalConnection(&local_conn));
// Create a temporary directory to copy the timezone db to. The CCTZ interface only
// loads timezone info from a directory. We abort the startup if this initialization
// fails for some reason.
string pathname = JoinPathSegments(local_dir, "impala.tzdb.XXXXXXX");
// mkdtemp operates in place, so we need a mutable array.
vector<char> local_path(pathname.c_str(), pathname.c_str() + pathname.size() + 1);
if (mkdtemp(local_path.data()) == nullptr) {
return Status(Substitute("Could not create temporary timezone directory: $0. Check "
"that the directory $1 is writable by the user running Impala.",
local_path.data(), local_dir));
}
Status status = CopyHdfsFile(hdfs_conn, hdfs_zone_info_zip, local_conn,
local_path.data());
if (!status.ok()) {
discard_result(FileSystemUtil::RemovePaths({local_path.data()}));
return status;
}
// Extract files from the zip archive.
string archive_file = JoinPathSegments(local_path.data(),
GetBaseName(hdfs_zone_info_zip.c_str()));
string destination_dir = JoinPathSegments(local_path.data(), "tzdb");
status = ZipUtil::ExtractFiles(archive_file, destination_dir);
if (!status.ok()) {
Status rm_status = FileSystemUtil::RemovePaths({local_path.data()});
if (!rm_status.ok()) LOG(WARNING) << rm_status.GetDetail();
return status;
}
// Find the root directory to load the time-zone db from.
// - If 'destination_dir' contains only one subdirectory, root directory should be set
// to that subdirectory.
// - Otherwise, root directory is 'destination_dir'.
vector<string> entry_names;
status = FileSystemUtil::Directory::GetEntryNames(destination_dir, &entry_names, 2);
if (!status.ok()) {
Status rm_status = FileSystemUtil::RemovePaths({local_path.data()});
if (!rm_status.ok()) LOG(WARNING) << rm_status.GetDetail();
return status;
}
string zone_info_root_dir = destination_dir;
if (entry_names.size() == 1) {
string entry_path = JoinPathSegments(destination_dir, entry_names[0]);
Status is_dir = FileSystemUtil::VerifyIsDirectory(entry_path);
if (is_dir.ok()) zone_info_root_dir = entry_path;
}
status = LoadZoneInfo(zone_info_root_dir);
Status rm_status = FileSystemUtil::RemovePaths({local_path.data()});
if (!rm_status.ok()) LOG(WARNING) << rm_status.GetDetail();
return status;
}
Status TimezoneDatabase::LoadZoneInfo(const string& zone_info_dir) {
// Find canonical path for 'zone_info_dir'.
string canonical_zone_info_dir;
RETURN_IF_ERROR(FileSystemUtil::GetCanonicalPath(zone_info_dir,
&canonical_zone_info_dir));
// Load 'Timezone' objects into 'tz_path_map'.
TimezoneMap tz_path_map;
RETURN_IF_ERROR(LoadZoneInfoHelper(canonical_zone_info_dir, canonical_zone_info_dir,
tz_path_map));
// Iterate through 'tz_path_map' and add 'Timezone' objects to 'tz_name_map_'.
// Use time-zone names as keys.
for (const auto& tz: tz_path_map) {
string tz_name;
if (FileSystemUtil::GetRelativePath(tz.first, canonical_zone_info_dir, &tz_name)
&& IsTimezoneNameValid(tz_name)) {
tz_name_map_[tz_name] = tz.second;
} else {
LOG(WARNING) << "Skipped adding " << tz.first << " to timezone db.";
}
}
return Status::OK();
}
Status TimezoneDatabase::LoadZoneInfoHelper(const string& path,
const string& zone_info_dir, TimezoneMap& tz_path_map) {
Status status = Status::OK();
FileSystemUtil::Directory dir(path);
string entry_name;
while (status.ok() && dir.GetNextEntryName(&entry_name)) {
// Skip entries that are not valid time-zone name segments.
if (!IsTimezoneNameSegmentValid(entry_name)) {
LOG(WARNING) << "Skipping " << path << "/" << entry_name
<< " path: " << entry_name
<< " is not a valid time-zone segment name.";
continue;
}
const string entry_path = JoinPathSegments(path, entry_name);
Status is_dir = FileSystemUtil::VerifyIsDirectory(entry_path);
if (is_dir.ok()) {
// 'entry_path' is a directory. Load 'Timezone' objects from the directory
// recursively.
status = LoadZoneInfoHelper(entry_path, zone_info_dir, tz_path_map);
} else {
// Load time-zone from 'entry_path'. It will log a warning if 'entry_path' is not a
// time-zone data file.
LoadTimezone(entry_path, zone_info_dir, tz_path_map);
}
}
if (status.ok()) status = dir.GetLastStatus();
return status;
}
void TimezoneDatabase::LoadTimezone(const string& path, const string& zone_info_dir,
TimezoneMap& tz_path_map) {
bool is_symbolic_link;
string canonical_path;
Status status = FileSystemUtil::IsSymbolicLink(path, &is_symbolic_link,
&canonical_path);
if (!status.ok()) {
LOG(WARNING) << status.GetDetail();
return;
}
// 'path' is not a symbolic link. Read 'Timezone' from 'path'.
if (!is_symbolic_link) {
shared_ptr<Timezone> tz = LoadTimezoneHelper(path);
if (tz != nullptr) tz_path_map[path] = tz;
return;
}
// 'path' is a symbolic link. Check that the resolved canonical path is also under
// 'zone_info_dir'.
if (!FileSystemUtil::IsPrefixPath(zone_info_dir, canonical_path)) {
LOG(WARNING) << "Symbolic link " << path << " resolved to the wrong path: "
<< canonical_path;
return;
}
// Check if 'canonical_path' has already been added as a key.
auto it = tz_path_map.find(canonical_path);
if (it != tz_path_map.end()) {
tz_path_map[path] = it->second;
return;
}
// 'canonical_path' hasn't been added as a key yet. Load 'Timezone' object and add it to
// 'tz_path_map' as a value mapped both to 'path' and 'canonical_path'.
shared_ptr<Timezone> tz = LoadTimezoneHelper(canonical_path);
if (tz != nullptr) {
tz_path_map[canonical_path] = tz_path_map[path] = tz;
}
}
shared_ptr<Timezone> TimezoneDatabase::LoadTimezoneHelper(const string& path) {
shared_ptr<Timezone> tz = make_shared<Timezone>();
if (!cctz::load_time_zone(path, tz.get())) {
LOG(WARNING) << "Could not load timezone: " << path;
return nullptr;
}
return tz;
}
Status TimezoneDatabase::LoadZoneAliasesFromHdfs(const string& hdfs_zone_alias_conf) {
DCHECK(!hdfs_zone_alias_conf.empty());
hdfsFS hdfs_conn;
RETURN_IF_ERROR(
HdfsFsCache::instance()->GetConnection(hdfs_zone_alias_conf, &hdfs_conn));
hdfsFile hdfs_file = hdfsOpenFile(
hdfs_conn, hdfs_zone_alias_conf.c_str(), O_RDONLY, 0, 0, 0);
if (hdfs_file == nullptr) {
return Status(GetHdfsErrorMsg("Failed to open HDFS file for reading: ",
hdfs_zone_alias_conf));
}
Status status = Status::OK();
vector<char> buffer(HDFS_READ_SIZE);
int current_bytes_read = -1;
stringstream ss;
while (true) {
current_bytes_read = hdfsRead(hdfs_conn, hdfs_file, buffer.data(), buffer.size());
if (current_bytes_read == 0) break;
if (current_bytes_read < 0) {
status = Status(TErrorCode::DISK_IO_ERROR, GetBackendString(),
GetHdfsErrorMsg("Error reading from HDFS file: ", hdfs_zone_alias_conf));
break;
}
ss << string(buffer.data(), current_bytes_read);
}
int hdfs_ret = hdfsCloseFile(hdfs_conn, hdfs_file);
if (hdfs_ret != 0) {
status.MergeStatus(
Status(ErrorMsg(TErrorCode::GENERAL,
GetHdfsErrorMsg("Failed to close HDFS file: ", hdfs_zone_alias_conf))));
}
if (status.ok()) {
status = LoadZoneAliases(ss, hdfs_zone_alias_conf.c_str());
}
return status;
}
Status TimezoneDatabase::LoadZoneAliases(istream &is, const char* path) {
string line, alias, value;
const string err_msg_path_part = (path == nullptr) ? "" : string(" in ") + path;
int i = 0;
while (is.good() && !is.eof()) {
i++;
getline(is, line);
// Strip off comments.
size_t comment = line.find('#');
if (comment != string::npos) {
line.resize(comment);
}
trim(line);
if (line.empty()) continue;
// Parse lines formatted as "alias = value".
size_t equal_pos = line.find('=');
if (equal_pos == string::npos) {
return Status(Substitute("Error in line $0$1. '=' is missing.", i,
err_msg_path_part));
}
// Check if alias name is present.
alias = line.substr(0, equal_pos);
trim(alias);
if (alias.empty()) {
return Status(Substitute("Error in line $0$1. Time-zone alias name is missing.", i,
err_msg_path_part));
}
// Check if alias is already in 'tz_name_map_'.
if (tz_name_map_.find(alias) != tz_name_map_.end()) {
LOG(WARNING) << "Skipping line " << i << err_msg_path_part
<< ". Duplicate time-zone alias: " << alias;
continue;
}
// Value is either a fix offset in seconds or a time-zone name.
value = line.substr(equal_pos + 1, string::npos);
trim(value);
if (value.empty()) {
return Status(Substitute("Error in line $0$1. Missing value.", i,
err_msg_path_part));
}
int64_t offset_sec;
if (IsTimezoneOffsetValid(value, &offset_sec)) {
// Add time-zone with a fix offset to the map.
shared_ptr<Timezone> tz = make_shared<Timezone>(
cctz::fixed_time_zone(cctz::sys_seconds(offset_sec)));
tz_name_map_[alias] = tz;
} else {
// Check if the value is in the map.
auto it_value = tz_name_map_.find(value);
if (it_value != tz_name_map_.end()) {
tz_name_map_[alias] = it_value->second;
} else {
LOG(WARNING) << "Skipping line " << i << err_msg_path_part
<< ". Unknown time-zone name or invalid offset: " << value;
}
}
}
return Status::OK();
}
Status TimezoneDatabase::Initialize() {
// Load 'Timezone' objects into 'tz_name_map_'. Use paths as keys.
if (!FLAGS_hdfs_zone_info_zip.empty()) {
tz_db_path_ = FLAGS_hdfs_zone_info_zip;
RETURN_IF_ERROR(
LoadZoneInfoFromHdfs(FLAGS_hdfs_zone_info_zip, FLAGS_local_library_dir));
} else {
tz_db_path_ = ZONE_INFO_DIR;
RETURN_IF_ERROR(LoadZoneInfo(ZONE_INFO_DIR));
}
// Sanity check.
if (tz_name_map_.find("UTC") == tz_name_map_.end()) {
return Status("Failed to load UTC timezone info.");
}
// Add timezone aliases.
if (!FLAGS_hdfs_zone_alias_conf.empty()) {
RETURN_IF_ERROR(LoadZoneAliasesFromHdfs(FLAGS_hdfs_zone_alias_conf));
} else {
stringstream ss(TIMEZONE_ALIASES, ios_base::in);
RETURN_IF_ERROR(LoadZoneAliases(ss));
}
return Status::OK();
}
}