blob: 35bcc41a6d9796cc80a96d9f1469cfa19fd67c5f [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
#include <fcntl.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/stat.h>
#include <time.h>
#include <unistd.h>
#include <cerrno>
#include <iostream>
#include <map>
#include <sstream>
#include "dbcommon/log/logger.h"
#include "storage/format/orc/timezone.h"
namespace orc {
// Find the position that is the closest and less than or equal to the
// target.
// @return -1 if the target < array[0] or array is empty or
// i if array[i] <= target and (i == n or array[i] < array[i+1])
int64_t binarySearch(const std::vector<int64_t>& array, int64_t target) {
uint64_t size = array.size();
if (size == 0) {
return -1;
}
uint64_t min = 0;
uint64_t max = size - 1;
uint64_t mid = (min + max) / 2;
while ((array[mid] != target) && (min < max)) {
if (array[mid] < target) {
min = mid + 1;
} else if (mid == 0) {
max = 0;
} else {
max = mid - 1;
}
mid = (min + max) / 2;
}
if (target < array[mid]) {
return static_cast<int64_t>(mid) - 1;
} else {
return static_cast<int64_t>(mid);
}
}
FutureRule::~FutureRule() {
// PASS
}
std::unique_ptr<FutureRule> parseFutureRule(const std::string& ruleString) {
std::unique_ptr<FutureRule> result(new FutureRuleImpl());
FutureRuleParser parser(ruleString,
dynamic_cast<FutureRuleImpl*>(result.get()));
return result;
}
FutureRuleImpl::~FutureRuleImpl() {
// PASS
}
bool FutureRuleImpl::isDefined() const { return ruleString.size() > 0; }
const TimezoneVariant& FutureRuleImpl::getVariant(int64_t clk) const {
if (!hasDst) {
return standard;
} else {
int64_t adjusted = clk % SECONDS_PER_400_YEARS;
if (adjusted < 0) {
adjusted += SECONDS_PER_400_YEARS;
}
int64_t idx = binarySearch(offsets, adjusted);
if (startInStd == (idx % 2 == 0)) {
return standard;
} else {
return dst;
}
}
}
void FutureRuleImpl::print(std::ostream* out) const {
if (isDefined()) {
*out << " Future rule: " << ruleString << "\n";
*out << " standard " << standard.toString() << "\n";
if (hasDst) {
*out << " dst " << dst.toString() << "\n";
*out << " start " << start.toString() << "\n";
*out << " end " << end.toString() << "\n";
}
}
}
VersionParser::~VersionParser() {
// PASS
}
static uint32_t decode32(const unsigned char* ptr) {
return static_cast<uint32_t>(ptr[0] << 24) |
static_cast<uint32_t>(ptr[1] << 16) |
static_cast<uint32_t>(ptr[2] << 8) | static_cast<uint32_t>(ptr[3]);
}
class Version1Parser : public VersionParser {
public:
virtual ~Version1Parser();
uint64_t getVersion() const override { return 1; }
// Get the number of bytes
uint64_t getTimeSize() const override { return 4; }
// Parse the time at the given location.
int64_t parseTime(const unsigned char* ptr) const override {
// sign extend from 32 bits
return static_cast<int32_t>(decode32(ptr));
}
std::string parseFutureString(const unsigned char*, uint64_t,
uint64_t) const override {
return "";
}
};
Version1Parser::~Version1Parser() {
// PASS
}
class Version2Parser : public VersionParser {
public:
virtual ~Version2Parser();
uint64_t getVersion() const override { return 2; }
// Get the number of bytes
uint64_t getTimeSize() const override { return 8; }
// Parse the time at the given location.
int64_t parseTime(const unsigned char* ptr) const override {
return static_cast<int64_t>(decode32(ptr)) << 32 | decode32(ptr + 4);
}
std::string parseFutureString(const unsigned char* ptr, uint64_t offset,
uint64_t length) const override {
return std::string(reinterpret_cast<const char*>(ptr) + offset + 1,
length - 2);
}
};
Version2Parser::~Version2Parser() {
// PASS
}
static std::map<std::string, Timezone*> timezoneCache;
Timezone::~Timezone() {
// PASS
}
TimezoneImpl::TimezoneImpl(const std::string& _filename,
const std::vector<unsigned char> buffer)
: filename(_filename) {
parseZoneFile(&buffer[0], 0, buffer.size(), Version1Parser());
// Build the literal for the ORC epoch
// 2015 Jan 1 00:00:00
tm epochStruct;
epochStruct.tm_sec = 0;
epochStruct.tm_min = 0;
epochStruct.tm_hour = 0;
epochStruct.tm_mday = 1;
epochStruct.tm_mon = 0;
epochStruct.tm_year = 2015 - 1900;
epochStruct.tm_isdst = 0;
time_t utcEpoch = timegm(&epochStruct);
epoch = utcEpoch - getVariant(utcEpoch).gmtOffset;
}
const char* getTimezoneDirectory() {
const char* dir = getenv("TZDIR");
if (!dir) {
dir = DEFAULT_TZDIR;
}
return dir;
}
// Get a timezone by absolute filename.
// Results are cached.
const Timezone& getTimezoneByFilename(const std::string& filename) {
std::map<std::string, Timezone*>::iterator itr = timezoneCache.find(filename);
if (itr != timezoneCache.end()) {
return *(itr->second);
}
int in = open(filename.c_str(), O_RDONLY);
if (in == -1) {
std::stringstream buffer;
buffer << "failed to open " << filename << " - " << strerror(errno);
LOG_ERROR(ERRCODE_INTERNAL_ERROR, "%s", buffer.str().c_str());
}
struct stat fileInfo;
if (fstat(in, &fileInfo) == -1) {
std::stringstream buffer;
buffer << "failed to stat " << filename << " - " << strerror(errno);
LOG_ERROR(ERRCODE_INTERNAL_ERROR, "%s", buffer.str().c_str());
}
if ((fileInfo.st_mode & S_IFMT) != S_IFREG) {
std::stringstream buffer;
buffer << "non-file in tzfile reader " << filename;
LOG_ERROR(ERRCODE_INTERNAL_ERROR, "%s", buffer.str().c_str());
}
size_t size = static_cast<size_t>(fileInfo.st_size);
std::vector<unsigned char> buffer(size);
size_t posn = 0;
while (posn < size) {
ssize_t ret = read(in, &buffer[posn], size - posn);
if (ret == -1) {
LOG_ERROR(ERRCODE_INTERNAL_ERROR, "Failure to read timezone file %s - %s",
filename.c_str(), strerror(errno));
}
posn += static_cast<size_t>(ret);
}
if (close(in) == -1) {
std::stringstream err;
err << "failed to close " << filename << " - " << strerror(errno);
LOG_ERROR(ERRCODE_INTERNAL_ERROR, "%s", err.str().c_str());
}
Timezone* result = new TimezoneImpl(filename, buffer);
timezoneCache[filename] = result;
return *result;
}
// Get the local timezone.
const Timezone& getLocalTimezone() {
return getTimezoneByFilename(LOCAL_TIMEZONE);
}
// Get a timezone by name (eg. America/Los_Angeles).
// Results are cached.
const Timezone& getTimezoneByName(const std::string& zone) {
std::string filename(getTimezoneDirectory());
filename += "/";
filename += zone;
return getTimezoneByFilename(filename);
}
// Parse a set of bytes as a timezone file as if they came from filename.
std::unique_ptr<Timezone> getTimezone(const std::string& filename,
const std::vector<unsigned char>& b) {
return std::unique_ptr<Timezone>(new TimezoneImpl(filename, b));
}
TimezoneImpl::~TimezoneImpl() {
// PASS
}
void TimezoneImpl::parseTimeVariants(const unsigned char* ptr,
uint64_t variantOffset,
uint64_t variantCount, uint64_t nameOffset,
uint64_t nameCount) {
for (uint64_t variant = 0; variant < variantCount; ++variant) {
variants[variant].gmtOffset =
static_cast<int32_t>(decode32(ptr + variantOffset + 6 * variant));
variants[variant].isDst = ptr[variantOffset + 6 * variant + 4];
uint nameStart = ptr[variantOffset + 6 * variant + 5];
if (nameStart >= nameCount) {
std::stringstream buffer;
buffer << "name out of range in variant " << variant << " - " << nameStart
<< " >= " << nameCount;
LOG_ERROR(ERRCODE_INTERNAL_ERROR, "%s", buffer.str().c_str());
}
variants[variant].name = std::string(reinterpret_cast<const char*>(ptr) +
nameOffset + nameStart);
}
}
//
// Parse the zone file to get the bits we need.
// There are two versions of the timezone file:
//
// Version 1(version = 0x00):
// Magic(version)
// Header
// TransitionTimes(4 byte)
// TransitionRules
// Rules
// LeapSeconds(4 byte)
// IsStd
// IsGmt
//
// Version2:
// Version1(0x32) = a version 1 copy of the data for old clients
// Magic(0x32)
// Header
// TransitionTimes(8 byte)
// TransitionRules
// Rules
// LeapSeconds(8 byte)
// IsStd
// IsGmt
// FutureString
void TimezoneImpl::parseZoneFile(const unsigned char* ptr,
uint64_t sectionOffset, uint64_t fileLength,
const VersionParser& versionParser) {
const uint64_t magicOffset = sectionOffset + 0;
const uint64_t headerOffset = magicOffset + 20;
// check for validity before we start parsing
if (fileLength < headerOffset + 6 * 4 ||
strncmp(reinterpret_cast<const char*>(ptr) + magicOffset, "TZif", 4) !=
0) {
std::stringstream buffer;
buffer << "non-tzfile " << filename;
LOG_ERROR(ERRCODE_INTERNAL_ERROR, "%s", buffer.str().c_str());
}
const uint64_t isGmtCount = decode32(ptr + headerOffset + 0);
const uint64_t isStdCount = decode32(ptr + headerOffset + 4);
const uint64_t leapCount = decode32(ptr + headerOffset + 8);
const uint64_t timeCount = decode32(ptr + headerOffset + 12);
const uint64_t variantCount = decode32(ptr + headerOffset + 16);
const uint64_t nameCount = decode32(ptr + headerOffset + 20);
const uint64_t timeOffset = headerOffset + 24;
const uint64_t timeVariantOffset =
timeOffset + versionParser.getTimeSize() * timeCount;
const uint64_t variantOffset = timeVariantOffset + timeCount;
const uint64_t nameOffset = variantOffset + variantCount * 6;
const uint64_t sectionLength = nameOffset + nameCount +
(versionParser.getTimeSize() + 4) * leapCount +
isGmtCount + isStdCount;
if (sectionLength > fileLength) {
std::stringstream buffer;
buffer << "tzfile too short " << filename << " needs " << sectionLength
<< " and has " << fileLength;
LOG_ERROR(ERRCODE_INTERNAL_ERROR, "%s", buffer.str().c_str());
}
// if it is version 2, skip over the old layout and read the new one.
if (sectionOffset == 0 && ptr[magicOffset + 4] != 0) {
parseZoneFile(ptr, sectionLength, fileLength, Version2Parser());
return;
}
version = versionParser.getVersion();
variants.resize(variantCount);
transitions.resize(timeCount);
currentVariant.resize(timeCount);
parseTimeVariants(ptr, variantOffset, variantCount, nameOffset, nameCount);
bool foundAncient = false;
for (uint64_t t = 0; t < timeCount; ++t) {
transitions[t] = versionParser.parseTime(ptr + timeOffset +
t * versionParser.getTimeSize());
currentVariant[t] = ptr[timeVariantOffset + t];
if (currentVariant[t] >= variantCount) {
std::stringstream buffer;
buffer << "tzfile rule out of range " << filename << " references rule "
<< currentVariant[t] << " of " << variantCount;
LOG_ERROR(ERRCODE_INTERNAL_ERROR, "%s", buffer.str().c_str());
}
// find the oldest standard time and use that as the ancient value
if (!foundAncient && !variants[currentVariant[t]].isDst) {
foundAncient = true;
ancientVariant = currentVariant[t];
}
}
if (!foundAncient) {
ancientVariant = 0;
}
futureRule = parseFutureRule(versionParser.parseFutureString(
ptr, sectionLength, fileLength - sectionLength));
// find the lower bound for applying the future rule
if (futureRule->isDefined()) {
if (timeCount > 0) {
lastTransition = transitions[timeCount - 1];
} else {
lastTransition = INT64_MIN;
}
} else {
lastTransition = INT64_MAX;
}
}
const TimezoneVariant& TimezoneImpl::getVariant(int64_t clk) const {
// if it is after the last explicit entry in the table,
// use the future rule to get an answer
if (clk > lastTransition) {
return futureRule->getVariant(clk);
} else {
int64_t transition = binarySearch(transitions, clk);
uint64_t idx;
if (transition < 0) {
idx = ancientVariant;
} else {
idx = currentVariant[static_cast<size_t>(transition)];
}
return variants[idx];
}
}
void TimezoneImpl::print(std::ostream& out) const {
out << "Timezone file: " << filename << "\n";
out << " Version: " << version << "\n";
futureRule->print(&out);
for (uint64_t r = 0; r < variants.size(); ++r) {
out << " Variant " << r << ": " << variants[r].toString() << "\n";
}
for (uint64_t t = 0; t < transitions.size(); ++t) {
tm timeStruct;
tm* result = nullptr;
char buffer[25];
if (sizeof(time_t) >= 8) {
time_t val = transitions[t];
result = gmtime_r(&val, &timeStruct);
if (result) {
strftime(buffer, sizeof(buffer), "%F %H:%M:%S", &timeStruct);
}
}
std::cout << " Transition: " << (result == nullptr ? "null" : buffer)
<< " (" << transitions[t] << ") -> "
<< variants[currentVariant[t]].name << "\n";
}
}
TimezoneError::TimezoneError(const std::string& what)
: std::runtime_error(what) {
// PASS
}
TimezoneError::TimezoneError(const TimezoneError& other)
: std::runtime_error(other) {
// PASS
}
TimezoneError::~TimezoneError() noexcept {
// PASS
}
} // namespace orc