blob: d0c595f3dc9eeaa0f708e7289cd15fc751a33e83 [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#include "runtime/timestamp-parse-util.h"
#include <boost/assign/list_of.hpp>
#include <boost/date_time/gregorian/gregorian.hpp>
#include <boost/date_time/posix_time/posix_time.hpp>
#include <boost/unordered_map.hpp>
#include "runtime/string-value.inline.h"
#include "util/string-parser.h"
namespace assign = boost::assign;
using boost::unordered_map;
using boost::gregorian::date;
using boost::gregorian::date_duration;
using boost::posix_time::hours;
using boost::posix_time::not_a_date_time;
using boost::posix_time::time_duration;
namespace impala {
/// Stores the results of parsing a date/time string.
struct DateTimeParseResult {
int year;
int month;
int day;
int hour;
int minute;
int second;
int32_t fraction;
boost::posix_time::time_duration tz_offset;
DateTimeParseResult()
: year(0),
month(0),
day(0),
hour(0),
minute(0),
second(0),
fraction(0),
tz_offset(0,0,0,0) {
}
};
bool TimestampParser::initialized_ = false;
/// Lazily initialized pseudo-constant hashmap for mapping month names to an index.
static unordered_map<StringValue, int> REV_MONTH_INDEX;
DateTimeFormatContext TimestampParser::DEFAULT_SHORT_DATE_TIME_CTX;
DateTimeFormatContext TimestampParser::DEFAULT_SHORT_ISO_DATE_TIME_CTX;
DateTimeFormatContext TimestampParser::DEFAULT_DATE_CTX;
DateTimeFormatContext TimestampParser::DEFAULT_TIME_CTX;
DateTimeFormatContext TimestampParser::DEFAULT_DATE_TIME_CTX[10];
DateTimeFormatContext TimestampParser::DEFAULT_ISO_DATE_TIME_CTX[10];
DateTimeFormatContext TimestampParser::DEFAULT_TIME_FRAC_CTX[10];
void TimestampParser::Init() {
if (TimestampParser::initialized_) return;
// This needs to be lazily init'd because a StringValues hash function will be invoked
// for each entry that's placed in the map. The hash function expects that
// CpuInfo::Init() has already been called.
REV_MONTH_INDEX = boost::unordered_map<StringValue, int>({
{StringValue("jan"), 1}, {StringValue("feb"), 2},
{StringValue("mar"), 3}, {StringValue("apr"), 4},
{StringValue("may"), 5}, {StringValue("jun"), 6},
{StringValue("jul"), 7}, {StringValue("aug"), 8},
{StringValue("sep"), 9}, {StringValue("oct"), 10},
{StringValue("nov"), 11}, {StringValue("dec"), 12}
});
// Setup the default date/time context yyyy-MM-dd HH:mm:ss.SSSSSSSSS
const char* DATE_TIME_CTX_FMT = "yyyy-MM-dd HH:mm:ss.SSSSSSSSS";
const int FRACTIONAL_MAX_LEN = 9;
for (int i = FRACTIONAL_MAX_LEN; i >= 0; --i) {
DEFAULT_DATE_TIME_CTX[i].Reset(DATE_TIME_CTX_FMT,
DEFAULT_DATE_TIME_FMT_LEN - (FRACTIONAL_MAX_LEN - i));
ParseFormatTokens(&DEFAULT_DATE_TIME_CTX[i]);
}
// Setup the default ISO date/time context yyyy-MM-ddTHH:mm:ss.SSSSSSSSS
for (int i = FRACTIONAL_MAX_LEN; i >= 0; --i) {
DEFAULT_ISO_DATE_TIME_CTX[i].Reset("yyyy-MM-ddTHH:mm:ss.SSSSSSSSS",
DEFAULT_DATE_TIME_FMT_LEN - (FRACTIONAL_MAX_LEN - i));
ParseFormatTokens(&DEFAULT_ISO_DATE_TIME_CTX[i]);
}
// Setup the short default date/time context yyyy-MM-dd HH:mm:ss
DEFAULT_SHORT_DATE_TIME_CTX.Reset("yyyy-MM-dd HH:mm:ss",
DEFAULT_SHORT_DATE_TIME_FMT_LEN);
ParseFormatTokens(&DEFAULT_SHORT_DATE_TIME_CTX);
// Setup the short default ISO date/time context yyyy-MM-ddTHH:mm:ss
DEFAULT_SHORT_ISO_DATE_TIME_CTX.Reset("yyyy-MM-ddTHH:mm:ss",
DEFAULT_SHORT_DATE_TIME_FMT_LEN);
ParseFormatTokens(&DEFAULT_SHORT_ISO_DATE_TIME_CTX);
// Setup the default short date context yyyy-MM-dd
DEFAULT_DATE_CTX.Reset("yyyy-MM-dd", DEFAULT_DATE_FMT_LEN);
ParseFormatTokens(&DEFAULT_DATE_CTX);
// Setup the default short time context HH:mm:ss
DEFAULT_TIME_CTX.Reset("HH:mm:ss", DEFAULT_TIME_FMT_LEN);
ParseFormatTokens(&DEFAULT_TIME_CTX);
// Setup the default short time context with fractional seconds HH:mm:ss.SSSSSSSSS
for (int i = FRACTIONAL_MAX_LEN; i >= 0; --i) {
DEFAULT_TIME_FRAC_CTX[i].Reset(DATE_TIME_CTX_FMT + 11,
DEFAULT_TIME_FRAC_FMT_LEN - (FRACTIONAL_MAX_LEN - i));
ParseFormatTokens(&DEFAULT_TIME_FRAC_CTX[i]);
}
// Flag that the parser is ready.
TimestampParser::initialized_ = true;
}
bool TimestampParser::ParseFormatTokens(DateTimeFormatContext* dt_ctx) {
DCHECK(dt_ctx != NULL);
DCHECK(dt_ctx->fmt != NULL);
DCHECK(dt_ctx->fmt_len > 0);
DCHECK(dt_ctx->toks.size() == 0);
const char* str_begin = dt_ctx->fmt;
const char* str_end = str_begin + dt_ctx->fmt_len;
const char* str = str_begin;
// Parse the tokens from the format string
while (str < str_end) {
if (isdigit(*str)) return false;
// Ignore T|Z|non aA-zZ chars but track them as separators (required for printing).
if ((*str == 'T') || (*str == 'Z') || (!isalpha(*str))) {
if (dt_ctx->has_time_toks && IsValidTZOffset(str, str_end)) {
// TZ offset must come at the end of the format.
dt_ctx->toks.push_back(DateTimeFormatToken(TZ_OFFSET, str - str_begin,
str_end - str, str));
break;
}
dt_ctx->toks.push_back(DateTimeFormatToken(SEPARATOR, str - str_begin, 1, str));
++str;
continue;
}
// Not a separator, verify that the previous token is either a separator or has
// length >1, i.e., it is not a variable length token.
if (!dt_ctx->toks.empty()) {
const DateTimeFormatToken& prev = dt_ctx->toks.back();
if (UNLIKELY(prev.type != SEPARATOR && prev.len == 1)) return false;
}
DateTimeFormatTokenType tok_type = UNKNOWN;
switch (*str) {
case 'y': tok_type = YEAR; break;
case 'M': tok_type = MONTH_IN_YEAR; break;
case 'd': tok_type = DAY_IN_MONTH; break;
case 'H': tok_type = HOUR_IN_DAY; break;
case 'm': tok_type = MINUTE_IN_HOUR; break;
case 's': tok_type = SECOND_IN_MINUTE; break;
case 'S': tok_type = FRACTION; break;
// Error on aA-zZ reserved characters that are not used yet.
default: return false;
}
dt_ctx->has_date_toks |= tok_type < HOUR_IN_DAY;
dt_ctx->has_time_toks |= tok_type >= HOUR_IN_DAY;
// Get the token group length
int tok_len = 1;
char tok_chr = *str;
const char* curr_tok_chr = str + 1;
while (curr_tok_chr < str_end) {
if (*curr_tok_chr != tok_chr) break;
++tok_len;
++curr_tok_chr;
}
if (tok_type == MONTH_IN_YEAR) {
if (UNLIKELY(tok_len > 3)) return false;
if (tok_len == 3) tok_type = MONTH_IN_YEAR_SLT;
}
// In an output scenario, fmt_out_len is used to determine the print buffer size.
// If the format uses short token groups e.g. yyyy-MM-d, there must to be enough
// room in the buffer for wider values e.g. 2013-12-16.
if (tok_len == 1) ++dt_ctx->fmt_out_len;
DateTimeFormatToken tok(tok_type, str - str_begin, tok_len, str);
str += tok.len;
dt_ctx->toks.push_back(tok);
}
return dt_ctx->has_date_toks || dt_ctx->has_time_toks;
}
bool TimestampParser::Parse(const char* str, int len, boost::gregorian::date* d,
boost::posix_time::time_duration* t) {
DCHECK(TimestampParser::initialized_);
DCHECK(d != NULL);
DCHECK(t != NULL);
if (UNLIKELY(str == NULL || len <= 0)) {
*d = boost::gregorian::date();
*t = boost::posix_time::time_duration(boost::posix_time::not_a_date_time);
return false;
}
// Remove leading white space.
while (len > 0 && isspace(*str)) {
++str;
--len;
}
// Strip the trailing blanks.
while (len > 0 && isspace(str[len - 1])) --len;
// Strip if there is a 'Z' suffix
if (len > 0 && str[len - 1] == 'Z') {
--len;
} else if (len > DEFAULT_TIME_FMT_LEN && (str[4] == '-' || str[2] == ':')) {
// Strip timezone offset if it seems like a valid timestamp string.
int curr_pos = DEFAULT_TIME_FMT_LEN;
// Timezone offset will be at least two bytes long, no need to check last
// two bytes.
while (curr_pos < len - 2) {
if (str[curr_pos] == '+' || str[curr_pos] == '-') {
len = curr_pos;
break;
}
++curr_pos;
}
}
// Only process what we have to.
if (len > DEFAULT_DATE_TIME_FMT_LEN) len = DEFAULT_DATE_TIME_FMT_LEN;
// Determine the default formatting context that's required for parsing.
DateTimeFormatContext* dt_ctx = NULL;
if (LIKELY(len >= DEFAULT_TIME_FMT_LEN)) {
// This string starts with a date component
if (str[4] == '-') {
switch (len) {
case DEFAULT_DATE_FMT_LEN: {
dt_ctx = &DEFAULT_DATE_CTX;
break;
}
case DEFAULT_SHORT_DATE_TIME_FMT_LEN: {
switch (str[10]) {
case ' ': dt_ctx = &DEFAULT_SHORT_DATE_TIME_CTX; break;
case 'T': dt_ctx = &DEFAULT_SHORT_ISO_DATE_TIME_CTX; break;
}
break;
}
case DEFAULT_DATE_TIME_FMT_LEN: {
switch (str[10]) {
case ' ': dt_ctx = &DEFAULT_DATE_TIME_CTX[9]; break;
case 'T': dt_ctx = &DEFAULT_ISO_DATE_TIME_CTX[9]; break;
}
break;
}
default: {
// There is likely a fractional component that's below the expected 9 chars.
// We will need to work out which default context to use that corresponds to
// the fractional length in the string.
if (LIKELY(len > DEFAULT_SHORT_DATE_TIME_FMT_LEN)) {
switch (str[10]) {
case ' ': {
dt_ctx =
&DEFAULT_DATE_TIME_CTX[len - DEFAULT_SHORT_DATE_TIME_FMT_LEN - 1];
break;
}
case 'T': {
dt_ctx = &DEFAULT_ISO_DATE_TIME_CTX
[len - DEFAULT_SHORT_DATE_TIME_FMT_LEN - 1];
break;
}
}
}
break;
}
}
} else if (str[2] == ':') {
if (len > DEFAULT_TIME_FRAC_FMT_LEN) len = DEFAULT_TIME_FRAC_FMT_LEN;
if (len > DEFAULT_TIME_FMT_LEN && str[8] == '.') {
dt_ctx = &DEFAULT_TIME_FRAC_CTX[len - DEFAULT_TIME_FMT_LEN - 1];
} else {
dt_ctx = &DEFAULT_TIME_CTX;
}
}
}
if (LIKELY(dt_ctx != NULL)) {
return Parse(str, len, *dt_ctx, d, t);
} else {
*d = boost::gregorian::date();
*t = boost::posix_time::time_duration(boost::posix_time::not_a_date_time);
return false;
}
}
bool TimestampParser::Parse(const char* str, int len, const DateTimeFormatContext& dt_ctx,
date* d, time_duration* t) {
DCHECK(TimestampParser::initialized_);
DCHECK(dt_ctx.toks.size() > 0);
DCHECK(d != NULL);
DCHECK(t != NULL);
DateTimeParseResult dt_result;
int day_offset = 0;
if (UNLIKELY(str == NULL || len <= 0 ||
!ParseDateTime(str, len, dt_ctx, &dt_result))) {
*d = date();
*t = time_duration(not_a_date_time);
return false;
}
if (dt_ctx.has_time_toks) {
*t = time_duration(dt_result.hour, dt_result.minute,
dt_result.second, dt_result.fraction);
*t -= dt_result.tz_offset;
if (t->is_negative()) {
*t += hours(24);
day_offset = -1;
} else if (t->hours() >= 24) {
*t -= hours(24);
day_offset = 1;
}
} else {
*t = time_duration(0, 0, 0, 0);
}
if (dt_ctx.has_date_toks) {
bool is_valid_date = true;
try {
DCHECK(-1 <= day_offset && day_offset <= 1);
if ((dt_result.year == 1400 && dt_result.month == 1 && dt_result.day == 1 &&
day_offset == -1) ||
(dt_result.year == 9999 && dt_result.month == 12 && dt_result.day == 31 &&
day_offset == 1)) {
// Have to check lower/upper bound explicitly.
// Tried date::is_not_a_date_time() but it doesn't complain value is out of range
// for "'1400-01-01' - 1 day" and "'9999-12-31' + 1 day".
is_valid_date = false;
} else {
*d = date(dt_result.year, dt_result.month, dt_result.day);
*d += date_duration(day_offset);
}
} catch (boost::exception&) {
is_valid_date = false;
}
if (!is_valid_date) {
VLOG_ROW << "Invalid date: " << dt_result.year << "-" << dt_result.month << "-"
<< dt_result.day;
*d = date();
*t = time_duration(not_a_date_time);
return false;
}
} else {
*d = date();
}
return true;
}
int TimestampParser::Format(const DateTimeFormatContext& dt_ctx,
const boost::gregorian::date& d, const boost::posix_time::time_duration& t,
int len, char* buff) {
DCHECK(TimestampParser::initialized_);
DCHECK(dt_ctx.toks.size() > 0);
DCHECK(len > dt_ctx.fmt_out_len);
DCHECK(buff != NULL);
if (dt_ctx.has_date_toks && d.is_special()) return -1;
if (dt_ctx.has_time_toks && t.is_special()) return -1;
char* str = buff;
for (const DateTimeFormatToken& tok: dt_ctx.toks) {
int32_t num_val = -1;
const char* str_val = NULL;
int str_val_len = 0;
switch (tok.type) {
case YEAR: {
num_val = d.year();
if (tok.len <= 3) num_val %= 100;
break;
}
case MONTH_IN_YEAR: num_val = d.month().as_number(); break;
case MONTH_IN_YEAR_SLT: {
str_val = d.month().as_short_string();
str_val_len = 3;
break;
}
case DAY_IN_MONTH: num_val = d.day(); break;
case HOUR_IN_DAY: num_val = t.hours(); break;
case MINUTE_IN_HOUR: num_val = t.minutes(); break;
case SECOND_IN_MINUTE: num_val = t.seconds(); break;
case FRACTION: {
num_val = t.fractional_seconds();
if (num_val > 0) for (int j = tok.len; j < 9; ++j) num_val /= 10;
break;
}
case SEPARATOR: {
str_val = tok.val;
str_val_len = tok.len;
break;
}
case TZ_OFFSET: {
break;
}
default: DCHECK(false) << "Unknown date/time format token";
}
if (num_val > -1) {
str += sprintf(str, "%0*d", tok.len, num_val);
} else {
memcpy(str, str_val, str_val_len);
str += str_val_len;
}
}
/// Terminate the string
*str = '\0';
return str - buff;
}
bool TimestampParser::ParseDateTime(const char* str, int str_len,
const DateTimeFormatContext& dt_ctx, DateTimeParseResult* dt_result) {
DCHECK(dt_ctx.fmt_len > 0);
DCHECK(dt_ctx.toks.size() > 0);
DCHECK(dt_result != NULL);
if (str_len <= 0 || str_len < dt_ctx.fmt_len || str == NULL) return false;
StringParser::ParseResult status;
// Keep track of the number of characters we need to shift token positions by.
// Variable-length tokens will result in values > 0;
int shift_len = 0;
for (const DateTimeFormatToken& tok: dt_ctx.toks) {
const char* tok_val = str + tok.pos + shift_len;
if (tok.type == SEPARATOR) {
if (UNLIKELY(*tok_val != *tok.val)) return false;
continue;
}
int tok_len = tok.len;
const char* str_end = str + str_len;
// In case of single-character tokens we scan ahead to the next separator.
if (UNLIKELY(tok_len == 1)) {
while ((tok_val + tok_len < str_end) && isdigit(*(tok_val + tok_len))) {
++tok_len;
++shift_len;
}
}
switch (tok.type) {
case YEAR: {
dt_result->year = StringParser::StringToInt<int>(tok_val, tok_len, &status);
if (UNLIKELY(StringParser::PARSE_SUCCESS != status)) return false;
if (UNLIKELY(dt_result->year < 1 || dt_result->year > 9999)) return false;
if (tok_len < 4 && dt_result->year < 99) dt_result->year += 2000;
break;
}
case MONTH_IN_YEAR: {
dt_result->month = StringParser::StringToInt<int>(tok_val, tok_len, &status);
if (UNLIKELY(StringParser::PARSE_SUCCESS != status)) return false;
if (UNLIKELY(dt_result->month < 1 || dt_result->month > 12)) return false;
break;
}
case MONTH_IN_YEAR_SLT: {
char raw_buff[tok.len];
std::transform(tok_val, tok_val + tok.len, raw_buff, ::tolower);
StringValue buff(raw_buff, tok.len);
boost::unordered_map<StringValue, int>::const_iterator iter =
REV_MONTH_INDEX.find(buff);
if (UNLIKELY(iter == REV_MONTH_INDEX.end())) return false;
dt_result->month = iter->second;
break;
}
case DAY_IN_MONTH: {
dt_result->day = StringParser::StringToInt<int>(tok_val, tok_len, &status);
if (UNLIKELY(StringParser::PARSE_SUCCESS != status)) return false;
// TODO: Validate that the value of day is correct for the given month.
if (UNLIKELY(dt_result->day < 1 || dt_result->day > 31)) return false;
break;
}
case HOUR_IN_DAY: {
dt_result->hour = StringParser::StringToInt<int>(tok_val, tok_len, &status);
if (UNLIKELY(StringParser::PARSE_SUCCESS != status)) return false;
if (UNLIKELY(dt_result->hour < 0 || dt_result->hour > 23)) return false;
break;
}
case MINUTE_IN_HOUR: {
dt_result->minute = StringParser::StringToInt<int>(tok_val, tok_len, &status);
if (UNLIKELY(StringParser::PARSE_SUCCESS != status)) return false;
if (UNLIKELY(dt_result->minute < 0 || dt_result->minute > 59)) return false;
break;
}
case SECOND_IN_MINUTE: {
dt_result->second = StringParser::StringToInt<int>(tok_val, tok_len, &status);
if (UNLIKELY(StringParser::PARSE_SUCCESS != status)) return false;
if (UNLIKELY(dt_result->second < 0 || dt_result->second > 59)) return false;
break;
}
case FRACTION: {
dt_result->fraction =
StringParser::StringToInt<int32_t>(tok_val, tok_len, &status);
if (UNLIKELY(StringParser::PARSE_SUCCESS != status)) return false;
// A user may specify a time of 04:30:22.1238, the parser will return 1238 for
// the fractional portion. This does not represent the intended value of
// 123800000, therefore the number must be scaled up.
for (int i = tok_len; i < 9; ++i) dt_result->fraction *= 10;
break;
}
case TZ_OFFSET: {
if (tok_val[0] != '+' && tok_val[0] != '-') return false;
int sign = tok_val[0] == '-' ? -1 : 1;
int minute = 0;
int hour = StringParser::StringToInt<int>(tok_val + 1, 2, &status);
if (UNLIKELY(StringParser::PARSE_SUCCESS != status ||
hour < 0 || hour > 23)) {
return false;
}
switch (tok_len) {
case 6: {
// +hh:mm
minute = StringParser::StringToInt<int>(tok_val + 4, 2, &status);
break;
}
case 5: {
// +hh:mm
minute = StringParser::StringToInt<int>(tok_val + 3, 2, &status);
break;
}
case 3: {
// +hh
break;
}
default: {
// Invalid timezone offset length.
return false;
}
}
if (UNLIKELY(StringParser::PARSE_SUCCESS != status ||
minute < 0 || minute > 59)) {
return false;
}
dt_result->tz_offset = boost::posix_time::time_duration(sign * hour,
sign * minute, 0, 0);
break;
}
default: DCHECK(false) << "Unknown date/time format token";
}
}
return true;
}
bool TimestampParser::IsValidTZOffset(const char* str_begin, const char* str_end) {
if (*str_begin == '+' || *str_begin == '-') {
++str_begin;
switch(str_end - str_begin) {
case 5: // hh:mm
return strncmp(str_begin, "hh:mm", 5) == 0;
case 4: // hhmm
return strncmp(str_begin, "hhmm", 4) == 0;
case 2: // hh
return strncmp(str_begin, "hh", 2) == 0;
default:
break;
}
}
return false;
}
}