blob: 9c6bac06069f3f69205e368d58f10da7b254d617 [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#include "runtime/timestamp-parse-util.h"
#include "common/names.h"
#include "runtime/datetime-iso-sql-format-parser.h"
#include "runtime/datetime-simple-date-format-parser.h"
#include "runtime/date-value.h"
#include "runtime/runtime-state.h"
#include "runtime/string-value.inline.h"
#include "udf/udf-internal.h"
#include "util/string-parser.h"
#include "cctype"
using boost::gregorian::date;
using boost::gregorian::date_duration;
using boost::gregorian::gregorian_calendar;
using boost::posix_time::hours;
using boost::posix_time::not_a_date_time;
using boost::posix_time::ptime;
using boost::posix_time::time_duration;
namespace impala {
using namespace datetime_parse_util;
// Helper for parse functions to produce return value and set output parameters when
// parsing fails. 'd' and 't' must be non-NULL.
static bool IndicateTimestampParseFailure(date* d, time_duration* t) {
DCHECK(d != nullptr);
DCHECK(t != nullptr);
*d = date();
*t = time_duration(not_a_date_time);
return false;
}
bool TimestampParser::ParseSimpleDateFormat(const char* str, int len,
boost::gregorian::date* d, boost::posix_time::time_duration* t) {
DCHECK(d != nullptr);
DCHECK(t != nullptr);
if (UNLIKELY(str == nullptr)) return IndicateTimestampParseFailure(d, t);
int trimmed_len = len;
// Remove leading white space.
while (trimmed_len > 0 && isspace(*str)) {
++str;
--trimmed_len;
}
// Strip the trailing blanks.
while (trimmed_len > 0 && isspace(str[trimmed_len - 1])) --trimmed_len;
// Strip if there is a 'Z' suffix
if (trimmed_len > 0 && str[trimmed_len - 1] == 'Z') {
--trimmed_len;
} else if (trimmed_len > SimpleDateFormatTokenizer::DEFAULT_TIME_FMT_LEN &&
(str[4] == '-' || str[2] == ':')) {
// Strip timezone offset if it seems like a valid timestamp string.
int curr_pos = SimpleDateFormatTokenizer::DEFAULT_TIME_FMT_LEN;
// Timezone offset will be at least two bytes long, no need to check last
// two bytes.
while (curr_pos < trimmed_len - 2) {
if (str[curr_pos] == '+' || str[curr_pos] == '-') {
trimmed_len = curr_pos;
break;
}
++curr_pos;
}
}
if (UNLIKELY(trimmed_len <= 0)) return IndicateTimestampParseFailure(d, t);
// Determine the length of relevant input, if we're using one of the default formats.
int default_fmt_len = min(trimmed_len,
SimpleDateFormatTokenizer::DEFAULT_DATE_TIME_FMT_LEN);
// Determine the default formatting context that's required for parsing.
const DateTimeFormatContext* dt_ctx =
SimpleDateFormatTokenizer::GetDefaultFormatContext(str, default_fmt_len, true,
true);
if (dt_ctx != nullptr) {
return ParseSimpleDateFormat(str, default_fmt_len, *dt_ctx, d, t);
}
// Generating context lazily as a fall back if default formats fail.
// ParseFormatTokenByStr() does not require a template format string.
DateTimeFormatContext lazy_ctx(str, trimmed_len);
if (!SimpleDateFormatTokenizer::TokenizeByStr(&lazy_ctx)) {
return IndicateTimestampParseFailure(d, t);
}
dt_ctx = &lazy_ctx;
return ParseSimpleDateFormat(str, trimmed_len, *dt_ctx, d, t);
}
date TimestampParser::RealignYear(const DateTimeParseResult& dt_result,
const DateTimeFormatContext& dt_ctx, int day_offset, const time_duration& t) {
DCHECK(!dt_ctx.century_break_ptime.is_special());
// Let the century start at AABB and the year parsed be YY, this gives us AAYY.
int year = dt_result.year + (dt_ctx.century_break_ptime.date().year() / 100) * 100;
date unshifted_date;
// The potential actual date (02/29 in unshifted year + 100 years) might be valid
// even if unshifted date is not, so try to make unshifted date valid by adding 1 day.
// This makes the behavior closer to Hive.
if (dt_result.month == 2 && dt_result.day == 29 &&
!gregorian_calendar::is_leap_year(year)) {
unshifted_date = date(year, 3, 1);
} else {
unshifted_date = date(year, dt_result.month, dt_result.day);
}
unshifted_date += date_duration(day_offset);
// Advance 100 years if parsed time is before the century break.
// For example if the century breaks at 1937 but dt_result->year = 1936,
// the correct year would be 2036.
if (ptime(unshifted_date, t) < dt_ctx.century_break_ptime) {
return date(year + 100, dt_result.month, dt_result.day) + date_duration(day_offset);
} else {
return date(year, dt_result.month, dt_result.day) + date_duration(day_offset);
}
}
int TimestampParser::AdjustWithTimezone(time_duration* t,
const time_duration& tz_offset) {
*t -= tz_offset;
if (t->is_negative()) {
*t += hours(24);
return -1;
} else if (t->hours() >= 24) {
*t -= hours(24);
return 1;
}
return 0;
}
bool TimestampParser::PopulateParseResult(const DateTimeFormatContext& dt_ctx,
const DateTimeParseResult& dt_result, date* d, time_duration* t) {
int day_offset = 0;
if (dt_ctx.has_time_toks) {
*t = time_duration(dt_result.hour, dt_result.minute,
dt_result.second, dt_result.fraction);
day_offset = AdjustWithTimezone(t, dt_result.tz_offset);
} else {
*t = time_duration(0, 0, 0, 0);
}
if (dt_ctx.has_date_toks) {
try {
DCHECK(-1 <= day_offset && day_offset <= 1);
if (dt_result.realign_year) {
*d = RealignYear(dt_result, dt_ctx, day_offset, *t);
} else {
*d = date(dt_result.year, dt_result.month, dt_result.day)
+ date_duration(day_offset);
}
// Have to check year lower/upper bound [1400, 9999] here because
// operator + (date, date_duration) won't throw an exception even if the result is
// out-of-range.
if (d->year() < 1400 || d->year() > 9999) {
// Calling year() on out-of-range date throws an exception itself. This branch is
// to describe the checking logic but is never taken.
DCHECK(false);
}
} catch (boost::exception&) {
VLOG_ROW << "Invalid date: " << dt_result.year << "-" << dt_result.month << "-"
<< dt_result.day;
return false;
}
} else {
*d = date();
}
return true;
}
bool TimestampParser::ParseSimpleDateFormat(const char* str, int len,
const DateTimeFormatContext& dt_ctx, date* d, time_duration* t) {
DCHECK(dt_ctx.toks.size() > 0);
DCHECK(d != nullptr);
DCHECK(t != nullptr);
DateTimeParseResult dt_result;
if (UNLIKELY(str == nullptr || len <= 0 ||
!SimpleDateFormatParser::ParseDateTime(str, len, dt_ctx, &dt_result))) {
return IndicateTimestampParseFailure(d, t);
}
if (!PopulateParseResult(dt_ctx, dt_result, d, t)) {
return IndicateTimestampParseFailure(d, t);
}
return true;
}
bool TimestampParser::ParseIsoSqlFormat(const char* str, int len,
const DateTimeFormatContext& dt_ctx, date* d, time_duration* t) {
DCHECK(dt_ctx.toks.size() > 0);
DCHECK(d != nullptr);
DCHECK(t != nullptr);
if (UNLIKELY(str == nullptr || len <= 0)) return IndicateTimestampParseFailure(d, t);
DateTimeParseResult dt_result;
if (!IsoSqlFormatParser::ParseDateTime(str, len, dt_ctx, &dt_result)) {
return IndicateTimestampParseFailure(d, t);
}
if (!PopulateParseResult(dt_ctx, dt_result, d, t)) {
return IndicateTimestampParseFailure(d, t);
}
return true;
}
string TimestampParser::Format(const DateTimeFormatContext& dt_ctx, const date& d,
const time_duration& t) {
DCHECK(dt_ctx.toks.size() > 0);
if (dt_ctx.has_date_toks && d.is_special()) return "";
if (dt_ctx.has_time_toks && t.is_special()) return "";
string result;
result.reserve(dt_ctx.fmt_out_len);
for (const DateTimeFormatToken& tok: dt_ctx.toks) {
int32_t num_val = -1;
switch (tok.type) {
case YEAR:
case ROUND_YEAR: {
num_val = AdjustYearToLength(d.year(), tok.len);
break;
}
case QUARTER_OF_YEAR: {
num_val = GetQuarter(d.month());
break;
}
case MONTH_IN_YEAR: num_val = d.month().as_number(); break;
case MONTH_NAME:
case MONTH_NAME_SHORT: {
result.append(FormatMonthName(d.month().as_number(), tok));
break;
}
case WEEK_OF_YEAR: {
num_val = GetWeekOfYear(d.year(), d.month(), d.day());
break;
}
case WEEK_OF_MONTH: {
num_val = GetWeekOfMonth(d.day());
break;
}
case DAY_OF_WEEK: {
// Value in [1-7] where 1 represents Sunday, 2 represents Monday, etc.
num_val = d.day_of_week() + 1;
break;
}
case DAY_IN_MONTH: num_val = d.day(); break;
case DAY_IN_YEAR: {
num_val = GetDayInYear(d.year(), d.month(), d.day());
break;
}
case DAY_NAME:
case DAY_NAME_SHORT: {
result.append(FormatDayName(d.day_of_week() + 1, tok));
break;
}
case HOUR_IN_DAY: num_val = t.hours(); break;
case HOUR_IN_HALF_DAY: {
num_val = t.hours();
if (num_val == 0) num_val = 12;
if (num_val > 12) num_val -= 12;
break;
}
case MERIDIEM_INDICATOR: {
const MERIDIEM_INDICATOR_TEXT* indicator_txt = (tok.len == 2) ? &AM : &AM_LONG;
if (t.hours() >= 12) {
indicator_txt = (tok.len == 2) ? &PM : &PM_LONG;
}
result.append((isupper(*tok.val)) ? indicator_txt->first : indicator_txt->second,
tok.len);
break;
}
case MINUTE_IN_HOUR: num_val = t.minutes(); break;
case SECOND_IN_MINUTE: num_val = t.seconds(); break;
case SECOND_IN_DAY: {
num_val = t.hours() * 3600 + t.minutes() * 60 + t.seconds();
break;
}
case FRACTION: {
num_val = t.fractional_seconds();
if (num_val > 0) for (int j = tok.len; j < 9; ++j) num_val /= 10;
break;
}
case SEPARATOR:
case ISO8601_TIME_INDICATOR:
case ISO8601_ZULU_INDICATOR: {
result.append(tok.val, tok.len);
break;
}
case TZ_OFFSET: {
break;
}
case TEXT: {
result.append(FormatTextToken(tok));
break;
}
case ISO8601_WEEK_NUMBERING_YEAR: {
num_val = AdjustYearToLength(GetIso8601WeekNumberingYear(d), tok.len);
break;
}
case ISO8601_WEEK_OF_YEAR: {
num_val = d.week_number();
break;
}
case ISO8601_DAY_OF_WEEK: {
// day_of_week() returns 0 for Sunday, 1 for Monday and 6 for Saturday.
num_val = d.day_of_week();
// We need to output 1 for Monday and 7 for Sunday.
if (num_val == 0) num_val = 7;
break;
}
default: DCHECK(false) << "Unknown date/time format token";
}
if (num_val > -1) {
string tmp_str = std::to_string(num_val);
if (!tok.fm_modifier && tmp_str.length() < tok.len) {
tmp_str.insert(0, tok.len - tmp_str.length(), '0');
}
result.append(tmp_str);
}
}
return result;
}
int TimestampParser::GetIso8601WeekNumberingYear(const boost::gregorian::date& d) {
DCHECK(!d.is_special());
DCHECK(1400 <= d.year() && d.year() <= 9999);
static const boost::gregorian::date epoch(1970, 1, 1);
DateValue dv((d - epoch).days());
DCHECK(dv.IsValid());
int week_numbering_year = dv.Iso8601WeekNumberingYear();
// 1400.01.01 is Wednesday. 9999.12.31 is Friday.
// This means that week_numbering_year must fall in the [1400, 9999] range.
DCHECK(1400 <= week_numbering_year && week_numbering_year <= 9999);
return week_numbering_year;
}
}