blob: 70c0eb4ec484882bd557754e80839085e5448026 [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <boost/date_time/posix_time/ptime.hpp>
#include "gutil/macros.h"
#include <unordered_set>
#include <vector>
#include "runtime/timestamp-value.h"
#include "udf/udf.h"
namespace impala {
using impala_udf::FunctionContext;
using impala_udf::StringVal;
// Impala provides multiple algorithms to parse datetime formats:
// - SimpleDateFormat: This is the one that is traditionally used with functions such
// as to_timestamp() and from_timestamp().
// - ISO SQL:2016 compliant datetime pattern matching. CAST(..FORMAT..) comes with
// support for this pattern only.
// This is a collection of the logic that is shared between the 2 types of pattern
// matching including result codes, error reporting, format token types etc.
namespace datetime_parse_util {
const int MONTH_LENGTHS[12] = {31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31};
const int FRACTIONAL_SECOND_MAX_LENGTH = 9;
// Describes ranges for months in a non-leap year expressed as number of days since
// January 1.
const std::vector<int> MONTH_RANGES = {
0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334, 365 };
// Describes ranges for months in a leap year expressed as number of days since January 1.
const std::vector<int> LEAP_YEAR_MONTH_RANGES = {
0, 31, 60, 91, 121, 152, 182, 213, 244, 274, 305, 335, 366 };
// Contains all the possible result codes that can come from parsing a datetime format
// pattern.
enum FormatTokenizationResult {
SUCCESS,
GENERAL_ERROR,
DUPLICATE_FORMAT,
YEAR_WITH_ROUNDED_YEAR_ERROR,
CONFLICTING_YEAR_TOKENS_ERROR,
DAY_OF_YEAR_TOKEN_CONFLICT,
CONFLICTING_HOUR_TOKENS_ERROR,
CONFLICTING_MERIDIEM_TOKENS_ERROR,
MERIDIEM_CONFLICTS_WITH_HOUR_ERROR,
MISSING_HOUR_TOKEN_ERROR,
SECOND_IN_DAY_CONFLICT,
TOO_LONG_FORMAT_ERROR,
TIMEZONE_OFFSET_NOT_ALLOWED_ERROR,
MISSING_TZH_TOKEN_ERROR,
DATE_WITH_TIME_ERROR,
CONFLICTING_FRACTIONAL_SECOND_TOKENS_ERROR,
TEXT_TOKEN_NOT_CLOSED,
NO_DATETIME_TOKENS_ERROR,
MISPLACED_FX_MODIFIER_ERROR
};
/// Holds all the token types that serve as building blocks for datetime format patterns.
enum DateTimeFormatTokenType {
UNKNOWN = 0,
SEPARATOR,
YEAR,
ROUND_YEAR,
MONTH_IN_YEAR,
MONTH_IN_YEAR_SLT,
DAY_IN_MONTH,
DAY_IN_YEAR,
HOUR_IN_DAY,
HOUR_IN_HALF_DAY,
MINUTE_IN_HOUR,
SECOND_IN_DAY,
SECOND_IN_MINUTE,
FRACTION,
TZ_OFFSET,
TIMEZONE_HOUR,
TIMEZONE_MIN,
MERIDIEM_INDICATOR,
ISO8601_TIME_INDICATOR,
ISO8601_ZULU_INDICATOR,
TEXT,
FM_MODIFIER,
FX_MODIFIER
};
/// Indicates whether the cast is a 'datetime to string' or a 'string to datetime' cast.
/// PARSE is a string type to datetime type cast.
/// FORMAT is a datetime type to string type cast.
enum CastDirection {
PARSE,
FORMAT
};
typedef std::pair<const char*, const char*> MERIDIEM_INDICATOR_TEXT;
const MERIDIEM_INDICATOR_TEXT AM = {"AM", "am"};
const MERIDIEM_INDICATOR_TEXT AM_LONG = {"A.M.", "a.m."};
const MERIDIEM_INDICATOR_TEXT PM = {"PM", "pm"};
const MERIDIEM_INDICATOR_TEXT PM_LONG = {"P.M.", "p.m."};
/// Stores metadata about a token within a datetime format.
struct DateTimeFormatToken {
/// Indicates the type of datetime format token.
DateTimeFormatTokenType type;
/// The position of where this token is supposed to start in the datetime string
/// to be parsed.
int pos;
/// The length of the token.
int len;
/// A pointer to the beginning of this token in the format string.
const char* val;
/// True if FM modifier is active for this token. This overrides the FX modifier active
/// for the whole format.
bool fm_modifier;
/// True if this is a text token that is surrounded by escaped double quotes making the
/// content of the token double-escaped.
bool is_double_escaped;
DateTimeFormatToken(DateTimeFormatTokenType type, int pos, int len, const char* val)
: type(type), pos(pos), len(len), val(val), fm_modifier(false),
is_double_escaped(false) {
}
};
/// Holds metadata about the datetime format. In the format parsing process the members of
/// this struct are populated gradually as the process advances. After the parsing process
/// this holds the found format tokens alongside with auxiliary information such as
/// whether the input format contains date or time tokens or both.
struct DateTimeFormatContext {
/// Pointer to the beginning of the format string.
const char* fmt;
/// Length of the format string.
int fmt_len;
/// Expected length of the output of a 'datetime to string' cast. This usually equals to
/// the length of the input format string. However, there are some edge cases where this
/// is not true:
/// - SimpleDateFormat parsing on '2019-11-10' as input and 'yyyy-d-m' as format
/// produces output that is longer than the format string.
/// - ISO SQL parsing has token types where the output length is different from the
/// token length like: 'MONTH', 'DAY', 'HH12', 'HH24', FF1, FF2, FF4, etc.
int fmt_out_len;
/// Vector of tokens found in the format string.
std::vector<DateTimeFormatToken> toks;
bool has_date_toks;
bool has_time_toks;
/// True if the format contains an FX modifier effective for all the tokens.
bool fx_modifier;
/// Used for casting with SimpleDateFormat to handle rounded year. Make sure you call
/// SetCenturyBreakAndCurrentTime() before using this member.
boost::posix_time::ptime century_break_ptime;
/// Used for round year and less than 4-digit year calculation in ISO:SQL:2016 parsing.
/// Make sure you call SetCenturyBreakAndCurrentTime() before using this member. Not
/// owned by this object.
const TimestampValue* current_time;
DateTimeFormatContext() {
Reset(nullptr);
}
DateTimeFormatContext(const char* fmt) {
Reset(fmt);
}
DateTimeFormatContext(const char* fmt, int fmt_len) {
Reset(fmt, fmt_len);
}
/// Set the century break for parsing 1 or 2-digit year format. When parsing 1 or
/// 2-digit year, the year should be in the interval [now - 80 years, now + 20 years),
/// according to Hive. Also sets the current time that is used for round year
/// calculation in ISO:SQL:2016 parsing.
void SetCenturyBreakAndCurrentTime(const TimestampValue& now);
/// Initializes all the members of this object.
void Reset(const char* fmt, int fmt_len);
void Reset(const char* fmt) {
Reset(fmt, (fmt == nullptr) ? 0 : strlen(fmt));
}
};
/// Stores the results of parsing a date/time string.
struct DateTimeParseResult {
int year = -1;
int month = 0;
int day = 0;
int hour = 0;
int minute = 0;
int second = 0;
int32_t fraction = 0;
boost::posix_time::time_duration tz_offset =
boost::posix_time::time_duration(0, 0, 0, 0);
bool realign_year = false;
};
/// This function is used to indicate an error or warning when the input format
/// tokenization fails for some reason. Constructs an error message based on 'error_type'
/// and pushes it to 'context'. Depending on 'is_error' the message can be an error or
/// warning.
void ReportBadFormat(FunctionContext* context, FormatTokenizationResult error_type,
const StringVal& format, bool is_error);
bool ParseAndValidate(const char* token, int token_len, int min, int max,
int* result) WARN_UNUSED_RESULT;
bool ParseFractionToken(const char* token, int token_len,
DateTimeParseResult* result) WARN_UNUSED_RESULT;
inline bool IsLeapYear(int year) {
return year % 4 == 0 && (year % 100 != 0 || year % 400 == 0);
}
/// Given the year, month and the day in month calculates the day in year using
/// MONTH_LENGTHS.
int GetDayInYear(int year, int month, int day_in_month);
/// Gets a year and the number of days passed since 1st of January that year. Calculates
/// the month and the day of that year. Returns false if any of the in parameters are
/// invalid e.g. if calling this function with a non-leap year and 'days_since_jan1' is
/// 365. Returns true on success.
bool GetMonthAndDayFromDaysSinceJan1(int year, int days_since_jan1, int* month, int* day)
WARN_UNUSED_RESULT;
// Receives a text token and gives its string formatted representation. This is used in
// a string to datetime conversion path.
std::string FormatTextToken(const DateTimeFormatToken& tok);
}
}