| // Licensed to the Apache Software Foundation (ASF) under one |
| // or more contributor license agreements. See the NOTICE file |
| // distributed with this work for additional information |
| // regarding copyright ownership. The ASF licenses this file |
| // to you under the Apache License, Version 2.0 (the |
| // "License"); you may not use this file except in compliance |
| // with the License. You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, |
| // software distributed under the License is distributed on an |
| // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| // KIND, either express or implied. See the License for the |
| // specific language governing permissions and limitations |
| // under the License. |
| |
| #pragma once |
| |
| #include <boost/date_time/posix_time/ptime.hpp> |
| #include "gutil/macros.h" |
| #include <unordered_map> |
| #include <unordered_set> |
| #include <vector> |
| |
| #include "exprs/timestamp-functions.h" |
| #include "runtime/timestamp-value.h" |
| #include "udf/udf.h" |
| |
| namespace impala { |
| |
| using impala_udf::FunctionContext; |
| using impala_udf::StringVal; |
| |
| /// Impala provides multiple algorithms to parse datetime formats: |
| /// - SimpleDateFormat: This is the one that is traditionally used with functions such |
| /// as to_timestamp() and from_timestamp(). |
| /// - ISO SQL:2016 compliant datetime pattern matching. CAST(..FORMAT..) comes with |
| /// support for this pattern only. |
| /// This is a collection of the logic that is shared between the 2 types of pattern |
| /// matching including result codes, error reporting, format token types etc. |
| namespace datetime_parse_util { |
| const int FRACTIONAL_SECOND_MAX_LENGTH = 9; |
| |
| /// Describes ranges for months in a non-leap year expressed as number of days since |
| /// January 1. |
| const std::vector<int> MONTH_RANGES = { |
| 0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334, 365 }; |
| |
| /// Describes ranges for months in a leap year expressed as number of days since |
| /// January 1. |
| const std::vector<int> LEAP_YEAR_MONTH_RANGES = { |
| 0, 31, 60, 91, 121, 152, 182, 213, 244, 274, 305, 335, 366 }; |
| |
| /// Maps the 3-letter prefix of a month name to the suffix of the month name and the |
| /// ordinal number of month. The key of this map can be used to uniquely identify the |
| /// month while the suffix part of the value can be used for checking if the full month |
| /// name was given correctly in the input of a string to datetime conversion. The number |
| /// part of the value can be used as a result of the string to datetime conversion. |
| const std::unordered_map<std::string, std::pair<std::string, int>> |
| MONTH_PREFIX_TO_SUFFIX = { |
| {"jan", {"uary", 1}}, |
| {"feb", {"ruary", 2}}, |
| {"mar", {"ch", 3}}, |
| {"apr", {"il", 4}}, |
| {"may", {"", 5}}, |
| {"jun", {"e", 6}}, |
| {"jul", {"y", 7}}, |
| {"aug", {"ust", 8}}, |
| {"sep", {"tember", 9}}, |
| {"oct", {"ober", 10}}, |
| {"nov", {"ember", 11}}, |
| {"dec", {"ember", 12}} |
| }; |
| |
| /// Similar to 'MONTH_PREFIX_TO_SUFFIX' but maps the 3-letter prefix of a day name to the |
| /// suffix of the day name and the ordinal number of the day (1 means Monday and 7 means |
| /// Sunday). |
| const std::unordered_map<std::string, std::pair<std::string, int>> |
| DAY_PREFIX_TO_SUFFIX = { |
| {"mon", {"day", 1}}, |
| {"tue", {"sday", 2}}, |
| {"wed", {"nesday", 3}}, |
| {"thu", {"rsday", 4}}, |
| {"fri", {"day", 5}}, |
| {"sat", {"urday", 6}}, |
| {"sun", {"day", 7}} |
| }; |
| |
| /// Length of short month names like 'JAN', 'FEB', etc. |
| const int SHORT_MONTH_NAME_LENGTH = 3; |
| |
| /// Length of the longest month name 'SEPTEMBER'. |
| const int MAX_MONTH_NAME_LENGTH = 9; |
| |
| /// Length of short day names like 'MON', 'TUE', etc. |
| const int SHORT_DAY_NAME_LENGTH = 3; |
| |
| /// Length of the longest day name 'WEDNESDAY'. |
| const int MAX_DAY_NAME_LENGTH = 9; |
| |
| /// Contains all the possible result codes that can come from parsing a datetime format |
| /// pattern. |
| enum FormatTokenizationResult { |
| SUCCESS, |
| GENERAL_ERROR, |
| DUPLICATE_FORMAT, |
| YEAR_WITH_ROUNDED_YEAR_ERROR, |
| CONFLICTING_YEAR_TOKENS_ERROR, |
| CONFLICTING_MONTH_TOKENS_ERROR, |
| DAY_OF_YEAR_TOKEN_CONFLICT, |
| CONFLICTING_HOUR_TOKENS_ERROR, |
| CONFLICTING_MERIDIEM_TOKENS_ERROR, |
| MERIDIEM_CONFLICTS_WITH_HOUR_ERROR, |
| MISSING_HOUR_TOKEN_ERROR, |
| SECOND_IN_DAY_CONFLICT, |
| TOO_LONG_FORMAT_ERROR, |
| TIMEZONE_OFFSET_NOT_ALLOWED_ERROR, |
| MISSING_TZH_TOKEN_ERROR, |
| DATE_WITH_TIME_ERROR, |
| CONFLICTING_FRACTIONAL_SECOND_TOKENS_ERROR, |
| TEXT_TOKEN_NOT_CLOSED, |
| NO_DATETIME_TOKENS_ERROR, |
| MISPLACED_FX_MODIFIER_ERROR, |
| QUARTER_NOT_ALLOWED_FOR_PARSING, |
| DAY_OF_WEEK_NOT_ALLOWED_FOR_PARSING, |
| DAY_NAME_NOT_ALLOWED_FOR_PARSING, |
| WEEK_NUMBER_NOT_ALLOWED_FOR_PARSING, |
| CONFLICTING_DAY_OF_WEEK_TOKENS_ERROR, |
| MISSING_ISO8601_WEEK_BASED_TOKEN_ERROR, |
| CONFLICTING_DATE_TOKENS_ERROR |
| }; |
| |
| /// Holds all the token types that serve as building blocks for datetime format patterns. |
| enum DateTimeFormatTokenType { |
| UNKNOWN = 0, |
| SEPARATOR, |
| YEAR, |
| ROUND_YEAR, |
| MONTH_IN_YEAR, |
| DAY_IN_MONTH, |
| DAY_IN_YEAR, |
| HOUR_IN_DAY, |
| HOUR_IN_HALF_DAY, |
| MINUTE_IN_HOUR, |
| SECOND_IN_DAY, |
| SECOND_IN_MINUTE, |
| FRACTION, |
| TZ_OFFSET, |
| TIMEZONE_HOUR, |
| TIMEZONE_MIN, |
| MERIDIEM_INDICATOR, |
| ISO8601_TIME_INDICATOR, |
| ISO8601_ZULU_INDICATOR, |
| TEXT, |
| FM_MODIFIER, |
| FX_MODIFIER, |
| MONTH_NAME, |
| MONTH_NAME_SHORT, |
| DAY_NAME, |
| DAY_NAME_SHORT, |
| DAY_OF_WEEK, |
| QUARTER_OF_YEAR, |
| WEEK_OF_YEAR, |
| WEEK_OF_MONTH, |
| ISO8601_WEEK_NUMBERING_YEAR, |
| ISO8601_WEEK_OF_YEAR, |
| ISO8601_DAY_OF_WEEK |
| }; |
| |
| /// Indicates whether the cast is a 'datetime to string' or a 'string to datetime' cast. |
| /// PARSE is a string type to datetime type cast. |
| /// FORMAT is a datetime type to string type cast. |
| enum CastDirection { |
| PARSE, |
| FORMAT |
| }; |
| |
| typedef std::pair<const char*, const char*> MERIDIEM_INDICATOR_TEXT; |
| const MERIDIEM_INDICATOR_TEXT AM = {"AM", "am"}; |
| const MERIDIEM_INDICATOR_TEXT AM_LONG = {"A.M.", "a.m."}; |
| const MERIDIEM_INDICATOR_TEXT PM = {"PM", "pm"}; |
| const MERIDIEM_INDICATOR_TEXT PM_LONG = {"P.M.", "p.m."}; |
| |
| /// Stores metadata about a token within a datetime format. |
| struct DateTimeFormatToken { |
| /// Indicates the type of datetime format token. |
| DateTimeFormatTokenType type; |
| /// The position of where this token is supposed to start in the datetime string |
| /// to be parsed. |
| int pos; |
| /// The length of the token. |
| int len; |
| /// A pointer to the beginning of this token in the format string. |
| const char* val; |
| /// True if FM modifier is active for this token. This overrides the FX modifier active |
| /// for the whole format. |
| bool fm_modifier; |
| |
| /// True if this is a text token that is surrounded by escaped double quotes making the |
| /// content of the token double-escaped. |
| bool is_double_escaped; |
| |
| DateTimeFormatToken(DateTimeFormatTokenType type, int pos, int len, const char* val) |
| : type(type), pos(pos), len(len), val(val), fm_modifier(false), |
| is_double_escaped(false) { |
| } |
| }; |
| |
| /// Holds metadata about the datetime format. In the format parsing process the members of |
| /// this struct are populated gradually as the process advances. After the parsing process |
| /// this holds the found format tokens alongside with auxiliary information such as |
| /// whether the input format contains date or time tokens or both. |
| struct DateTimeFormatContext { |
| /// Pointer to the beginning of the format string. |
| const char* fmt; |
| /// Length of the format string. |
| int fmt_len; |
| /// Expected length of the output of a 'datetime to string' cast. This usually equals to |
| /// the length of the input format string. However, there are some edge cases where this |
| /// is not true: |
| /// - SimpleDateFormat parsing on '2019-11-10' as input and 'yyyy-d-m' as format |
| /// produces output that is longer than the format string. |
| /// - ISO SQL parsing has token types where the output length is different from the |
| /// token length like: 'MONTH', 'DAY', 'HH12', 'HH24', FF1, FF2, FF4, etc. |
| int fmt_out_len; |
| /// Vector of tokens found in the format string. |
| std::vector<DateTimeFormatToken> toks; |
| bool has_date_toks; |
| bool has_time_toks; |
| |
| /// True if the format contains an FX modifier effective for all the tokens. |
| bool fx_modifier; |
| |
| /// Used for casting with SimpleDateFormat to handle rounded year. Make sure you call |
| /// SetCenturyBreakAndCurrentTime() before using this member. |
| boost::posix_time::ptime century_break_ptime; |
| /// Used for round year and less than 4-digit year calculation in ISO:SQL:2016 parsing. |
| /// Make sure you call SetCenturyBreakAndCurrentTime() before using this member. Not |
| /// owned by this object. |
| const TimestampValue* current_time; |
| |
| DateTimeFormatContext() { |
| Reset(nullptr); |
| } |
| |
| DateTimeFormatContext(const char* fmt) { |
| Reset(fmt); |
| } |
| |
| DateTimeFormatContext(const char* fmt, int fmt_len) { |
| Reset(fmt, fmt_len); |
| } |
| |
| /// Set the century break for parsing 1 or 2-digit year format. When parsing 1 or |
| /// 2-digit year, the year should be in the interval [now - 80 years, now + 20 years), |
| /// according to Hive. Also sets the current time that is used for round year |
| /// calculation in ISO:SQL:2016 parsing. |
| void SetCenturyBreakAndCurrentTime(const TimestampValue& now); |
| |
| /// Initializes all the members of this object. |
| void Reset(const char* fmt, int fmt_len); |
| |
| void Reset(const char* fmt) { |
| Reset(fmt, (fmt == nullptr) ? 0 : strlen(fmt)); |
| } |
| }; |
| |
| /// Stores the results of parsing a date/time string. |
| struct DateTimeParseResult { |
| int year = -1; |
| int month = 0; |
| int day = 0; |
| int hour = 0; |
| int minute = 0; |
| int second = 0; |
| int32_t fraction = 0; |
| boost::posix_time::time_duration tz_offset = |
| boost::posix_time::time_duration(0, 0, 0, 0); |
| bool realign_year = false; |
| }; |
| |
| /// This function is used to indicate an error or warning when the input format |
| /// tokenization fails for some reason. Constructs an error message based on 'error_type' |
| /// and pushes it to 'context'. Depending on 'is_error' the message can be an error or |
| /// warning. |
| void ReportBadFormat(FunctionContext* context, FormatTokenizationResult error_type, |
| const StringVal& format, bool is_error); |
| |
| bool ParseAndValidate(const char* token, int token_len, int min, int max, |
| int* result) WARN_UNUSED_RESULT; |
| |
| // Given the month calculates the quarter of year. |
| int GetQuarter(int month); |
| |
| bool ParseFractionToken(const char* token, int token_len, |
| DateTimeParseResult* result) WARN_UNUSED_RESULT; |
| |
| /// Gets a month name token (either full or short name) and converts it to the ordinal |
| /// number of month between 1 and 12. Make sure 'tok.type' is either MONTH_NAME or |
| /// MONTH_NAME_SHORT. Result is stored in 'month'. Returns false if the given month name |
| /// is invalid. 'fx_modifier' indicates if there is an active FX modifier on the whole |
| /// format. |
| /// If the month part of the input is not followed by a separator then the end of the |
| /// month part is found using MONTH_PREFIX_TO_SUFFIX. First, the 3 letter prefix of the |
| /// month name identifies a particular month and then checks if the rest of the month |
| /// name matches. If it does then '*token_end' is adjusted to point to the character |
| /// right after the end of the month part. |
| bool ParseMonthNameToken(const DateTimeFormatToken& tok, const char* token_start, |
| const char** token_end, bool fx_modifier, int* month) |
| WARN_UNUSED_RESULT; |
| |
| /// Gets a day name token (either full or short name) and converts it to the ordinal |
| /// number of day between 1 and 7. Make sure 'tok.type' is either DAY_NAME or |
| /// DAY_NAME_SHORT. |
| /// Result is stored in 'day'. Returns false if the given day name is invalid. |
| /// 'fx_modifier' indicates if there is an active FX modifier on the whole format. |
| /// If the day part of the input is not followed by a separator then the end of the day |
| /// part is found using DAY_PREFIX_TO_SUFFIX. First, the 3 letter prefix of the day name |
| /// identifies a particular day and then checks if the rest of the day name matches. If it |
| /// does then '*token_end' is adjusted to point to the character right after the end of |
| /// the day part. |
| bool ParseDayNameToken(const DateTimeFormatToken& tok, const char* token_start, |
| const char** token_end, bool fx_modifier, int* day) |
| WARN_UNUSED_RESULT; |
| |
| inline bool IsLeapYear(int year) { |
| return year % 4 == 0 && (year % 100 != 0 || year % 400 == 0); |
| } |
| |
| /// Given the year, month and the day in month calculates the day in year. |
| int GetDayInYear(int year, int month, int day_in_month); |
| |
| /// Gets a year and the number of days passed since 1st of January that year. Calculates |
| /// the month and the day of that year. Returns false if any of the in parameters are |
| /// invalid e.g. if calling this function with a non-leap year and 'days_since_jan1' is |
| /// 365. Returns true on success. |
| bool GetMonthAndDayFromDaysSinceJan1(int year, int days_since_jan1, int* month, int* day) |
| WARN_UNUSED_RESULT; |
| |
| // Receives a text token and gives its string formatted representation. This is used in |
| // a string to datetime conversion path. |
| std::string FormatTextToken(const DateTimeFormatToken& tok); |
| |
| /// Taking 'num_of_month' this function provides the name of the month. Based on the |
| /// casing of the month format token in 'tok' this can format the results in 3 cases: |
| /// Capitalized, full lowercase and full uppercase. E.g. "March", "march" and "MARCH". |
| const std::string& FormatMonthName(int num_of_month, const DateTimeFormatToken& tok); |
| |
| /// Gets 'day' as a number between 1 and 7 that represents the day of week where Sunday |
| /// is 1 and returns the name of the day. Based on the casing of the day format token in |
| /// 'tok' this can format the results in 3 cases: Capitalized, full lowercase and full |
| /// uppercase. E.g. "Monday", "monday" and "MONDAY". |
| const std::string& FormatDayName(int day, const DateTimeFormatToken& tok); |
| |
| /// Returns how the output of a month or day token should be formatted. Make sure to |
| /// call this when 'tok.type' is any of the month or day name tokens. |
| TimestampFunctions::TextCase GetOutputCase(const DateTimeFormatToken& tok); |
| |
| /// Given the year, month and the day in month calculates the week in year where the |
| /// first week of the year starts from 1st January. |
| int GetWeekOfYear(int year, int month, int day); |
| |
| /// Given the day of month calculates the week in the month where the first week of the |
| /// month starts from the first day of the month. |
| int GetWeekOfMonth(int day); |
| |
| /// Returns the year adjusted to 'len' digits. |
| /// E.g. AdjustYearToLength(1789, 3) returns 789. |
| int AdjustYearToLength(int year, int len); |
| } |
| |
| } |