blob: 3247ab0a1b96a1d0ae3fd8f91c66d9a35d323d2e [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include "runtime/datetime-parser-common.h"
#include "gutil/macros.h"
namespace impala {
namespace datetime_parse_util {
/// This class is responsible for parsing an input string into a timestamp using
/// ISO:SQL:2016 datetime format patterns. The list of tokens for the parsing comes as a
/// constructor parameter in 'dt_ctx' pre-processed by IsoSqlFomratTokenizer.
/// For more details of the parsing logic please see the design document attached to
/// IMPALA-4018.
class IsoSqlFormatParser {
/// Given a list of format tokens in 'dt_ctx' runs through 'input_str' and parses it
/// into 'result'. Return value indicates if the parsing was successful.
/// The caller has to make sure that 'dt_ctx.fmt' is a null-terminated string.
static bool ParseDateTime(const char* input_str, int input_len,
const DateTimeFormatContext& dt_ctx, DateTimeParseResult* result)
// Struct used for capturing ISO 8601 week-based tokens during parsing.
struct Iso8601WeekBasedDateParseResult {
int year = -1;
int week_of_year = -1;
int day_of_week = -1;
bool IsSet() const {
return (year != -1 || week_of_year != -1 || day_of_week != -1);
/// Calcuates year/month/day date values corresponding to the ISO 8601 week-based
/// date values stored in this instance.
/// If the ISO 8601 week-based date values are valid, the resulting year/month/day
/// values are copied to 'result' and true is returned.
/// Otherwise, 'result' is left intact and false is returned.
bool ExtractYearMonthDay(DateTimeParseResult* result) const WARN_UNUSED_RESULT;
/// 'input_str' points to a location in the input string where the parsing stands now.
/// Given 'tok' as the next token in the list of tokens created by the tokenizer this
/// function finds the end of the next token.
/// 'input_len' is used for stopping when we reach the end of the input string.
/// Returns a pointer pointing one position after the last character of the found token.
/// If can't identify the next token then returns nullptr.
/// If a MONTH_NAME token is not followed by a separator then the end of the month name
/// in the input can't be found by this function. In this case MAX_MONTH_NAME_LENGTH is
/// expected as the lenght of the month token and later on ParseMonthNameToken() will
/// adjust the end of the token.
static const char* FindEndOfToken(const char* input_str, int input_len,
const DateTimeFormatToken& tok, bool fx_provided) WARN_UNUSED_RESULT;
/// Has to call this function when 'input_str' points to the fist character of a
/// meridiem indicator. Identifies the last position of a meridiem indicator and returns
/// a pointer to the next position after this. If no meridiem indicator is found
/// starting from 'input_str' then returns nullptr.
static const char* ParseMeridiemIndicatorFromInput(const char* input_str,
int input_len);
/// Takes 'year' that is shorter than 4 digits and prefixes it with digits from
/// 'current_year'. Returns the resulting 4-digit year.
static int PrefixYearFromCurrentYear(int year, int year_token_len, int current_year);
/// This function takes a 2-digit 'year' and constructs a 4-digit year from it based on
/// the 'current_year'. Returns the resulting year.
static int RoundYearFromCurrentYear(int year, int current_year);
/// Returns the ISO 8601 week numbering year corresponding to 'now' date.
static int GetIso8601WeekNumberingYear(const TimestampValue* now);
/// Gets a pointer to the current char in the input string and an index to the current
/// token being processed within 'dt_ctx->toks'. Advances these pointers to the end of
/// the current separator sequence no matter how long these sequences are. It's expected
/// that '**tok' is of type SEPARATOR. Returns false if '**current_pos' is not a
/// separator or if either the input ends while having remaining items in 'dt_ctx->toks'
/// or the other way around.
static bool ProcessSeparatorSequence(const char** current_pos, const char* end_pos,
const DateTimeFormatContext& dt_ctx, int* dt_ctx_it);
// Gets the next character starting from '*format' that can be used for input
// matching. Takes care of the escaping backslashes regardless if the text token inside
// the format is itself double escped or not. Returns the next character in a form
// expected in the input. If '*format' points at the beginning of an escape sequence,
// '*format' is moved to the last character of the escape sequence. Otherwise,
// '*format' is not changed. E.g. If the text token is "\"abc" then this returns '"'
// after skipping the backslash and moves '*format' to '"'.
static char GetNextCharFromTextToken(const char** format,
const DateTimeFormatToken* tok);