blob: 9d973aa3765380dff488b00d0fb69955043eccbd [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include "runtime/datetime-parser-common.h"
#include <string>
#include <unordered_set>
#include <unordered_map>
#include "gutil/macros.h"
namespace impala {
namespace datetime_parse_util {
/// This class is responsible for splitting a datetime format string into tokens following
/// the ISO:SQL:2016 standard. For acceptable tokens please see the design document
/// attached to IMPALA-4018. You can also get a good impression of the available tokens by
/// checking VALID_TOKENS member and IsSeparator() function of this class.
class IsoSqlFormatTokenizer {
IsoSqlFormatTokenizer(DateTimeFormatContext* dt_ctx, CastDirection cast_mode,
bool time_toks) : dt_ctx_(dt_ctx), cast_mode_(cast_mode),
accept_time_toks_(time_toks), fm_modifier_active_(false) {}
void Reset(DateTimeFormatContext* dt_ctx, CastDirection cast_mode, bool time_toks) {
dt_ctx_ = dt_ctx;
cast_mode_ = cast_mode;
accept_time_toks_ = time_toks;
/// Performs parsing of 'dt_ctx_.fmt' format string. During the process this populates
/// 'dt_ctx_' with metadata about the format string such as tokens found in the string.
FormatTokenizationResult Tokenize();
/// Returns true if '*current_pos' points to a valid separator. If
/// 'read_escaped_single_quotes' is true then escaped single quotes are also taken as
/// valid separators. In this case '*current_pos' is advanced from the escaping
/// backslash to the single quote.
static bool IsSeparator(const char** current_pos, const char* str_end,
bool read_escaped_single_quotes = true);
/// Stores metadata about a specific token type.
struct TokenItem {
TokenItem(DateTimeFormatTokenType t, bool dt, bool tt) :
type(t), date_token(dt), time_token(tt) {}
DateTimeFormatTokenType type;
bool date_token;
bool time_token;
/// This holds all the valid tokens accepted by this parser. When a format string is
/// being parsed the substrings are checked against this member to decide if they are
/// valid tokens or not.
/// Note, token matching is case-insensitive even though this member contains
/// upper-case names only.
static const std::unordered_map<std::string, TokenItem> VALID_TOKENS;
/// Keeps the maximum token lengths for tokens where the token length doesn't equal to
/// the length of the datetime pattern. E.g. "HH12" is 4 chars long, however expects
/// maximum 2 digits as "10".
static const std::unordered_map<std::string, int> SPECIAL_LENGTHS;
/// This has to be in line with the longest string in VALID_TOKENS;
static const unsigned MAX_TOKEN_SIZE;
/// To be on the safe side this introduces a max length limit for the input format
/// strings.
static const int MAX_FORMAT_LENGTH;
/// When parsing is in progress this contains the format tokens that we have found in
/// the input format string so far.
std::unordered_set<std::string> used_tokens_;
/// The context that is used for the input of the parsing. It is also populated during
/// the parsing process. Not owned by this class.
/// Note, that 'dt_ctx_->fmt' is a null-terminated string so it's safe to use string
/// functions that make this assumption.
DateTimeFormatContext* dt_ctx_;
/// Decides whether this is a 'datetime to string' or a 'string to datetime' cast.
CastDirection cast_mode_;
bool accept_time_toks_;
/// True when the FM modifier has to be applied to the following non-separator token.
/// It is set back to false once applied on a token.
bool fm_modifier_active_;
/// Iterates through all the consecutive separator characters from a given pointer
/// 'current' and saves them to 'dt_ctx_'.
void ProcessSeparators(const char** current);
/// Identifies the next token using VALID_TOKENS and saves it to 'dt_ctx_'. Finds the
/// longest possible match.
FormatTokenizationResult ProcessNextToken(const char** current_pos);
/// Checks if the token has special length in 'SPECIAL_LENGTHS'. If finds a special
/// length then returns that, otherwise returns the length of 'token'.
int GetMaxTokenLength(const std::string& token) const;
/// Checks if 'token' is present in 'used_tokens_';
bool IsUsedToken(const std::string& token) const;
/// Checks if any of the meridiem indicators are present in 'used_tokens_'.
bool IsMeridiemIndicatorProvided() const;
/// Checks if the end product of the parsing contains format tokens that collide with
/// each other like YYYY and RR.
FormatTokenizationResult CheckIncompatibilities() const;
/// Checks if '*current_pos' points to an FX modifier and advances '*current_pos' after
/// the FX modifier. Sets 'dt_ctx_->fx_modifier' to true if '*current_pos' points to an
/// FX modifier. Call this when '*current_pos' points to the first character of the
/// format string.
void ProcessFXModifier(const char** current_pos);
/// Returns true if 'current_pos' points to either a double quote or to a backslash
/// that is followed by a double quote.
static bool IsStartOfTextToken(const char* current_pos);
/// Finds the end of the text token and saves metadata about the token into 'dt_ctx_'.
/// This has to be called when '*current_pos' points to the beginning of a text token
/// or in other words if IsStartOfTextToken(*current_pos) is true. As a side effect
/// 'current_pos' is advanced right after the closing double qoute of the text token.
FormatTokenizationResult ProcessTextToken(const char** current_pos,
const char* str_begin, const char* str_end);
// Starting from 'str_start' finds the closing quotation mark of the text token even
// if it's escaped. 'str_start' has to point to the first character of the text token
// right after the opening quote. Returns a pointer pointing right after the closing
// double quote of the text token or nullptr if the text token is unclosed.
static const char* FindEndOfTextToken(const char* str_start, const char* str_end,
bool is_escaped);