blob: 740b27ca18afed22582b7e78377d0515988b789c [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include "runtime/datetime-parser-common.h"
#include <boost/unordered_map.hpp>
#include "runtime/string-value.h"
namespace impala {
/// The functionality here covers SimpleDateFormat pattern handling in Impala.
/// Adds support for dealing with custom date/time formats in Impala. The following
/// date/time tokens are supported:
/// y - Year
/// M - Month
/// d - Day
/// H - Hour
/// m - Minute
/// s - second
/// S - Fractional second
///
/// TimeZone offset formats (Must be at the end of format string):
/// +/-hh:mm
/// +/-hhmm
/// +/-hh
///
///
/// The token names and usage have been modeled after the SimpleDateFormat class used in
/// Java, with only the above list of tokens being supported. All fields will consume
/// variable length inputs when parsing an input string and must therefore use separators
/// to specify the boundaries of the fields, with the exception of TimeZone values, which
/// have to be of fixed width. Repeating tokens can be used to specify fields of exact
/// width, e.g. in yy-MM both fields must be of exactly length two. When using fixed width
/// fields values must be zero-padded and output values will be zero padded during
/// formatting. There is one exception to this: a month field of length 3 will specify
/// literal month names instead of zero padding, i.e., yyyy-MMM-dd will parse from and
/// format to strings like 2013-Nov-21. When using fields of fixed width the separators
/// can be omitted.
///
///
/// Formatting character groups can appear in any order along with any separators
/// except TimeZone offset.
/// e.g.
/// yyyy/MM/dd
/// dd-MMM-yy
/// (dd)(MM)(yyyy) HH:mm:ss
/// yyyy-MM-dd HH:mm:ss+hh:mm
/// ..etc..
///
/// The following features are not supported:
/// Long literal months e.g. MMMM
/// Nested strings e.g. "Year: " yyyy "Month: " mm "Day: " dd
/// Lazy formatting
namespace datetime_parse_util {
class SimpleDateFormatTokenizer {
public:
/// Constants to hold default format lengths.
static const int DEFAULT_DATE_FMT_LEN;
static const int DEFAULT_TIME_FMT_LEN;
static const int DEFAULT_TIME_FRAC_FMT_LEN;
static const int DEFAULT_SHORT_DATE_TIME_FMT_LEN;
static const int DEFAULT_DATE_TIME_FMT_LEN;
/// Parse the date/time format into tokens and place them in the context.
/// dt_ctx -- output date/time format context
/// accept_time_toks -- if true, time tokens are accepted. Otherwise time tokens are
/// rejected.
/// Return true if the parse was successful.
static bool Tokenize(DateTimeFormatContext* dt_ctx,
bool accept_time_toks = true);
/// Parse the date/time string to generate the DateTimeFormatToken required by
/// DateTimeFormatContext. Similar to Tokenize() this function will take the string
/// and length, then heuristically determine whether the value contains date tokens,
/// time tokens, or both. Unlike Tokenize(), it does not require the template format
/// string.
/// dt_ctx -- date/time format context (must contain valid tokens)
/// accept_time_toks -- if true, time tokens are accepted, otherwise time tokens are
/// rejected.
/// accept_time_toks_only -- if true, time tokens w/o date tokens are accepted.
/// Otherwise, they are rejected.
/// Return true if the date/time was successfully parsed.
static bool TokenizeByStr(DateTimeFormatContext* dt_ctx,
bool accept_time_toks = true, bool accept_time_toks_only = true);
/// Parse date/time string to find the corresponding default date/time format context.
/// The string must adhere to a default date/time format.
/// str -- valid pointer to the string to parse.
/// len -- length of the string to parse (must be > 0)
/// accept_time_toks -- if true, time tokens are accepted. Otherwise time tokens are
/// rejected.
/// accept_time_toks_only -- if true, time tokens without date tokens are accepted.
/// Otherwise, they are rejected.
/// Return the corresponding default format context if parsing succeeded, or nullptr
/// otherwise.
static const DateTimeFormatContext* GetDefaultFormatContext(const char* str, int len,
bool accept_time_toks, bool accept_time_toks_only);
/// Initialize the default format contexts. This *must* be called before using
/// GetDefaultFormatContext().
static void InitCtx();
private:
/// Used to indicate if the state has been initialized.
static bool initialized;
/// Pseudo-constant default date/time contexts. Backwards compatibility is provided on
/// variable length fractional components by defining a format context for each expected
/// length (0 - 9). This logic will be refactored when the parser supports lazy tokens.
static DateTimeFormatContext DEFAULT_SHORT_DATE_TIME_CTX;
static DateTimeFormatContext DEFAULT_SHORT_ISO_DATE_TIME_CTX;
static DateTimeFormatContext DEFAULT_DATE_CTX;
static DateTimeFormatContext DEFAULT_TIME_CTX;
static DateTimeFormatContext DEFAULT_DATE_TIME_CTX[10];
static DateTimeFormatContext DEFAULT_ISO_DATE_TIME_CTX[10];
static DateTimeFormatContext DEFAULT_TIME_FRAC_CTX[10];
/// Checks if str_begin point to the beginning of a valid timezone offset.
static bool IsValidTZOffset(const char* str_begin, const char* str_end);
// Parse out the next digit token from the date/time string by checking for contiguous
// digit characters and return a pointer to the end of that token.
// str -- pointer to the string to be parsed
// str_end -- the pointer to the end of the string to be parsed
// Returns the pointer within the string to the end of the valid digit token.
static const char* ParseDigitToken(const char* str, const char* str_end);
// Parse out the next separator token from the date/time string against an expected
// character.
// str -- pointer to the string to be parsed
// str_end -- the pointer to the end of the string to be parsed
// sep -- the separator char to compare the token to
// Returns the pointer within the string to the end of the valid separator token.
static const char* ParseSeparatorToken(const char* str, const char* str_end,
const char sep);
};
class SimpleDateFormatParser {
public:
/// Parse a date/time string. The data must adhere to the format, otherwise it will be
/// rejected i.e. no missing tokens.
/// Does only a basic validation on the parsed date/time values. The caller is
/// responsible for implementing rigid data validation and range-check.
/// str -- valid pointer to the string to parse
/// len -- length of the string to parse (must be > 0)
/// dt_ctx -- date/time format context (must contain valid tokens)
/// dt_result -- the struct where the results of the parsing will be placed
/// Return true if the date/time was successfully parsed.
static bool ParseDateTime(const char* str, int len,
const DateTimeFormatContext& dt_ctx, DateTimeParseResult* dt_result);
// Initializes REV_MONTH_INDEX;
static void InitCtx();
private:
/// Used to indicate if the state has been initialized.
static bool initialized;
/// Lazily initialized pseudo-constant hashmap for mapping month names to an index.
static boost::unordered_map<StringValue, int> REV_MONTH_INDEX;
};
}
}