be/src/runtime/datetime-simple-date-format-parser.h - impala - Git at Google

 // Licensed to the Apache Software Foundation (ASF) under one
 // or more contributor license agreements.  See the NOTICE file
 // distributed with this work for additional information
 // regarding copyright ownership.  The ASF licenses this file
 // to you under the Apache License, Version 2.0 (the
 // "License"); you may not use this file except in compliance
 // with the License.  You may obtain a copy of the License at
 //
 //   http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing,
 // software distributed under the License is distributed on an
 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 // KIND, either express or implied.  See the License for the
 // specific language governing permissions and limitations
 // under the License.

 #pragma once

 #include "runtime/datetime-parser-common.h"

 #include <boost/unordered_map.hpp>

 #include "runtime/string-value.h"

 namespace impala {

 /// The functionality here covers SimpleDateFormat pattern handling in Impala.
 /// Adds support for dealing with custom date/time formats in Impala. The following
 /// date/time tokens are supported:
 ///   y - Year
 ///   M - Month
 ///   d - Day
 ///   H - Hour
 ///   m - Minute
 ///   s - second
 ///   S - Fractional second
 ///
 ///   TimeZone offset formats (Must be at the end of format string):
 ///   +/-hh:mm
 ///   +/-hhmm
 ///   +/-hh
 ///
 ///
 /// The token names and usage have been modeled after the SimpleDateFormat class used in
 /// Java, with only the above list of tokens being supported. All fields will consume
 /// variable length inputs when parsing an input string and must therefore use separators
 /// to specify the boundaries of the fields, with the exception of TimeZone values, which
 /// have to be of fixed width. Repeating tokens can be used to specify fields of exact
 /// width, e.g. in yy-MM both fields must be of exactly length two. When using fixed width
 /// fields values must be zero-padded and output values will be zero padded during
 /// formatting. There is one exception to this: a month field of length 3 will specify
 /// literal month names instead of zero padding, i.e., yyyy-MMM-dd will parse from and
 /// format to strings like 2013-Nov-21. When using fields of fixed width the separators
 /// can be omitted.
 ///
 ///
 /// Formatting character groups can appear in any order along with any separators
 /// except TimeZone offset.
 /// e.g.
 ///   yyyy/MM/dd
 ///   dd-MMM-yy
 ///   (dd)(MM)(yyyy) HH:mm:ss
 ///   yyyy-MM-dd HH:mm:ss+hh:mm
 /// ..etc..
 ///
 /// The following features are not supported:
 ///   Long literal months e.g. MMMM
 ///   Nested strings e.g. "Year: " yyyy "Month: " mm "Day: " dd
 ///   Lazy formatting
 namespace datetime_parse_util {

 class SimpleDateFormatTokenizer {
 public:
   /// Constants to hold default format lengths.
   static const int DEFAULT_DATE_FMT_LEN;
   static const int DEFAULT_TIME_FMT_LEN;
   static const int DEFAULT_TIME_FRAC_FMT_LEN;
   static const int DEFAULT_SHORT_DATE_TIME_FMT_LEN;
   static const int DEFAULT_DATE_TIME_FMT_LEN;

   /// Parse the date/time format into tokens and place them in the context.
   /// dt_ctx -- output date/time format context
   /// accept_time_toks -- if true, time tokens are accepted. Otherwise time tokens are
   /// rejected.
   /// Return true if the parse was successful.
   static bool Tokenize(DateTimeFormatContext* dt_ctx,
       bool accept_time_toks = true);

   /// Parse the date/time string to generate the DateTimeFormatToken required by
   /// DateTimeFormatContext. Similar to Tokenize() this function will take the string
   /// and length, then heuristically determine whether the value contains date tokens,
   /// time tokens, or both. Unlike Tokenize(), it does not require the template format
   /// string.
   /// dt_ctx -- date/time format context (must contain valid tokens)
   /// accept_time_toks -- if true, time tokens are accepted, otherwise time tokens are
   /// rejected.
   /// accept_time_toks_only -- if true, time tokens w/o date tokens are accepted.
   /// Otherwise, they are rejected.
   /// Return true if the date/time was successfully parsed.
   static bool TokenizeByStr(DateTimeFormatContext* dt_ctx,
       bool accept_time_toks = true, bool accept_time_toks_only = true);

   /// Parse date/time string to find the corresponding default date/time format context.
   /// The string must adhere to a default date/time format.
   /// str -- valid pointer to the string to parse.
   /// len -- length of the string to parse (must be > 0)
   /// accept_time_toks -- if true, time tokens are accepted. Otherwise time tokens are
   /// rejected.
   /// accept_time_toks_only -- if true, time tokens without date tokens are accepted.
   /// Otherwise, they are rejected.
   /// Return the corresponding default format context if parsing succeeded, or nullptr
   /// otherwise.
   static const DateTimeFormatContext* GetDefaultFormatContext(const char* str, int len,
       bool accept_time_toks, bool accept_time_toks_only);

   /// Initialize the default format contexts. This *must* be called before using
   /// GetDefaultFormatContext().
   static void InitCtx();
 private:
   /// Used to indicate if the state has been initialized.
   static bool initialized;

   /// Pseudo-constant default date/time contexts. Backwards compatibility is provided on
   /// variable length fractional components by defining a format context for each expected
   /// length (0 - 9). This logic will be refactored when the parser supports lazy tokens.
   static DateTimeFormatContext DEFAULT_SHORT_DATE_TIME_CTX;
   static DateTimeFormatContext DEFAULT_SHORT_ISO_DATE_TIME_CTX;
   static DateTimeFormatContext DEFAULT_DATE_CTX;
   static DateTimeFormatContext DEFAULT_TIME_CTX;
   static DateTimeFormatContext DEFAULT_DATE_TIME_CTX[10];
   static DateTimeFormatContext DEFAULT_ISO_DATE_TIME_CTX[10];
   static DateTimeFormatContext DEFAULT_TIME_FRAC_CTX[10];

   /// Checks if str_begin point to the beginning of a valid timezone offset.
   static bool IsValidTZOffset(const char* str_begin, const char* str_end);

   // Parse out the next digit token from the date/time string by checking for contiguous
   // digit characters and return a pointer to the end of that token.
   // str -- pointer to the string to be parsed
   // str_end -- the pointer to the end of the string to be parsed
   // Returns the pointer within the string to the end of the valid digit token.
   static const char* ParseDigitToken(const char* str, const char* str_end);

   // Parse out the next separator token from the date/time string against an expected
   // character.
   // str -- pointer to the string to be parsed
   // str_end -- the pointer to the end of the string to be parsed
   // sep -- the separator char to compare the token to
   // Returns the pointer within the string to the end of the valid separator token.
   static const char* ParseSeparatorToken(const char* str, const char* str_end,
       const char sep);
 };

 class SimpleDateFormatParser {
 public:
   /// Parse a date/time string. The data must adhere to the format, otherwise it will be
   /// rejected i.e. no missing tokens.
   /// Does only a basic validation on the parsed date/time values. The caller is
   /// responsible for implementing rigid data validation and range-check.
   /// str -- valid pointer to the string to parse
   /// len -- length of the string to parse (must be > 0)
   /// dt_ctx -- date/time format context (must contain valid tokens)
   /// dt_result -- the struct where the results of the parsing will be placed
   /// Return true if the date/time was successfully parsed.
   static bool ParseDateTime(const char* str, int len,
       const DateTimeFormatContext& dt_ctx, DateTimeParseResult* dt_result);

   // Initializes REV_MONTH_INDEX;
   static void InitCtx();
 private:
   /// Used to indicate if the state has been initialized.
   static bool initialized;

   /// Lazily initialized pseudo-constant hashmap for mapping month names to an index.
   static boost::unordered_map<StringValue, int> REV_MONTH_INDEX;
 };

 }

 }
	// Licensed to the Apache Software Foundation (ASF) under one
	// or more contributor license agreements. See the NOTICE file
	// distributed with this work for additional information
	// regarding copyright ownership. The ASF licenses this file
	// to you under the Apache License, Version 2.0 (the
	// "License"); you may not use this file except in compliance
	// with the License. You may obtain a copy of the License at
	//
	// http://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing,
	// software distributed under the License is distributed on an
	// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	// KIND, either express or implied. See the License for the
	// specific language governing permissions and limitations
	// under the License.

	#pragma once

	#include "runtime/datetime-parser-common.h"

	#include <boost/unordered_map.hpp>

	#include "runtime/string-value.h"

	namespace impala {

	/// The functionality here covers SimpleDateFormat pattern handling in Impala.
	/// Adds support for dealing with custom date/time formats in Impala. The following
	/// date/time tokens are supported:
	/// y - Year
	/// M - Month
	/// d - Day
	/// H - Hour
	/// m - Minute
	/// s - second
	/// S - Fractional second
	///
	/// TimeZone offset formats (Must be at the end of format string):
	/// +/-hh:mm
	/// +/-hhmm
	/// +/-hh
	///
	///
	/// The token names and usage have been modeled after the SimpleDateFormat class used in
	/// Java, with only the above list of tokens being supported. All fields will consume
	/// variable length inputs when parsing an input string and must therefore use separators
	/// to specify the boundaries of the fields, with the exception of TimeZone values, which
	/// have to be of fixed width. Repeating tokens can be used to specify fields of exact
	/// width, e.g. in yy-MM both fields must be of exactly length two. When using fixed width
	/// fields values must be zero-padded and output values will be zero padded during
	/// formatting. There is one exception to this: a month field of length 3 will specify
	/// literal month names instead of zero padding, i.e., yyyy-MMM-dd will parse from and
	/// format to strings like 2013-Nov-21. When using fields of fixed width the separators
	/// can be omitted.
	///
	///
	/// Formatting character groups can appear in any order along with any separators
	/// except TimeZone offset.
	/// e.g.
	/// yyyy/MM/dd
	/// dd-MMM-yy
	/// (dd)(MM)(yyyy) HH:mm:ss
	/// yyyy-MM-dd HH:mm:ss+hh:mm
	/// ..etc..
	///
	/// The following features are not supported:
	/// Long literal months e.g. MMMM
	/// Nested strings e.g. "Year: " yyyy "Month: " mm "Day: " dd
	/// Lazy formatting
	namespace datetime_parse_util {

	class SimpleDateFormatTokenizer {
	public:
	/// Constants to hold default format lengths.
	static const int DEFAULT_DATE_FMT_LEN;
	static const int DEFAULT_TIME_FMT_LEN;
	static const int DEFAULT_TIME_FRAC_FMT_LEN;
	static const int DEFAULT_SHORT_DATE_TIME_FMT_LEN;
	static const int DEFAULT_DATE_TIME_FMT_LEN;

	/// Parse the date/time format into tokens and place them in the context.
	/// dt_ctx -- output date/time format context
	/// accept_time_toks -- if true, time tokens are accepted. Otherwise time tokens are
	/// rejected.
	/// Return true if the parse was successful.
	static bool Tokenize(DateTimeFormatContext* dt_ctx,
	bool accept_time_toks = true);

	/// Parse the date/time string to generate the DateTimeFormatToken required by
	/// DateTimeFormatContext. Similar to Tokenize() this function will take the string
	/// and length, then heuristically determine whether the value contains date tokens,
	/// time tokens, or both. Unlike Tokenize(), it does not require the template format
	/// string.
	/// dt_ctx -- date/time format context (must contain valid tokens)
	/// accept_time_toks -- if true, time tokens are accepted, otherwise time tokens are
	/// rejected.
	/// accept_time_toks_only -- if true, time tokens w/o date tokens are accepted.
	/// Otherwise, they are rejected.
	/// Return true if the date/time was successfully parsed.
	static bool TokenizeByStr(DateTimeFormatContext* dt_ctx,
	bool accept_time_toks = true, bool accept_time_toks_only = true);

	/// Parse date/time string to find the corresponding default date/time format context.
	/// The string must adhere to a default date/time format.
	/// str -- valid pointer to the string to parse.
	/// len -- length of the string to parse (must be > 0)
	/// accept_time_toks -- if true, time tokens are accepted. Otherwise time tokens are
	/// rejected.
	/// accept_time_toks_only -- if true, time tokens without date tokens are accepted.
	/// Otherwise, they are rejected.
	/// Return the corresponding default format context if parsing succeeded, or nullptr
	/// otherwise.
	static const DateTimeFormatContext* GetDefaultFormatContext(const char* str, int len,
	bool accept_time_toks, bool accept_time_toks_only);

	/// Initialize the default format contexts. This must be called before using
	/// GetDefaultFormatContext().
	static void InitCtx();
	private:
	/// Used to indicate if the state has been initialized.
	static bool initialized;

	/// Pseudo-constant default date/time contexts. Backwards compatibility is provided on
	/// variable length fractional components by defining a format context for each expected
	/// length (0 - 9). This logic will be refactored when the parser supports lazy tokens.
	static DateTimeFormatContext DEFAULT_SHORT_DATE_TIME_CTX;
	static DateTimeFormatContext DEFAULT_SHORT_ISO_DATE_TIME_CTX;
	static DateTimeFormatContext DEFAULT_DATE_CTX;
	static DateTimeFormatContext DEFAULT_TIME_CTX;
	static DateTimeFormatContext DEFAULT_DATE_TIME_CTX[10];
	static DateTimeFormatContext DEFAULT_ISO_DATE_TIME_CTX[10];
	static DateTimeFormatContext DEFAULT_TIME_FRAC_CTX[10];

	/// Checks if str_begin point to the beginning of a valid timezone offset.
	static bool IsValidTZOffset(const char* str_begin, const char* str_end);

	// Parse out the next digit token from the date/time string by checking for contiguous
	// digit characters and return a pointer to the end of that token.
	// str -- pointer to the string to be parsed
	// str_end -- the pointer to the end of the string to be parsed
	// Returns the pointer within the string to the end of the valid digit token.
	static const char* ParseDigitToken(const char* str, const char* str_end);

	// Parse out the next separator token from the date/time string against an expected
	// character.
	// str -- pointer to the string to be parsed
	// str_end -- the pointer to the end of the string to be parsed
	// sep -- the separator char to compare the token to
	// Returns the pointer within the string to the end of the valid separator token.
	static const char* ParseSeparatorToken(const char* str, const char* str_end,
	const char sep);
	};

	class SimpleDateFormatParser {
	public:
	/// Parse a date/time string. The data must adhere to the format, otherwise it will be
	/// rejected i.e. no missing tokens.
	/// Does only a basic validation on the parsed date/time values. The caller is
	/// responsible for implementing rigid data validation and range-check.
	/// str -- valid pointer to the string to parse
	/// len -- length of the string to parse (must be > 0)
	/// dt_ctx -- date/time format context (must contain valid tokens)
	/// dt_result -- the struct where the results of the parsing will be placed
	/// Return true if the date/time was successfully parsed.
	static bool ParseDateTime(const char* str, int len,
	const DateTimeFormatContext& dt_ctx, DateTimeParseResult* dt_result);

	// Initializes REV_MONTH_INDEX;
	static void InitCtx();
	private:
	/// Used to indicate if the state has been initialized.
	static bool initialized;

	/// Lazily initialized pseudo-constant hashmap for mapping month names to an index.
	static boost::unordered_map<StringValue, int> REV_MONTH_INDEX;
	};

	}

	}