be/src/runtime/datetime-iso-sql-format-parser.h - impala - Git at Google

 // Licensed to the Apache Software Foundation (ASF) under one
 // or more contributor license agreements.  See the NOTICE file
 // distributed with this work for additional information
 // regarding copyright ownership.  The ASF licenses this file
 // to you under the Apache License, Version 2.0 (the
 // "License"); you may not use this file except in compliance
 // with the License.  You may obtain a copy of the License at
 //
 //   http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing,
 // software distributed under the License is distributed on an
 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 // KIND, either express or implied.  See the License for the
 // specific language governing permissions and limitations
 // under the License.

 #pragma once

 #include "runtime/datetime-parser-common.h"

 #include "gutil/macros.h"

 namespace impala {

 namespace datetime_parse_util {

 /// This class is responsible for parsing an input string into a timestamp using
 /// ISO:SQL:2016 datetime format patterns. The list of tokens for the parsing comes as a
 /// constructor parameter in 'dt_ctx' pre-processed by IsoSqlFomratTokenizer.
 /// For more details of the parsing logic please see the design document attached to
 /// IMPALA-4018.
 class IsoSqlFormatParser {
 public:
   /// Given a list of format tokens in 'dt_ctx' runs through 'input_str' and parses it
   /// into 'result'. Return value indicates if the parsing was successful.
   /// The caller has to make sure that 'dt_ctx.fmt' is a null-terminated string.
   static bool ParseDateTime(const char* input_str, int input_len,
       const DateTimeFormatContext& dt_ctx, DateTimeParseResult* result)
       WARN_UNUSED_RESULT;

 private:
   // Struct used for capturing ISO 8601 week-based tokens during parsing.
   struct Iso8601WeekBasedDateParseResult {
     int year = -1;
     int week_of_year = -1;
     int day_of_week = -1;

     bool IsSet() const {
       return (year != -1 || week_of_year != -1 || day_of_week != -1);
     }

     /// Calcuates year/month/day date values corresponding to the ISO 8601 week-based
     /// date values stored in this instance.
     /// If the ISO 8601 week-based date values are valid, the resulting year/month/day
     /// values are copied to 'result' and true is returned.
     /// Otherwise, 'result' is left intact and false is returned.
     bool ExtractYearMonthDay(DateTimeParseResult* result) const WARN_UNUSED_RESULT;
   };

   /// 'input_str' points to a location in the input string where the parsing stands now.
   /// Given 'tok' as the next token in the list of tokens created by the tokenizer this
   /// function finds the end of the next token.
   /// 'input_len' is used for stopping when we reach the end of the input string.
   /// Returns a pointer pointing one position after the last character of the found token.
   /// If can't identify the next token then returns nullptr.
   /// If a MONTH_NAME token is not followed by a separator then the end of the month name
   /// in the input can't be found by this function. In this case MAX_MONTH_NAME_LENGTH is
   /// expected as the lenght of the month token and later on ParseMonthNameToken() will
   /// adjust the end of the token.
   static const char* FindEndOfToken(const char* input_str, int input_len,
       const DateTimeFormatToken& tok, bool fx_provided) WARN_UNUSED_RESULT;

   /// Has to call this function when 'input_str' points to the fist character of a
   /// meridiem indicator. Identifies the last position of a meridiem indicator and returns
   /// a pointer to the next position after this. If no meridiem indicator is found
   /// starting from 'input_str' then returns nullptr.
   static const char* ParseMeridiemIndicatorFromInput(const char* input_str,
       int input_len);

   /// Takes 'year' that is shorter than 4 digits and prefixes it with digits from
   /// 'current_year'. Returns the resulting 4-digit year.
   static int PrefixYearFromCurrentYear(int year, int year_token_len, int current_year);

   /// This function takes a 2-digit 'year' and constructs a 4-digit year from it based on
   /// the 'current_year'. Returns the resulting year.
   static int RoundYearFromCurrentYear(int year, int current_year);

   /// Returns the ISO 8601 week numbering year corresponding to 'now' date.
   static int GetIso8601WeekNumberingYear(const TimestampValue* now);

   /// Gets a pointer to the current char in the input string and an index to the current
   /// token being processed within 'dt_ctx->toks'. Advances these pointers to the end of
   /// the current separator sequence no matter how long these sequences are. It's expected
   /// that '**tok' is of type SEPARATOR. Returns false if '**current_pos' is not a
   /// separator or if either the input ends while having remaining items in 'dt_ctx->toks'
   /// or the other way around.
   static bool ProcessSeparatorSequence(const char** current_pos, const char* end_pos,
       const DateTimeFormatContext& dt_ctx, int* dt_ctx_it);

   // Gets the next character starting from '*format' that can be used for input
   // matching. Takes care of the escaping backslashes regardless if the text token inside
   // the format is itself double escped or not. Returns the next character in a form
   // expected in the input. If '*format' points at the beginning of an escape sequence,
   // '*format' is moved to the last character of the escape sequence. Otherwise,
   // '*format' is not changed. E.g. If the text token is "\"abc" then this returns '"'
   // after skipping the backslash and moves '*format' to '"'.
   static char GetNextCharFromTextToken(const char** format,
       const DateTimeFormatToken* tok);
 };

 }
 }
	// Licensed to the Apache Software Foundation (ASF) under one
	// or more contributor license agreements. See the NOTICE file
	// distributed with this work for additional information
	// regarding copyright ownership. The ASF licenses this file
	// to you under the Apache License, Version 2.0 (the
	// "License"); you may not use this file except in compliance
	// with the License. You may obtain a copy of the License at
	//
	// http://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing,
	// software distributed under the License is distributed on an
	// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	// KIND, either express or implied. See the License for the
	// specific language governing permissions and limitations
	// under the License.

	#pragma once

	#include "runtime/datetime-parser-common.h"

	#include "gutil/macros.h"

	namespace impala {

	namespace datetime_parse_util {

	/// This class is responsible for parsing an input string into a timestamp using
	/// ISO:SQL:2016 datetime format patterns. The list of tokens for the parsing comes as a
	/// constructor parameter in 'dt_ctx' pre-processed by IsoSqlFomratTokenizer.
	/// For more details of the parsing logic please see the design document attached to
	/// IMPALA-4018.
	class IsoSqlFormatParser {
	public:
	/// Given a list of format tokens in 'dt_ctx' runs through 'input_str' and parses it
	/// into 'result'. Return value indicates if the parsing was successful.
	/// The caller has to make sure that 'dt_ctx.fmt' is a null-terminated string.
	static bool ParseDateTime(const char* input_str, int input_len,
	const DateTimeFormatContext& dt_ctx, DateTimeParseResult* result)
	WARN_UNUSED_RESULT;

	private:
	// Struct used for capturing ISO 8601 week-based tokens during parsing.
	struct Iso8601WeekBasedDateParseResult {
	int year = -1;
	int week_of_year = -1;
	int day_of_week = -1;

	bool IsSet() const {
	return (year != -1 \|\| week_of_year != -1 \|\| day_of_week != -1);
	}

	/// Calcuates year/month/day date values corresponding to the ISO 8601 week-based
	/// date values stored in this instance.
	/// If the ISO 8601 week-based date values are valid, the resulting year/month/day
	/// values are copied to 'result' and true is returned.
	/// Otherwise, 'result' is left intact and false is returned.
	bool ExtractYearMonthDay(DateTimeParseResult* result) const WARN_UNUSED_RESULT;
	};

	/// 'input_str' points to a location in the input string where the parsing stands now.
	/// Given 'tok' as the next token in the list of tokens created by the tokenizer this
	/// function finds the end of the next token.
	/// 'input_len' is used for stopping when we reach the end of the input string.
	/// Returns a pointer pointing one position after the last character of the found token.
	/// If can't identify the next token then returns nullptr.
	/// If a MONTH_NAME token is not followed by a separator then the end of the month name
	/// in the input can't be found by this function. In this case MAX_MONTH_NAME_LENGTH is
	/// expected as the lenght of the month token and later on ParseMonthNameToken() will
	/// adjust the end of the token.
	static const char* FindEndOfToken(const char* input_str, int input_len,
	const DateTimeFormatToken& tok, bool fx_provided) WARN_UNUSED_RESULT;

	/// Has to call this function when 'input_str' points to the fist character of a
	/// meridiem indicator. Identifies the last position of a meridiem indicator and returns
	/// a pointer to the next position after this. If no meridiem indicator is found
	/// starting from 'input_str' then returns nullptr.
	static const char* ParseMeridiemIndicatorFromInput(const char* input_str,
	int input_len);

	/// Takes 'year' that is shorter than 4 digits and prefixes it with digits from
	/// 'current_year'. Returns the resulting 4-digit year.
	static int PrefixYearFromCurrentYear(int year, int year_token_len, int current_year);

	/// This function takes a 2-digit 'year' and constructs a 4-digit year from it based on
	/// the 'current_year'. Returns the resulting year.
	static int RoundYearFromCurrentYear(int year, int current_year);

	/// Returns the ISO 8601 week numbering year corresponding to 'now' date.
	static int GetIso8601WeekNumberingYear(const TimestampValue* now);

	/// Gets a pointer to the current char in the input string and an index to the current
	/// token being processed within 'dt_ctx->toks'. Advances these pointers to the end of
	/// the current separator sequence no matter how long these sequences are. It's expected
	/// that 'tok' is of type SEPARATOR. Returns false if 'current_pos' is not a
	/// separator or if either the input ends while having remaining items in 'dt_ctx->toks'
	/// or the other way around.
	static bool ProcessSeparatorSequence(const char** current_pos, const char* end_pos,
	const DateTimeFormatContext& dt_ctx, int* dt_ctx_it);

	// Gets the next character starting from '*format' that can be used for input
	// matching. Takes care of the escaping backslashes regardless if the text token inside
	// the format is itself double escped or not. Returns the next character in a form
	// expected in the input. If '*format' points at the beginning of an escape sequence,
	// '*format' is moved to the last character of the escape sequence. Otherwise,
	// '*format' is not changed. E.g. If the text token is "\"abc" then this returns '"'
	// after skipping the backslash and moves '*format' to '"'.
	static char GetNextCharFromTextToken(const char** format,
	const DateTimeFormatToken* tok);
	};

	}
	}