// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements.  See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership.  The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License.  You may obtain a copy of the License at
//
//   http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied.  See the License for the
// specific language governing permissions and limitations
// under the License.

#include "runtime/datetime-simple-date-format-parser.h"

#include <algorithm>

#include "cctz/civil_time.h"
#include "common/names.h"
#include "runtime/string-value.h"
#include "runtime/string-value.inline.h"
#include "util/string-parser.h"

using boost::unordered_map;
using boost::posix_time::time_duration;

namespace impala {

namespace datetime_parse_util {

bool SimpleDateFormatTokenizer::initialized = false;

const int SimpleDateFormatTokenizer::DEFAULT_DATE_FMT_LEN = 10;
const int SimpleDateFormatTokenizer::DEFAULT_TIME_FMT_LEN = 8;
const int SimpleDateFormatTokenizer::DEFAULT_TIME_FRAC_FMT_LEN = 18;
const int SimpleDateFormatTokenizer::DEFAULT_SHORT_DATE_TIME_FMT_LEN = 19;
const int SimpleDateFormatTokenizer::DEFAULT_DATE_TIME_FMT_LEN = 29;

DateTimeFormatContext SimpleDateFormatTokenizer::DEFAULT_SHORT_DATE_TIME_CTX;
DateTimeFormatContext SimpleDateFormatTokenizer::DEFAULT_SHORT_ISO_DATE_TIME_CTX;
DateTimeFormatContext SimpleDateFormatTokenizer::DEFAULT_DATE_CTX;
DateTimeFormatContext SimpleDateFormatTokenizer::DEFAULT_TIME_CTX;
DateTimeFormatContext SimpleDateFormatTokenizer::DEFAULT_DATE_TIME_CTX[10];
DateTimeFormatContext SimpleDateFormatTokenizer::DEFAULT_ISO_DATE_TIME_CTX[10];
DateTimeFormatContext SimpleDateFormatTokenizer::DEFAULT_TIME_FRAC_CTX[10];

void SimpleDateFormatTokenizer::InitCtx() {
  if (initialized) return;

  // Setup the default date/time context yyyy-MM-dd HH:mm:ss.SSSSSSSSS
  const char* DATE_TIME_CTX_FMT = "yyyy-MM-dd HH:mm:ss.SSSSSSSSS";
  const int FRACTIONAL_MAX_LEN = 9;
  for (int i = FRACTIONAL_MAX_LEN; i >= 0; --i) {
    DEFAULT_DATE_TIME_CTX[i].Reset(DATE_TIME_CTX_FMT,
        DEFAULT_DATE_TIME_FMT_LEN - (FRACTIONAL_MAX_LEN - i));
    Tokenize(&DEFAULT_DATE_TIME_CTX[i]);
  }

  // Setup the default ISO date/time context yyyy-MM-ddTHH:mm:ss.SSSSSSSSS
  for (int i = FRACTIONAL_MAX_LEN; i >= 0; --i) {
    DEFAULT_ISO_DATE_TIME_CTX[i].Reset("yyyy-MM-ddTHH:mm:ss.SSSSSSSSS",
        DEFAULT_DATE_TIME_FMT_LEN - (FRACTIONAL_MAX_LEN - i));
    Tokenize(&DEFAULT_ISO_DATE_TIME_CTX[i]);
  }

  // Setup the short default date/time context yyyy-MM-dd HH:mm:ss
  DEFAULT_SHORT_DATE_TIME_CTX.Reset("yyyy-MM-dd HH:mm:ss");
  Tokenize(&DEFAULT_SHORT_DATE_TIME_CTX);

  // Setup the short default ISO date/time context yyyy-MM-ddTHH:mm:ss
  DEFAULT_SHORT_ISO_DATE_TIME_CTX.Reset("yyyy-MM-ddTHH:mm:ss");
  Tokenize(&DEFAULT_SHORT_ISO_DATE_TIME_CTX);

  // Setup the default short date context yyyy-MM-dd
  DEFAULT_DATE_CTX.Reset("yyyy-MM-dd");
  Tokenize(&DEFAULT_DATE_CTX);

  // Setup the default short time context HH:mm:ss
  DEFAULT_TIME_CTX.Reset("HH:mm:ss");
  Tokenize(&DEFAULT_TIME_CTX);

  // Setup the default short time context with fractional seconds HH:mm:ss.SSSSSSSSS
  for (int i = FRACTIONAL_MAX_LEN; i >= 0; --i) {
    DEFAULT_TIME_FRAC_CTX[i].Reset(DATE_TIME_CTX_FMT + 11,
        DEFAULT_TIME_FRAC_FMT_LEN - (FRACTIONAL_MAX_LEN - i));
    Tokenize(&DEFAULT_TIME_FRAC_CTX[i]);
  }

  // Flag that the parser is ready.
  initialized = true;
}

bool SimpleDateFormatTokenizer::IsValidTZOffset(const char* str_begin,
    const char* str_end) {
  if (*str_begin == '+' || *str_begin == '-') {
    ++str_begin;
    switch (str_end - str_begin) {
      case 5:   // hh:mm
        return strncmp(str_begin, "hh:mm", 5) == 0;
      case 4:   // hhmm
        return strncmp(str_begin, "hhmm", 4) == 0;
      case 2:   // hh
        return strncmp(str_begin, "hh", 2) == 0;
      default:
        break;
    }
  }
  return false;
}

bool SimpleDateFormatTokenizer::Tokenize(DateTimeFormatContext* dt_ctx,
    bool accept_time_toks) {
  DCHECK(dt_ctx != NULL);
  DCHECK(dt_ctx->fmt != NULL);
  DCHECK(dt_ctx->fmt_len > 0);
  DCHECK(dt_ctx->toks.size() == 0);
  const char* str_begin = dt_ctx->fmt;
  const char* str_end = str_begin + dt_ctx->fmt_len;
  const char* str = str_begin;
  // Parse the tokens from the format string
  while (str < str_end) {
    if (isdigit(*str)) return false;

    // If time tokens are accepted, track T|Z as separators.
    if (*str == 'T' || *str == 'Z') {
      if (!accept_time_toks) return false;
      dt_ctx->toks.push_back(DateTimeFormatToken(SEPARATOR, str - str_begin, 1, str));
      ++str;
      continue;
    }

    // A non-alphanumerical char could be the first char of a timezone-offset token.
    // If it is not the beginning of a time-zone offset token, track it as a separator.
    if (!isalpha(*str)) {
      if (dt_ctx->has_time_toks && IsValidTZOffset(str, str_end)) {
        // TZ offset must come at the end of the format.
        dt_ctx->toks.push_back(DateTimeFormatToken(TZ_OFFSET, str - str_begin,
            str_end - str, str));
        break;
      } else {
        dt_ctx->toks.push_back(DateTimeFormatToken(SEPARATOR, str - str_begin, 1, str));
        ++str;
        continue;
      }
    }

    // Not a separator, verify that the previous token is either a separator or has
    // length >1, i.e., it is not a variable length token.
    if (!dt_ctx->toks.empty()) {
      const DateTimeFormatToken& prev = dt_ctx->toks.back();
      if (UNLIKELY(prev.type != SEPARATOR && prev.len == 1)) return false;
    }
    DateTimeFormatTokenType tok_type = UNKNOWN;
    switch (*str) {
      case 'y': tok_type = YEAR; break;
      case 'M': tok_type = MONTH_IN_YEAR; break;
      case 'd': tok_type = DAY_IN_MONTH; break;
      case 'H': tok_type = HOUR_IN_DAY; break;
      case 'm': tok_type = MINUTE_IN_HOUR; break;
      case 's': tok_type = SECOND_IN_MINUTE; break;
      case 'S': tok_type = FRACTION; break;
      // Error on aA-zZ reserved characters that are not used yet.
      default: return false;
    }
    dt_ctx->has_date_toks |= tok_type < HOUR_IN_DAY;
    dt_ctx->has_time_toks |= tok_type >= HOUR_IN_DAY;
    if (!accept_time_toks && dt_ctx->has_time_toks) return false;

    // Get the token length
    int tok_len = 1;
    char tok_chr = *str;
    const char* curr_tok_chr = str + 1;
    while (curr_tok_chr < str_end) {
      if (*curr_tok_chr != tok_chr) break;
      ++tok_len;
      ++curr_tok_chr;
    }
    if (tok_type == MONTH_IN_YEAR) {
      if (UNLIKELY(tok_len > 3)) return false;
      if (tok_len == 3) tok_type = MONTH_NAME_SHORT;
    }
    // In an output scenario, fmt_out_len is used to determine the print buffer size.
    // If the format uses short tokens e.g. yyyy-MM-d, there must to be enough room in
    // the buffer for wider values e.g. 2013-12-16.
    if (tok_len == 1) ++dt_ctx->fmt_out_len;
    DateTimeFormatToken tok(tok_type, str - str_begin, tok_len, str);
    str += tok.len;
    dt_ctx->toks.push_back(tok);
  }
  return dt_ctx->has_date_toks || dt_ctx->has_time_toks;
}

const char* SimpleDateFormatTokenizer::ParseDigitToken(const char* str,
    const char* str_end) {
  const char* tok_end = str;
  while (tok_end < str_end) {
    if (!isdigit(*tok_end)) return tok_end;
    ++tok_end;
  }
  return tok_end;
}

const char* SimpleDateFormatTokenizer::ParseSeparatorToken(const char* str,
    const char* str_end, const char sep) {
  const char* tok_end = str;
  while (tok_end < str_end) {
    if (*tok_end != sep) return tok_end;
    ++tok_end;
  }
  return tok_end;
}

bool SimpleDateFormatTokenizer::TokenizeByStr( DateTimeFormatContext* dt_ctx,
    bool accept_time_toks, bool accept_time_toks_only) {
  DCHECK(dt_ctx != NULL);
  DCHECK(dt_ctx->fmt != NULL);
  DCHECK_GT(dt_ctx->fmt_len, 0);
  DCHECK_EQ(dt_ctx->toks.size(), 0);
  const char* str_begin = dt_ctx->fmt;
  const char* str_end = str_begin + dt_ctx->fmt_len;
  const char* str = str_begin;
  const char* tok_end;

  // Parse the 4-digit year
  tok_end = ParseDigitToken(str, str_end);
  if (tok_end - str == 4) {
    dt_ctx->toks.push_back(
        DateTimeFormatToken(YEAR, str - str_begin, tok_end - str, str));
    str = tok_end;

    // Check for the date separator '-'
    tok_end = ParseSeparatorToken(str, str_end, '-');
    if (tok_end - str != 1) return false;
    dt_ctx->toks.push_back(
        DateTimeFormatToken(SEPARATOR, str - str_begin, tok_end - str, str));
    str = tok_end;

    // Parse the 1 or 2 digit month.
    tok_end = ParseDigitToken(str, str_end);
    if (tok_end - str != 1 && tok_end - str != 2) return false;
    dt_ctx->toks.push_back(
        DateTimeFormatToken(MONTH_IN_YEAR, str - str_begin, tok_end - str, str));
    str = tok_end;

    // Check for the date separator '-'
    tok_end = ParseSeparatorToken(str, str_end, '-');
    if (tok_end - str != 1) return false;
    dt_ctx->toks.push_back(
        DateTimeFormatToken(SEPARATOR, str - str_begin, tok_end - str, str));
    str = tok_end;

    // Parse the 1 or 2 digit day in month
    tok_end = ParseDigitToken(str, str_end);
    if (tok_end - str != 1 && tok_end - str != 2) return false;
    dt_ctx->toks.push_back(
        DateTimeFormatToken(DAY_IN_MONTH, str - str_begin, tok_end - str, str));
    str = tok_end;
    dt_ctx->has_date_toks = true;

    // If the string ends here, we only have a date component
    if (str == str_end) return true;
    // If time tokens are not accepted, string should have ended here.
    if (!accept_time_toks) return false;

    // Check for the space between date and time component
    if (*str != ' ' && *str != 'T') return false;
    char sep = *str;
    tok_end = ParseSeparatorToken(str, str_end, sep);
    if (tok_end - str < 1) return false;
    // IMPALA-6641: Multiple spaces are okay, 'T' separator must be single
    if (sep == 'T' && tok_end - str > 1) return false;
    dt_ctx->toks.push_back(
        DateTimeFormatToken(SEPARATOR, str - str_begin, tok_end - str, str));
    str = tok_end;

    // Invalid format if date-time separator is not followed by more digits
    if (str > str_end) return false;
    tok_end = ParseDigitToken(str, str_end);
  }

  // If time tokens are not accepted, no need to proceed.
  if (!accept_time_toks) return false;
  // If no date tokens were found and time tokens on their own are not allowed, return
  // false.
  if (!dt_ctx->has_date_toks && !accept_time_toks_only) return false;

  // Parse the 1 or 2 digit hour
  if (tok_end - str != 1 && tok_end - str != 2) return false;
  dt_ctx->toks.push_back(
      DateTimeFormatToken(HOUR_IN_DAY, str - str_begin, tok_end - str, str));
  str = tok_end;

  // Check for the time component separator ':'
  tok_end = ParseSeparatorToken(str, str_end, ':');
  if (tok_end - str != 1) return false;
  dt_ctx->toks.push_back(
      DateTimeFormatToken(SEPARATOR, str - str_begin, tok_end - str, str));
  str = tok_end;

  // Parse the 1 or 2 digit minute
  tok_end = ParseDigitToken(str, str_end);
  if (tok_end - str != 1 && tok_end - str != 2) return false;
  dt_ctx->toks.push_back(
      DateTimeFormatToken(MINUTE_IN_HOUR, str - str_begin, tok_end - str, str));
  str = tok_end;

  // Check for the time component separator ':'
  tok_end = ParseSeparatorToken(str, str_end, ':');
  if (tok_end - str != 1) return false;
  dt_ctx->toks.push_back(
      DateTimeFormatToken(SEPARATOR, str - str_begin, tok_end - str, str));
  str = tok_end;

  // Parse the 1 or 2 digit second
  tok_end = ParseDigitToken(str, str_end);
  if (tok_end - str != 1 && tok_end - str != 2) return false;
  dt_ctx->toks.push_back(
      DateTimeFormatToken(SECOND_IN_MINUTE, str - str_begin, tok_end - str, str));
  str = tok_end;
  dt_ctx->has_time_toks = true;

  // There is more to parse, there maybe a fractional component.
  if (str < str_end) {
    tok_end = ParseSeparatorToken(str, str_end, '.');
    if (tok_end - str != 1) return false;
    dt_ctx->toks.push_back(
        DateTimeFormatToken(SEPARATOR, str - str_begin, tok_end - str, str));
    str = tok_end;

    // Invalid format when there is no fractional component following '.'
    if (str > str_end) return false;

    // Parse the fractional component.
    // Like the non-lazy path, this will parse up to 9 fractional digits
    tok_end = ParseDigitToken(str, str_end);
    int num_digits = std::min<int>(9, tok_end - str);
    if (num_digits == 0) return false;
    dt_ctx->toks.push_back(
        DateTimeFormatToken(FRACTION, str - str_begin, num_digits, str));
    str = tok_end;

    // Invalid format if there is more to parse after the fractional component
    if (str < str_end) return false;
  }
  return true;
}

const DateTimeFormatContext* SimpleDateFormatTokenizer::GetDefaultFormatContext(
    const char* str, int len, bool accept_time_toks, bool accept_time_toks_only) {
  DCHECK(initialized);
  DCHECK(str != nullptr);
  DCHECK(len > 0);

  if (LIKELY(len >= DEFAULT_TIME_FMT_LEN)) {
    // Check if this string starts with a date component
    if (str[4] == '-' && str[7] == '-') {
      // Do we have a date component only?
      if (len == DEFAULT_DATE_FMT_LEN) {
        return &DEFAULT_DATE_CTX;
      }

      // We have a time component as well. Do we accept it?
      if (!accept_time_toks) return nullptr;

      switch (len) {
        case DEFAULT_SHORT_DATE_TIME_FMT_LEN: {
          if (LIKELY(str[13] == ':')) {
            switch (str[10]) {
              case ' ':
                return &DEFAULT_SHORT_DATE_TIME_CTX;
              case 'T':
                return &DEFAULT_SHORT_ISO_DATE_TIME_CTX;
            }
          }
          break;
        }
        case DEFAULT_DATE_TIME_FMT_LEN: {
          if (LIKELY(str[13] == ':')) {
            switch (str[10]) {
              case ' ':
                return &DEFAULT_DATE_TIME_CTX[9];
              case 'T':
                return &DEFAULT_ISO_DATE_TIME_CTX[9];
            }
          }
          break;
        }
        default: {
          // There is likely a fractional component that's below the expected 9 chars.
          // We will need to work out which default context to use that corresponds to
          // the fractional length in the string.
          if (LIKELY(len > DEFAULT_SHORT_DATE_TIME_FMT_LEN)
              && LIKELY(str[19] == '.') && LIKELY(str[13] == ':')) {
            switch (str[10]) {
              case ' ': {
                return &DEFAULT_DATE_TIME_CTX[len - DEFAULT_SHORT_DATE_TIME_FMT_LEN - 1];
              }
              case 'T': {
                return &DEFAULT_ISO_DATE_TIME_CTX
                    [len - DEFAULT_SHORT_DATE_TIME_FMT_LEN - 1];
              }
            }
          }
          break;
        }
      }
    } else {
      // 'str' string does not start with a date component.
      // Do we accept time component only?
      if (!accept_time_toks || !accept_time_toks_only) return nullptr;

      // Parse time component.
      if (str[2] == ':' && str[5] == ':' && isdigit(str[7])) {
        len = min(len, DEFAULT_TIME_FRAC_FMT_LEN);
        if (len > DEFAULT_TIME_FMT_LEN && str[8] == '.') {
          return &DEFAULT_TIME_FRAC_CTX[len - DEFAULT_TIME_FMT_LEN - 1];
        } else {
          return &DEFAULT_TIME_CTX;
        }
      }
    }
  }

  return nullptr;
}

bool SimpleDateFormatParser::ParseDateTime(const char* str, int str_len,
    const DateTimeFormatContext& dt_ctx, DateTimeParseResult* dt_result) {
  DCHECK(dt_ctx.fmt_len > 0);
  DCHECK(dt_ctx.toks.size() > 0);
  DCHECK(dt_result != NULL);
  if (str_len <= 0 || str_len < dt_ctx.fmt_len || str == NULL) return false;
  StringParser::ParseResult status;
  // Keep track of the number of characters we need to shift token positions by.
  // Variable-length tokens will result in values > 0;
  int shift_len = 0;
  for (const DateTimeFormatToken& tok: dt_ctx.toks) {
    const char* tok_val = str + tok.pos + shift_len;
    if (tok.type == SEPARATOR) {
      if (UNLIKELY(*tok_val != *tok.val)) return false;
      continue;
    }
    int tok_len = tok.len;
    const char* str_end = str + str_len;
    // In case of single-character tokens we scan ahead to the next separator.
    if (UNLIKELY(tok_len == 1)) {
      while ((tok_val + tok_len < str_end) && isdigit(*(tok_val + tok_len))) {
        ++tok_len;
        ++shift_len;
      }
    }
    switch (tok.type) {
      case YEAR: {
        if (!ParseAndValidate(tok_val, tok_len, 0, 9999, &dt_result->year)) return false;
        // Year in "Y" and "YY" format should be in the interval
        // [current time - 80 years, current time + 20 years)
        if (tok_len <= 2) dt_result->realign_year = true;
        break;
      }
      case MONTH_IN_YEAR: {
        if (!ParseAndValidate(tok_val, tok_len, 1, 12, &dt_result->month)) return false;
        break;
      }
      case MONTH_NAME_SHORT: {
        const char* tok_end = tok_val + tok_len;
        if (!ParseMonthNameToken(tok, tok_val, &tok_end, dt_ctx.fx_modifier,
            &dt_result->month)) {
          return false;
        }
        break;
      }
      case DAY_IN_MONTH: {
        if (!ParseAndValidate(tok_val, tok_len, 1, 31, &dt_result->day)) return false;
        break;
      }
      case HOUR_IN_DAY: {
        if (!ParseAndValidate(tok_val, tok_len, 0, 23, &dt_result->hour)) return false;
        break;
      }
      case MINUTE_IN_HOUR: {
        if (!ParseAndValidate(tok_val, tok_len, 0, 59, &dt_result->minute)) return false;
        break;
      }
      case SECOND_IN_MINUTE: {
        if (!ParseAndValidate(tok_val, tok_len, 0, 59, &dt_result->second)) return false;
        break;
      }
      case FRACTION: {
        if (!ParseFractionToken(tok_val, tok_len, dt_result)) return false;
        break;
      }
      case TZ_OFFSET: {
        if (tok_val[0] != '+' && tok_val[0] != '-') return false;
        int sign = tok_val[0] == '-' ? -1 : 1;
        int minute = 0;
        int hour = StringParser::StringToInt<int>(tok_val + 1, 2, &status);
        if (UNLIKELY(StringParser::PARSE_SUCCESS != status ||
            hour < 0 || hour > 23)) {
          return false;
        }
        switch (tok_len) {
          case 6: {
            // +hh:mm
            minute = StringParser::StringToInt<int>(tok_val + 4, 2, &status);
            break;
          }
          case 5: {
            // +hh:mm
            minute = StringParser::StringToInt<int>(tok_val + 3, 2, &status);
            break;
          }
          case 3: {
            // +hh
            break;
          }
          default: {
            // Invalid timezone offset length.
            return false;
          }
        }
        if (UNLIKELY(StringParser::PARSE_SUCCESS != status ||
            minute < 0 || minute > 59)) {
          return false;
        }
        dt_result->tz_offset = time_duration(sign * hour, sign * minute, 0, 0);
        break;
      }
      default: DCHECK(false) << "Unknown date/time format token";
    }
  }
  return true;
}

} // namespace datetime_parse_util

} // namespace impala
