blob: 5813e940d2a15ec2610ca2ecf54bb801b0fc64a6 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "ExcelReadHelpers.h"
#include <IO/PeekableReadBuffer.h>
#include <IO/ReadBuffer.h>
#include <IO/parseDateTimeBestEffort.h>
#include <Common/DateLUTImpl.h>
namespace DB
{
namespace ErrorCodes
{
extern const int CANNOT_PARSE_DATETIME;
}
}
namespace local_engine
{
bool readDateText(LocalDate & date, DB::ReadBuffer & buf, const DB::FormatSettings & settings)
{
bool is_us_style = settings.date_time_input_format == DB::FormatSettings::DateTimeInputFormat::BestEffortUS;
return readDateTextWithExcel(date, buf, is_us_style, settings);
}
bool readDateTime64Text(
DB::DateTime64 & x,
DB::ReadBuffer & buf,
const DB::FormatSettings & settings,
const DateLUTImpl & time_zone,
const DateLUTImpl & utc_time_zone,
bool quote)
{
// scale is set 6, maybe a bug
int scale = 6;
auto pr = static_cast<DB::PeekableReadBuffer>(buf);
DB::PeekableReadBufferCheckpoint checkpoint{pr, false};
if (readDatetime64TextWithExcel(x, scale, pr, time_zone, settings.csv, quote))
return true;
else
pr.rollbackToCheckpoint();
switch (settings.date_time_input_format)
{
case DB::FormatSettings::DateTimeInputFormat::Basic:
return tryReadDateTime64Text(x, scale, pr, time_zone);
case DB::FormatSettings::DateTimeInputFormat::BestEffort:
return tryParseDateTime64BestEffort(x, scale, pr, time_zone, utc_time_zone);
case DB::FormatSettings::DateTimeInputFormat::BestEffortUS:
return tryParseDateTime64BestEffortUS(x, scale, pr, time_zone, utc_time_zone);
}
return false;
}
bool readDatetime64TextWithExcel(
DB::DateTime64 & datetime64,
UInt32 scale,
DB::ReadBuffer & buf,
const DateLUTImpl & time_zone,
const DB::FormatSettings::CSV & settings,
bool quote)
{
/// Support more format.
/// Only parser for below:
/// yyyy-MM-dd HH:mm:ss,SSS
/// yyyy-MM-dd'T'HH:mm:ss
/// yyyy-MM-dd'T'HH:mm:ss.SSS
/// yyyy-MM-dd'T'HH:mm:ss'Z'
/// yyyy-MM-dd'T'HH:mm:ss.SSS'Z'
/// Other will fallback to ch read.
/// The whole value is in buffer.
///
auto quick_return
= [&time_zone, &scale, &datetime64](
UInt16 year, UInt8 month, UInt8 day, UInt8 hour, UInt8 minute, UInt8 second, DB::DateTime64::NativeType fractional) -> bool
{
if (!day)
day = 1;
if (!checkDate(year, month, day))
return false;
time_t datetime = time_zone.makeDateTime(year, month, day, hour, minute, second);
return DB::DecimalUtils::tryGetDecimalFromComponents<DB::DateTime64>(datetime, fractional, scale, datetime64);
};
UInt16 year = 0;
UInt8 month = 0;
UInt8 day = 0;
UInt8 hour = 0;
UInt8 minute = 0;
UInt8 second = 0;
DB::DateTime64::NativeType fractional = 0;
char year_digits[std::numeric_limits<UInt64>::digits10];
size_t num_year_digits = readDigits(year_digits, sizeof(year_digits), buf);
if (num_year_digits != 4) // yyyyMM
return false;
readDecimalNumber<4>(year, year_digits);
char delimiter_after_year = *buf.position();
++buf.position();
char month_digits[std::numeric_limits<UInt64>::digits10];
size_t num_month_digits = readDigits(month_digits, sizeof(month_digits), buf);
if (num_month_digits == 1)
readDecimalNumber<1>(month, month_digits);
else if (num_month_digits == 2)
readDecimalNumber<2>(month, month_digits);
else
return false;
if (*buf.position() != delimiter_after_year) // delimiter must same char
return false;
++buf.position();
char day_digits[std::numeric_limits<UInt64>::digits10];
size_t num_day_digits = readDigits(day_digits, sizeof(day_digits), buf);
if (num_day_digits == 1)
readDecimalNumber<1>(day, day_digits);
else if (num_day_digits == 2)
readDecimalNumber<2>(day, day_digits);
else
return false;
char delimiter_after_day = *buf.position();
if (delimiter_after_day == settings.delimiter)
return quick_return(year, month, day, hour, minute, second, fractional);
if (delimiter_after_day != ' ' && delimiter_after_day != '\'')
return false;
++buf.position();
/// 'T'
if (*buf.position() == 'T')
{
++buf.position();
if (delimiter_after_day != *buf.position())
return false;
++buf.position();
}
if (!readNumber<2>(buf, hour))
return false;
if (*buf.position() != ':')
return false;
++buf.position();
if (!readNumber<2>(buf, minute))
return false;
if (*buf.position() != ':')
return false;
++buf.position();
if (!readNumber<2>(buf, second))
return false;
/// .SSS'Z'
/// if not has quote, not allow ',' after 'ss'
bool allow_comma = (settings.delimiter == ',' && quote) || (!quote && settings.delimiter != ',');
if (!buf.eof() && (*buf.position() == '.' || (allow_comma && *buf.position() == ',')))
{
++buf.position();
/// Read digits, up to 'scale' positions.
for (size_t i = 0; i < scale; ++i)
{
if (!buf.eof() && isNumericASCII(*buf.position()))
{
fractional *= 10;
fractional += *buf.position() - '0';
++buf.position();
}
else
fractional *= 10;
}
}
if (!buf.eof() && buf.position() + 3 <= buf.buffer().end())
{
/// ignore 'Z'
if (buf.position()[0] == '\'' && buf.position()[1] == 'Z' && buf.position()[2] == '\'')
buf.position() = buf.position() + 3;
}
return quick_return(year, month, day, hour, minute, second, fractional);
}
inline bool readDateTextWithExcel(LocalDate & date, DB::ReadBuffer & buf, bool is_us_style, const DB::FormatSettings & settings)
{
if (buf.eof())
return false;
/// Support more format include MM/dd/yyyy, yyyyMM, yyyy-MM, yyyy/MM.
/// The whole value is in buffer.
/// The delimiters can be arbitrary characters, like YYYY!MM, but obviously not digits.
UInt16 year = 0;
UInt8 month = 0;
UInt8 day = 0;
char first_digits[std::numeric_limits<UInt64>::digits10];
size_t num_first_digits = readDigits(first_digits, sizeof(first_digits), buf);
if (num_first_digits == 8) // yyyyMMdd
{
readDecimalNumber<4>(year, first_digits);
readDecimalNumber<2>(month, first_digits + 4);
readDecimalNumber<2>(day, first_digits + 6);
}
else if (num_first_digits == 6) // yyyyMM
{
readDecimalNumber<4>(year, first_digits);
readDecimalNumber<2>(month, first_digits + 4);
}
else if (num_first_digits == 4) // yyyy-MM, yyyy/MM, yyyy.M, yyyyMMdd, yyyy.M.d
{
readDecimalNumber<4>(year, first_digits);
char delimiter_after_year = *buf.position();
++buf.position();
char month_digits[std::numeric_limits<UInt64>::digits10];
size_t num_month_digits = readDigits(month_digits, sizeof(month_digits), buf);
/// incorrect: yyyy-MMM
if (num_month_digits == 1)
readDecimalNumber<1>(month, month_digits);
else if (num_month_digits == 2)
readDecimalNumber<2>(month, month_digits);
else
return false;
/// yyyy-MM-xx fallback to ch parser
if (!buf.eof() && *buf.position() == delimiter_after_year)
{
++buf.position();
char day_digits[std::numeric_limits<UInt64>::digits10];
size_t num_day_digits = readDigits(day_digits, sizeof(day_digits), buf);
/// incorrect: yyyy-MM-ddd
if (num_day_digits == 1)
readDecimalNumber<1>(day, day_digits);
else if (num_day_digits == 2)
readDecimalNumber<2>(day, day_digits);
else
return false;
}
}
else if (is_us_style)
{
/// MM/dd/yyyy, M.d.yyyy
if (num_first_digits != 1 && num_first_digits != 2)
return false;
/// MM/dd/yyyy: 01/01/2023 or 1/1/2023
if (num_first_digits == 1)
readDecimalNumber<1>(month, first_digits);
else
readDecimalNumber<2>(month, first_digits);
char delimiter_after_year = *buf.position();
if (delimiter_after_year == settings.csv.delimiter)
return false;
++buf.position();
char day_digits[std::numeric_limits<UInt64>::digits10];
size_t num_day_digits = readDigits(day_digits, sizeof(day_digits), buf);
if (num_day_digits == 1)
readDecimalNumber<1>(day, day_digits);
else if (num_day_digits == 2)
readDecimalNumber<2>(day, day_digits);
else
return false;
// incorrect: MM/dd-yyyy, MM/dd
if (buf.eof() || delimiter_after_year != *buf.position())
return false;
++buf.position();
char year_digits[std::numeric_limits<UInt64>::digits10];
size_t num_year_digits = readDigits(year_digits, sizeof(year_digits), buf);
if (num_year_digits != 4)
return false;
readDecimalNumber<4>(year, year_digits);
}
else
return false;
if (!day)
day = 1;
date = LocalDate(year, month, day);
return true;
}
}