blob: 4f4f53618a0f41254f00d1262a504709d63d9ee5 [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#include <gtest/gtest.h>
#include <rapidjson/document.h>
#include <re2/re2.h>
#include <string>
namespace doris {
class EsScrollParserTest : public testing::Test {
public:
void SetUp() override {}
void TearDown() override {}
};
// Test timezone pattern matching for ES datetime parsing fix
TEST_F(EsScrollParserTest, TestTimezonePatternMatching) {
RE2 time_zone_pattern(R"([+-]\d{2}:?\d{2}|Z)");
std::vector<std::string> valid_timezone_formats = {
"2025-05-23T20:56:52.052+0900", "2025-05-23T20:56:52.052-0500",
"2025-05-23T20:56:52.052+08:00", "2025-05-23T20:56:52.052-04:30",
"2025-05-23T20:56:52.052Z", "2022-08-08T12:10:10.151Z",
"2022-08-08T12:10:10+0900", "2022-08-08T12:10:10-0500"};
for (const auto& datetime_str : valid_timezone_formats) {
re2::StringPiece timezone_value;
bool matched = time_zone_pattern.Match(datetime_str, 0, datetime_str.size(),
RE2::UNANCHORED, &timezone_value, 1);
EXPECT_TRUE(matched) << "Failed to match timezone in: " << datetime_str;
std::string timezone = timezone_value.as_string();
EXPECT_FALSE(timezone.empty()) << "Empty timezone captured from: " << datetime_str;
if (timezone == "Z") {
EXPECT_EQ(timezone, "Z");
} else {
EXPECT_TRUE(timezone[0] == '+' || timezone[0] == '-')
<< "Invalid timezone sign in: " << timezone;
// Valid timezone lengths: 5 for +0900, 6 for +08:00
EXPECT_TRUE(timezone.length() == 5 || timezone.length() == 6)
<< "Invalid timezone length in: " << timezone
<< " (length: " << timezone.length() << ")";
}
}
}
TEST_F(EsScrollParserTest, TestInvalidTimezonePatterns) {
RE2 time_zone_pattern(R"([+-]\d{2}:?\d{2}|Z)");
std::vector<std::string> invalid_formats = {
"2025-05-23T20:56:52.052", "2025-05-23T20:56:52.052+9", "2025-05-23T20:56:52.052+090",
"2025-05-23T20:56:52.052+9:00"};
for (const auto& datetime_str : invalid_formats) {
re2::StringPiece timezone_value;
bool matched = time_zone_pattern.Match(datetime_str, 0, datetime_str.size(),
RE2::UNANCHORED, &timezone_value, 1);
if (matched) {
std::string timezone = timezone_value.as_string();
EXPECT_TRUE(timezone.empty()) << "Should not capture timezone from: " << datetime_str;
}
}
}
TEST_F(EsScrollParserTest, TestBugScenarioTimezoneFormat) {
RE2 time_zone_pattern(R"([+-]\d{2}:?\d{2}|Z)");
std::string problematic_format = "2025-05-23T20:56:52.052+0900";
re2::StringPiece timezone_value;
bool matched = time_zone_pattern.Match(problematic_format, 0, problematic_format.size(),
RE2::UNANCHORED, &timezone_value, 1);
EXPECT_TRUE(matched) << "Failed to match the bug scenario format: " << problematic_format;
std::string timezone = timezone_value.as_string();
EXPECT_EQ(timezone, "+0900") << "Incorrect timezone captured: " << timezone;
}
TEST_F(EsScrollParserTest, TestEdgeCaseTimezoneFormats) {
RE2 time_zone_pattern(R"([+-]\d{2}:?\d{2}|Z)");
std::vector<std::string> edge_cases = {"+00:00", "-00:00", "+23:59", "-23:59",
"+99:99", "Z", "+0800", ""};
// Test each edge case
std::vector<std::string> test_datetime_strings = {
"2025-05-23T20:56:52.052+00:00", // +00:00 (UTC with colon)
"2025-05-23T20:56:52.052-00:00", // -00:00 (UTC with colon)
"2025-05-23T20:56:52.052+23:59", // +23:59 (max valid timezone)
"2025-05-23T20:56:52.052-23:59", // -23:59 (max valid timezone)
"2025-05-23T20:56:52.052+99:99", // +99:99 (invalid but should match pattern)
"2025-05-23T20:56:52.052Z", // Z (UTC)
"2025-05-23T20:56:52.052+0800", // +0800 (no colon)
"2025-05-23T20:56:52.052" // empty timezone (no timezone)
};
std::vector<std::string> expected_matches = {"+00:00", "-00:00", "+23:59", "-23:59",
"+99:99", "Z", "+0800", ""};
std::vector<bool> should_match = {true, true, true, true, true, true, true, false};
for (size_t i = 0; i < test_datetime_strings.size(); ++i) {
const std::string& datetime_str = test_datetime_strings[i];
const std::string& expected_match = expected_matches[i];
bool should_match_expected = should_match[i];
re2::StringPiece timezone_value;
bool matched = time_zone_pattern.Match(datetime_str, 0, datetime_str.size(),
RE2::UNANCHORED, &timezone_value, 1);
EXPECT_EQ(matched, should_match_expected)
<< "Edge case test failed for: " << datetime_str
<< " (expected match: " << should_match_expected << ")";
if (matched && should_match_expected) {
std::string timezone = timezone_value.as_string();
EXPECT_EQ(timezone, expected_match)
<< "Incorrect timezone captured from: " << datetime_str
<< " (expected: " << expected_match << ", got: " << timezone << ")";
}
}
}
TEST_F(EsScrollParserTest, TestSpecialTimezoneEdgeCases) {
RE2 time_zone_pattern(R"([+-]\d{2}:?\d{2}|Z)");
// Additional edge cases for comprehensive testing
std::vector<std::pair<std::string, std::pair<std::string, bool>>> special_cases = {
// {datetime_string, {expected_timezone, should_match}}
{"2025-05-23T20:56:52+0000", {"+0000", true}}, // +0000 without colon
{"2025-05-23T20:56:52-0000", {"-0000", true}}, // -0000 without colon
{"2025-05-23T20:56:52+12:30", {"+12:30", true}}, // +12:30 with colon
{"2025-05-23T20:56:52-12:30", {"-12:30", true}}, // -12:30 with colon
{"2025-05-23T20:56:52+1200", {"+1200", true}}, // +1200 without colon
{"2025-05-23T20:56:52-1200", {"-1200", true}}, // -1200 without colon
{"2025-05-23T20:56:52.000Z", {"Z", true}}, // Z with milliseconds
{"2025-05-23T20:56:52.123456+05:30", {"+05:30", true}}, // microseconds with timezone
{"2025-05-23T20:56:52.123456-05:30", {"-05:30", true}}, // microseconds with timezone
{"2025-05-23T20:56:52.123456+0530", {"+0530", true}}, // microseconds without colon
{"2025-05-23T20:56:52.123456-0530", {"-0530", true}}, // microseconds without colon
{"2025-05-23T20:56:52+14:00", {"+14:00", true}}, // +14:00 (valid max timezone)
{"2025-05-23T20:56:52-12:00", {"-12:00", true}}, // -12:00 (valid min timezone)
};
for (const auto& test_case : special_cases) {
const std::string& datetime_str = test_case.first;
const std::string& expected_timezone = test_case.second.first;
bool should_match = test_case.second.second;
re2::StringPiece timezone_value;
bool matched = time_zone_pattern.Match(datetime_str, 0, datetime_str.size(),
RE2::UNANCHORED, &timezone_value, 1);
EXPECT_EQ(matched, should_match) << "Special case test failed for: " << datetime_str
<< " (expected match: " << should_match << ")";
if (matched && should_match) {
std::string timezone = timezone_value.as_string();
EXPECT_EQ(timezone, expected_timezone)
<< "Incorrect timezone captured from: " << datetime_str
<< " (expected: " << expected_timezone << ", got: " << timezone << ")";
}
}
}
} // namespace doris