blob: 3236c9c68ca6f619fd27d9a08535fffcd2a61383 [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#include <string>
#include "exec/delimited-text-parser.inline.h"
#include "testutil/gtest-util.h"
#include "common/names.h"
namespace impala {
void Validate(TupleDelimitedTextParser* parser, const string& data,
int expected_offset, char tuple_delim, int expected_num_tuples,
int expected_num_fields) {
parser->ParserReset();
char* data_ptr = const_cast<char*>(data.c_str());
int remaining_len = data.size();
int offset = parser->FindFirstInstance(data_ptr, remaining_len);
EXPECT_EQ(offset, expected_offset) << data;
if (offset == -1) return;
EXPECT_GE(offset, 1) << data;
EXPECT_LT(offset, data.size()) << data;
if (tuple_delim != '\n') {
EXPECT_EQ(data[offset - 1], tuple_delim) << data;
} else {
EXPECT_TRUE(data[offset - 1] == '\n' || data[offset - 1] == '\r')
<< data[offset - 1] << endl << data;
if (data[offset - 1] == '\r' && offset < data.size()) {
EXPECT_NE(data[offset], '\n');
}
}
data_ptr += offset;
remaining_len -= offset;
char* row_end_locs[100];
vector<FieldLocation> field_locations(100);
int num_tuples = 0;
int num_fields = 0;
char* next_column_start;
Status status =
parser->ParseFieldLocations(100, remaining_len, &data_ptr, &row_end_locs[0],
field_locations.data(), &num_tuples, &num_fields, &next_column_start);
EXPECT_EQ(num_tuples, expected_num_tuples) << data;
EXPECT_EQ(num_fields, expected_num_fields) << data;
}
TEST(DelimitedTextParser, Basic) {
const char TUPLE_DELIM = '|';
const char FIELD_DELIM = ',';
const char COLLECTION_DELIM = ',';
const char ESCAPE_CHAR = '@';
const int NUM_COLS = 1;
// Test without escape
bool is_materialized_col[NUM_COLS];
for (int i = 0; i < NUM_COLS; ++i) is_materialized_col[i] = true;
TupleDelimitedTextParser no_escape_parser(NUM_COLS, 0, is_materialized_col,
TUPLE_DELIM, FIELD_DELIM, COLLECTION_DELIM);
// Note that only complete tuples "count"
Validate(&no_escape_parser, "no_delims", -1, TUPLE_DELIM, 0, 0);
Validate(&no_escape_parser, "abc||abc", 4, TUPLE_DELIM, 1, 1);
Validate(&no_escape_parser, "|abcd", 1, TUPLE_DELIM, 0, 0);
Validate(&no_escape_parser, "a|bcd", 2, TUPLE_DELIM, 0, 0);
// Test with escape char
TupleDelimitedTextParser escape_parser(NUM_COLS, 0, is_materialized_col,
TUPLE_DELIM, FIELD_DELIM, COLLECTION_DELIM,
ESCAPE_CHAR);
Validate(&escape_parser, "a@|a|bcd", 5, TUPLE_DELIM, 0, 0);
Validate(&escape_parser, "a@@|a|bcd", 4, TUPLE_DELIM, 1, 1);
Validate(&escape_parser, "a@@@|a|bcd", 7, TUPLE_DELIM, 0, 0);
Validate(&escape_parser, "a@@@@|a|bcd", 6, TUPLE_DELIM, 1, 1);
Validate(&escape_parser, "a|@@@|a|bcd", 2, TUPLE_DELIM, 1, 1);
// // The parser doesn't support this case.
// // TODO: update test when it is fixed
// Validate(&escape_parser, "@|no_delims", -1, TUPLE_DELIM);
// Test null characters
const string str1("\0no_delims", 10);
const string str2("ab\0||abc", 8);
const string str3("\0|\0|\0", 5);
const string str4("abc|\0a|abc", 10);
const string str5("\0|aaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", 32);
Validate(&no_escape_parser, str1, -1, TUPLE_DELIM, 0, 0);
Validate(&no_escape_parser, str2, 4, TUPLE_DELIM, 1, 1);
Validate(&no_escape_parser, str3, 2, TUPLE_DELIM, 1, 1);
Validate(&no_escape_parser, str4, 4, TUPLE_DELIM, 1, 1);
Validate(&no_escape_parser, str5, 2, TUPLE_DELIM, 0, 0);
const string str6("\0@|\0|\0", 6);
const string str7("\0@@|\0|\0", 6);
const string str8("\0@\0@|\0|\0", 8);
const string str9("\0@||aaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", 34);
Validate(&escape_parser, str6, 5, TUPLE_DELIM, 0, 0);
Validate(&escape_parser, str7, 4, TUPLE_DELIM, 1, 1);
Validate(&escape_parser, str8, 7, TUPLE_DELIM, 0, 0);
Validate(&escape_parser, str9, 4, TUPLE_DELIM, 0, 0);
}
TEST(DelimitedTextParser, Fields) {
const char TUPLE_DELIM = '|';
const char FIELD_DELIM = ',';
const char COLLECTION_DELIM = ',';
const char ESCAPE_CHAR = '@';
const int NUM_COLS = 2;
bool is_materialized_col[NUM_COLS];
for (int i = 0; i < NUM_COLS; ++i) is_materialized_col[i] = true;
TupleDelimitedTextParser no_escape_parser(NUM_COLS, 0, is_materialized_col,
TUPLE_DELIM, FIELD_DELIM, COLLECTION_DELIM);
Validate(&no_escape_parser, "a,b|c,d|e,f", 4, TUPLE_DELIM, 1, 3);
Validate(&no_escape_parser, "b|c,d|e,f", 2, TUPLE_DELIM, 1, 3);
Validate(&no_escape_parser, "a,|c,d|", 3, TUPLE_DELIM, 1, 2);
Validate(&no_escape_parser, "a,|c|e", 3, TUPLE_DELIM, 1, 2);
const string str10("a,\0|c,d|e", 9);
Validate(&no_escape_parser, str10, 4, TUPLE_DELIM, 1, 2);
TupleDelimitedTextParser escape_parser(NUM_COLS, 0, is_materialized_col,
TUPLE_DELIM, FIELD_DELIM, COLLECTION_DELIM,
ESCAPE_CHAR);
Validate(&escape_parser, "a,b|c,d|e,f", 4, TUPLE_DELIM, 1, 3);
Validate(&escape_parser, "a,@|c|e,f", 6, TUPLE_DELIM, 0, 1);
Validate(&escape_parser, "a|b,c|d@,e", 2, TUPLE_DELIM, 1, 2);
}
TEST(DelimitedTextParser, SpecialDelimiters) {
const char TUPLE_DELIM = '\n'; // implies '\r' and "\r\n" are also delimiters
const char NUL_DELIM = '\0';
const int NUM_COLS = 1;
const int MAX_COLS = 2;
bool is_materialized_col[MAX_COLS];
for (int i = 0; i < MAX_COLS; ++i) is_materialized_col[i] = true;
TupleDelimitedTextParser tuple_delim_parser(NUM_COLS, 0, is_materialized_col,
TUPLE_DELIM);
TupleDelimitedTextParser nul_tuple_parser(NUM_COLS, 0, is_materialized_col, NUL_DELIM);
TupleDelimitedTextParser nul_field_parser(MAX_COLS, 0, is_materialized_col,
TUPLE_DELIM, NUL_DELIM);
// Non-SSE case
Validate(&tuple_delim_parser, "A\r\nB", 3, TUPLE_DELIM, 0, 0);
Validate(&tuple_delim_parser, "A\rB", 2, TUPLE_DELIM, 0, 0);
Validate(&tuple_delim_parser, "A\nB", 2, TUPLE_DELIM, 0, 0);
Validate(&tuple_delim_parser, "A\nB\r\nC", 2, TUPLE_DELIM, 1, 1);
Validate(&tuple_delim_parser, "A\r\nB\r\nC", 3, TUPLE_DELIM, 1, 1);
Validate(&tuple_delim_parser, "A\rB\nC\r\nD", 2, TUPLE_DELIM, 2, 2);
Validate(&tuple_delim_parser, "\r\r\n\n", 1, TUPLE_DELIM, 2, 2);
// NUL tuple delimiter; no field delimiter
const string nul1("\0\0\0", 3);
const string nul2("AAA\0BBB\0", 8);
const string nul3("\n\0\r\0\r\n\0", 7);
const string nul4("\n\0\r\0\r\n", 6);
Validate(&nul_tuple_parser, nul1, 1, NUL_DELIM, 2, 2);
Validate(&nul_tuple_parser, nul2, 4, NUL_DELIM, 1, 1);
Validate(&nul_tuple_parser, nul3, 2, NUL_DELIM, 2, 2);
Validate(&nul_tuple_parser, nul4, 2, NUL_DELIM, 1, 1);
// SSE case
string data = "\rAAAAAAAAAAAAAAA";
DCHECK_EQ(data.size(), SSEUtil::CHARS_PER_128_BIT_REGISTER);
Validate(&tuple_delim_parser, data, 1, TUPLE_DELIM, 0, 0);
data = "\nAAAAAAAAAAAAAAA";
DCHECK_EQ(data.size(), SSEUtil::CHARS_PER_128_BIT_REGISTER);
Validate(&tuple_delim_parser, data, 1, TUPLE_DELIM, 0, 0);
data = "\r\nAAAAAAAAAAAAAA";
DCHECK_EQ(data.size(), SSEUtil::CHARS_PER_128_BIT_REGISTER);
Validate(&tuple_delim_parser, data, 2, TUPLE_DELIM, 0, 0);
data = "\r\nAAA\n\r\r\nAAAAAAA";
DCHECK_EQ(data.size(), SSEUtil::CHARS_PER_128_BIT_REGISTER);
Validate(&tuple_delim_parser, data, 2, TUPLE_DELIM, 3, 3);
// NUL SSE case
const string nulsse1("AAAAA\0AAAAAAAAAAA\0AAAAAAAAAAAA\0\0", 32);
const string nulsse2("AAAAA\0AAAAAAAAAAA\0AAAAAAAAAAAA\0A", 32);
const string nulsse3("AAA\0BBBbbbbbbbbbbbbbbbbbbb\0cccc,ddd\0", 36);
const string nulsse4("AAA\0BBBbbbbbbbbbbbbbbbbbbb\0cccc,dddd", 36);
Validate(&nul_tuple_parser, nulsse1, 6, NUL_DELIM, 3, 3);
Validate(&nul_tuple_parser, nulsse2, 6, NUL_DELIM, 2, 2);
Validate(&nul_tuple_parser, nulsse3, 4, NUL_DELIM, 2, 2);
Validate(&nul_tuple_parser, nulsse4, 4, NUL_DELIM, 1, 1);
// NUL Field delimiters
const string field1("\na\0b\0c\n", 7);
const string field2("aaaa\na\0b\0c\naaaaa\0b\na\0b\0c\n", 25);
Validate(&nul_field_parser, field1, 1, TUPLE_DELIM, 1, 2);
Validate(&nul_field_parser, field2, 5, TUPLE_DELIM, 3, 6);
}
// TODO: expand test for other delimited text parser functions/cases.
// Not all of them work without creating a HdfsScanNode but we can expand
// these tests quite a bit more.
}