| // Licensed to the Apache Software Foundation (ASF) under one |
| // or more contributor license agreements. See the NOTICE file |
| // distributed with this work for additional information |
| // regarding copyright ownership. The ASF licenses this file |
| // to you under the Apache License, Version 2.0 (the |
| // "License"); you may not use this file except in compliance |
| // with the License. You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, |
| // software distributed under the License is distributed on an |
| // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| // KIND, either express or implied. See the License for the |
| // specific language governing permissions and limitations |
| // under the License. |
| |
| |
| #ifndef IMPALA_EXEC_DELIMITED_TEXT_PARSER_H |
| #define IMPALA_EXEC_DELIMITED_TEXT_PARSER_H |
| |
| #include "exec/hdfs-scanner.h" |
| #include "exec/hdfs-scan-node.h" |
| #include "util/sse-util.h" |
| |
| namespace impala { |
| |
| template <bool DELIMITED_TUPLES> |
| class DelimitedTextParser { |
| public: |
| |
| /// The Delimited Text Parser parses text rows that are delimited by specific |
| /// characters: |
| /// tuple_delim: delimits tuples. Only used if DELIMITED_TUPLES is true. |
| /// field_delim: delimits fields |
| /// collection_item_delim: delimits collection items |
| /// escape_char: escape delimiters, make them part of the data. |
| /// |
| /// If the template parameter DELIMITED_TUPLES is false there is no support |
| /// for tuple delimiters and we do not need to search for them. Any value |
| /// may be passed for tuple_delim, as it is ignored. |
| /// |
| /// 'num_cols' is the total number of columns including partition keys. |
| /// |
| /// 'is_materialized_col' should be initialized to an array of length 'num_cols', with |
| /// is_materialized_col[i] = <true if column i should be materialized, false otherwise> |
| /// Owned by caller. |
| /// |
| /// The main method is ParseData which fills in a vector of pointers and lengths to the |
| /// fields. It also can handle an escape character which masks a tuple or field |
| /// delimiter that occurs in the data. |
| DelimitedTextParser( |
| int num_cols, int num_partition_keys, const bool* is_materialized_col, |
| char tuple_delim, char field_delim_ = '\0', char collection_item_delim = '^', |
| char escape_char = '\0'); |
| |
| /// Called to initialize parser at beginning of scan range. |
| void ParserReset(); |
| |
| /// Check if we are at the start of a tuple. |
| bool AtTupleStart() { return column_idx_ == num_partition_keys_; } |
| |
| char escape_char() const { return escape_char_; } |
| |
| /// Parses a byte buffer for the field and tuple breaks. |
| /// This function will write the field start & len to field_locations |
| /// which can then be written out to tuples. |
| /// This function uses SSE ("Intel x86 instruction set extension |
| /// 'Streaming Simd Extension') if the hardware supports SSE4.2 |
| /// instructions. SSE4.2 added string processing instructions that |
| /// allow for processing 16 characters at a time. Otherwise, this |
| /// function walks the file_buffer_ character by character. |
| /// Input Parameters: |
| /// max_tuples: The maximum number of tuples that should be parsed. |
| /// This is used to control how the batching works. |
| /// remaining_len: Length of data remaining in the byte_buffer_pointer. |
| /// byte_buffer_pointer: Pointer to the buffer containing the data to be parsed. |
| /// Output Parameters: |
| /// field_locations: array of pointers to data fields and their lengths |
| /// num_tuples: Number of tuples parsed |
| /// num_fields: Number of materialized fields parsed |
| /// next_column_start: pointer within file_buffer_ where the next field starts |
| /// after the return from the call to ParseData |
| /// Returns an error status if any column exceeds the size limit. |
| /// See AddColumn() for details. |
| Status ParseFieldLocations(int max_tuples, int64_t remaining_len, |
| char** byte_buffer_ptr, char** row_end_locations, |
| FieldLocation* field_locations, |
| int* num_tuples, int* num_fields, char** next_column_start); |
| |
| /// Parse a single tuple from buffer. |
| /// - buffer/len are input parameters for the entire record. |
| /// - on return field_locations will contain the start/len for each materialized |
| /// col. |
| /// - *num_fields returns the number of fields processed. |
| /// This function is used to parse sequence file records which do not need to |
| /// parse for tuple delimiters. Returns an error status if any column exceeds the |
| /// size limit. See AddColumn() for details. |
| /// This function is disabled for non-sequence file parsing. |
| template <bool PROCESS_ESCAPES> |
| Status ParseSingleTuple(int64_t len, char* buffer, FieldLocation* field_locations, |
| int* num_fields); |
| |
| /// FindFirstInstance returns the position after the first non-escaped tuple |
| /// delimiter from the starting offset. |
| /// Used to find the start of a tuple if jumping into the middle of a text file. |
| /// If no tuple delimiter is found within the buffer, return -1; |
| int64_t FindFirstInstance(const char* buffer, int64_t len); |
| |
| /// Will we return the current column to the query? |
| /// Hive allows cols at the end of the table that are not in the schema. We'll |
| /// just ignore those columns |
| bool ReturnCurrentColumn() const { |
| return column_idx_ < num_cols_ && is_materialized_col_[column_idx_]; |
| } |
| |
| /// Fill in columns missing at the end of the tuple. |
| /// 'len' and 'last_column' may contain the length and the pointer to the |
| /// last column on which the file ended without a delimiter. |
| /// Fills in the offsets and lengths in field_locations. |
| /// If parsing stopped on a delimiter and there is no last column then length will be 0. |
| /// Other columns beyond that are filled with 0 length fields. |
| /// 'num_fields' points to an initialized count of fields and will incremented |
| /// by the number fields added. |
| /// 'field_locations' will be updated with the start and length of the fields. |
| /// Returns an error status if 'len' exceeds the size limit specified in AddColumn(). |
| template <bool PROCESS_ESCAPES> |
| Status FillColumns(int64_t len, char** last_column, int* num_fields, |
| impala::FieldLocation* field_locations); |
| |
| /// Return true if we have not seen a tuple delimiter for the current tuple being |
| /// parsed (i.e., the last byte read was not a tuple delimiter). |
| bool HasUnfinishedTuple() { |
| DCHECK(DELIMITED_TUPLES); |
| return unfinished_tuple_; |
| } |
| |
| private: |
| /// Initialize the parser state. |
| void ParserInit(HdfsScanNode* scan_node); |
| |
| /// Helper routine to add a column to the field_locations vector. |
| /// Template parameter: |
| /// PROCESS_ESCAPES -- if true the the column may have escape characters |
| /// and the negative of the len will be stored. |
| /// len: length of the current column. The length of a column must fit in a 32-bit |
| /// signed integer (i.e. <= 2147483647 bytes). If a column is larger than that, |
| /// it will be treated as an error. |
| /// Input/Output: |
| /// next_column_start: Start of the current column, moved to the start of the next. |
| /// num_fields: current number of fields processed, updated to next field. |
| /// Output: |
| /// field_locations: updated with start and length of current field. |
| /// Return an error status if 'len' exceeds the size limit specified above. |
| template <bool PROCESS_ESCAPES> |
| Status AddColumn(int64_t len, char** next_column_start, int* num_fields, |
| FieldLocation* field_locations); |
| |
| /// Helper routine to parse delimited text using SSE instructions. |
| /// Identical arguments as ParseFieldLocations. |
| /// If the template argument, 'PROCESS_ESCAPES' is true, this function will handle |
| /// escapes, otherwise, it will assume the text is unescaped. By using templates, |
| /// we can special case the un-escaped path for better performance. The unescaped |
| /// path is optimized away by the compiler. Returns an error status if the length |
| /// of any column exceeds the size limit. See AddColumn() for details. |
| template <bool PROCESS_ESCAPES> |
| Status ParseSse(int max_tuples, int64_t* remaining_len, |
| char** byte_buffer_ptr, char** row_end_locations_, |
| FieldLocation* field_locations, |
| int* num_tuples, int* num_fields, char** next_column_start); |
| |
| bool IsFieldOrCollectionItemDelimiter(char c) { |
| return (!DELIMITED_TUPLES && c == field_delim_) || |
| (DELIMITED_TUPLES && field_delim_ != tuple_delim_ && c == field_delim_) || |
| (collection_item_delim_ != '\0' && c == collection_item_delim_); |
| } |
| |
| /// SSE(xmm) register containing the tuple search character(s). |
| __m128i xmm_tuple_search_; |
| |
| /// SSE(xmm) register containing the delimiter search character(s). |
| __m128i xmm_delim_search_; |
| |
| /// SSE(xmm) register containing the escape search character. |
| __m128i xmm_escape_search_; |
| |
| /// For each col index [0, num_cols_), true if the column should be materialized. |
| /// Not owned. |
| const bool* is_materialized_col_; |
| |
| /// The number of delimiters contained in xmm_tuple_search_, i.e. its length. |
| int num_tuple_delims_; |
| |
| /// The number of delimiters contained in xmm_delim_search_, i.e. its length. |
| int num_delims_; |
| |
| /// Number of columns in the table (including partition columns) |
| int num_cols_; |
| |
| /// Number of partition columns in the table. |
| int num_partition_keys_; |
| |
| /// Index to keep track of the current column in the current file |
| int column_idx_; |
| |
| /// Used for special processing of \r. |
| /// This will be the offset of the last instance of \r from the end of the |
| /// current buffer being searched unless the last row delimiter was not a \r in which |
| /// case it will be -1. If the last character in a buffer is \r then the value |
| /// will be 0. At the start of processing a new buffer if last_row_delim_offset_ is 0 |
| /// then it is set to be one more than the size of the buffer so that if the buffer |
| /// starts with \n it is processed as \r\n. |
| int32_t last_row_delim_offset_; |
| |
| /// Precomputed masks to process escape characters |
| uint16_t low_mask_[16]; |
| uint16_t high_mask_[16]; |
| |
| /// Character delimiting fields (to become slots). |
| char field_delim_; |
| |
| /// True if this parser should handle escape characters. |
| bool process_escapes_; |
| |
| /// Escape character. Only used if process_escapes_ is true. |
| char escape_char_; |
| |
| /// Character delimiting collection items (to become slots). |
| char collection_item_delim_; |
| |
| /// Character delimiting tuples. Only used if DELIMITED_TUPLES is true. |
| char tuple_delim_; |
| |
| /// Whether or not the current column has an escape character in it |
| /// (and needs to be unescaped) |
| bool current_column_has_escape_; |
| |
| /// Whether or not the previous character was the escape character |
| bool last_char_is_escape_; |
| |
| /// True if the last tuple is unfinished (not ended with tuple delimiter). |
| bool unfinished_tuple_; |
| }; |
| |
| using TupleDelimitedTextParser = DelimitedTextParser<true>; |
| using SequenceDelimitedTextParser = DelimitedTextParser<false>; |
| |
| }// namespace impala |
| #endif// IMPALA_EXEC_DELIMITED_TEXT_PARSER_H |