blob: 36d6ded4bd75910cdc78f8e7ecce985562272420 [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include "runtime/scoped-buffer.h"
namespace parquet {
class ColumnChunk;
class ColumnIndex;
class OffsetIndex;
}
namespace impala {
class HdfsParquetScanner;
/// Helper class for reading the Parquet page index. It allocates a buffer to hold the
/// raw bytes of the page index. It provides helper methods to deserialize the relevant
/// parts.
class ParquetPageIndex {
public:
ParquetPageIndex(HdfsParquetScanner* scanner);
/// It reads the raw bytes of a page index belonging to a specific row group. It stores
/// the bytes in an internal buffer.
/// It expects that the layout of the page index conforms to the specification, i.e.
/// column indexes come before offset indexes. Otherwise it returns an error. (Impala
/// and Parquet-MR produce conforming page index layouts).
/// It needs to be called before the serialization methods.
Status ReadAll(int row_group_idx);
/// Deserializes a ColumnIndex object for the given column chunk.
Status DeserializeColumnIndex(const parquet::ColumnChunk& col_chunk,
parquet::ColumnIndex* column_index);
/// Deserializes an OffsetIndex object for the given column chunk.
Status DeserializeOffsetIndex(const parquet::ColumnChunk& col_chunk,
parquet::OffsetIndex* offset_index);
/// Determines the column index and offset index ranges for the given row group.
/// Returns true when at least a partial column index and an offset index are found.
/// Returns false when there is absolutely no column index or offset index for the row
/// group.
static bool DeterminePageIndexRangesInRowGroup(
const parquet::RowGroup& row_group, int64_t* column_index_start,
int64_t* column_index_size, int64_t* offset_index_start,
int64_t* offset_index_size);
/// Releases resources held by this object.
void Release() { page_index_buffer_.Release(); }
/// Returns true if the page index buffer is empty.
bool IsEmpty() { return page_index_buffer_.Size() == 0; }
private:
/// The scanner that created this object.
HdfsParquetScanner* scanner_;
/// Buffer to hold the raw bytes of the page index.
ScopedBuffer page_index_buffer_;
/// File offsets and sizes of the page Index.
int64_t column_index_base_offset_;
int64_t column_index_size_;
int64_t offset_index_base_offset_;
int64_t offset_index_size_;
};
}