blob: cb417a5ef5e1877f7e0bc73a916899d648e7dd92 [file] [log] [blame]
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
import os
import struct
from parquet.ttypes import FileMetaData
from thrift.protocol import TCompactProtocol
from thrift.transport import TTransport
PARQUET_VERSION_NUMBER = 'PAR1'
def parse_int(s):
"""Reinterprets the string 's' as a 4-byte integer."""
return struct.unpack('i', s)[0]
def get_parquet_metadata(filename):
"""Returns a FileMetaData as defined in parquet.thrift. 'filename' must be a local
file path."""
file_size = os.path.getsize(filename)
with open(filename) as f:
# Check file starts and ends with magic bytes
start_magic = f.read(len(PARQUET_VERSION_NUMBER))
assert start_magic == PARQUET_VERSION_NUMBER
f.seek(file_size - len(PARQUET_VERSION_NUMBER))
end_magic = f.read(len(PARQUET_VERSION_NUMBER))
assert end_magic == PARQUET_VERSION_NUMBER
# Read metadata length
f.seek(file_size - len(PARQUET_VERSION_NUMBER) - 4)
metadata_len = parse_int(f.read(4))
# Read metadata
f.seek(file_size - len(PARQUET_VERSION_NUMBER) - 4 - metadata_len)
serialized_metadata = f.read(metadata_len)
# Deserialize metadata
transport = TTransport.TMemoryBuffer(serialized_metadata)
protocol = TCompactProtocol.TCompactProtocol(transport)
metadata = FileMetaData()
metadata.read(protocol)
return metadata