blob: 4f555872a1550fd3b1750582a4e482250bc21991 [file] [log] [blame]
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
import shutil
import tempfile
import numpy as np
import pandas as pd
import pyarrow as pa
try:
import pyarrow.parquet as pq
except ImportError:
pq = None
class ParquetManifestCreation(object):
"""Benchmark creating a parquet manifest."""
size = 10 ** 6
tmpdir = None
param_names = ('num_partitions', 'num_threads')
params = [(10, 100, 1000), (1, 8)]
def setup(self, num_partitions, num_threads):
if pq is None:
raise NotImplementedError("Parquet support not enabled")
self.tmpdir = tempfile.mkdtemp('benchmark_parquet')
rnd = np.random.RandomState(42)
num1 = rnd.randint(0, num_partitions, size=self.size)
num2 = rnd.randint(0, 1000, size=self.size)
output_df = pd.DataFrame({'num1': num1, 'num2': num2})
output_table = pa.Table.from_pandas(output_df)
pq.write_to_dataset(output_table, self.tmpdir, ['num1'])
def teardown(self, num_partitions, num_threads):
if self.tmpdir is not None:
shutil.rmtree(self.tmpdir)
def time_manifest_creation(self, num_partitions, num_threads):
pq.ParquetManifest(self.tmpdir, metadata_nthreads=num_threads)