blob: c6bd4e182f66c4c3b709d3df491b8bb6f8baf9ca [file] [log] [blame]
#-------------------------------------------------------------
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
#-------------------------------------------------------------
import argparse
import timeit
from systemds.context import SystemDSContext
setup = "\n".join(
[
"from systemds.script_building.script import DMLScript",
"import pandas as pd",
"import os",
"if os.path.isdir(src):",
" files = [os.path.join(src, f) for f in os.listdir(src)]",
" df = pd.concat([pd.read_csv(f, header=None) for f in files])",
"else:",
" df = pd.read_csv(src, header=None)",
"if dtype is not None:",
" df = df.astype(dtype)",
]
)
run = "\n".join(
[
"frame_from_pandas = ctx.from_pandas(df)",
"script = DMLScript(ctx)",
"script.add_input_from_python('test', frame_from_pandas)",
"script.execute()",
]
)
dtype_choices = [
"double",
"float",
"long",
"int8",
"int16",
"int32",
"int64",
"uint8",
"uint16",
"uint32",
"uint64",
"float32",
"float64",
"string",
"bool",
]
def main(args):
gvars = {"src": args.src, "dtype": args.dtype, "ctx": SystemDSContext(logging_level=10, py4j_logging_level=50)}
print(timeit.timeit(run, setup, globals=gvars, number=args.number))
gvars["ctx"].close()
if __name__ == "__main__":
description = "Benchmarks time spent loading data into systemds"
parser = argparse.ArgumentParser(description=description)
parser.add_argument("src")
parser.add_argument("number", type=int, help="number of times to load the data")
help_force_dtype = (
"optionally cast all columns to one of the dtype choices in pandas"
)
parser.add_argument(
"--dtype",
choices=dtype_choices,
required=False,
default=None,
help=help_force_dtype,
)
args = parser.parse_args()
main(args)