blob: 44d813b232227607beb290ecd459517754c8d2ef [file] [log] [blame]
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
"""
Script to test memory.
Run with mprof:
pip install memory_profiler
mprof run test_memory.py
mprof plot
See https://github.com/apache/hamilton/pull/374 for more details.
"""
from hamilton import driver
from hamilton.ad_hoc_utils import create_temporary_module
from hamilton.function_modifiers import parameterize, source
NUM_ITERS = 100
import numpy as np
import pandas as pd
def foo_0(memory_size: int = 100_000_000) -> pd.DataFrame:
"""
Generates a large DataFrame with memory size close to the specified memory_size_gb.
Parameters:
memory_size_gb (float): Desired memory size of the DataFrame in GB. Default is 1 GB.
Returns:
pd.DataFrame: Generated DataFrame with approximate memory usage of memory_size_gb.
"""
# Number of rows in the DataFrame
num_rows = 10**6
# Calculate the number of columns required to make a DataFrame close to memory_size_gb
# Assuming float64 type which takes 8 bytes
bytes_per_row = 8 * num_rows
target_bytes = memory_size
num_cols = target_bytes // bytes_per_row
# Create a DataFrame with random data
data = {f"col_{i}": np.random.random(num_rows) for i in range(int(num_cols))}
df = pd.DataFrame(data)
# Print DataFrame info, including memory usage
print(df.info(memory_usage="deep"))
return df
count = 0
@parameterize(
**{f"foo_{i}": {"foo_i_minus_one": source(f"foo_{i - 1}")} for i in range(1, NUM_ITERS)}
)
def foo_i(foo_i_minus_one: pd.DataFrame) -> pd.DataFrame:
global count
count += 1
print(f"foo_{count}")
return foo_i_minus_one * 1.01
if __name__ == "__main__":
mod = create_temporary_module(foo_i, foo_0)
dr = driver.Builder().with_modules(mod).build()
output = dr.execute([f"foo_{NUM_ITERS - 1}"], inputs=dict(memory_size=100_000_000))