scripts/test_memory.py - hamilton - Git at Google

 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.

 """
 Script to test memory.
 Run with mprof:
     pip install memory_profiler
     mprof run test_memory.py
     mprof plot
 See https://github.com/apache/hamilton/pull/374 for more details.
 """

 from hamilton import driver
 from hamilton.ad_hoc_utils import create_temporary_module
 from hamilton.function_modifiers import parameterize, source

 NUM_ITERS = 100

 import numpy as np
 import pandas as pd


 def foo_0(memory_size: int = 100_000_000) -> pd.DataFrame:
     """
     Generates a large DataFrame with memory size close to the specified memory_size_gb.

     Parameters:
     memory_size_gb (float): Desired memory size of the DataFrame in GB. Default is 1 GB.

     Returns:
     pd.DataFrame: Generated DataFrame with approximate memory usage of memory_size_gb.
     """
     # Number of rows in the DataFrame
     num_rows = 10**6

     # Calculate the number of columns required to make a DataFrame close to memory_size_gb
     # Assuming float64 type which takes 8 bytes
     bytes_per_row = 8 * num_rows
     target_bytes = memory_size
     num_cols = target_bytes // bytes_per_row

     # Create a DataFrame with random data
     data = {f"col_{i}": np.random.random(num_rows) for i in range(int(num_cols))}
     df = pd.DataFrame(data)

     # Print DataFrame info, including memory usage
     print(df.info(memory_usage="deep"))
     return df


 count = 0


 @parameterize(
     **{f"foo_{i}": {"foo_i_minus_one": source(f"foo_{i - 1}")} for i in range(1, NUM_ITERS)}
 )
 def foo_i(foo_i_minus_one: pd.DataFrame) -> pd.DataFrame:
     global count
     count += 1
     print(f"foo_{count}")
     return foo_i_minus_one * 1.01


 if __name__ == "__main__":
     mod = create_temporary_module(foo_i, foo_0)
     dr = driver.Builder().with_modules(mod).build()
     output = dr.execute([f"foo_{NUM_ITERS - 1}"], inputs=dict(memory_size=100_000_000))
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.

	"""
	Script to test memory.
	Run with mprof:
	pip install memory_profiler
	mprof run test_memory.py
	mprof plot
	See https://github.com/apache/hamilton/pull/374 for more details.
	"""

	from hamilton import driver
	from hamilton.ad_hoc_utils import create_temporary_module
	from hamilton.function_modifiers import parameterize, source

	NUM_ITERS = 100

	import numpy as np
	import pandas as pd


	def foo_0(memory_size: int = 100_000_000) -> pd.DataFrame:
	"""
	Generates a large DataFrame with memory size close to the specified memory_size_gb.

	Parameters:
	memory_size_gb (float): Desired memory size of the DataFrame in GB. Default is 1 GB.

	Returns:
	pd.DataFrame: Generated DataFrame with approximate memory usage of memory_size_gb.
	"""
	# Number of rows in the DataFrame
	num_rows = 10**6

	# Calculate the number of columns required to make a DataFrame close to memory_size_gb
	# Assuming float64 type which takes 8 bytes
	bytes_per_row = 8 * num_rows
	target_bytes = memory_size
	num_cols = target_bytes // bytes_per_row

	# Create a DataFrame with random data
	data = {f"col_{i}": np.random.random(num_rows) for i in range(int(num_cols))}
	df = pd.DataFrame(data)

	# Print DataFrame info, including memory usage
	print(df.info(memory_usage="deep"))
	return df


	count = 0


	@parameterize(
	**{f"foo_{i}": {"foo_i_minus_one": source(f"foo_{i - 1}")} for i in range(1, NUM_ITERS)}
	)
	def foo_i(foo_i_minus_one: pd.DataFrame) -> pd.DataFrame:
	global count
	count += 1
	print(f"foo_{count}")
	return foo_i_minus_one * 1.01


	if __name__ == "__main__":
	mod = create_temporary_module(foo_i, foo_0)
	dr = driver.Builder().with_modules(mod).build()
	output = dr.execute([f"foo_{NUM_ITERS - 1}"], inputs=dict(memory_size=100_000_000))