perf profiling
diff --git a/examples/materialization/using_types/run.py b/examples/materialization/using_types/run.py index 973c910..eec65a4 100644 --- a/examples/materialization/using_types/run.py +++ b/examples/materialization/using_types/run.py
@@ -1,10 +1,15 @@ +import logging + import simple_etl from hamilton_sdk import adapters from hamilton import driver +from hamilton.log_setup import setup_logging + +setup_logging(logging.DEBUG) tracker = adapters.HamiltonTracker( - project_id=7, # modify this as needed + project_id=15, # modify this as needed username="elijah@dagworks.io", dag_name="my_version_of_the_dag", tags={"environment": "DEV", "team": "MY_TEAM", "version": "X"},
diff --git a/examples/materialization/using_types/simple_etl.png b/examples/materialization/using_types/simple_etl.png index 13c5caf..a6595a2 100644 --- a/examples/materialization/using_types/simple_etl.png +++ b/examples/materialization/using_types/simple_etl.png Binary files differ
diff --git a/examples/materialization/using_types/simple_etl.py b/examples/materialization/using_types/simple_etl.py index d14c266..73ae688 100644 --- a/examples/materialization/using_types/simple_etl.py +++ b/examples/materialization/using_types/simple_etl.py
@@ -1,8 +1,17 @@ +from hamilton.telemetry import disable_telemetry + +disable_telemetry() +import logging + import pandas as pd from sklearn import datasets +from hamilton import node from hamilton.function_modifiers import loader, saver from hamilton.io import utils as io_utils +from hamilton.log_setup import setup_logging + +setup_logging(logging.INFO) @loader() @@ -22,3 +31,15 @@ transformed_data.to_csv(filepath) metadata = io_utils.get_file_and_dataframe_metadata(filepath, transformed_data) return metadata + + +if __name__ == "__main__": + import time + + from hamilton_sdk.tracking import runs + + df, metadata = raw_data() + t1 = time.time() + stats = runs.process_result(df, node.Node.from_fn(raw_data)) + t2 = time.time() + print(t2 - t1)
diff --git a/ui/sdk/src/hamilton_sdk/tracking/pandas_col_stats.py b/ui/sdk/src/hamilton_sdk/tracking/pandas_col_stats.py index c59a92d..a013b1c 100644 --- a/ui/sdk/src/hamilton_sdk/tracking/pandas_col_stats.py +++ b/ui/sdk/src/hamilton_sdk/tracking/pandas_col_stats.py
@@ -1,5 +1,6 @@ from typing import Dict, List, Union +import numpy as np import pandas as pd from hamilton_sdk.tracking import dataframe_stats as dfs @@ -45,19 +46,8 @@ def histogram(col: pd.Series, num_hist_bins: int = 10) -> Dict[str, int]: - try: - hist_counts = ( - col.value_counts( - bins=num_hist_bins, - ) - .sort_index() - .to_dict() - ) - except ValueError: - return {} - except AttributeError: - return {} - return {str(interval): interval_value for interval, interval_value in hist_counts.items()} + hist, bins = np.histogram(col, bins=num_hist_bins) + return {str(interval): interval_value for interval, interval_value in zip(bins, hist)} def numeric_column_stats(
diff --git a/ui/sdk/src/hamilton_sdk/tracking/pandas_stats.py b/ui/sdk/src/hamilton_sdk/tracking/pandas_stats.py index e6ec8c8..a076344 100644 --- a/ui/sdk/src/hamilton_sdk/tracking/pandas_stats.py +++ b/ui/sdk/src/hamilton_sdk/tracking/pandas_stats.py
@@ -1,3 +1,4 @@ +import logging from typing import Any, Dict, Union import pandas as pd @@ -12,7 +13,11 @@ - for object types we should :shrug: """ + dr = driver.Builder().with_modules(pcs).with_config({"config_key": "config_value"}).build() +logger = logging.getLogger(__name__) + +import time def _compute_stats(df: pd.DataFrame) -> Dict[str, Dict[str, Any]]: @@ -48,9 +53,12 @@ TODO: profile this and see where we can speed things up. """ try: + t1 = time.time() res = dr.execute( [target_output], inputs={"col": col, "name": name, "position": position} ) + + logger.info(f"Computed stats for column {name}, time taken was {time.time() - t1}") res = res[target_output].to_dict() except Exception: # minimum that we want -- ideally we have hamilton handle errors and do best effort.