modules/research-framework/simexr_mod/db/utils/transform_utils.py - airavata - Git at Google

 from typing import Any

 import pandas as pd


 def _is_listy(v: Any) -> bool:
     """Check if value is list-like (list, tuple, numpy array, pandas Series)."""
     import numpy as np
     return isinstance(v, (list, tuple, np.ndarray, pd.Series))


 def _to_list(v: Any) -> list:
     """Convert value to list format."""
     import numpy as np
     if isinstance(v, np.ndarray):
         return v.tolist()
     if isinstance(v, pd.Series):
         return v.to_list()
     if isinstance(v, (list, tuple)):
         return list(v)
     # scalar → list of one
     return [v]


 def _explode_row(model_id: str, ts: Any, params: dict, outputs: dict) -> pd.DataFrame:
     """
     Explode a single row where some fields in params/outputs may be arrays.
     Strategy:
       - Collect all keys from params + outputs
       - Determine the per-key sequence lengths (only for list-like values)
       - If no list-like values exist → return a single-row dataframe
       - Otherwise, define max_len = max(list lengths)
       - For each key:
           * if list-like: pad/truncate to max_len (pads with None)
           * if scalar: repeat the scalar max_len times
       - Return a dataframe with max_len rows, adding a 'step' index (0..max_len-1)
     """
     # Flatten key space
     all_keys = list(dict.fromkeys([*params.keys(), *outputs.keys()]))

     # Compute lengths for list-like values
     lengths = []
     for k in all_keys:
         v = params.get(k, outputs.get(k, None))  # prefer params; either is fine for length check
         if _is_listy(v):
             lengths.append(len(_to_list(v)))

     if not lengths:
         # No arrays: single-row record
         row = {"model_id": model_id, "ts": ts, "step": 0}
         # Merge params & outputs; params take precedence on key collisions
         merged = {**outputs, **params}
         row.update(merged)
         return pd.DataFrame([row])

     max_len = max(lengths)

     def _series_for(k: str) -> list:
         # prefer params[k] over outputs[k] only for value source when both present
         if k in params:
             v = params[k]
         else:
             v = outputs.get(k, None)

         if _is_listy(v):
             lst = _to_list(v)
             # pad to max_len
             if len(lst) < max_len:
                 lst = lst + [None] * (max_len - len(lst))
             elif len(lst) > max_len:
                 lst = lst[:max_len]
             return lst
         else:
             # scalar → repeat
             return [v] * max_len

     data = {
         "model_id": [model_id] * max_len,
         "ts": [ts] * max_len,
         "step": list(range(max_len)),
     }
     for k in all_keys:
         data[k] = _series_for(k)

     return pd.DataFrame(data)
	from typing import Any

	import pandas as pd


	def _is_listy(v: Any) -> bool:
	"""Check if value is list-like (list, tuple, numpy array, pandas Series)."""
	import numpy as np
	return isinstance(v, (list, tuple, np.ndarray, pd.Series))


	def _to_list(v: Any) -> list:
	"""Convert value to list format."""
	import numpy as np
	if isinstance(v, np.ndarray):
	return v.tolist()
	if isinstance(v, pd.Series):
	return v.to_list()
	if isinstance(v, (list, tuple)):
	return list(v)
	# scalar → list of one
	return [v]


	def _explode_row(model_id: str, ts: Any, params: dict, outputs: dict) -> pd.DataFrame:
	"""
	Explode a single row where some fields in params/outputs may be arrays.
	Strategy:
	- Collect all keys from params + outputs
	- Determine the per-key sequence lengths (only for list-like values)
	- If no list-like values exist → return a single-row dataframe
	- Otherwise, define max_len = max(list lengths)
	- For each key:
	* if list-like: pad/truncate to max_len (pads with None)
	* if scalar: repeat the scalar max_len times
	- Return a dataframe with max_len rows, adding a 'step' index (0..max_len-1)
	"""
	# Flatten key space
	all_keys = list(dict.fromkeys([params.keys(), outputs.keys()]))

	# Compute lengths for list-like values
	lengths = []
	for k in all_keys:
	v = params.get(k, outputs.get(k, None)) # prefer params; either is fine for length check
	if _is_listy(v):
	lengths.append(len(_to_list(v)))

	if not lengths:
	# No arrays: single-row record
	row = {"model_id": model_id, "ts": ts, "step": 0}
	# Merge params & outputs; params take precedence on key collisions
	merged = {outputs, params}
	row.update(merged)
	return pd.DataFrame([row])

	max_len = max(lengths)

	def _series_for(k: str) -> list:
	# prefer params[k] over outputs[k] only for value source when both present
	if k in params:
	v = params[k]
	else:
	v = outputs.get(k, None)

	if _is_listy(v):
	lst = _to_list(v)
	# pad to max_len
	if len(lst) < max_len:
	lst = lst + [None] * (max_len - len(lst))
	elif len(lst) > max_len:
	lst = lst[:max_len]
	return lst
	else:
	# scalar → repeat
	return [v] * max_len

	data = {
	"model_id": [model_id] * max_len,
	"ts": [ts] * max_len,
	"step": list(range(max_len)),
	}
	for k in all_keys:
	data[k] = _series_for(k)

	return pd.DataFrame(data)