modules/research-framework/simexr_mod/execute/loader/transform_code.py - airavata - Git at Google

 from dataclasses import dataclass
 from pathlib import Path
 from typing import Tuple, Dict, Any
 import os

 # Note: These imports may need adjustment based on actual project structure

 from code.refactor.llm_refactor import refactor_to_single_entry
 from code.utils.github_utils import fetch_notebook_from_github
 from code.utils.notebook_utils import notebook_to_script


 @dataclass
 class ExternalScriptImporter:
     """
     Full pipeline for an external notebook or script:
       1) If it's a notebook, convert to .py
       2) Refactor to single-entry `simulate(**params)`
       3) Extract metadata (from refactorer)
       4) Iterative smoke-tests & auto-correction
       5) Return (model_id, metadata)
     """
     models_root: Path = Path("external_models")

     def __post_init__(self):
         """Initialize dependencies after dataclass creation."""
         self.models_root.mkdir(parents=True, exist_ok=True)

     def import_and_refactor(
         self,
         source_url: str,
         model_name: str,
         dest_dir: str,
         max_smoke_iters: int = 3,
         llm_model: str = "gpt-5-mini",
     ) -> Tuple[str, Dict[str, Any]]:
         """
         Import and refactor external script/notebook.

         Args:
             source_url: URL to the source file
             model_name: Name for the model
             dest_dir: Destination directory
             max_smoke_iters: Maximum smoke test iterations
             llm_model: LLM model to use for refactoring

         Returns:
             Tuple of (model_id, metadata)
         """
         print(f"[TRANSFORM_CODE] Starting import_and_refactor for {source_url}")
         print(f"[TRANSFORM_CODE] Fetching notebook from GitHub...")
         nb_path = fetch_notebook_from_github(source_url, dest_dir=dest_dir)
         print(f"[TRANSFORM_CODE] Notebook fetched: {nb_path}")

         print(f"[TRANSFORM_CODE] Converting notebook to script...")
         py_path = notebook_to_script(nb_path, output_dir=str(self.models_root))
         print(f"[TRANSFORM_CODE] Script created: {py_path}")

         # Refactor into single entrypoint + extract metadata
         print(f"[TRANSFORM_CODE] Calling refactor_to_single_entry...")
         script_path, metadata = refactor_to_single_entry(Path(py_path))
         print(f"[TRANSFORM_CODE] Refactoring completed. Script path: {script_path}")

         # Optionally set/override user-facing name for slugging
         metadata = dict(metadata or {})
         metadata.setdefault("model_name", model_name)
         print(f"[TRANSFORM_CODE] Metadata: {metadata}")

         # Iterative smoke-test + correction loop
         try:
             # Try new structure first (lazy import to avoid circular dependencies)
             from execute.test.simulation_refiner import SimulationRefiner
             refiner = SimulationRefiner(
                 script_path=script_path,
                 model_name=model_name,
                 max_iterations=max_smoke_iters
             )
             model_id = refiner.refine()
         except (NameError, ImportError, ModuleNotFoundError):
             # Fallback - just use the script path as model_id
             import hashlib
             model_id = hashlib.md5(f"{model_name}_{script_path}".encode()).hexdigest()[:12]
         return model_id, metadata
	from dataclasses import dataclass
	from pathlib import Path
	from typing import Tuple, Dict, Any
	import os

	# Note: These imports may need adjustment based on actual project structure

	from code.refactor.llm_refactor import refactor_to_single_entry
	from code.utils.github_utils import fetch_notebook_from_github
	from code.utils.notebook_utils import notebook_to_script


	@dataclass
	class ExternalScriptImporter:
	"""
	Full pipeline for an external notebook or script:
	1) If it's a notebook, convert to .py
	2) Refactor to single-entry `simulate(**params)`
	3) Extract metadata (from refactorer)
	4) Iterative smoke-tests & auto-correction
	5) Return (model_id, metadata)
	"""
	models_root: Path = Path("external_models")

	def __post_init__(self):
	"""Initialize dependencies after dataclass creation."""
	self.models_root.mkdir(parents=True, exist_ok=True)

	def import_and_refactor(
	self,
	source_url: str,
	model_name: str,
	dest_dir: str,
	max_smoke_iters: int = 3,
	llm_model: str = "gpt-5-mini",
	) -> Tuple[str, Dict[str, Any]]:
	"""
	Import and refactor external script/notebook.

	Args:
	source_url: URL to the source file
	model_name: Name for the model
	dest_dir: Destination directory
	max_smoke_iters: Maximum smoke test iterations
	llm_model: LLM model to use for refactoring

	Returns:
	Tuple of (model_id, metadata)
	"""
	print(f"[TRANSFORM_CODE] Starting import_and_refactor for {source_url}")
	print(f"[TRANSFORM_CODE] Fetching notebook from GitHub...")
	nb_path = fetch_notebook_from_github(source_url, dest_dir=dest_dir)
	print(f"[TRANSFORM_CODE] Notebook fetched: {nb_path}")

	print(f"[TRANSFORM_CODE] Converting notebook to script...")
	py_path = notebook_to_script(nb_path, output_dir=str(self.models_root))
	print(f"[TRANSFORM_CODE] Script created: {py_path}")

	# Refactor into single entrypoint + extract metadata
	print(f"[TRANSFORM_CODE] Calling refactor_to_single_entry...")
	script_path, metadata = refactor_to_single_entry(Path(py_path))
	print(f"[TRANSFORM_CODE] Refactoring completed. Script path: {script_path}")

	# Optionally set/override user-facing name for slugging
	metadata = dict(metadata or {})
	metadata.setdefault("model_name", model_name)
	print(f"[TRANSFORM_CODE] Metadata: {metadata}")

	# Iterative smoke-test + correction loop
	try:
	# Try new structure first (lazy import to avoid circular dependencies)
	from execute.test.simulation_refiner import SimulationRefiner
	refiner = SimulationRefiner(
	script_path=script_path,
	model_name=model_name,
	max_iterations=max_smoke_iters
	)
	model_id = refiner.refine()
	except (NameError, ImportError, ModuleNotFoundError):
	# Fallback - just use the script path as model_id
	import hashlib
	model_id = hashlib.md5(f"{model_name}_{script_path}".encode()).hexdigest()[:12]
	return model_id, metadata