examples/lineage/data_loading.py - hamilton - Git at Google

 from typing import List

 import pandas as pd

 from hamilton.function_modifiers import extract_columns, tag, tag_outputs


 def _sanitize_columns(
     df_columns: List[str],
 ) -> List[str]:
     """Helper function to sanitize column names.

     :param df_columns: the current column names
     :return: sanitized column names
     """
     return [c.strip().replace("/", "_per_").replace(" ", "_").lower() for c in df_columns]


 # list of columns we want to expose for transformations.
 columns_to_extract = [
     "passengerid",
     "survived",
     "pclass",
     "name",
     "sex",
     "age",
     "sibsp",
     "parch",
     "ticket",
     "fare",
     "cabin",
     "embarked",
 ]


 @tag_outputs(age={"PII": "true"}, sex={"PII": "true"})
 @extract_columns(*columns_to_extract)  # expose columns for feature functions
 @tag(
     source="prod.titantic",
     owner="data-engineering",
     importance="production",
     info="https://internal.wikipage.net/",
     target_="titanic_data",
 )
 def titanic_data(index_col: str, location: str) -> pd.DataFrame:
     """Input data that someone in data engineering has provided for us.

     Here are the features in the data:
         survived - Survival (0 = No; 1 = Yes)
         class - Passenger Class (1 = 1st; 2 = 2nd; 3 = 3rd)
         name - Name
         sex - Sex
         age - Age
         sibsp - Number of Siblings/Spouses Aboard
         parch - Number of Parents/Children Aboard
         ticket - Ticket Number
         fare - Passenger Fare
         cabin - Cabin
         embarked - Port of Embarkation (C = Cherbourg; Q = Queenstown; S = Southampton)
         boat - Lifeboat (if survived)
         body - Body number (if did not survive and body was recovered)

     :param index_col: the column to use as the index
     :param location: the path location of the data
     :return: a dataframe
     """
     df = pd.read_csv(location)  # pretend we changed how to load this.
     df.columns = _sanitize_columns(df.columns)
     df = df.set_index(index_col)
     return df


 def target(titanic_data: pd.DataFrame, target_col: str) -> pd.Series:
     """The target column values we want to predict."""
     return titanic_data[target_col]
	from typing import List

	import pandas as pd

	from hamilton.function_modifiers import extract_columns, tag, tag_outputs


	def _sanitize_columns(
	df_columns: List[str],
	) -> List[str]:
	"""Helper function to sanitize column names.

	:param df_columns: the current column names
	:return: sanitized column names
	"""
	return [c.strip().replace("/", "_per_").replace(" ", "_").lower() for c in df_columns]


	# list of columns we want to expose for transformations.
	columns_to_extract = [
	"passengerid",
	"survived",
	"pclass",
	"name",
	"sex",
	"age",
	"sibsp",
	"parch",
	"ticket",
	"fare",
	"cabin",
	"embarked",
	]


	@tag_outputs(age={"PII": "true"}, sex={"PII": "true"})
	@extract_columns(*columns_to_extract) # expose columns for feature functions
	@tag(
	source="prod.titantic",
	owner="data-engineering",
	importance="production",
	info="https://internal.wikipage.net/",
	target_="titanic_data",
	)
	def titanic_data(index_col: str, location: str) -> pd.DataFrame:
	"""Input data that someone in data engineering has provided for us.

	Here are the features in the data:
	survived - Survival (0 = No; 1 = Yes)
	class - Passenger Class (1 = 1st; 2 = 2nd; 3 = 3rd)
	name - Name
	sex - Sex
	age - Age
	sibsp - Number of Siblings/Spouses Aboard
	parch - Number of Parents/Children Aboard
	ticket - Ticket Number
	fare - Passenger Fare
	cabin - Cabin
	embarked - Port of Embarkation (C = Cherbourg; Q = Queenstown; S = Southampton)
	boat - Lifeboat (if survived)
	body - Body number (if did not survive and body was recovered)

	:param index_col: the column to use as the index
	:param location: the path location of the data
	:return: a dataframe
	"""
	df = pd.read_csv(location) # pretend we changed how to load this.
	df.columns = _sanitize_columns(df.columns)
	df = df.set_index(index_col)
	return df


	def target(titanic_data: pd.DataFrame, target_col: str) -> pd.Series:
	"""The target column values we want to predict."""
	return titanic_data[target_col]