blob: 8e2b6163e6cc792294bc1109180ce0313f0738b3 [file] [log] [blame]
from pathlib import Path
import pandas as pd
def pdl_data(pdl_file: str, data_dir: str = "data/") -> pd.DataFrame:
"""Load raw People Data Labs data stored locally"""
return pd.read_json(Path(data_dir, pdl_file))
def stock_data(stock_file: str, data_dir: str = "data/") -> pd.DataFrame:
"""Load raw stock data stored locally"""
return pd.read_json(Path(data_dir, stock_file))
def company_info(pdl_data: pd.DataFrame) -> pd.DataFrame:
"""Select columns containing general company info"""
columns = [
"id",
"ticker",
"website",
"name",
"display_name",
"legal_name",
"founded",
"industry",
"type",
"summary",
"total_funding_raised",
"latest_funding_stage",
"number_funding_rounds",
"last_funding_date",
"inferred_revenue",
]
return pdl_data[columns]
def employee_count_by_month_df(pdl_data: pd.DataFrame) -> pd.DataFrame:
"""Normalized employee count data"""
return (
pd.json_normalize(pdl_data["employee_count_by_month"])
.assign(ticker=pdl_data["ticker"])
.melt(
id_vars="ticker",
var_name="year_month",
value_name="employee_count",
)
.astype({"year_month": "datetime64[ns]"})
)
def n_company_by_funding_stage(company_info: pd.DataFrame) -> pd.Series:
"""Get the number of company per funding stage"""
return (
company_info.groupby("latest_funding_stage")["latest_funding_stage"]
.value_counts()
.sort_values(ascending=False)
)
def selected_companies(company_info: pd.DataFrame, rounds_selection: list[str]) -> pd.DataFrame:
"""Companies with `latest_funding_stage` included in `rounds_selection"""
return company_info.loc[company_info.latest_funding_stage.isin(rounds_selection)]
def employees_since_last_funding_round(
employee_count_by_month_df: pd.DataFrame,
selected_companies: pd.DataFrame,
) -> pd.DataFrame:
"""Select employee count data since the last funding round"""
employee_count_by_month_df = employee_count_by_month_df.loc[
employee_count_by_month_df.ticker.isin(selected_companies.ticker)
]
df = pd.merge(
left=employee_count_by_month_df,
right=selected_companies[["ticker", "last_funding_date"]],
on="ticker",
how="left",
)
return df.loc[df.year_month > df.last_funding_date]
def _growth_rate(group):
"""aggregation for growth rate; data needs to be sorted"""
return (group.iloc[-1] - group.iloc[0]) / group.iloc[0]
def employee_growth_rate_since_last_funding_round(
employees_since_last_funding_round: pd.DataFrame,
) -> pd.DataFrame:
"""Employee count growth rate since last funding round"""
return (
employees_since_last_funding_round.sort_values(by="year_month", ascending=True)
.groupby("ticker")["employee_count"]
.aggregate(_growth_rate)
.sort_values(ascending=False)
.reset_index()
.rename(columns={"employee_count": "employee_growth"})
)
def stock_growth_rate_since_last_funding_round(
stock_data: pd.DataFrame,
employees_since_last_funding_round: pd.DataFrame,
) -> pd.DataFrame:
"""Stock data since last funding round.
Returns None is no stock history or window found.
NOTE. We use the minimum date from the employee count history instead of the true
funding round date to ensure growth rates cover the same period
"""
period_start = (
employees_since_last_funding_round.groupby("ticker")["year_month"].min().reset_index()
)
df = pd.merge(left=stock_data, right=period_start, on="ticker", how="inner")
stock_growth = dict()
for idx, row in df.iterrows():
history = pd.json_normalize(row["historical_price"]).astype({"date": "datetime64[ns]"})
# skip ticker if history is empty
if history.empty:
stock_growth[row.ticker] = None
continue
window = history[history.date > row.year_month]
# skip ticker if window is empty
if window.empty:
stock_growth[row.ticker] = None
continue
stock_growth[row.ticker] = _growth_rate(window["close"])
return (
pd.DataFrame()
.from_dict(stock_growth, orient="index")
.reset_index()
.rename(columns={"index": "ticker", 0: "stock_growth"})
)
def augmented_company_info(
selected_companies: pd.DataFrame,
employee_growth_rate_since_last_funding_round: pd.DataFrame,
stock_growth_rate_since_last_funding_round: pd.DataFrame,
) -> pd.DataFrame:
"""Merge employee count and stock growth with company info"""
df = pd.merge(
selected_companies,
employee_growth_rate_since_last_funding_round,
on="ticker",
how="left",
)
df = pd.merge(
df,
stock_growth_rate_since_last_funding_round,
on="ticker",
how="left",
)
return df