""" Eggs study: analysis pipeline. Reads raw egg production data (via Our World in Data, built on FAOSTAT), cleans it, derives the global trend + producer concentration, and forecasts world egg production through 2030 with a Holt linear-trend model. Output: ../../src/data/eggs/clean.json (consumed by the static site) Run: cd analysis .venv/bin/python eggs/pipeline.py Source: Our World in Data, "Egg production", based on FAOSTAT (FAO of the UN). Item: Eggs (Primary). Element: Production (tonnes). https://ourworldindata.org/grapher/egg-production-thousand-tonnes """ from pathlib import Path import pandas as pd import sys sys.path.insert(0, str(Path(__file__).resolve().parent.parent)) from _lib import holt_forecast, write_clean HERE = Path(__file__).parent RAW = HERE / "raw" / "egg-production.csv" OUT = HERE.parent.parent / "src" / "data" / "eggs" / "clean.json" FORECAST_TO = 2030 PROD_COL = "eggs__00001783__production__005510__tonnes" SOURCE_URL = "https://ourworldindata.org/grapher/egg-production-thousand-tonnes" def is_country(code: str) -> bool: """Real countries carry a 3-letter ISO code. Region aggregates (FAO), continents and income groups use blank codes or an OWID_ prefix.""" return isinstance(code, str) and len(code) == 3 and not code.startswith("OWID") def load() -> pd.DataFrame: df = pd.read_csv(RAW) df = df.rename(columns={PROD_COL: "tonnes"}) df = df[["entity", "code", "year", "tonnes"]].dropna(subset=["tonnes"]) df["year"] = df["year"].astype(int) df["code"] = df["code"].fillna("") return df def world_series(df: pd.DataFrame) -> pd.Series: return df[df["entity"] == "World"].set_index("year")["tonnes"].sort_index() def main() -> None: df = load() world = world_series(df) latest_year = int(world.index[-1]) first_year = int(world.index[0]) world_latest = float(world.loc[latest_year]) # --- Top producing countries, latest year --- latest = df[(df["year"] == latest_year) & (df["code"].map(is_country))] top = ( latest.sort_values("tonnes", ascending=False) .head(10)[["entity", "code", "tonnes"]] .reset_index(drop=True) ) bars = [ { "label": r.entity, "value": round(r.tonnes / 1e6, 3), "note": f"{round(100 * r.tonnes / world_latest, 1)}%", } for r in top.itertuples() ] fc = holt_forecast(world.values, latest_year, FORECAST_TO, ndigits=3, scale=1e6) payload = { "meta": { "source": "Our World in Data, Egg production (FAOSTAT, FAO of the UN)", "sourceUrl": SOURCE_URL, "item": "Eggs (Primary), Production (tonnes)", "asOf": f"{latest_year} (latest full year in FAOSTAT release)", "unitNote": "Values shown in million tonnes of eggs (whole-egg weight, in shell).", "method": "Holt linear-trend exponential smoothing (un-damped); 80% band from residual std scaled by sqrt(horizon).", }, "headline": { "latestYear": latest_year, "worldMilTonnes": round(world_latest / 1e6, 1), "growthSince": first_year, "growthFactor": round(world_latest / float(world.loc[first_year]), 1), "forecastYear": FORECAST_TO, "forecastMilTonnes": fc["mean"][-1], }, "series": [ {"year": int(yr), "value": round(float(v) / 1e6, 3)} for yr, v in world.items() ], "forecast": fc, "bars": bars, } write_clean(OUT, payload) print(f"wrote {OUT.relative_to(HERE.parent.parent)}") print( f" world {latest_year}: {payload['headline']['worldMilTonnes']} Mt " f"-> forecast {FORECAST_TO}: {fc['mean'][-1]} Mt" ) print(f" growth since {first_year}: {payload['headline']['growthFactor']}x") print(f" top producer: {bars[0]['label']} ({bars[0]['note']})") if __name__ == "__main__": main()