"""
Eggs study: analysis pipeline.

Reads raw egg production data (via Our World in Data, built on FAOSTAT),
cleans it, derives the global trend + producer concentration, and
forecasts world egg production through 2030 with a Holt linear-trend model.

Output: ../../src/data/eggs/clean.json  (consumed by the static site)

Run:
    cd analysis
    .venv/bin/python eggs/pipeline.py

Source: Our World in Data, "Egg production", based on FAOSTAT (FAO of the
        UN). Item: Eggs (Primary). Element: Production (tonnes).
        https://ourworldindata.org/grapher/egg-production-thousand-tonnes
"""

from pathlib import Path

import pandas as pd

import sys
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
from _lib import holt_forecast, write_clean

HERE = Path(__file__).parent
RAW = HERE / "raw" / "egg-production.csv"
OUT = HERE.parent.parent / "src" / "data" / "eggs" / "clean.json"

FORECAST_TO = 2030
PROD_COL = "eggs__00001783__production__005510__tonnes"
SOURCE_URL = "https://ourworldindata.org/grapher/egg-production-thousand-tonnes"


def is_country(code: str) -> bool:
    """Real countries carry a 3-letter ISO code. Region aggregates (FAO),
    continents and income groups use blank codes or an OWID_ prefix."""
    return isinstance(code, str) and len(code) == 3 and not code.startswith("OWID")


def load() -> pd.DataFrame:
    df = pd.read_csv(RAW)
    df = df.rename(columns={PROD_COL: "tonnes"})
    df = df[["entity", "code", "year", "tonnes"]].dropna(subset=["tonnes"])
    df["year"] = df["year"].astype(int)
    df["code"] = df["code"].fillna("")
    return df


def world_series(df: pd.DataFrame) -> pd.Series:
    return df[df["entity"] == "World"].set_index("year")["tonnes"].sort_index()


def main() -> None:
    df = load()
    world = world_series(df)
    latest_year = int(world.index[-1])
    first_year = int(world.index[0])
    world_latest = float(world.loc[latest_year])

    # --- Top producing countries, latest year ---
    latest = df[(df["year"] == latest_year) & (df["code"].map(is_country))]
    top = (
        latest.sort_values("tonnes", ascending=False)
        .head(10)[["entity", "code", "tonnes"]]
        .reset_index(drop=True)
    )
    bars = [
        {
            "label": r.entity,
            "value": round(r.tonnes / 1e6, 3),
            "note": f"{round(100 * r.tonnes / world_latest, 1)}%",
        }
        for r in top.itertuples()
    ]

    fc = holt_forecast(world.values, latest_year, FORECAST_TO, ndigits=3, scale=1e6)

    payload = {
        "meta": {
            "source": "Our World in Data, Egg production (FAOSTAT, FAO of the UN)",
            "sourceUrl": SOURCE_URL,
            "item": "Eggs (Primary), Production (tonnes)",
            "asOf": f"{latest_year} (latest full year in FAOSTAT release)",
            "unitNote": "Values shown in million tonnes of eggs (whole-egg weight, in shell).",
            "method": "Holt linear-trend exponential smoothing (un-damped); 80% band from residual std scaled by sqrt(horizon).",
        },
        "headline": {
            "latestYear": latest_year,
            "worldMilTonnes": round(world_latest / 1e6, 1),
            "growthSince": first_year,
            "growthFactor": round(world_latest / float(world.loc[first_year]), 1),
            "forecastYear": FORECAST_TO,
            "forecastMilTonnes": fc["mean"][-1],
        },
        "series": [
            {"year": int(yr), "value": round(float(v) / 1e6, 3)}
            for yr, v in world.items()
        ],
        "forecast": fc,
        "bars": bars,
    }

    write_clean(OUT, payload)
    print(f"wrote {OUT.relative_to(HERE.parent.parent)}")
    print(
        f"  world {latest_year}: {payload['headline']['worldMilTonnes']} Mt "
        f"-> forecast {FORECAST_TO}: {fc['mean'][-1]} Mt"
    )
    print(f"  growth since {first_year}: {payload['headline']['growthFactor']}x")
    print(f"  top producer: {bars[0]['label']} ({bars[0]['note']})")


if __name__ == "__main__":
    main()