"""
Coffee study: analysis pipeline.

Reads raw FAO green-coffee production data (via Our World in Data),
cleans it, derives the global trend + producer concentration, and
forecasts world production through 2030 with a Holt linear-trend model.

Output: ../../src/data/coffee/clean.json  (consumed by the static site)

Run:
    cd analysis
    .venv/bin/python coffee/pipeline.py

Source: Our World in Data — "Coffee bean production", based on
        FAOSTAT (FAO of the UN). Item: Coffee, green. Element: Production (tonnes).
        https://ourworldindata.org/grapher/coffee-bean-production
"""

from pathlib import Path

import pandas as pd

import sys
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
from _lib import holt_forecast, write_clean

HERE = Path(__file__).parent
RAW = HERE / "raw" / "coffee-bean-production.csv"
OUT = HERE.parent.parent / "src" / "data" / "coffee" / "clean.json"

# Region aggregates / income groups present in the FAO file that are NOT countries.
NON_COUNTRIES = (
    "World", "Africa", "Asia", "Americas", "Europe", "Oceania",
    "European Union", "income countries", "Income", "(FAO)",
    "Net Food", "Least Developed", "Land Locked", "Small Island",
    "Low Income Food Deficit", "South America", "Central America",
    "North America", "Caribbean", "Eastern", "Western", "Southern",
    "Northern", "Middle", "Sub-Saharan",
)

FORECAST_TO = 2030
PROD_COL = "coffee__green__00000656__production__005510__tonnes"


def is_country(name: str) -> bool:
    return not any(token in name for token in NON_COUNTRIES)


def load() -> pd.DataFrame:
    df = pd.read_csv(RAW)
    df = df.rename(columns={PROD_COL: "tonnes"})
    df = df[["entity", "code", "year", "tonnes"]].dropna(subset=["tonnes"])
    df["year"] = df["year"].astype(int)
    return df


def world_series(df: pd.DataFrame) -> pd.Series:
    w = df[df["entity"] == "World"].set_index("year")["tonnes"].sort_index()
    # Drop the most recent year if it is a partial / provisional outlier:
    # keep full continuous range only.
    return w


def main() -> None:
    df = load()
    world = world_series(df)
    latest_year = int(world.index[-1])
    first_year = int(world.index[0])

    # --- Top producers, latest year ---
    latest = df[(df["year"] == latest_year) & (df["entity"].map(is_country))]
    top = (
        latest.sort_values("tonnes", ascending=False)
        .head(10)[["entity", "code", "tonnes"]]
        .reset_index(drop=True)
    )
    world_latest = float(world.loc[latest_year])
    top_records = [
        {
            "country": r.entity,
            "code": r.code,
            "milTonnes": round(r.tonnes / 1e6, 3),
            "sharePct": round(100 * r.tonnes / world_latest, 1),
        }
        for r in top.itertuples()
    ]

    fc = holt_forecast(world.values, latest_year, FORECAST_TO, ndigits=3, scale=1e6)

    payload = {
        "meta": {
            "source": "Our World in Data — Coffee bean production (FAOSTAT, FAO of the UN)",
            "sourceUrl": "https://ourworldindata.org/grapher/coffee-bean-production",
            "item": "Coffee, green — Production (tonnes)",
            "asOf": f"{latest_year} (latest full year in FAOSTAT release)",
            "unitNote": "Values shown in million tonnes of green coffee.",
            "method": "Holt linear-trend exponential smoothing; 80% band from residual std scaled by sqrt(horizon).",
        },
        "headline": {
            "latestYear": latest_year,
            "worldMilTonnes": round(world_latest / 1e6, 2),
            "growthSince": first_year,
            "growthFactor": round(world_latest / float(world.loc[first_year]), 1),
            "forecastYear": FORECAST_TO,
            "forecastMilTonnes": fc["mean"][-1],
        },
        "worldSeries": [
            {"year": int(yr), "milTonnes": round(float(v) / 1e6, 3)}
            for yr, v in world.items()
        ],
        "topProducers": top_records,
        "forecast": fc,
    }

    write_clean(OUT, payload)
    print(f"wrote {OUT.relative_to(HERE.parent.parent)}")
    print(
        f"  world {latest_year}: {payload['headline']['worldMilTonnes']} Mt "
        f"-> forecast {FORECAST_TO}: {fc['mean'][-1]} Mt"
    )
    print(f"  top producer: {top_records[0]['country']} "
          f"({top_records[0]['sharePct']}%)")


if __name__ == "__main__":
    main()