"""
Oceans study: analysis pipeline.

Reads raw FAO wild-capture fisheries data (via Our World in Data),
cleans it, derives the world wild-catch trend + top capture nations, and
forecasts world capture through 2030 with a Holt DAMPED-trend model.

The world wild catch is a famously flat series: it climbed steeply until the
late 1980s and has bounced around ~90 million tonnes ever since. A damped
trend (damped_trend=True) fits a plateauing series far better than the
undamped Holt linear trend used in the coffee study; this is a deliberately
different model for a deliberately different shape.

Output: ../../src/data/oceans/clean.json  (consumed by the static site)

Run:
    cd analysis
    .venv/bin/python oceans/pipeline.py

Source: Our World in Data — "Capture fishery production", based on
        FAOSTAT (FAO of the UN). Element: Capture fisheries production
        (tonnes, live weight). Column: er_fsh_capt_mt.
        https://ourworldindata.org/grapher/capture-fishery-production
"""

import re
from pathlib import Path

import pandas as pd

import sys
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
from _lib import holt_forecast, write_clean

HERE = Path(__file__).parent
RAW = HERE / "raw" / "capture-fishery-production.csv"
OUT = HERE.parent.parent / "src" / "data" / "oceans" / "clean.json"

FORECAST_TO = 2030
CAPTURE_COL = "er_fsh_capt_mt"

# Non-country entities in the OWID/FAO file: the World aggregate, FAO/World Bank
# regional groupings, income groups, and the EU bloc. Matched by suffix or exact
# name so real countries that merely contain a region word (for example
# "Central African Republic", "South Africa") are NOT excluded.
AGGREGATE_PATTERNS = (
    re.compile(r"\(FAO\)$"),
    re.compile(r"\(WB\)$"),
    re.compile(r"income countries$", re.IGNORECASE),
    re.compile(r"^World$"),
    re.compile(r"^European Union"),
)


def is_country(name: str) -> bool:
    return not any(p.search(name) for p in AGGREGATE_PATTERNS)


def load() -> pd.DataFrame:
    df = pd.read_csv(RAW)
    df = df.rename(columns={CAPTURE_COL: "tonnes"})
    df = df[["entity", "code", "year", "tonnes"]].dropna(subset=["tonnes"])
    df["year"] = df["year"].astype(int)
    return df


def world_series(df: pd.DataFrame) -> pd.Series:
    """World wild-capture total per year (continuous annual series)."""
    return df[df["entity"] == "World"].set_index("year")["tonnes"].sort_index()


def main() -> None:
    df = load()
    world = world_series(df)
    latest_year = int(world.index[-1])
    first_year = int(world.index[0])
    world_latest = float(world.loc[latest_year])
    peak_year = int(world.idxmax())

    # --- Top capture nations, latest world year (shares vs the world total) ---
    latest = df[(df["year"] == latest_year) & (df["entity"].map(is_country))]
    top = (
        latest.sort_values("tonnes", ascending=False)
        .head(10)[["entity", "tonnes"]]
        .reset_index(drop=True)
    )
    bars = [
        {
            "label": r.entity,
            "value": round(r.tonnes / 1e6, 2),
            "note": f"{round(100 * r.tonnes / world_latest, 1)}% of world",
        }
        for r in top.itertuples()
    ]

    fc = holt_forecast(
        world.values, latest_year, FORECAST_TO,
        damped=True, ndigits=3, nonneg=True, scale=1e6,
    )

    payload = {
        "meta": {
            "source": "Our World in Data — Capture fishery production (FAOSTAT, FAO of the UN)",
            "sourceUrl": "https://ourworldindata.org/grapher/capture-fishery-production",
            "item": "Capture fisheries production (wild catch, tonnes live weight)",
            "asOf": f"{latest_year} (latest full year in the FAOSTAT release)",
            "unitNote": "Values shown in million tonnes of wild capture (live weight).",
            "method": "Holt damped-trend exponential smoothing; 80% band from residual std scaled by sqrt(horizon).",
        },
        "headline": {
            "latestYear": latest_year,
            "worldMilTonnes": round(world_latest / 1e6, 1),
            "peakYear": peak_year,
            "peakMilTonnes": round(float(world.loc[peak_year]) / 1e6, 1),
            "firstYear": first_year,
            "firstMilTonnes": round(float(world.loc[first_year]) / 1e6, 1),
            "forecastYear": FORECAST_TO,
            "forecastMilTonnes": fc["mean"][-1],
        },
        "series": [
            {"year": int(yr), "value": round(float(v) / 1e6, 3)}
            for yr, v in world.items()
        ],
        "forecast": fc,
        "bars": bars,
    }

    write_clean(OUT, payload)
    print(f"wrote {OUT.relative_to(HERE.parent.parent)}")
    print(
        f"  world {latest_year}: {payload['headline']['worldMilTonnes']} Mt "
        f"(peak {peak_year}: {payload['headline']['peakMilTonnes']} Mt) "
        f"-> forecast {FORECAST_TO}: {fc['mean'][-1]} Mt"
    )
    print(f"  top nation: {bars[0]['label']} ({bars[0]['note']})")


if __name__ == "__main__":
    main()