"""
Pine & timber study: analysis pipeline.

Reads raw FAOSTAT Forestry data (Roundwood, Production, m3), cleans it,
derives the global trend and producer concentration, and forecasts world
roundwood production through 2030 with a Holt linear-trend model.

Output: ../../src/data/pine/clean.json  (consumed by the static site)

Run:
    cd analysis
    .venv/bin/python pine/pipeline.py

Source: FAOSTAT Forestry (FAO of the UN). Item: Roundwood (item code 1861).
        Element: Production (element code 5516). Unit: m3.
        Bulk file "Forestry_E_All_Data_(Normalized)", downloaded 2026-06-22.
        https://www.fao.org/faostat/en/#data/FO

Note on the data: this CSV was DOWNLOADED from the FAOSTAT bulk endpoint
(bulks-faostat.fao.org), not transcribed by hand. "Roundwood" is the broadest
FAOSTAT wood category and includes both industrial roundwood and woodfuel.
"""

from pathlib import Path

import pandas as pd

import sys
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
from _lib import holt_forecast, write_clean

HERE = Path(__file__).parent
RAW = HERE / "raw" / "roundwood-production-faostat.csv"
OUT = HERE.parent.parent / "src" / "data" / "pine" / "clean.json"

FORECAST_TO = 2030

# FAOSTAT Area Codes >= 5000 are regional / income-group aggregates.
# Area code 351 is "China" (an aggregate of mainland + Hong Kong + Macao +
# Taiwan); we keep "China, mainland" (41) instead to avoid double counting.
AGG_MIN_CODE = 5000
CHINA_AGG_CODE = 351
WORLD_CODE = 5000


def load() -> pd.DataFrame:
    df = pd.read_csv(RAW)
    df = df.rename(
        columns={
            "Area Code": "area_code",
            "Area": "entity",
            "Year": "year",
            "Value": "m3",
        }
    )
    df = df[["area_code", "entity", "year", "m3"]].dropna(subset=["m3"])
    df["year"] = df["year"].astype(int)
    df["area_code"] = df["area_code"].astype(int)
    return df


def is_country(area_code: int) -> bool:
    return area_code < AGG_MIN_CODE and area_code != CHINA_AGG_CODE


def world_series(df: pd.DataFrame) -> pd.Series:
    w = df[df["area_code"] == WORLD_CODE].set_index("year")["m3"].sort_index()
    return w


def main() -> None:
    df = load()
    world = world_series(df)
    latest_year = int(world.index[-1])
    first_year = int(world.index[0])
    world_latest = float(world.loc[latest_year])

    # --- Top producing countries, latest year ---
    latest = df[(df["year"] == latest_year) & (df["area_code"].map(is_country))]
    top = (
        latest.sort_values("m3", ascending=False)
        .head(10)[["entity", "m3"]]
        .reset_index(drop=True)
    )
    bars = [
        {
            "label": r.entity.replace("China, mainland", "China"),
            "value": round(r.m3 / 1e6, 1),
            "note": f"{round(100 * r.m3 / world_latest, 1)}%",
        }
        for r in top.itertuples()
    ]

    fc = holt_forecast(world.values, latest_year, FORECAST_TO, ndigits=1, scale=1e6)

    payload = {
        "meta": {
            "source": "FAOSTAT Forestry (FAO of the UN), Roundwood, Production",
            "sourceUrl": "https://www.fao.org/faostat/en/#data/FO",
            "item": "Roundwood (item 1861), Production, m3",
            "asOf": f"{latest_year} (FAOSTAT bulk file downloaded 2026-06-22)",
            "unitNote": "Values shown in million cubic metres (M m3) of roundwood.",
            "method": "Holt linear-trend exponential smoothing; 80% band from residual std scaled by sqrt(horizon).",
        },
        "headline": {
            "latestYear": latest_year,
            "worldMilM3": round(world_latest / 1e6, 0),
            "growthSince": first_year,
            "growthFactor": round(world_latest / float(world.loc[first_year]), 1),
            "topProducer": bars[0]["label"],
            "topProducerSharePct": round(100 * float(top.iloc[0]["m3"]) / world_latest, 1),
            "forecastYear": FORECAST_TO,
            "forecastMilM3": fc["mean"][-1],
        },
        "series": [
            {"year": int(yr), "value": round(float(v) / 1e6, 1)}
            for yr, v in world.items()
        ],
        "forecast": fc,
        "bars": bars,
    }

    write_clean(OUT, payload)
    print(f"wrote {OUT.relative_to(HERE.parent.parent)}")
    print(
        f"  world {latest_year}: {payload['headline']['worldMilM3']} M m3 "
        f"-> forecast {FORECAST_TO}: {fc['mean'][-1]} M m3"
    )
    print(
        f"  top producer: {bars[0]['label']} ({bars[0]['note']}); "
        f"growth x{payload['headline']['growthFactor']} since {first_year}"
    )


if __name__ == "__main__":
    main()