""" Coffee study: analysis pipeline. Reads raw FAO green-coffee production data (via Our World in Data), cleans it, derives the global trend + producer concentration, and forecasts world production through 2030 with a Holt linear-trend model. Output: ../../src/data/coffee/clean.json (consumed by the static site) Run: cd analysis .venv/bin/python coffee/pipeline.py Source: Our World in Data — "Coffee bean production", based on FAOSTAT (FAO of the UN). Item: Coffee, green. Element: Production (tonnes). https://ourworldindata.org/grapher/coffee-bean-production """ from pathlib import Path import pandas as pd import sys sys.path.insert(0, str(Path(__file__).resolve().parent.parent)) from _lib import holt_forecast, write_clean HERE = Path(__file__).parent RAW = HERE / "raw" / "coffee-bean-production.csv" OUT = HERE.parent.parent / "src" / "data" / "coffee" / "clean.json" # Region aggregates / income groups present in the FAO file that are NOT countries. NON_COUNTRIES = ( "World", "Africa", "Asia", "Americas", "Europe", "Oceania", "European Union", "income countries", "Income", "(FAO)", "Net Food", "Least Developed", "Land Locked", "Small Island", "Low Income Food Deficit", "South America", "Central America", "North America", "Caribbean", "Eastern", "Western", "Southern", "Northern", "Middle", "Sub-Saharan", ) FORECAST_TO = 2030 PROD_COL = "coffee__green__00000656__production__005510__tonnes" def is_country(name: str) -> bool: return not any(token in name for token in NON_COUNTRIES) def load() -> pd.DataFrame: df = pd.read_csv(RAW) df = df.rename(columns={PROD_COL: "tonnes"}) df = df[["entity", "code", "year", "tonnes"]].dropna(subset=["tonnes"]) df["year"] = df["year"].astype(int) return df def world_series(df: pd.DataFrame) -> pd.Series: w = df[df["entity"] == "World"].set_index("year")["tonnes"].sort_index() # Drop the most recent year if it is a partial / provisional outlier: # keep full continuous range only. return w def main() -> None: df = load() world = world_series(df) latest_year = int(world.index[-1]) first_year = int(world.index[0]) # --- Top producers, latest year --- latest = df[(df["year"] == latest_year) & (df["entity"].map(is_country))] top = ( latest.sort_values("tonnes", ascending=False) .head(10)[["entity", "code", "tonnes"]] .reset_index(drop=True) ) world_latest = float(world.loc[latest_year]) top_records = [ { "country": r.entity, "code": r.code, "milTonnes": round(r.tonnes / 1e6, 3), "sharePct": round(100 * r.tonnes / world_latest, 1), } for r in top.itertuples() ] fc = holt_forecast(world.values, latest_year, FORECAST_TO, ndigits=3, scale=1e6) payload = { "meta": { "source": "Our World in Data — Coffee bean production (FAOSTAT, FAO of the UN)", "sourceUrl": "https://ourworldindata.org/grapher/coffee-bean-production", "item": "Coffee, green — Production (tonnes)", "asOf": f"{latest_year} (latest full year in FAOSTAT release)", "unitNote": "Values shown in million tonnes of green coffee.", "method": "Holt linear-trend exponential smoothing; 80% band from residual std scaled by sqrt(horizon).", }, "headline": { "latestYear": latest_year, "worldMilTonnes": round(world_latest / 1e6, 2), "growthSince": first_year, "growthFactor": round(world_latest / float(world.loc[first_year]), 1), "forecastYear": FORECAST_TO, "forecastMilTonnes": fc["mean"][-1], }, "worldSeries": [ {"year": int(yr), "milTonnes": round(float(v) / 1e6, 3)} for yr, v in world.items() ], "topProducers": top_records, "forecast": fc, } write_clean(OUT, payload) print(f"wrote {OUT.relative_to(HERE.parent.parent)}") print( f" world {latest_year}: {payload['headline']['worldMilTonnes']} Mt " f"-> forecast {FORECAST_TO}: {fc['mean'][-1]} Mt" ) print(f" top producer: {top_records[0]['country']} " f"({top_records[0]['sharePct']}%)") if __name__ == "__main__": main()