""" Wines study: analysis pipeline. Reads raw FAO wine production data (via Our World in Data), cleans it, derives the global trend + producer concentration, and forecasts world production through 2030 with a Holt linear-trend model. Output: ../../src/data/wines/clean.json (consumed by the static site) Run: cd analysis .venv/bin/python wines/pipeline.py Source: Our World in Data, "Wine production", based on FAOSTAT (FAO of the UN). Item: Wine. Element: Production (tonnes). Data was downloaded from the OWID grapher CSV, not transcribed. https://ourworldindata.org/grapher/wine-production """ from pathlib import Path import pandas as pd import sys sys.path.insert(0, str(Path(__file__).resolve().parent.parent)) from _lib import holt_forecast, write_clean HERE = Path(__file__).parent RAW = HERE / "raw" / "wine-production.csv" OUT = HERE.parent.parent / "src" / "data" / "wines" / "clean.json" # Region aggregates / income groups present in the FAO file that are NOT countries. NON_COUNTRIES = ( "World", "Africa", "Asia", "Americas", "Europe", "Oceania", "European Union", "income countries", "Income", "(FAO)", "Net Food", "Least Developed", "Land Locked", "Small Island", "Low Income Food Deficit", "South America", "Central America", "North America", "Caribbean", "Eastern", "Western", "Southern", "Northern", "Middle", "Sub-Saharan", "(27)", ) FORECAST_TO = 2030 PROD_COL = "wine__00000564__production__005510__tonnes" def is_country(name: str) -> bool: return not any(token in name for token in NON_COUNTRIES) def load() -> pd.DataFrame: df = pd.read_csv(RAW) df = df.rename(columns={PROD_COL: "tonnes"}) df = df[["entity", "code", "year", "tonnes"]].dropna(subset=["tonnes"]) df["year"] = df["year"].astype(int) return df def world_series(df: pd.DataFrame) -> pd.Series: return df[df["entity"] == "World"].set_index("year")["tonnes"].sort_index() def main() -> None: df = load() world = world_series(df) latest_year = int(world.index[-1]) first_year = int(world.index[0]) world_latest = float(world.loc[latest_year]) # --- Top producing countries, latest year --- latest = df[(df["year"] == latest_year) & (df["entity"].map(is_country))] top = ( latest.sort_values("tonnes", ascending=False) .head(10)[["entity", "tonnes"]] .reset_index(drop=True) ) bars = [ { "label": r.entity, "value": round(r.tonnes / 1e6, 3), "note": f"{round(100 * r.tonnes / world_latest, 1)}%", } for r in top.itertuples() ] series = [ {"year": int(yr), "value": round(float(v) / 1e6, 3)} for yr, v in world.items() ] fc = holt_forecast(world.values, latest_year, FORECAST_TO, ndigits=3, scale=1e6) payload = { "meta": { "source": "Our World in Data, Wine production (FAOSTAT, FAO of the UN)", "sourceUrl": "https://ourworldindata.org/grapher/wine-production", "item": "Wine, Production (tonnes)", "asOf": f"{latest_year} (latest full year in the FAOSTAT release; data downloaded from OWID, not transcribed)", "unitNote": "Values shown in million tonnes of wine. FAOSTAT reports wine output by mass, not volume, so figures differ from OIV hectolitre series.", "method": "Holt linear-trend exponential smoothing; 80% band from residual std scaled by sqrt(horizon).", }, "headline": { "latestYear": latest_year, "worldMilTonnes": round(world_latest / 1e6, 2), "growthSince": first_year, "firstYearMilTonnes": round(float(world.loc[first_year]) / 1e6, 2), "topProducer": bars[0]["label"], "topProducerShare": bars[0]["note"], "forecastYear": FORECAST_TO, "forecastMilTonnes": fc["mean"][-1], }, "series": series, "forecast": fc, "bars": bars, } write_clean(OUT, payload) print(f"wrote {OUT.relative_to(HERE.parent.parent)}") print( f" world {latest_year}: {payload['headline']['worldMilTonnes']} Mt " f"-> forecast {FORECAST_TO}: {fc['mean'][-1]} Mt" ) print(f" top producer: {bars[0]['label']} ({bars[0]['note']})") if __name__ == "__main__": main()