"""One-off: pivot the legacy 2025_07_15_metadata_fixed.csv into the merged xlsx.

The 2025-07-15 pilot batch was indexed by a separate CSV with one row per
(machine, HHMMSS, ROI). The unified xlsx instead has one row per fly
(machine, ROI) with both `training_date_time` and `testing_date_time`.
This script pivots the CSV to match that schema and appends the result
to the xlsx, after backing up the original.

Idempotent: if any row for date == 2025-07-15 already exists, abort.

Run:
    python merge_2025_07_15_into_xlsx.py
"""

from __future__ import annotations

import shutil
import sys
from datetime import datetime
from pathlib import Path

import pandas as pd

from config import VIDEO_INFO_XLSX, DATA_METADATA

LEGACY_CSV = DATA_METADATA / "2025_07_15_metadata_fixed.csv"

# Per-machine pairing of training-session HHMMSS → testing-session HHMMSS.
# Single-session machines (268, 139) get None for the testing field.
SESSION_PAIRS: dict[int, tuple[str, str | None]] = {
     76: ("16-03-10", "16-31-34"),
    145: ("16-03-27", "16-31-41"),
    268: ("16-32-05", None),         # only one recording; treat as training
    139: ("16-31-52", None),         # only one recording; never tracked
}


def hhmmss_to_xlsx_time(date: str, hhmmss: str) -> str:
    """'16-03-10' on date 2025-07-15 → '20250715_403PM'.

    The xlsx uses HHMMam/pm format (the regex in export_video_db_index.py
    accepts AM/PM with optional minutes). 16:03 → 4:03 PM → '403PM'.
    """
    h, m, _s = (int(p) for p in hhmmss.split("-"))
    suffix = "AM" if h < 12 else "PM"
    h12 = h if h == 12 else h % 12
    ymd = date.replace("-", "")
    if m == 0:
        return f"{ymd}_{h12}{suffix}"
    return f"{ymd}_{h12}{m:02d}{suffix}"


def main() -> None:
    if not LEGACY_CSV.exists():
        sys.exit(f"legacy CSV not found at {LEGACY_CSV}")
    if not VIDEO_INFO_XLSX.exists():
        sys.exit(f"xlsx not found at {VIDEO_INFO_XLSX}")

    csv = pd.read_csv(LEGACY_CSV)
    xlsx = pd.read_excel(VIDEO_INFO_XLSX)

    # Idempotency check: if 2025-07-15 already in the xlsx, refuse.
    existing_dates = pd.to_datetime(xlsx["date"]).dt.strftime("%Y-%m-%d")
    if (existing_dates == "2025-07-15").any():
        sys.exit("xlsx already contains 2025-07-15 rows; nothing to do.")

    # Build one row per (machine, ROI). The legacy CSV has duplicate rows
    # per session — collapse on (machine, ROI) and pick metadata from any.
    csv["machine_int"] = csv["machine_name"].astype(int)
    by_fly = csv.groupby(["machine_int", "ROI"], as_index=False).agg(
        genotype=("genotype", "first"),
        group=("group", "first"),
    )

    rows = []
    for _, fly in by_fly.iterrows():
        machine_int = int(fly["machine_int"])
        if machine_int not in SESSION_PAIRS:
            print(f"  skip machine {machine_int}: no session pairing defined")
            continue
        train_hhmmss, test_hhmmss = SESSION_PAIRS[machine_int]
        rows.append({
            "source_date": "20250715",
            "date": pd.Timestamp("2025-07-15"),
            "machine_name": f"ETHOSCOPE_{machine_int:03d}",
            "roi": int(fly["ROI"]),
            "species": "Melanogaster/CS" if fly["genotype"] == "CS" else fly["genotype"],
            "male": fly["group"],            # 'trained' / 'naive' already canonical
            "collected": pd.NaT,
            "training_date_time": hhmmss_to_xlsx_time("2025-07-15", train_hhmmss),
            "testing_date_time":  hhmmss_to_xlsx_time("2025-07-15", test_hhmmss) if test_hhmmss else "",
            "training_length_hr": pd.NA,
            "consolidation_length_hr": pd.NA,
            "memory": pd.NA,
            "age": pd.NA,
        })

    new_df = pd.DataFrame(rows)
    print(f"adding {len(new_df)} rows for the 2025-07-15 batch:")
    print(new_df[["machine_name", "roi", "male", "training_date_time", "testing_date_time"]])

    # Back up the xlsx, then append.
    backup = VIDEO_INFO_XLSX.with_suffix(
        f".backup_{datetime.now():%Y%m%d_%H%M%S}.xlsx"
    )
    shutil.copy2(VIDEO_INFO_XLSX, backup)
    print(f"\nbacked up xlsx → {backup}")

    merged = pd.concat([xlsx, new_df], ignore_index=True)
    merged.to_excel(VIDEO_INFO_XLSX, index=False)
    print(f"wrote {VIDEO_INFO_XLSX}  ({len(merged)} rows total)")


if __name__ == "__main__":
    main()