"""One-off: pivot the legacy 2025_07_15_metadata_fixed.csv into the merged xlsx. The 2025-07-15 pilot batch was indexed by a separate CSV with one row per (machine, HHMMSS, ROI). The unified xlsx instead has one row per fly (machine, ROI) with both `training_date_time` and `testing_date_time`. This script pivots the CSV to match that schema and appends the result to the xlsx, after backing up the original. Idempotent: if any row for date == 2025-07-15 already exists, abort. Run: python merge_2025_07_15_into_xlsx.py """ from __future__ import annotations import shutil import sys from datetime import datetime from pathlib import Path import pandas as pd from config import VIDEO_INFO_XLSX, DATA_METADATA LEGACY_CSV = DATA_METADATA / "2025_07_15_metadata_fixed.csv" # Per-machine pairing of training-session HHMMSS → testing-session HHMMSS. # Single-session machines (268, 139) get None for the testing field. SESSION_PAIRS: dict[int, tuple[str, str | None]] = { 76: ("16-03-10", "16-31-34"), 145: ("16-03-27", "16-31-41"), 268: ("16-32-05", None), # only one recording; treat as training 139: ("16-31-52", None), # only one recording; never tracked } def hhmmss_to_xlsx_time(date: str, hhmmss: str) -> str: """'16-03-10' on date 2025-07-15 → '20250715_403PM'. The xlsx uses HHMMam/pm format (the regex in export_video_db_index.py accepts AM/PM with optional minutes). 16:03 → 4:03 PM → '403PM'. """ h, m, _s = (int(p) for p in hhmmss.split("-")) suffix = "AM" if h < 12 else "PM" h12 = h if h == 12 else h % 12 ymd = date.replace("-", "") if m == 0: return f"{ymd}_{h12}{suffix}" return f"{ymd}_{h12}{m:02d}{suffix}" def main() -> None: if not LEGACY_CSV.exists(): sys.exit(f"legacy CSV not found at {LEGACY_CSV}") if not VIDEO_INFO_XLSX.exists(): sys.exit(f"xlsx not found at {VIDEO_INFO_XLSX}") csv = pd.read_csv(LEGACY_CSV) xlsx = pd.read_excel(VIDEO_INFO_XLSX) # Idempotency check: if 2025-07-15 already in the xlsx, refuse. existing_dates = pd.to_datetime(xlsx["date"]).dt.strftime("%Y-%m-%d") if (existing_dates == "2025-07-15").any(): sys.exit("xlsx already contains 2025-07-15 rows; nothing to do.") # Build one row per (machine, ROI). The legacy CSV has duplicate rows # per session — collapse on (machine, ROI) and pick metadata from any. csv["machine_int"] = csv["machine_name"].astype(int) by_fly = csv.groupby(["machine_int", "ROI"], as_index=False).agg( genotype=("genotype", "first"), group=("group", "first"), ) rows = [] for _, fly in by_fly.iterrows(): machine_int = int(fly["machine_int"]) if machine_int not in SESSION_PAIRS: print(f" skip machine {machine_int}: no session pairing defined") continue train_hhmmss, test_hhmmss = SESSION_PAIRS[machine_int] rows.append({ "source_date": "20250715", "date": pd.Timestamp("2025-07-15"), "machine_name": f"ETHOSCOPE_{machine_int:03d}", "roi": int(fly["ROI"]), "species": "Melanogaster/CS" if fly["genotype"] == "CS" else fly["genotype"], "male": fly["group"], # 'trained' / 'naive' already canonical "collected": pd.NaT, "training_date_time": hhmmss_to_xlsx_time("2025-07-15", train_hhmmss), "testing_date_time": hhmmss_to_xlsx_time("2025-07-15", test_hhmmss) if test_hhmmss else "", "training_length_hr": pd.NA, "consolidation_length_hr": pd.NA, "memory": pd.NA, "age": pd.NA, }) new_df = pd.DataFrame(rows) print(f"adding {len(new_df)} rows for the 2025-07-15 batch:") print(new_df[["machine_name", "roi", "male", "training_date_time", "testing_date_time"]]) # Back up the xlsx, then append. backup = VIDEO_INFO_XLSX.with_suffix( f".backup_{datetime.now():%Y%m%d_%H%M%S}.xlsx" ) shutil.copy2(VIDEO_INFO_XLSX, backup) print(f"\nbacked up xlsx → {backup}") merged = pd.concat([xlsx, new_df], ignore_index=True) merged.to_excel(VIDEO_INFO_XLSX, index=False) print(f"wrote {VIDEO_INFO_XLSX} ({len(merged)} rows total)") if __name__ == "__main__": main()