Merge 2025-07-15 batch into the xlsx; tools to detect & re-track

- merge_2025_07_15_into_xlsx.py: pivot the legacy 2025_07_15_metadata_fixed.csv into the unified xlsx schema (one row per fly, training_date_time + testing_date_time). Backs up the xlsx before writing. 24 new rows across machines 076 / 139 / 145 / 268. - pick_targets.py: --video flag to bypass the inventory's in_xlsx filter, so a specific mp4 can be picked outside the normal flow. - explore_barrier_signal.py: visualises raw y(t), per-frame inter-fly distance, and sliding min/mean distance against a known barrier-opening time. Used for prototyping the detector. - detect_barrier_opening.py: per-ROI sliding-window mean-distance change-point estimator (median across ROIs). Currently noisy on a one-video calibration set; will be re-tuned once the 4 missing 2025-07-15 videos are re-tracked. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-01 10:28:25 +01:00 · 2026-05-01 10:28:25 +01:00 · 847d2cbd1b
commit 847d2cbd1b
parent 8f3c4ca89c
4 changed files with 480 additions and 1 deletions
--- a/scripts/merge_2025_07_15_into_xlsx.py
+++ b/scripts/merge_2025_07_15_into_xlsx.py
@ -0,0 +1,115 @@
+"""One-off: pivot the legacy 2025_07_15_metadata_fixed.csv into the merged xlsx.
+
+The 2025-07-15 pilot batch was indexed by a separate CSV with one row per
+(machine, HHMMSS, ROI). The unified xlsx instead has one row per fly
+(machine, ROI) with both `training_date_time` and `testing_date_time`.
+This script pivots the CSV to match that schema and appends the result
+to the xlsx, after backing up the original.
+
+Idempotent: if any row for date == 2025-07-15 already exists, abort.
+
+Run:
+    python merge_2025_07_15_into_xlsx.py
+"""
+
+from __future__ import annotations
+
+import shutil
+import sys
+from datetime import datetime
+from pathlib import Path
+
+import pandas as pd
+
+from config import VIDEO_INFO_XLSX, DATA_METADATA
+
+LEGACY_CSV = DATA_METADATA / "2025_07_15_metadata_fixed.csv"
+
+# Per-machine pairing of training-session HHMMSS → testing-session HHMMSS.
+# Single-session machines (268, 139) get None for the testing field.
+SESSION_PAIRS: dict[int, tuple[str, str | None]] = {
+     76: ("16-03-10", "16-31-34"),
+    145: ("16-03-27", "16-31-41"),
+    268: ("16-32-05", None),         # only one recording; treat as training
+    139: ("16-31-52", None),         # only one recording; never tracked
+}
+
+
+def hhmmss_to_xlsx_time(date: str, hhmmss: str) -> str:
+    """'16-03-10' on date 2025-07-15 → '20250715_403PM'.
+
+    The xlsx uses HHMMam/pm format (the regex in export_video_db_index.py
+    accepts AM/PM with optional minutes). 16:03 → 4:03 PM → '403PM'.
+    """
+    h, m, _s = (int(p) for p in hhmmss.split("-"))
+    suffix = "AM" if h < 12 else "PM"
+    h12 = h if h == 12 else h % 12
+    ymd = date.replace("-", "")
+    if m == 0:
+        return f"{ymd}_{h12}{suffix}"
+    return f"{ymd}_{h12}{m:02d}{suffix}"
+
+
+def main() -> None:
+    if not LEGACY_CSV.exists():
+        sys.exit(f"legacy CSV not found at {LEGACY_CSV}")
+    if not VIDEO_INFO_XLSX.exists():
+        sys.exit(f"xlsx not found at {VIDEO_INFO_XLSX}")
+
+    csv = pd.read_csv(LEGACY_CSV)
+    xlsx = pd.read_excel(VIDEO_INFO_XLSX)
+
+    # Idempotency check: if 2025-07-15 already in the xlsx, refuse.
+    existing_dates = pd.to_datetime(xlsx["date"]).dt.strftime("%Y-%m-%d")
+    if (existing_dates == "2025-07-15").any():
+        sys.exit("xlsx already contains 2025-07-15 rows; nothing to do.")
+
+    # Build one row per (machine, ROI). The legacy CSV has duplicate rows
+    # per session — collapse on (machine, ROI) and pick metadata from any.
+    csv["machine_int"] = csv["machine_name"].astype(int)
+    by_fly = csv.groupby(["machine_int", "ROI"], as_index=False).agg(
+        genotype=("genotype", "first"),
+        group=("group", "first"),
+    )
+
+    rows = []
+    for _, fly in by_fly.iterrows():
+        machine_int = int(fly["machine_int"])
+        if machine_int not in SESSION_PAIRS:
+            print(f"  skip machine {machine_int}: no session pairing defined")
+            continue
+        train_hhmmss, test_hhmmss = SESSION_PAIRS[machine_int]
+        rows.append({
+            "source_date": "20250715",
+            "date": pd.Timestamp("2025-07-15"),
+            "machine_name": f"ETHOSCOPE_{machine_int:03d}",
+            "roi": int(fly["ROI"]),
+            "species": "Melanogaster/CS" if fly["genotype"] == "CS" else fly["genotype"],
+            "male": fly["group"],            # 'trained' / 'naive' already canonical
+            "collected": pd.NaT,
+            "training_date_time": hhmmss_to_xlsx_time("2025-07-15", train_hhmmss),
+            "testing_date_time":  hhmmss_to_xlsx_time("2025-07-15", test_hhmmss) if test_hhmmss else "",
+            "training_length_hr": pd.NA,
+            "consolidation_length_hr": pd.NA,
+            "memory": pd.NA,
+            "age": pd.NA,
+        })
+
+    new_df = pd.DataFrame(rows)
+    print(f"adding {len(new_df)} rows for the 2025-07-15 batch:")
+    print(new_df[["machine_name", "roi", "male", "training_date_time", "testing_date_time"]])
+
+    # Back up the xlsx, then append.
+    backup = VIDEO_INFO_XLSX.with_suffix(
+        f".backup_{datetime.now():%Y%m%d_%H%M%S}.xlsx"
+    )
+    shutil.copy2(VIDEO_INFO_XLSX, backup)
+    print(f"\nbacked up xlsx → {backup}")
+
+    merged = pd.concat([xlsx, new_df], ignore_index=True)
+    merged.to_excel(VIDEO_INFO_XLSX, index=False)
+    print(f"wrote {VIDEO_INFO_XLSX}  ({len(merged)} rows total)")
+
+
+if __name__ == "__main__":
+    main()