Merge 2025-07-15 batch into the xlsx; tools to detect & re-track

- merge_2025_07_15_into_xlsx.py: pivot the legacy 2025_07_15_metadata_fixed.csv
  into the unified xlsx schema (one row per fly, training_date_time +
  testing_date_time). Backs up the xlsx before writing. 24 new rows
  across machines 076 / 139 / 145 / 268.
- pick_targets.py: --video flag to bypass the inventory's in_xlsx filter,
  so a specific mp4 can be picked outside the normal flow.
- explore_barrier_signal.py: visualises raw y(t), per-frame inter-fly
  distance, and sliding min/mean distance against a known
  barrier-opening time. Used for prototyping the detector.
- detect_barrier_opening.py: per-ROI sliding-window mean-distance
  change-point estimator (median across ROIs). Currently noisy on a
  one-video calibration set; will be re-tuned once the 4 missing
  2025-07-15 videos are re-tracked.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
Giorgio Gilestro 2026-05-01 10:28:25 +01:00
parent 8f3c4ca89c
commit 847d2cbd1b
4 changed files with 480 additions and 1 deletions

View file

@ -0,0 +1,115 @@
"""One-off: pivot the legacy 2025_07_15_metadata_fixed.csv into the merged xlsx.
The 2025-07-15 pilot batch was indexed by a separate CSV with one row per
(machine, HHMMSS, ROI). The unified xlsx instead has one row per fly
(machine, ROI) with both `training_date_time` and `testing_date_time`.
This script pivots the CSV to match that schema and appends the result
to the xlsx, after backing up the original.
Idempotent: if any row for date == 2025-07-15 already exists, abort.
Run:
python merge_2025_07_15_into_xlsx.py
"""
from __future__ import annotations
import shutil
import sys
from datetime import datetime
from pathlib import Path
import pandas as pd
from config import VIDEO_INFO_XLSX, DATA_METADATA
LEGACY_CSV = DATA_METADATA / "2025_07_15_metadata_fixed.csv"
# Per-machine pairing of training-session HHMMSS → testing-session HHMMSS.
# Single-session machines (268, 139) get None for the testing field.
SESSION_PAIRS: dict[int, tuple[str, str | None]] = {
76: ("16-03-10", "16-31-34"),
145: ("16-03-27", "16-31-41"),
268: ("16-32-05", None), # only one recording; treat as training
139: ("16-31-52", None), # only one recording; never tracked
}
def hhmmss_to_xlsx_time(date: str, hhmmss: str) -> str:
"""'16-03-10' on date 2025-07-15 → '20250715_403PM'.
The xlsx uses HHMMam/pm format (the regex in export_video_db_index.py
accepts AM/PM with optional minutes). 16:03 4:03 PM '403PM'.
"""
h, m, _s = (int(p) for p in hhmmss.split("-"))
suffix = "AM" if h < 12 else "PM"
h12 = h if h == 12 else h % 12
ymd = date.replace("-", "")
if m == 0:
return f"{ymd}_{h12}{suffix}"
return f"{ymd}_{h12}{m:02d}{suffix}"
def main() -> None:
if not LEGACY_CSV.exists():
sys.exit(f"legacy CSV not found at {LEGACY_CSV}")
if not VIDEO_INFO_XLSX.exists():
sys.exit(f"xlsx not found at {VIDEO_INFO_XLSX}")
csv = pd.read_csv(LEGACY_CSV)
xlsx = pd.read_excel(VIDEO_INFO_XLSX)
# Idempotency check: if 2025-07-15 already in the xlsx, refuse.
existing_dates = pd.to_datetime(xlsx["date"]).dt.strftime("%Y-%m-%d")
if (existing_dates == "2025-07-15").any():
sys.exit("xlsx already contains 2025-07-15 rows; nothing to do.")
# Build one row per (machine, ROI). The legacy CSV has duplicate rows
# per session — collapse on (machine, ROI) and pick metadata from any.
csv["machine_int"] = csv["machine_name"].astype(int)
by_fly = csv.groupby(["machine_int", "ROI"], as_index=False).agg(
genotype=("genotype", "first"),
group=("group", "first"),
)
rows = []
for _, fly in by_fly.iterrows():
machine_int = int(fly["machine_int"])
if machine_int not in SESSION_PAIRS:
print(f" skip machine {machine_int}: no session pairing defined")
continue
train_hhmmss, test_hhmmss = SESSION_PAIRS[machine_int]
rows.append({
"source_date": "20250715",
"date": pd.Timestamp("2025-07-15"),
"machine_name": f"ETHOSCOPE_{machine_int:03d}",
"roi": int(fly["ROI"]),
"species": "Melanogaster/CS" if fly["genotype"] == "CS" else fly["genotype"],
"male": fly["group"], # 'trained' / 'naive' already canonical
"collected": pd.NaT,
"training_date_time": hhmmss_to_xlsx_time("2025-07-15", train_hhmmss),
"testing_date_time": hhmmss_to_xlsx_time("2025-07-15", test_hhmmss) if test_hhmmss else "",
"training_length_hr": pd.NA,
"consolidation_length_hr": pd.NA,
"memory": pd.NA,
"age": pd.NA,
})
new_df = pd.DataFrame(rows)
print(f"adding {len(new_df)} rows for the 2025-07-15 batch:")
print(new_df[["machine_name", "roi", "male", "training_date_time", "testing_date_time"]])
# Back up the xlsx, then append.
backup = VIDEO_INFO_XLSX.with_suffix(
f".backup_{datetime.now():%Y%m%d_%H%M%S}.xlsx"
)
shutil.copy2(VIDEO_INFO_XLSX, backup)
print(f"\nbacked up xlsx → {backup}")
merged = pd.concat([xlsx, new_df], ignore_index=True)
merged.to_excel(VIDEO_INFO_XLSX, index=False)
print(f"wrote {VIDEO_INFO_XLSX} ({len(merged)} rows total)")
if __name__ == "__main__":
main()