Merge 2025-07-15 batch into the xlsx; tools to detect & re-track
- merge_2025_07_15_into_xlsx.py: pivot the legacy 2025_07_15_metadata_fixed.csv into the unified xlsx schema (one row per fly, training_date_time + testing_date_time). Backs up the xlsx before writing. 24 new rows across machines 076 / 139 / 145 / 268. - pick_targets.py: --video flag to bypass the inventory's in_xlsx filter, so a specific mp4 can be picked outside the normal flow. - explore_barrier_signal.py: visualises raw y(t), per-frame inter-fly distance, and sliding min/mean distance against a known barrier-opening time. Used for prototyping the detector. - detect_barrier_opening.py: per-ROI sliding-window mean-distance change-point estimator (median across ROIs). Currently noisy on a one-video calibration set; will be re-tuned once the 4 missing 2025-07-15 videos are re-tracked. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
parent
8f3c4ca89c
commit
847d2cbd1b
4 changed files with 480 additions and 1 deletions
115
scripts/merge_2025_07_15_into_xlsx.py
Normal file
115
scripts/merge_2025_07_15_into_xlsx.py
Normal file
|
|
@ -0,0 +1,115 @@
|
|||
"""One-off: pivot the legacy 2025_07_15_metadata_fixed.csv into the merged xlsx.
|
||||
|
||||
The 2025-07-15 pilot batch was indexed by a separate CSV with one row per
|
||||
(machine, HHMMSS, ROI). The unified xlsx instead has one row per fly
|
||||
(machine, ROI) with both `training_date_time` and `testing_date_time`.
|
||||
This script pivots the CSV to match that schema and appends the result
|
||||
to the xlsx, after backing up the original.
|
||||
|
||||
Idempotent: if any row for date == 2025-07-15 already exists, abort.
|
||||
|
||||
Run:
|
||||
python merge_2025_07_15_into_xlsx.py
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import shutil
|
||||
import sys
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from config import VIDEO_INFO_XLSX, DATA_METADATA
|
||||
|
||||
LEGACY_CSV = DATA_METADATA / "2025_07_15_metadata_fixed.csv"
|
||||
|
||||
# Per-machine pairing of training-session HHMMSS → testing-session HHMMSS.
|
||||
# Single-session machines (268, 139) get None for the testing field.
|
||||
SESSION_PAIRS: dict[int, tuple[str, str | None]] = {
|
||||
76: ("16-03-10", "16-31-34"),
|
||||
145: ("16-03-27", "16-31-41"),
|
||||
268: ("16-32-05", None), # only one recording; treat as training
|
||||
139: ("16-31-52", None), # only one recording; never tracked
|
||||
}
|
||||
|
||||
|
||||
def hhmmss_to_xlsx_time(date: str, hhmmss: str) -> str:
|
||||
"""'16-03-10' on date 2025-07-15 → '20250715_403PM'.
|
||||
|
||||
The xlsx uses HHMMam/pm format (the regex in export_video_db_index.py
|
||||
accepts AM/PM with optional minutes). 16:03 → 4:03 PM → '403PM'.
|
||||
"""
|
||||
h, m, _s = (int(p) for p in hhmmss.split("-"))
|
||||
suffix = "AM" if h < 12 else "PM"
|
||||
h12 = h if h == 12 else h % 12
|
||||
ymd = date.replace("-", "")
|
||||
if m == 0:
|
||||
return f"{ymd}_{h12}{suffix}"
|
||||
return f"{ymd}_{h12}{m:02d}{suffix}"
|
||||
|
||||
|
||||
def main() -> None:
|
||||
if not LEGACY_CSV.exists():
|
||||
sys.exit(f"legacy CSV not found at {LEGACY_CSV}")
|
||||
if not VIDEO_INFO_XLSX.exists():
|
||||
sys.exit(f"xlsx not found at {VIDEO_INFO_XLSX}")
|
||||
|
||||
csv = pd.read_csv(LEGACY_CSV)
|
||||
xlsx = pd.read_excel(VIDEO_INFO_XLSX)
|
||||
|
||||
# Idempotency check: if 2025-07-15 already in the xlsx, refuse.
|
||||
existing_dates = pd.to_datetime(xlsx["date"]).dt.strftime("%Y-%m-%d")
|
||||
if (existing_dates == "2025-07-15").any():
|
||||
sys.exit("xlsx already contains 2025-07-15 rows; nothing to do.")
|
||||
|
||||
# Build one row per (machine, ROI). The legacy CSV has duplicate rows
|
||||
# per session — collapse on (machine, ROI) and pick metadata from any.
|
||||
csv["machine_int"] = csv["machine_name"].astype(int)
|
||||
by_fly = csv.groupby(["machine_int", "ROI"], as_index=False).agg(
|
||||
genotype=("genotype", "first"),
|
||||
group=("group", "first"),
|
||||
)
|
||||
|
||||
rows = []
|
||||
for _, fly in by_fly.iterrows():
|
||||
machine_int = int(fly["machine_int"])
|
||||
if machine_int not in SESSION_PAIRS:
|
||||
print(f" skip machine {machine_int}: no session pairing defined")
|
||||
continue
|
||||
train_hhmmss, test_hhmmss = SESSION_PAIRS[machine_int]
|
||||
rows.append({
|
||||
"source_date": "20250715",
|
||||
"date": pd.Timestamp("2025-07-15"),
|
||||
"machine_name": f"ETHOSCOPE_{machine_int:03d}",
|
||||
"roi": int(fly["ROI"]),
|
||||
"species": "Melanogaster/CS" if fly["genotype"] == "CS" else fly["genotype"],
|
||||
"male": fly["group"], # 'trained' / 'naive' already canonical
|
||||
"collected": pd.NaT,
|
||||
"training_date_time": hhmmss_to_xlsx_time("2025-07-15", train_hhmmss),
|
||||
"testing_date_time": hhmmss_to_xlsx_time("2025-07-15", test_hhmmss) if test_hhmmss else "",
|
||||
"training_length_hr": pd.NA,
|
||||
"consolidation_length_hr": pd.NA,
|
||||
"memory": pd.NA,
|
||||
"age": pd.NA,
|
||||
})
|
||||
|
||||
new_df = pd.DataFrame(rows)
|
||||
print(f"adding {len(new_df)} rows for the 2025-07-15 batch:")
|
||||
print(new_df[["machine_name", "roi", "male", "training_date_time", "testing_date_time"]])
|
||||
|
||||
# Back up the xlsx, then append.
|
||||
backup = VIDEO_INFO_XLSX.with_suffix(
|
||||
f".backup_{datetime.now():%Y%m%d_%H%M%S}.xlsx"
|
||||
)
|
||||
shutil.copy2(VIDEO_INFO_XLSX, backup)
|
||||
print(f"\nbacked up xlsx → {backup}")
|
||||
|
||||
merged = pd.concat([xlsx, new_df], ignore_index=True)
|
||||
merged.to_excel(VIDEO_INFO_XLSX, index=False)
|
||||
print(f"wrote {VIDEO_INFO_XLSX} ({len(merged)} rows total)")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Loading…
Add table
Add a link
Reference in a new issue