Add offline tracking pipeline for video backlog

The 2024 video set in all_video_info_merged.xlsx covers 63 (date, machine) sessions — 129 video instances — that have no auto-detectable targets, so ROI placement requires manual reference-point selection. This commit adds the three-stage pipeline that lets a user click for an hour, then walk away while the tracker grinds overnight: 1. build_video_inventory.py — scan /mnt/ethoscope_data/videos/ and join against the xlsx, producing data/metadata/video_inventory.csv 2. pick_targets.py — interactive matplotlib/Tk picker. User clicks TOP/CORNER/LEFT (the L-shape ethoscope expects); after the third click the 6 ROI rectangles are drawn on top of the frame so geometry can be verified before saving. Also supports marking a video 'unusable' (FOV wrong) so it's permanently skipped, frame stepping by ±1s/±5%/midpoint, point editing in --redo mode, and a crosshair cursor that survives matplotlib's per-motion cursor reset. 3. track_videos.py — headless batch tracker. Reads the JSON sidecars, builds 6 ROIs from the HD-mating-arena geometry, runs MultiFlyTracker against the merged.mp4 via MovieVirtualCamera, writes SQLite DBs to data/tracked/. Idempotent (skips done DBs), parallel via --jobs, subclasses MovieVirtualCamera so frames stay BGR (MultiFlyTracker calls cvtColor(BGR2GRAY) without checking channel count). Plus auto_detect_targets.py (fallback that runs ethoscope's auto-detector in case any videos do have visible target dots), monitor_tracking.py (progress + ETA from data/tracked/ ground truth, --watch for live view), and tracking_geometry.py (single source of truth for the affine math shared by picker and tracker). requirements-tracking.txt pins the extra deps (opencv-python, openpyxl, gitpython, netifaces, mysql-connector-python) — these are only needed for the tracking pipeline, not the existing analysis notebooks. Verified end-to-end on one of the user-picked videos: ~4000 rows/ROI in a 120s slice, fly bounding boxes in the expected 800-2000 px² band. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-04-27 17:25:26 +01:00 · 2026-04-27 17:25:26 +01:00 · e4da7691d5
commit e4da7691d5
parent e7e4db264d
11 changed files with 1296 additions and 0 deletions
--- a/scripts/build_video_inventory.py
+++ b/scripts/build_video_inventory.py
@ -0,0 +1,150 @@
+"""Build an inventory of videos available on disk and join with the metadata xlsx.
+
+Scans /mnt/ethoscope_data/videos/<uuid>/<machine_name>/<date_time>/*.mp4
+and produces a CSV mapping each (date, machine_name) row in
+all_video_info_merged.xlsx to the corresponding merged.mp4 path on disk.
+
+Output: data/metadata/video_inventory.csv with columns:
+    machine_uuid, machine_name, session_date, session_time, mp4_path,
+    in_xlsx (bool), already_tracked (bool)
+"""
+
+from __future__ import annotations
+
+import re
+from pathlib import Path
+
+import pandas as pd
+
+from config import DATA_RAW, INVENTORY_CSV, VIDEO_INFO_XLSX, VIDEOS_ROOT
+
+SESSION_RE = re.compile(r"^(\d{4}-\d{2}-\d{2})_(\d{2}-\d{2}-\d{2})$")
+
+
+def scan_videos(videos_root: Path) -> pd.DataFrame:
+    """Walk videos_root and return one row per merged.mp4 found.
+
+    Args:
+        videos_root: Root directory containing <uuid>/<machine_name>/<date_time>/.
+
+    Returns:
+        DataFrame with columns: machine_uuid, machine_name, session_date,
+        session_time, session_datetime, mp4_path.
+    """
+    rows = []
+    for uuid_dir in sorted(videos_root.iterdir()):
+        if not uuid_dir.is_dir():
+            continue
+        for machine_dir in uuid_dir.iterdir():
+            if not machine_dir.is_dir() or not machine_dir.name.startswith("ETHOSCOPE_"):
+                continue
+            for session_dir in machine_dir.iterdir():
+                if not session_dir.is_dir():
+                    continue
+                m = SESSION_RE.match(session_dir.name)
+                if not m:
+                    continue
+                date_str, time_str = m.group(1), m.group(2)
+                # Prefer *_merged.mp4 if present
+                merged = sorted(session_dir.glob("*_merged.mp4"))
+                if not merged:
+                    merged = sorted(session_dir.glob("*.mp4"))
+                if not merged:
+                    continue
+                rows.append(
+                    {
+                        "machine_uuid": uuid_dir.name,
+                        "machine_name": machine_dir.name,
+                        "session_date": date_str,
+                        "session_time": time_str,
+                        "session_datetime": f"{date_str}_{time_str}",
+                        "mp4_path": str(merged[0]),
+                    }
+                )
+    return pd.DataFrame(rows)
+
+
+def already_tracked_set(data_raw: Path) -> set[tuple[str, str]]:
+    """Return the set of (date, time) sessions for which a tracking DB exists.
+
+    DBs are named like:
+        2025-07-15_16-03-10_<uuid>__1920x1088@25fps-28q_merged_tracking.db
+    """
+    out = set()
+    for db in data_raw.glob("*_tracking.db"):
+        m = re.match(r"^(\d{4}-\d{2}-\d{2})_(\d{2}-\d{2}-\d{2})_", db.name)
+        if m:
+            out.add((m.group(1), m.group(2)))
+    return out
+
+
+def main() -> None:
+    print(f"Scanning {VIDEOS_ROOT} ...")
+    videos_df = scan_videos(VIDEOS_ROOT)
+    print(f"  found {len(videos_df)} video sessions on disk")
+
+    print(f"Loading metadata xlsx: {VIDEO_INFO_XLSX}")
+    meta = pd.read_excel(VIDEO_INFO_XLSX)
+    meta["session_date"] = meta["date"].dt.strftime("%Y-%m-%d")
+
+    # The xlsx has one row per (date, machine, ROI) — collapse to unique sessions
+    meta_sessions = (
+        meta[["session_date", "machine_name"]].drop_duplicates().reset_index(drop=True)
+    )
+    print(f"  xlsx contains {len(meta_sessions)} unique (date, machine) sessions")
+
+    # Mark which video sessions are referenced by the xlsx
+    xlsx_keys = set(zip(meta_sessions["session_date"], meta_sessions["machine_name"]))
+    videos_df["in_xlsx"] = videos_df.apply(
+        lambda r: (r["session_date"], r["machine_name"]) in xlsx_keys, axis=1
+    )
+
+    # Mark which already have tracking DBs in data/raw/
+    tracked = already_tracked_set(DATA_RAW)
+    videos_df["already_tracked"] = videos_df.apply(
+        lambda r: (r["session_date"], r["session_time"]) in tracked, axis=1
+    )
+
+    INVENTORY_CSV.parent.mkdir(parents=True, exist_ok=True)
+    videos_df.sort_values(["session_date", "machine_name", "session_time"]).to_csv(
+        INVENTORY_CSV, index=False
+    )
+
+    # Coverage report
+    in_xlsx = videos_df["in_xlsx"]
+    needed = videos_df[in_xlsx & ~videos_df["already_tracked"]]
+    n_xlsx_sessions = len(meta_sessions)
+    n_with_video = videos_df[in_xlsx].drop_duplicates(
+        ["session_date", "machine_name"]
+    ).shape[0]
+
+    # xlsx sessions that have no video on disk
+    found_keys = set(
+        zip(
+            videos_df.loc[in_xlsx, "session_date"],
+            videos_df.loc[in_xlsx, "machine_name"],
+        )
+    )
+    missing = sorted(xlsx_keys - found_keys)
+
+    print()
+    print("=" * 70)
+    print(f"Wrote inventory: {INVENTORY_CSV}")
+    print(f"  total video sessions on disk: {len(videos_df)}")
+    print(f"  xlsx unique sessions:         {n_xlsx_sessions}")
+    print(f"  xlsx sessions with video:     {n_with_video}")
+    print(f"  xlsx sessions missing video:  {len(missing)}")
+    print(f"  already tracked (DB exists):  {videos_df['already_tracked'].sum()}")
+    print(f"  TO TRACK (in_xlsx & ~tracked, video instances): {len(needed)}")
+
+    if missing:
+        print()
+        print("xlsx sessions with NO matching video on disk:")
+        for d, m in missing[:20]:
+            print(f"  {d}  {m}")
+        if len(missing) > 20:
+            print(f"  ... and {len(missing) - 20} more")
+
+
+if __name__ == "__main__":
+    main()