Add offline tracking pipeline for video backlog

The 2024 video set in all_video_info_merged.xlsx covers 63 (date, machine) sessions — 129 video instances — that have no auto-detectable targets, so ROI placement requires manual reference-point selection. This commit adds the three-stage pipeline that lets a user click for an hour, then walk away while the tracker grinds overnight: 1. build_video_inventory.py — scan /mnt/ethoscope_data/videos/ and join against the xlsx, producing data/metadata/video_inventory.csv 2. pick_targets.py — interactive matplotlib/Tk picker. User clicks TOP/CORNER/LEFT (the L-shape ethoscope expects); after the third click the 6 ROI rectangles are drawn on top of the frame so geometry can be verified before saving. Also supports marking a video 'unusable' (FOV wrong) so it's permanently skipped, frame stepping by ±1s/±5%/midpoint, point editing in --redo mode, and a crosshair cursor that survives matplotlib's per-motion cursor reset. 3. track_videos.py — headless batch tracker. Reads the JSON sidecars, builds 6 ROIs from the HD-mating-arena geometry, runs MultiFlyTracker against the merged.mp4 via MovieVirtualCamera, writes SQLite DBs to data/tracked/. Idempotent (skips done DBs), parallel via --jobs, subclasses MovieVirtualCamera so frames stay BGR (MultiFlyTracker calls cvtColor(BGR2GRAY) without checking channel count). Plus auto_detect_targets.py (fallback that runs ethoscope's auto-detector in case any videos do have visible target dots), monitor_tracking.py (progress + ETA from data/tracked/ ground truth, --watch for live view), and tracking_geometry.py (single source of truth for the affine math shared by picker and tracker). requirements-tracking.txt pins the extra deps (opencv-python, openpyxl, gitpython, netifaces, mysql-connector-python) — these are only needed for the tracking pipeline, not the existing analysis notebooks. Verified end-to-end on one of the user-picked videos: ~4000 rows/ROI in a 120s slice, fly bounding boxes in the expected 800-2000 px² band. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-04-27 17:25:26 +01:00 · 2026-04-27 17:25:26 +01:00 · e4da7691d5
commit e4da7691d5
parent e7e4db264d
11 changed files with 1296 additions and 0 deletions
--- a/scripts/monitor_tracking.py
+++ b/scripts/monitor_tracking.py
@ -0,0 +1,155 @@
+"""Live progress + ETA for the offline tracker batch.
+
+Counts ground-truth (DBs on disk) rather than parsing log lines, so it works
+whether the batch is running fresh or was resumed after a crash. Errors are
+parsed out of any *.log files in data/logs/.
+
+Usage:
+    python monitor_tracking.py              # one snapshot, exit
+    python monitor_tracking.py --watch      # refresh every 10 s
+    python monitor_tracking.py --watch 30   # refresh every 30 s
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import re
+import time
+from datetime import datetime, timedelta
+from pathlib import Path
+
+from config import LOGS_DIR, TARGETS_DIR, TRACKING_OUTPUT_DIR
+
+
+def count_target_jsons() -> tuple[int, int, list[str]]:
+    """Return (n_pickable, n_unusable, unusable_video_stems)."""
+    pickable = 0
+    unusable_stems: list[str] = []
+    for j in TARGETS_DIR.glob("*.json"):
+        try:
+            d = json.loads(j.read_text())
+        except Exception:
+            continue
+        if d.get("unusable"):
+            unusable_stems.append(j.stem)
+        elif d.get("reference_points"):
+            pickable += 1
+    return pickable, len(unusable_stems), unusable_stems
+
+
+def count_tracked_dbs() -> tuple[int, datetime | None, str | None]:
+    """Return (n_dbs, mtime_of_newest, name_of_newest)."""
+    dbs = list(TRACKING_OUTPUT_DIR.glob("*_tracking.db"))
+    if not dbs:
+        return 0, None, None
+    newest = max(dbs, key=lambda p: p.stat().st_mtime)
+    return len(dbs), datetime.fromtimestamp(newest.stat().st_mtime), newest.stem
+
+
+def parse_recent_errors(log_dir: Path, tail_lines: int = 5000) -> list[str]:
+    """Scan the most recent *.log file for lines reporting errors."""
+    if not log_dir.exists():
+        return []
+    logs = sorted(log_dir.glob("*.log"), key=lambda p: p.stat().st_mtime)
+    if not logs:
+        return []
+    latest = logs[-1]
+    try:
+        with latest.open() as f:
+            tail = f.readlines()[-tail_lines:]
+    except Exception:
+        return []
+    out = []
+    for line in tail:
+        if re.search(r":\s*error\b", line) or " error: " in line.lower():
+            out.append(line.rstrip())
+    return out
+
+
+def db_completion_history() -> list[float]:
+    """Return mtimes of all tracking DBs, sorted ascending. Used for rate."""
+    return sorted(p.stat().st_mtime for p in TRACKING_OUTPUT_DIR.glob("*_tracking.db"))
+
+
+def fmt_duration(seconds: float) -> str:
+    if seconds < 60:
+        return f"{int(seconds)} s"
+    if seconds < 3600:
+        return f"{int(seconds // 60)} min"
+    h = int(seconds // 3600)
+    m = int((seconds % 3600) // 60)
+    return f"{h} h {m} min"
+
+
+def snapshot() -> str:
+    pickable, unusable, _ = count_target_jsons()
+    tracked, last_mtime, last_name = count_tracked_dbs()
+    history = db_completion_history()
+    errors = parse_recent_errors(LOGS_DIR)
+
+    lines = [f"tracking progress @ {datetime.now():%Y-%m-%d %H:%M:%S}"]
+    lines.append(f"  pickable JSONs:    {pickable}")
+    lines.append(f"  unusable JSONs:    {unusable}  (skipped by tracker)")
+    pct = (tracked / pickable * 100) if pickable else 0
+    lines.append(
+        f"  DBs on disk:       {tracked} / {pickable}  ({pct:.0f}%)"
+    )
+    lines.append(f"  errors in log:     {len(errors)}")
+
+    # Rate from the last 10 completions, when available.
+    if len(history) >= 2:
+        window = history[-min(10, len(history)) :]
+        span = window[-1] - window[0]
+        if span > 0:
+            rate_per_hour = (len(window) - 1) / span * 3600
+            lines.append(f"  rate (last {len(window) - 1}):    {rate_per_hour:.1f} videos/hour")
+            remaining = max(0, pickable - tracked)
+            if rate_per_hour > 0 and remaining > 0:
+                eta_sec = remaining * 3600 / rate_per_hour
+                eta_at = datetime.now() + timedelta(seconds=eta_sec)
+                lines.append(
+                    f"  ETA remaining:     {fmt_duration(eta_sec)}  "
+                    f"(done by {eta_at:%H:%M %a})"
+                )
+
+    if last_mtime is not None and last_name is not None:
+        ago = (datetime.now() - last_mtime).total_seconds()
+        lines.append(
+            f"  most recent DB:    {last_name[:60]}...  ({fmt_duration(ago)} ago)"
+        )
+
+    if errors:
+        lines.append("")
+        lines.append(f"  recent errors ({min(5, len(errors))} of {len(errors)}):")
+        for e in errors[-5:]:
+            lines.append(f"    {e[:120]}")
+
+    return "\n".join(lines)
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument(
+        "--watch", nargs="?", type=int, const=10, default=None,
+        help="refresh every N seconds (default 10 if flag given without value)",
+    )
+    args = parser.parse_args()
+
+    if args.watch is None:
+        print(snapshot())
+        return
+
+    try:
+        while True:
+            # Clear screen and reprint
+            print("\033[2J\033[H", end="")
+            print(snapshot())
+            print(f"\n(refreshing every {args.watch}s — Ctrl-C to exit)")
+            time.sleep(args.watch)
+    except KeyboardInterrupt:
+        print()
+
+
+if __name__ == "__main__":
+    main()