From 2e80b834cab5648e200c8b251191f12f5811fa8a Mon Sep 17 00:00:00 2001
From: Giorgio Gilestro <giorgio@gilest.ro>
Date: Fri, 1 May 2026 11:13:05 +0100
Subject: [PATCH] Add video duration_s to inventory and propagate to merged TSV

build_video_inventory.py now opens each mp4 with cv2 to record
duration_s. Cached: a video already in the previous inventory keeps
its computed duration, so re-runs only pay the cv2 cost for new
recordings.

export_video_db_index.py looks up the matched video's duration and
writes it as training_video_duration_s / testing_video_duration_s
alongside the existing path columns. Useful for spotting unusually
short or long sessions and for sanity checks on the tracker output.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 scripts/build_video_inventory.py | 47 ++++++++++++++++++++++++++++++++
 scripts/export_video_db_index.py | 29 ++++++++++++--------
 2 files changed, 64 insertions(+), 12 deletions(-)

diff --git a/scripts/build_video_inventory.py b/scripts/build_video_inventory.py
index e931137..5cac340 100644
--- a/scripts/build_video_inventory.py
+++ b/scripts/build_video_inventory.py
@@ -64,6 +64,51 @@ def scan_videos(videos_root: Path) -> pd.DataFrame:
     return pd.DataFrame(rows)
 
 
+def video_duration_s(mp4_path: str) -> float | None:
+    """Read video duration in seconds via cv2. Returns None on failure."""
+    import cv2  # local import — heavy module, only needed when computing
+    cap = cv2.VideoCapture(mp4_path)
+    if not cap.isOpened():
+        return None
+    fps = cap.get(cv2.CAP_PROP_FPS)
+    frames = cap.get(cv2.CAP_PROP_FRAME_COUNT)
+    cap.release()
+    if fps <= 0 or frames <= 0:
+        return None
+    return float(frames / fps)
+
+
+def add_durations(videos_df: pd.DataFrame, prev_inv_path: Path) -> pd.DataFrame:
+    """Annotate videos_df with a duration_s column.
+
+    Reuses durations from the previous inventory CSV when present
+    (keyed on mp4_path) — only newly-discovered videos pay the cv2 open cost.
+    """
+    cache: dict[str, float] = {}
+    if prev_inv_path.exists():
+        prev = pd.read_csv(prev_inv_path)
+        if "duration_s" in prev.columns:
+            for _, r in prev.dropna(subset=["duration_s"]).iterrows():
+                cache[r["mp4_path"]] = float(r["duration_s"])
+
+    durations: list[float | None] = []
+    todo_count = sum(1 for p in videos_df["mp4_path"] if p not in cache)
+    if todo_count:
+        print(f"  computing duration for {todo_count} new video(s)…")
+    try:
+        from tqdm.auto import tqdm
+    except ImportError:
+        def tqdm(it, **_): return it
+    for mp4_path in tqdm(videos_df["mp4_path"], desc="durations", unit="vid"):
+        if mp4_path in cache:
+            durations.append(cache[mp4_path])
+        else:
+            durations.append(video_duration_s(mp4_path))
+    videos_df = videos_df.copy()
+    videos_df["duration_s"] = durations
+    return videos_df
+
+
 def already_tracked_set(tracked_dir: Path) -> set[tuple[str, str]]:
     """Return the set of (date, time) sessions for which a tracking DB exists.
 
@@ -83,6 +128,8 @@ def main() -> None:
     videos_df = scan_videos(VIDEOS_ROOT)
     print(f"  found {len(videos_df)} video sessions on disk")
 
+    videos_df = add_durations(videos_df, INVENTORY_CSV)
+
     print(f"Loading metadata xlsx: {VIDEO_INFO_XLSX}")
     meta = pd.read_excel(VIDEO_INFO_XLSX)
     meta["session_date"] = meta["date"].dt.strftime("%Y-%m-%d")
diff --git a/scripts/export_video_db_index.py b/scripts/export_video_db_index.py
index 1e9a582..0caa0d4 100644
--- a/scripts/export_video_db_index.py
+++ b/scripts/export_video_db_index.py
@@ -57,6 +57,7 @@ def parse_xlsx_time(value: str) -> tuple[str, int] | None:
 def build_session_index(inventory: pd.DataFrame) -> dict[tuple[str, str], list[dict]]:
     """Index inventory rows by (date, machine_name) → list of session dicts."""
     idx: dict[tuple[str, str], list[dict]] = {}
+    has_duration = "duration_s" in inventory.columns
     for row in inventory.itertuples(index=False):
         h, m, _s = (int(p) for p in str(row.session_time).split("-"))
         key = (row.session_date, row.machine_name)
@@ -64,6 +65,7 @@ def build_session_index(inventory: pd.DataFrame) -> dict[tuple[str, str], list[d
             "mp4_path": row.mp4_path,
             "session_datetime": row.session_datetime,
             "minutes": h * 60 + m,
+            "duration_s": float(row.duration_s) if has_duration and pd.notna(row.duration_s) else None,
         })
     return idx
 
@@ -83,7 +85,7 @@ def resolve_session(
     when: str,
     fallback_date: str | None,
     index: dict[tuple[str, str], list[dict]],
-) -> tuple[str, str]:
+) -> tuple[str, str, float | None]:
     """Look up the video + db whose start time is closest to `when`.
 
     Match strategy:
@@ -95,16 +97,18 @@ def resolve_session(
 
     Among candidates, pick the video whose start minute is closest to the
     xlsx-claimed time, within ±_TIME_TOLERANCE_MIN.
+
+    Returns (mp4_path, db_path, duration_s) — empty strings / None if no match.
     """
     parsed = parse_xlsx_time(when)
     if parsed is None:
-        return "", ""
+        return "", "", None
     date, target_min = parsed
     candidates = index.get((date, machine_name), [])
     if not candidates and fallback_date:
         candidates = index.get((fallback_date, machine_name), [])
     if not candidates:
-        return "", ""
+        return "", "", None
 
     def _gap(target: int, c: dict) -> int:
         # Reason: xlsx times like '1230AM' are ambiguous (12 AM vs 12 PM).
@@ -114,9 +118,9 @@ def resolve_session(
 
     best = min(candidates, key=lambda c: _gap(target_min, c))
     if _gap(target_min, best) > _TIME_TOLERANCE_MIN:
-        return "", ""
+        return "", "", None
     db = db_path_for_video(best["mp4_path"])
-    return best["mp4_path"], (str(db) if db else "")
+    return best["mp4_path"], (str(db) if db else ""), best.get("duration_s")
 
 
 # Variants of "naive" the xlsx has accumulated: 'naïve', 'niave', plus
@@ -151,19 +155,20 @@ def main() -> None:
     _normalize_metadata(df)
     date_iso = pd.to_datetime(df["date"]).dt.strftime("%Y-%m-%d")
 
-    train_videos, train_dbs, test_videos, test_dbs = [], [], [], []
+    train_videos, train_dbs, train_durs = [], [], []
+    test_videos, test_dbs, test_durs = [], [], []
     for fallback, row in zip(date_iso, df.itertuples(index=False)):
-        tv, td = resolve_session(row.machine_name, row.training_date_time, fallback, index)
-        sv, sd = resolve_session(row.machine_name, row.testing_date_time, fallback, index)
-        train_videos.append(tv)
-        train_dbs.append(td)
-        test_videos.append(sv)
-        test_dbs.append(sd)
+        tv, td, tdur = resolve_session(row.machine_name, row.training_date_time, fallback, index)
+        sv, sd, sdur = resolve_session(row.machine_name, row.testing_date_time, fallback, index)
+        train_videos.append(tv); train_dbs.append(td); train_durs.append(tdur)
+        test_videos.append(sv);  test_dbs.append(sd);  test_durs.append(sdur)
 
     df["training_video_path"] = train_videos
     df["training_db_path"] = train_dbs
+    df["training_video_duration_s"] = train_durs
     df["testing_video_path"] = test_videos
     df["testing_db_path"] = test_dbs
+    df["testing_video_duration_s"] = test_durs
 
     # Reason: an analyst flag for excluding individual fly/session rows that
     # turn out to be too noisy or otherwise unusable. Default True; flip to