From 2e80b834cab5648e200c8b251191f12f5811fa8a Mon Sep 17 00:00:00 2001 From: Giorgio Gilestro Date: Fri, 1 May 2026 11:13:05 +0100 Subject: [PATCH] Add video duration_s to inventory and propagate to merged TSV build_video_inventory.py now opens each mp4 with cv2 to record duration_s. Cached: a video already in the previous inventory keeps its computed duration, so re-runs only pay the cv2 cost for new recordings. export_video_db_index.py looks up the matched video's duration and writes it as training_video_duration_s / testing_video_duration_s alongside the existing path columns. Useful for spotting unusually short or long sessions and for sanity checks on the tracker output. Co-Authored-By: Claude Opus 4.7 --- scripts/build_video_inventory.py | 47 ++++++++++++++++++++++++++++++++ scripts/export_video_db_index.py | 29 ++++++++++++-------- 2 files changed, 64 insertions(+), 12 deletions(-) diff --git a/scripts/build_video_inventory.py b/scripts/build_video_inventory.py index e931137..5cac340 100644 --- a/scripts/build_video_inventory.py +++ b/scripts/build_video_inventory.py @@ -64,6 +64,51 @@ def scan_videos(videos_root: Path) -> pd.DataFrame: return pd.DataFrame(rows) +def video_duration_s(mp4_path: str) -> float | None: + """Read video duration in seconds via cv2. Returns None on failure.""" + import cv2 # local import — heavy module, only needed when computing + cap = cv2.VideoCapture(mp4_path) + if not cap.isOpened(): + return None + fps = cap.get(cv2.CAP_PROP_FPS) + frames = cap.get(cv2.CAP_PROP_FRAME_COUNT) + cap.release() + if fps <= 0 or frames <= 0: + return None + return float(frames / fps) + + +def add_durations(videos_df: pd.DataFrame, prev_inv_path: Path) -> pd.DataFrame: + """Annotate videos_df with a duration_s column. + + Reuses durations from the previous inventory CSV when present + (keyed on mp4_path) — only newly-discovered videos pay the cv2 open cost. + """ + cache: dict[str, float] = {} + if prev_inv_path.exists(): + prev = pd.read_csv(prev_inv_path) + if "duration_s" in prev.columns: + for _, r in prev.dropna(subset=["duration_s"]).iterrows(): + cache[r["mp4_path"]] = float(r["duration_s"]) + + durations: list[float | None] = [] + todo_count = sum(1 for p in videos_df["mp4_path"] if p not in cache) + if todo_count: + print(f" computing duration for {todo_count} new video(s)…") + try: + from tqdm.auto import tqdm + except ImportError: + def tqdm(it, **_): return it + for mp4_path in tqdm(videos_df["mp4_path"], desc="durations", unit="vid"): + if mp4_path in cache: + durations.append(cache[mp4_path]) + else: + durations.append(video_duration_s(mp4_path)) + videos_df = videos_df.copy() + videos_df["duration_s"] = durations + return videos_df + + def already_tracked_set(tracked_dir: Path) -> set[tuple[str, str]]: """Return the set of (date, time) sessions for which a tracking DB exists. @@ -83,6 +128,8 @@ def main() -> None: videos_df = scan_videos(VIDEOS_ROOT) print(f" found {len(videos_df)} video sessions on disk") + videos_df = add_durations(videos_df, INVENTORY_CSV) + print(f"Loading metadata xlsx: {VIDEO_INFO_XLSX}") meta = pd.read_excel(VIDEO_INFO_XLSX) meta["session_date"] = meta["date"].dt.strftime("%Y-%m-%d") diff --git a/scripts/export_video_db_index.py b/scripts/export_video_db_index.py index 1e9a582..0caa0d4 100644 --- a/scripts/export_video_db_index.py +++ b/scripts/export_video_db_index.py @@ -57,6 +57,7 @@ def parse_xlsx_time(value: str) -> tuple[str, int] | None: def build_session_index(inventory: pd.DataFrame) -> dict[tuple[str, str], list[dict]]: """Index inventory rows by (date, machine_name) → list of session dicts.""" idx: dict[tuple[str, str], list[dict]] = {} + has_duration = "duration_s" in inventory.columns for row in inventory.itertuples(index=False): h, m, _s = (int(p) for p in str(row.session_time).split("-")) key = (row.session_date, row.machine_name) @@ -64,6 +65,7 @@ def build_session_index(inventory: pd.DataFrame) -> dict[tuple[str, str], list[d "mp4_path": row.mp4_path, "session_datetime": row.session_datetime, "minutes": h * 60 + m, + "duration_s": float(row.duration_s) if has_duration and pd.notna(row.duration_s) else None, }) return idx @@ -83,7 +85,7 @@ def resolve_session( when: str, fallback_date: str | None, index: dict[tuple[str, str], list[dict]], -) -> tuple[str, str]: +) -> tuple[str, str, float | None]: """Look up the video + db whose start time is closest to `when`. Match strategy: @@ -95,16 +97,18 @@ def resolve_session( Among candidates, pick the video whose start minute is closest to the xlsx-claimed time, within ±_TIME_TOLERANCE_MIN. + + Returns (mp4_path, db_path, duration_s) — empty strings / None if no match. """ parsed = parse_xlsx_time(when) if parsed is None: - return "", "" + return "", "", None date, target_min = parsed candidates = index.get((date, machine_name), []) if not candidates and fallback_date: candidates = index.get((fallback_date, machine_name), []) if not candidates: - return "", "" + return "", "", None def _gap(target: int, c: dict) -> int: # Reason: xlsx times like '1230AM' are ambiguous (12 AM vs 12 PM). @@ -114,9 +118,9 @@ def resolve_session( best = min(candidates, key=lambda c: _gap(target_min, c)) if _gap(target_min, best) > _TIME_TOLERANCE_MIN: - return "", "" + return "", "", None db = db_path_for_video(best["mp4_path"]) - return best["mp4_path"], (str(db) if db else "") + return best["mp4_path"], (str(db) if db else ""), best.get("duration_s") # Variants of "naive" the xlsx has accumulated: 'naïve', 'niave', plus @@ -151,19 +155,20 @@ def main() -> None: _normalize_metadata(df) date_iso = pd.to_datetime(df["date"]).dt.strftime("%Y-%m-%d") - train_videos, train_dbs, test_videos, test_dbs = [], [], [], [] + train_videos, train_dbs, train_durs = [], [], [] + test_videos, test_dbs, test_durs = [], [], [] for fallback, row in zip(date_iso, df.itertuples(index=False)): - tv, td = resolve_session(row.machine_name, row.training_date_time, fallback, index) - sv, sd = resolve_session(row.machine_name, row.testing_date_time, fallback, index) - train_videos.append(tv) - train_dbs.append(td) - test_videos.append(sv) - test_dbs.append(sd) + tv, td, tdur = resolve_session(row.machine_name, row.training_date_time, fallback, index) + sv, sd, sdur = resolve_session(row.machine_name, row.testing_date_time, fallback, index) + train_videos.append(tv); train_dbs.append(td); train_durs.append(tdur) + test_videos.append(sv); test_dbs.append(sd); test_durs.append(sdur) df["training_video_path"] = train_videos df["training_db_path"] = train_dbs + df["training_video_duration_s"] = train_durs df["testing_video_path"] = test_videos df["testing_db_path"] = test_dbs + df["testing_video_duration_s"] = test_durs # Reason: an analyst flag for excluding individual fly/session rows that # turn out to be too noisy or otherwise unusable. Default True; flip to