"""Build an inventory of videos available on disk and join with the metadata xlsx. Scans /mnt/ethoscope_data/videos////*.mp4 and produces a CSV mapping each (date, machine_name) row in all_video_info_merged.xlsx to the corresponding merged.mp4 path on disk. Output: data/metadata/video_inventory.csv with columns: machine_uuid, machine_name, session_date, session_time, mp4_path, in_xlsx (bool), already_tracked (bool) """ from __future__ import annotations import re from pathlib import Path import pandas as pd from config import INVENTORY_CSV, TRACKING_OUTPUT_DIR, VIDEO_INFO_XLSX, VIDEOS_ROOT SESSION_RE = re.compile(r"^(\d{4}-\d{2}-\d{2})_(\d{2}-\d{2}-\d{2})$") def scan_videos(videos_root: Path) -> pd.DataFrame: """Walk videos_root and return one row per merged.mp4 found. Args: videos_root: Root directory containing ///. Returns: DataFrame with columns: machine_uuid, machine_name, session_date, session_time, session_datetime, mp4_path. """ rows = [] for uuid_dir in sorted(videos_root.iterdir()): if not uuid_dir.is_dir(): continue for machine_dir in uuid_dir.iterdir(): if not machine_dir.is_dir() or not machine_dir.name.startswith("ETHOSCOPE_"): continue for session_dir in machine_dir.iterdir(): if not session_dir.is_dir(): continue m = SESSION_RE.match(session_dir.name) if not m: continue date_str, time_str = m.group(1), m.group(2) # Prefer *_merged.mp4 if present merged = sorted(session_dir.glob("*_merged.mp4")) if not merged: merged = sorted(session_dir.glob("*.mp4")) if not merged: continue rows.append( { "machine_uuid": uuid_dir.name, "machine_name": machine_dir.name, "session_date": date_str, "session_time": time_str, "session_datetime": f"{date_str}_{time_str}", "mp4_path": str(merged[0]), } ) return pd.DataFrame(rows) def video_duration_s(mp4_path: str) -> float | None: """Read video duration in seconds via cv2. Returns None on failure.""" import cv2 # local import — heavy module, only needed when computing cap = cv2.VideoCapture(mp4_path) if not cap.isOpened(): return None fps = cap.get(cv2.CAP_PROP_FPS) frames = cap.get(cv2.CAP_PROP_FRAME_COUNT) cap.release() if fps <= 0 or frames <= 0: return None return float(frames / fps) def add_durations(videos_df: pd.DataFrame, prev_inv_path: Path) -> pd.DataFrame: """Annotate videos_df with a duration_s column. Reuses durations from the previous inventory CSV when present (keyed on mp4_path) — only newly-discovered videos pay the cv2 open cost. """ cache: dict[str, float] = {} if prev_inv_path.exists(): prev = pd.read_csv(prev_inv_path) if "duration_s" in prev.columns: for _, r in prev.dropna(subset=["duration_s"]).iterrows(): cache[r["mp4_path"]] = float(r["duration_s"]) durations: list[float | None] = [] todo_count = sum(1 for p in videos_df["mp4_path"] if p not in cache) if todo_count: print(f" computing duration for {todo_count} new video(s)…") try: from tqdm.auto import tqdm except ImportError: def tqdm(it, **_): return it for mp4_path in tqdm(videos_df["mp4_path"], desc="durations", unit="vid"): if mp4_path in cache: durations.append(cache[mp4_path]) else: durations.append(video_duration_s(mp4_path)) videos_df = videos_df.copy() videos_df["duration_s"] = durations return videos_df def already_tracked_set(tracked_dir: Path) -> set[tuple[str, str]]: """Return the set of (date, time) sessions for which a tracking DB exists. DBs are named like: 2025-07-15_16-03-10___1920x1088@25fps-28q_merged_tracking.db """ out = set() for db in tracked_dir.glob("*_tracking.db"): m = re.match(r"^(\d{4}-\d{2}-\d{2})_(\d{2}-\d{2}-\d{2})_", db.name) if m: out.add((m.group(1), m.group(2))) return out def main() -> None: print(f"Scanning {VIDEOS_ROOT} ...") videos_df = scan_videos(VIDEOS_ROOT) print(f" found {len(videos_df)} video sessions on disk") videos_df = add_durations(videos_df, INVENTORY_CSV) print(f"Loading metadata xlsx: {VIDEO_INFO_XLSX}") meta = pd.read_excel(VIDEO_INFO_XLSX) meta["session_date"] = meta["date"].dt.strftime("%Y-%m-%d") # The xlsx has one row per (date, machine, ROI) — collapse to unique sessions meta_sessions = ( meta[["session_date", "machine_name"]].drop_duplicates().reset_index(drop=True) ) print(f" xlsx contains {len(meta_sessions)} unique (date, machine) sessions") # Mark which video sessions are referenced by the xlsx xlsx_keys = set(zip(meta_sessions["session_date"], meta_sessions["machine_name"])) videos_df["in_xlsx"] = videos_df.apply( lambda r: (r["session_date"], r["machine_name"]) in xlsx_keys, axis=1 ) # Mark which already have tracking DBs in TRACKING_OUTPUT_DIR tracked = already_tracked_set(TRACKING_OUTPUT_DIR) videos_df["already_tracked"] = videos_df.apply( lambda r: (r["session_date"], r["session_time"]) in tracked, axis=1 ) INVENTORY_CSV.parent.mkdir(parents=True, exist_ok=True) videos_df.sort_values(["session_date", "machine_name", "session_time"]).to_csv( INVENTORY_CSV, index=False ) # Coverage report in_xlsx = videos_df["in_xlsx"] needed = videos_df[in_xlsx & ~videos_df["already_tracked"]] n_xlsx_sessions = len(meta_sessions) n_with_video = videos_df[in_xlsx].drop_duplicates( ["session_date", "machine_name"] ).shape[0] # xlsx sessions that have no video on disk found_keys = set( zip( videos_df.loc[in_xlsx, "session_date"], videos_df.loc[in_xlsx, "machine_name"], ) ) missing = sorted(xlsx_keys - found_keys) print() print("=" * 70) print(f"Wrote inventory: {INVENTORY_CSV}") print(f" total video sessions on disk: {len(videos_df)}") print(f" xlsx unique sessions: {n_xlsx_sessions}") print(f" xlsx sessions with video: {n_with_video}") print(f" xlsx sessions missing video: {len(missing)}") print(f" already tracked (DB exists): {videos_df['already_tracked'].sum()}") print(f" TO TRACK (in_xlsx & ~tracked, video instances): {len(needed)}") if missing: print() print("xlsx sessions with NO matching video on disk:") for d, m in missing[:20]: print(f" {d} {m}") if len(missing) > 20: print(f" ... and {len(missing) - 20} more") if __name__ == "__main__": main()