Add video duration_s to inventory and propagate to merged TSV
build_video_inventory.py now opens each mp4 with cv2 to record duration_s. Cached: a video already in the previous inventory keeps its computed duration, so re-runs only pay the cv2 cost for new recordings. export_video_db_index.py looks up the matched video's duration and writes it as training_video_duration_s / testing_video_duration_s alongside the existing path columns. Useful for spotting unusually short or long sessions and for sanity checks on the tracker output. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
parent
847d2cbd1b
commit
2e80b834ca
2 changed files with 64 additions and 12 deletions
|
|
@ -64,6 +64,51 @@ def scan_videos(videos_root: Path) -> pd.DataFrame:
|
|||
return pd.DataFrame(rows)
|
||||
|
||||
|
||||
def video_duration_s(mp4_path: str) -> float | None:
|
||||
"""Read video duration in seconds via cv2. Returns None on failure."""
|
||||
import cv2 # local import — heavy module, only needed when computing
|
||||
cap = cv2.VideoCapture(mp4_path)
|
||||
if not cap.isOpened():
|
||||
return None
|
||||
fps = cap.get(cv2.CAP_PROP_FPS)
|
||||
frames = cap.get(cv2.CAP_PROP_FRAME_COUNT)
|
||||
cap.release()
|
||||
if fps <= 0 or frames <= 0:
|
||||
return None
|
||||
return float(frames / fps)
|
||||
|
||||
|
||||
def add_durations(videos_df: pd.DataFrame, prev_inv_path: Path) -> pd.DataFrame:
|
||||
"""Annotate videos_df with a duration_s column.
|
||||
|
||||
Reuses durations from the previous inventory CSV when present
|
||||
(keyed on mp4_path) — only newly-discovered videos pay the cv2 open cost.
|
||||
"""
|
||||
cache: dict[str, float] = {}
|
||||
if prev_inv_path.exists():
|
||||
prev = pd.read_csv(prev_inv_path)
|
||||
if "duration_s" in prev.columns:
|
||||
for _, r in prev.dropna(subset=["duration_s"]).iterrows():
|
||||
cache[r["mp4_path"]] = float(r["duration_s"])
|
||||
|
||||
durations: list[float | None] = []
|
||||
todo_count = sum(1 for p in videos_df["mp4_path"] if p not in cache)
|
||||
if todo_count:
|
||||
print(f" computing duration for {todo_count} new video(s)…")
|
||||
try:
|
||||
from tqdm.auto import tqdm
|
||||
except ImportError:
|
||||
def tqdm(it, **_): return it
|
||||
for mp4_path in tqdm(videos_df["mp4_path"], desc="durations", unit="vid"):
|
||||
if mp4_path in cache:
|
||||
durations.append(cache[mp4_path])
|
||||
else:
|
||||
durations.append(video_duration_s(mp4_path))
|
||||
videos_df = videos_df.copy()
|
||||
videos_df["duration_s"] = durations
|
||||
return videos_df
|
||||
|
||||
|
||||
def already_tracked_set(tracked_dir: Path) -> set[tuple[str, str]]:
|
||||
"""Return the set of (date, time) sessions for which a tracking DB exists.
|
||||
|
||||
|
|
@ -83,6 +128,8 @@ def main() -> None:
|
|||
videos_df = scan_videos(VIDEOS_ROOT)
|
||||
print(f" found {len(videos_df)} video sessions on disk")
|
||||
|
||||
videos_df = add_durations(videos_df, INVENTORY_CSV)
|
||||
|
||||
print(f"Loading metadata xlsx: {VIDEO_INFO_XLSX}")
|
||||
meta = pd.read_excel(VIDEO_INFO_XLSX)
|
||||
meta["session_date"] = meta["date"].dt.strftime("%Y-%m-%d")
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue