Add video duration_s to inventory and propagate to merged TSV
build_video_inventory.py now opens each mp4 with cv2 to record duration_s. Cached: a video already in the previous inventory keeps its computed duration, so re-runs only pay the cv2 cost for new recordings. export_video_db_index.py looks up the matched video's duration and writes it as training_video_duration_s / testing_video_duration_s alongside the existing path columns. Useful for spotting unusually short or long sessions and for sanity checks on the tracker output. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
parent
847d2cbd1b
commit
2e80b834ca
2 changed files with 64 additions and 12 deletions
|
|
@ -64,6 +64,51 @@ def scan_videos(videos_root: Path) -> pd.DataFrame:
|
||||||
return pd.DataFrame(rows)
|
return pd.DataFrame(rows)
|
||||||
|
|
||||||
|
|
||||||
|
def video_duration_s(mp4_path: str) -> float | None:
|
||||||
|
"""Read video duration in seconds via cv2. Returns None on failure."""
|
||||||
|
import cv2 # local import — heavy module, only needed when computing
|
||||||
|
cap = cv2.VideoCapture(mp4_path)
|
||||||
|
if not cap.isOpened():
|
||||||
|
return None
|
||||||
|
fps = cap.get(cv2.CAP_PROP_FPS)
|
||||||
|
frames = cap.get(cv2.CAP_PROP_FRAME_COUNT)
|
||||||
|
cap.release()
|
||||||
|
if fps <= 0 or frames <= 0:
|
||||||
|
return None
|
||||||
|
return float(frames / fps)
|
||||||
|
|
||||||
|
|
||||||
|
def add_durations(videos_df: pd.DataFrame, prev_inv_path: Path) -> pd.DataFrame:
|
||||||
|
"""Annotate videos_df with a duration_s column.
|
||||||
|
|
||||||
|
Reuses durations from the previous inventory CSV when present
|
||||||
|
(keyed on mp4_path) — only newly-discovered videos pay the cv2 open cost.
|
||||||
|
"""
|
||||||
|
cache: dict[str, float] = {}
|
||||||
|
if prev_inv_path.exists():
|
||||||
|
prev = pd.read_csv(prev_inv_path)
|
||||||
|
if "duration_s" in prev.columns:
|
||||||
|
for _, r in prev.dropna(subset=["duration_s"]).iterrows():
|
||||||
|
cache[r["mp4_path"]] = float(r["duration_s"])
|
||||||
|
|
||||||
|
durations: list[float | None] = []
|
||||||
|
todo_count = sum(1 for p in videos_df["mp4_path"] if p not in cache)
|
||||||
|
if todo_count:
|
||||||
|
print(f" computing duration for {todo_count} new video(s)…")
|
||||||
|
try:
|
||||||
|
from tqdm.auto import tqdm
|
||||||
|
except ImportError:
|
||||||
|
def tqdm(it, **_): return it
|
||||||
|
for mp4_path in tqdm(videos_df["mp4_path"], desc="durations", unit="vid"):
|
||||||
|
if mp4_path in cache:
|
||||||
|
durations.append(cache[mp4_path])
|
||||||
|
else:
|
||||||
|
durations.append(video_duration_s(mp4_path))
|
||||||
|
videos_df = videos_df.copy()
|
||||||
|
videos_df["duration_s"] = durations
|
||||||
|
return videos_df
|
||||||
|
|
||||||
|
|
||||||
def already_tracked_set(tracked_dir: Path) -> set[tuple[str, str]]:
|
def already_tracked_set(tracked_dir: Path) -> set[tuple[str, str]]:
|
||||||
"""Return the set of (date, time) sessions for which a tracking DB exists.
|
"""Return the set of (date, time) sessions for which a tracking DB exists.
|
||||||
|
|
||||||
|
|
@ -83,6 +128,8 @@ def main() -> None:
|
||||||
videos_df = scan_videos(VIDEOS_ROOT)
|
videos_df = scan_videos(VIDEOS_ROOT)
|
||||||
print(f" found {len(videos_df)} video sessions on disk")
|
print(f" found {len(videos_df)} video sessions on disk")
|
||||||
|
|
||||||
|
videos_df = add_durations(videos_df, INVENTORY_CSV)
|
||||||
|
|
||||||
print(f"Loading metadata xlsx: {VIDEO_INFO_XLSX}")
|
print(f"Loading metadata xlsx: {VIDEO_INFO_XLSX}")
|
||||||
meta = pd.read_excel(VIDEO_INFO_XLSX)
|
meta = pd.read_excel(VIDEO_INFO_XLSX)
|
||||||
meta["session_date"] = meta["date"].dt.strftime("%Y-%m-%d")
|
meta["session_date"] = meta["date"].dt.strftime("%Y-%m-%d")
|
||||||
|
|
|
||||||
|
|
@ -57,6 +57,7 @@ def parse_xlsx_time(value: str) -> tuple[str, int] | None:
|
||||||
def build_session_index(inventory: pd.DataFrame) -> dict[tuple[str, str], list[dict]]:
|
def build_session_index(inventory: pd.DataFrame) -> dict[tuple[str, str], list[dict]]:
|
||||||
"""Index inventory rows by (date, machine_name) → list of session dicts."""
|
"""Index inventory rows by (date, machine_name) → list of session dicts."""
|
||||||
idx: dict[tuple[str, str], list[dict]] = {}
|
idx: dict[tuple[str, str], list[dict]] = {}
|
||||||
|
has_duration = "duration_s" in inventory.columns
|
||||||
for row in inventory.itertuples(index=False):
|
for row in inventory.itertuples(index=False):
|
||||||
h, m, _s = (int(p) for p in str(row.session_time).split("-"))
|
h, m, _s = (int(p) for p in str(row.session_time).split("-"))
|
||||||
key = (row.session_date, row.machine_name)
|
key = (row.session_date, row.machine_name)
|
||||||
|
|
@ -64,6 +65,7 @@ def build_session_index(inventory: pd.DataFrame) -> dict[tuple[str, str], list[d
|
||||||
"mp4_path": row.mp4_path,
|
"mp4_path": row.mp4_path,
|
||||||
"session_datetime": row.session_datetime,
|
"session_datetime": row.session_datetime,
|
||||||
"minutes": h * 60 + m,
|
"minutes": h * 60 + m,
|
||||||
|
"duration_s": float(row.duration_s) if has_duration and pd.notna(row.duration_s) else None,
|
||||||
})
|
})
|
||||||
return idx
|
return idx
|
||||||
|
|
||||||
|
|
@ -83,7 +85,7 @@ def resolve_session(
|
||||||
when: str,
|
when: str,
|
||||||
fallback_date: str | None,
|
fallback_date: str | None,
|
||||||
index: dict[tuple[str, str], list[dict]],
|
index: dict[tuple[str, str], list[dict]],
|
||||||
) -> tuple[str, str]:
|
) -> tuple[str, str, float | None]:
|
||||||
"""Look up the video + db whose start time is closest to `when`.
|
"""Look up the video + db whose start time is closest to `when`.
|
||||||
|
|
||||||
Match strategy:
|
Match strategy:
|
||||||
|
|
@ -95,16 +97,18 @@ def resolve_session(
|
||||||
|
|
||||||
Among candidates, pick the video whose start minute is closest to the
|
Among candidates, pick the video whose start minute is closest to the
|
||||||
xlsx-claimed time, within ±_TIME_TOLERANCE_MIN.
|
xlsx-claimed time, within ±_TIME_TOLERANCE_MIN.
|
||||||
|
|
||||||
|
Returns (mp4_path, db_path, duration_s) — empty strings / None if no match.
|
||||||
"""
|
"""
|
||||||
parsed = parse_xlsx_time(when)
|
parsed = parse_xlsx_time(when)
|
||||||
if parsed is None:
|
if parsed is None:
|
||||||
return "", ""
|
return "", "", None
|
||||||
date, target_min = parsed
|
date, target_min = parsed
|
||||||
candidates = index.get((date, machine_name), [])
|
candidates = index.get((date, machine_name), [])
|
||||||
if not candidates and fallback_date:
|
if not candidates and fallback_date:
|
||||||
candidates = index.get((fallback_date, machine_name), [])
|
candidates = index.get((fallback_date, machine_name), [])
|
||||||
if not candidates:
|
if not candidates:
|
||||||
return "", ""
|
return "", "", None
|
||||||
|
|
||||||
def _gap(target: int, c: dict) -> int:
|
def _gap(target: int, c: dict) -> int:
|
||||||
# Reason: xlsx times like '1230AM' are ambiguous (12 AM vs 12 PM).
|
# Reason: xlsx times like '1230AM' are ambiguous (12 AM vs 12 PM).
|
||||||
|
|
@ -114,9 +118,9 @@ def resolve_session(
|
||||||
|
|
||||||
best = min(candidates, key=lambda c: _gap(target_min, c))
|
best = min(candidates, key=lambda c: _gap(target_min, c))
|
||||||
if _gap(target_min, best) > _TIME_TOLERANCE_MIN:
|
if _gap(target_min, best) > _TIME_TOLERANCE_MIN:
|
||||||
return "", ""
|
return "", "", None
|
||||||
db = db_path_for_video(best["mp4_path"])
|
db = db_path_for_video(best["mp4_path"])
|
||||||
return best["mp4_path"], (str(db) if db else "")
|
return best["mp4_path"], (str(db) if db else ""), best.get("duration_s")
|
||||||
|
|
||||||
|
|
||||||
# Variants of "naive" the xlsx has accumulated: 'naïve', 'niave', plus
|
# Variants of "naive" the xlsx has accumulated: 'naïve', 'niave', plus
|
||||||
|
|
@ -151,19 +155,20 @@ def main() -> None:
|
||||||
_normalize_metadata(df)
|
_normalize_metadata(df)
|
||||||
date_iso = pd.to_datetime(df["date"]).dt.strftime("%Y-%m-%d")
|
date_iso = pd.to_datetime(df["date"]).dt.strftime("%Y-%m-%d")
|
||||||
|
|
||||||
train_videos, train_dbs, test_videos, test_dbs = [], [], [], []
|
train_videos, train_dbs, train_durs = [], [], []
|
||||||
|
test_videos, test_dbs, test_durs = [], [], []
|
||||||
for fallback, row in zip(date_iso, df.itertuples(index=False)):
|
for fallback, row in zip(date_iso, df.itertuples(index=False)):
|
||||||
tv, td = resolve_session(row.machine_name, row.training_date_time, fallback, index)
|
tv, td, tdur = resolve_session(row.machine_name, row.training_date_time, fallback, index)
|
||||||
sv, sd = resolve_session(row.machine_name, row.testing_date_time, fallback, index)
|
sv, sd, sdur = resolve_session(row.machine_name, row.testing_date_time, fallback, index)
|
||||||
train_videos.append(tv)
|
train_videos.append(tv); train_dbs.append(td); train_durs.append(tdur)
|
||||||
train_dbs.append(td)
|
test_videos.append(sv); test_dbs.append(sd); test_durs.append(sdur)
|
||||||
test_videos.append(sv)
|
|
||||||
test_dbs.append(sd)
|
|
||||||
|
|
||||||
df["training_video_path"] = train_videos
|
df["training_video_path"] = train_videos
|
||||||
df["training_db_path"] = train_dbs
|
df["training_db_path"] = train_dbs
|
||||||
|
df["training_video_duration_s"] = train_durs
|
||||||
df["testing_video_path"] = test_videos
|
df["testing_video_path"] = test_videos
|
||||||
df["testing_db_path"] = test_dbs
|
df["testing_db_path"] = test_dbs
|
||||||
|
df["testing_video_duration_s"] = test_durs
|
||||||
|
|
||||||
# Reason: an analyst flag for excluding individual fly/session rows that
|
# Reason: an analyst flag for excluding individual fly/session rows that
|
||||||
# turn out to be too noisy or otherwise unusable. Default True; flip to
|
# turn out to be too noisy or otherwise unusable. Default True; flip to
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue