Add video duration_s to inventory and propagate to merged TSV

build_video_inventory.py now opens each mp4 with cv2 to record
duration_s. Cached: a video already in the previous inventory keeps
its computed duration, so re-runs only pay the cv2 cost for new
recordings.

export_video_db_index.py looks up the matched video's duration and
writes it as training_video_duration_s / testing_video_duration_s
alongside the existing path columns. Useful for spotting unusually
short or long sessions and for sanity checks on the tracker output.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
Giorgio Gilestro 2026-05-01 11:13:05 +01:00
parent 847d2cbd1b
commit 2e80b834ca
2 changed files with 64 additions and 12 deletions

View file

@ -57,6 +57,7 @@ def parse_xlsx_time(value: str) -> tuple[str, int] | None:
def build_session_index(inventory: pd.DataFrame) -> dict[tuple[str, str], list[dict]]:
"""Index inventory rows by (date, machine_name) → list of session dicts."""
idx: dict[tuple[str, str], list[dict]] = {}
has_duration = "duration_s" in inventory.columns
for row in inventory.itertuples(index=False):
h, m, _s = (int(p) for p in str(row.session_time).split("-"))
key = (row.session_date, row.machine_name)
@ -64,6 +65,7 @@ def build_session_index(inventory: pd.DataFrame) -> dict[tuple[str, str], list[d
"mp4_path": row.mp4_path,
"session_datetime": row.session_datetime,
"minutes": h * 60 + m,
"duration_s": float(row.duration_s) if has_duration and pd.notna(row.duration_s) else None,
})
return idx
@ -83,7 +85,7 @@ def resolve_session(
when: str,
fallback_date: str | None,
index: dict[tuple[str, str], list[dict]],
) -> tuple[str, str]:
) -> tuple[str, str, float | None]:
"""Look up the video + db whose start time is closest to `when`.
Match strategy:
@ -95,16 +97,18 @@ def resolve_session(
Among candidates, pick the video whose start minute is closest to the
xlsx-claimed time, within ±_TIME_TOLERANCE_MIN.
Returns (mp4_path, db_path, duration_s) empty strings / None if no match.
"""
parsed = parse_xlsx_time(when)
if parsed is None:
return "", ""
return "", "", None
date, target_min = parsed
candidates = index.get((date, machine_name), [])
if not candidates and fallback_date:
candidates = index.get((fallback_date, machine_name), [])
if not candidates:
return "", ""
return "", "", None
def _gap(target: int, c: dict) -> int:
# Reason: xlsx times like '1230AM' are ambiguous (12 AM vs 12 PM).
@ -114,9 +118,9 @@ def resolve_session(
best = min(candidates, key=lambda c: _gap(target_min, c))
if _gap(target_min, best) > _TIME_TOLERANCE_MIN:
return "", ""
return "", "", None
db = db_path_for_video(best["mp4_path"])
return best["mp4_path"], (str(db) if db else "")
return best["mp4_path"], (str(db) if db else ""), best.get("duration_s")
# Variants of "naive" the xlsx has accumulated: 'naïve', 'niave', plus
@ -151,19 +155,20 @@ def main() -> None:
_normalize_metadata(df)
date_iso = pd.to_datetime(df["date"]).dt.strftime("%Y-%m-%d")
train_videos, train_dbs, test_videos, test_dbs = [], [], [], []
train_videos, train_dbs, train_durs = [], [], []
test_videos, test_dbs, test_durs = [], [], []
for fallback, row in zip(date_iso, df.itertuples(index=False)):
tv, td = resolve_session(row.machine_name, row.training_date_time, fallback, index)
sv, sd = resolve_session(row.machine_name, row.testing_date_time, fallback, index)
train_videos.append(tv)
train_dbs.append(td)
test_videos.append(sv)
test_dbs.append(sd)
tv, td, tdur = resolve_session(row.machine_name, row.training_date_time, fallback, index)
sv, sd, sdur = resolve_session(row.machine_name, row.testing_date_time, fallback, index)
train_videos.append(tv); train_dbs.append(td); train_durs.append(tdur)
test_videos.append(sv); test_dbs.append(sd); test_durs.append(sdur)
df["training_video_path"] = train_videos
df["training_db_path"] = train_dbs
df["training_video_duration_s"] = train_durs
df["testing_video_path"] = test_videos
df["testing_db_path"] = test_dbs
df["testing_video_duration_s"] = test_durs
# Reason: an analyst flag for excluding individual fly/session rows that
# turn out to be too noisy or otherwise unusable. Default True; flip to