Deleted the 5 stale pre-pipeline tracking DBs and the data/raw/ directory. Dropped DATA_RAW from config.py; build_video_inventory now scans TRACKING_OUTPUT_DIR for already-tracked sessions. Notebooks no longer import DATA_RAW. README, PLANNING and todo updated to reflect that the repo holds only code + small curated metadata, never bulky DBs. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
150 lines
5.4 KiB
Python
150 lines
5.4 KiB
Python
"""Build an inventory of videos available on disk and join with the metadata xlsx.
|
|
|
|
Scans /mnt/ethoscope_data/videos/<uuid>/<machine_name>/<date_time>/*.mp4
|
|
and produces a CSV mapping each (date, machine_name) row in
|
|
all_video_info_merged.xlsx to the corresponding merged.mp4 path on disk.
|
|
|
|
Output: data/metadata/video_inventory.csv with columns:
|
|
machine_uuid, machine_name, session_date, session_time, mp4_path,
|
|
in_xlsx (bool), already_tracked (bool)
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import re
|
|
from pathlib import Path
|
|
|
|
import pandas as pd
|
|
|
|
from config import INVENTORY_CSV, TRACKING_OUTPUT_DIR, VIDEO_INFO_XLSX, VIDEOS_ROOT
|
|
|
|
SESSION_RE = re.compile(r"^(\d{4}-\d{2}-\d{2})_(\d{2}-\d{2}-\d{2})$")
|
|
|
|
|
|
def scan_videos(videos_root: Path) -> pd.DataFrame:
|
|
"""Walk videos_root and return one row per merged.mp4 found.
|
|
|
|
Args:
|
|
videos_root: Root directory containing <uuid>/<machine_name>/<date_time>/.
|
|
|
|
Returns:
|
|
DataFrame with columns: machine_uuid, machine_name, session_date,
|
|
session_time, session_datetime, mp4_path.
|
|
"""
|
|
rows = []
|
|
for uuid_dir in sorted(videos_root.iterdir()):
|
|
if not uuid_dir.is_dir():
|
|
continue
|
|
for machine_dir in uuid_dir.iterdir():
|
|
if not machine_dir.is_dir() or not machine_dir.name.startswith("ETHOSCOPE_"):
|
|
continue
|
|
for session_dir in machine_dir.iterdir():
|
|
if not session_dir.is_dir():
|
|
continue
|
|
m = SESSION_RE.match(session_dir.name)
|
|
if not m:
|
|
continue
|
|
date_str, time_str = m.group(1), m.group(2)
|
|
# Prefer *_merged.mp4 if present
|
|
merged = sorted(session_dir.glob("*_merged.mp4"))
|
|
if not merged:
|
|
merged = sorted(session_dir.glob("*.mp4"))
|
|
if not merged:
|
|
continue
|
|
rows.append(
|
|
{
|
|
"machine_uuid": uuid_dir.name,
|
|
"machine_name": machine_dir.name,
|
|
"session_date": date_str,
|
|
"session_time": time_str,
|
|
"session_datetime": f"{date_str}_{time_str}",
|
|
"mp4_path": str(merged[0]),
|
|
}
|
|
)
|
|
return pd.DataFrame(rows)
|
|
|
|
|
|
def already_tracked_set(tracked_dir: Path) -> set[tuple[str, str]]:
|
|
"""Return the set of (date, time) sessions for which a tracking DB exists.
|
|
|
|
DBs are named like:
|
|
2025-07-15_16-03-10_<uuid>__1920x1088@25fps-28q_merged_tracking.db
|
|
"""
|
|
out = set()
|
|
for db in tracked_dir.glob("*_tracking.db"):
|
|
m = re.match(r"^(\d{4}-\d{2}-\d{2})_(\d{2}-\d{2}-\d{2})_", db.name)
|
|
if m:
|
|
out.add((m.group(1), m.group(2)))
|
|
return out
|
|
|
|
|
|
def main() -> None:
|
|
print(f"Scanning {VIDEOS_ROOT} ...")
|
|
videos_df = scan_videos(VIDEOS_ROOT)
|
|
print(f" found {len(videos_df)} video sessions on disk")
|
|
|
|
print(f"Loading metadata xlsx: {VIDEO_INFO_XLSX}")
|
|
meta = pd.read_excel(VIDEO_INFO_XLSX)
|
|
meta["session_date"] = meta["date"].dt.strftime("%Y-%m-%d")
|
|
|
|
# The xlsx has one row per (date, machine, ROI) — collapse to unique sessions
|
|
meta_sessions = (
|
|
meta[["session_date", "machine_name"]].drop_duplicates().reset_index(drop=True)
|
|
)
|
|
print(f" xlsx contains {len(meta_sessions)} unique (date, machine) sessions")
|
|
|
|
# Mark which video sessions are referenced by the xlsx
|
|
xlsx_keys = set(zip(meta_sessions["session_date"], meta_sessions["machine_name"]))
|
|
videos_df["in_xlsx"] = videos_df.apply(
|
|
lambda r: (r["session_date"], r["machine_name"]) in xlsx_keys, axis=1
|
|
)
|
|
|
|
# Mark which already have tracking DBs in TRACKING_OUTPUT_DIR
|
|
tracked = already_tracked_set(TRACKING_OUTPUT_DIR)
|
|
videos_df["already_tracked"] = videos_df.apply(
|
|
lambda r: (r["session_date"], r["session_time"]) in tracked, axis=1
|
|
)
|
|
|
|
INVENTORY_CSV.parent.mkdir(parents=True, exist_ok=True)
|
|
videos_df.sort_values(["session_date", "machine_name", "session_time"]).to_csv(
|
|
INVENTORY_CSV, index=False
|
|
)
|
|
|
|
# Coverage report
|
|
in_xlsx = videos_df["in_xlsx"]
|
|
needed = videos_df[in_xlsx & ~videos_df["already_tracked"]]
|
|
n_xlsx_sessions = len(meta_sessions)
|
|
n_with_video = videos_df[in_xlsx].drop_duplicates(
|
|
["session_date", "machine_name"]
|
|
).shape[0]
|
|
|
|
# xlsx sessions that have no video on disk
|
|
found_keys = set(
|
|
zip(
|
|
videos_df.loc[in_xlsx, "session_date"],
|
|
videos_df.loc[in_xlsx, "machine_name"],
|
|
)
|
|
)
|
|
missing = sorted(xlsx_keys - found_keys)
|
|
|
|
print()
|
|
print("=" * 70)
|
|
print(f"Wrote inventory: {INVENTORY_CSV}")
|
|
print(f" total video sessions on disk: {len(videos_df)}")
|
|
print(f" xlsx unique sessions: {n_xlsx_sessions}")
|
|
print(f" xlsx sessions with video: {n_with_video}")
|
|
print(f" xlsx sessions missing video: {len(missing)}")
|
|
print(f" already tracked (DB exists): {videos_df['already_tracked'].sum()}")
|
|
print(f" TO TRACK (in_xlsx & ~tracked, video instances): {len(needed)}")
|
|
|
|
if missing:
|
|
print()
|
|
print("xlsx sessions with NO matching video on disk:")
|
|
for d, m in missing[:20]:
|
|
print(f" {d} {m}")
|
|
if len(missing) > 20:
|
|
print(f" ... and {len(missing) - 20} more")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|