Remove data/raw/ entirely — all bulky data now under /mnt/data/projects/cupido/

Deleted the 5 stale pre-pipeline tracking DBs and the data/raw/ directory.
Dropped DATA_RAW from config.py; build_video_inventory now scans
TRACKING_OUTPUT_DIR for already-tracked sessions. Notebooks no longer
import DATA_RAW. README, PLANNING and todo updated to reflect that the
repo holds only code + small curated metadata, never bulky DBs.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
Giorgio Gilestro 2026-05-01 09:20:25 +01:00
parent 9f3ee24a23
commit 23050360ea
9 changed files with 37 additions and 70 deletions

View file

@ -16,7 +16,7 @@ from pathlib import Path
import pandas as pd
from config import DATA_RAW, INVENTORY_CSV, VIDEO_INFO_XLSX, VIDEOS_ROOT
from config import INVENTORY_CSV, TRACKING_OUTPUT_DIR, VIDEO_INFO_XLSX, VIDEOS_ROOT
SESSION_RE = re.compile(r"^(\d{4}-\d{2}-\d{2})_(\d{2}-\d{2}-\d{2})$")
@ -64,14 +64,14 @@ def scan_videos(videos_root: Path) -> pd.DataFrame:
return pd.DataFrame(rows)
def already_tracked_set(data_raw: Path) -> set[tuple[str, str]]:
def already_tracked_set(tracked_dir: Path) -> set[tuple[str, str]]:
"""Return the set of (date, time) sessions for which a tracking DB exists.
DBs are named like:
2025-07-15_16-03-10_<uuid>__1920x1088@25fps-28q_merged_tracking.db
"""
out = set()
for db in data_raw.glob("*_tracking.db"):
for db in tracked_dir.glob("*_tracking.db"):
m = re.match(r"^(\d{4}-\d{2}-\d{2})_(\d{2}-\d{2}-\d{2})_", db.name)
if m:
out.add((m.group(1), m.group(2)))
@ -99,8 +99,8 @@ def main() -> None:
lambda r: (r["session_date"], r["machine_name"]) in xlsx_keys, axis=1
)
# Mark which already have tracking DBs in data/raw/
tracked = already_tracked_set(DATA_RAW)
# Mark which already have tracking DBs in TRACKING_OUTPUT_DIR
tracked = already_tracked_set(TRACKING_OUTPUT_DIR)
videos_df["already_tracked"] = videos_df.apply(
lambda r: (r["session_date"], r["session_time"]) in tracked, axis=1
)