Remove data/raw/ entirely — all bulky data now under /mnt/data/projects/cupido/
Deleted the 5 stale pre-pipeline tracking DBs and the data/raw/ directory. Dropped DATA_RAW from config.py; build_video_inventory now scans TRACKING_OUTPUT_DIR for already-tracked sessions. Notebooks no longer import DATA_RAW. README, PLANNING and todo updated to reflect that the repo holds only code + small curated metadata, never bulky DBs. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
parent
9f3ee24a23
commit
23050360ea
9 changed files with 37 additions and 70 deletions
|
|
@ -16,7 +16,7 @@ from pathlib import Path
|
|||
|
||||
import pandas as pd
|
||||
|
||||
from config import DATA_RAW, INVENTORY_CSV, VIDEO_INFO_XLSX, VIDEOS_ROOT
|
||||
from config import INVENTORY_CSV, TRACKING_OUTPUT_DIR, VIDEO_INFO_XLSX, VIDEOS_ROOT
|
||||
|
||||
SESSION_RE = re.compile(r"^(\d{4}-\d{2}-\d{2})_(\d{2}-\d{2}-\d{2})$")
|
||||
|
||||
|
|
@ -64,14 +64,14 @@ def scan_videos(videos_root: Path) -> pd.DataFrame:
|
|||
return pd.DataFrame(rows)
|
||||
|
||||
|
||||
def already_tracked_set(data_raw: Path) -> set[tuple[str, str]]:
|
||||
def already_tracked_set(tracked_dir: Path) -> set[tuple[str, str]]:
|
||||
"""Return the set of (date, time) sessions for which a tracking DB exists.
|
||||
|
||||
DBs are named like:
|
||||
2025-07-15_16-03-10_<uuid>__1920x1088@25fps-28q_merged_tracking.db
|
||||
"""
|
||||
out = set()
|
||||
for db in data_raw.glob("*_tracking.db"):
|
||||
for db in tracked_dir.glob("*_tracking.db"):
|
||||
m = re.match(r"^(\d{4}-\d{2}-\d{2})_(\d{2}-\d{2}-\d{2})_", db.name)
|
||||
if m:
|
||||
out.add((m.group(1), m.group(2)))
|
||||
|
|
@ -99,8 +99,8 @@ def main() -> None:
|
|||
lambda r: (r["session_date"], r["machine_name"]) in xlsx_keys, axis=1
|
||||
)
|
||||
|
||||
# Mark which already have tracking DBs in data/raw/
|
||||
tracked = already_tracked_set(DATA_RAW)
|
||||
# Mark which already have tracking DBs in TRACKING_OUTPUT_DIR
|
||||
tracked = already_tracked_set(TRACKING_OUTPUT_DIR)
|
||||
videos_df["already_tracked"] = videos_df.apply(
|
||||
lambda r: (r["session_date"], r["session_time"]) in tracked, axis=1
|
||||
)
|
||||
|
|
|
|||
|
|
@ -5,7 +5,6 @@ from pathlib import Path
|
|||
|
||||
# Where this code repository lives (the directory containing scripts/, notebooks/, ...).
|
||||
PROJECT_ROOT = Path(__file__).resolve().parent.parent
|
||||
DATA_RAW = PROJECT_ROOT / "data" / "raw"
|
||||
DATA_METADATA = PROJECT_ROOT / "data" / "metadata"
|
||||
DATA_PROCESSED = PROJECT_ROOT / "data" / "processed"
|
||||
FIGURES = PROJECT_ROOT / "figures"
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue