Remove data/raw/ entirely — all bulky data now under /mnt/data/projects/cupido/

Deleted the 5 stale pre-pipeline tracking DBs and the data/raw/ directory.
Dropped DATA_RAW from config.py; build_video_inventory now scans
TRACKING_OUTPUT_DIR for already-tracked sessions. Notebooks no longer
import DATA_RAW. README, PLANNING and todo updated to reflect that the
repo holds only code + small curated metadata, never bulky DBs.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
Giorgio Gilestro 2026-05-01 09:20:25 +01:00
parent 9f3ee24a23
commit 23050360ea
9 changed files with 37 additions and 70 deletions

View file

@ -14,7 +14,7 @@
"execution_count": null,
"metadata": {},
"outputs": [],
"source": "import sys\nfrom pathlib import Path\n\nimport pandas as pd\nimport numpy as np\nimport sqlite3\nimport glob\nimport re\nimport matplotlib.pyplot as plt\nimport seaborn as sns\nfrom scipy.spatial.distance import euclidean\nfrom scipy import stats\n\n# ─── Where the data lives ────────────────────────────────────────────────\n# DATA_DIR holds everything bulky/regenerable: the metadata TSV and the\n# tracking SQLite DBs. It's mounted into the container at this fixed path.\n# REPO_ROOT is your checkout of the cupido repo, in your home directory.\n# Path.home() expands to /home/<your-username>, so this works for any\n# user (no hard-coded usernames).\nDATA_DIR = Path(\"/mnt/data/projects/cupido\")\nREPO_ROOT = Path.home() / \"cupido\"\n\nMETADATA_TSV = DATA_DIR / \"all_video_info_merged.tsv\"\nTRACKED_DBS = DATA_DIR / \"tracked\"\n\n# Sanity-check the data location up front so any failure here points at\n# the obvious thing — rather than crashing inside load_roi_data later.\nassert METADATA_TSV.exists(), f\"Metadata TSV not found at {METADATA_TSV}\"\nassert TRACKED_DBS.is_dir(), f\"Tracked-DB directory not found at {TRACKED_DBS}\"\n\n# Pull the in-repo path constants (DATA_RAW, DATA_METADATA, DATA_PROCESSED,\n# FIGURES) from scripts/config.py — single source of truth.\nsys.path.insert(0, str(REPO_ROOT / \"scripts\"))\nfrom config import DATA_RAW, DATA_METADATA, DATA_PROCESSED, FIGURES\n\n# Plotting style\nplt.style.use('seaborn-v0_8')\nsns.set_palette(\"husl\")\n"
"source": "import sys\nfrom pathlib import Path\n\nimport pandas as pd\nimport numpy as np\nimport sqlite3\nimport glob\nimport re\nimport matplotlib.pyplot as plt\nimport seaborn as sns\nfrom scipy.spatial.distance import euclidean\nfrom scipy import stats\n\n# ─── Where the data lives ────────────────────────────────────────────────\n# DATA_DIR holds everything bulky/regenerable: the metadata TSV and the\n# tracking SQLite DBs. It's mounted into the container at this fixed path.\n# REPO_ROOT is your checkout of the cupido repo, in your home directory.\n# Path.home() expands to /home/<your-username>, so this works for any\n# user (no hard-coded usernames).\nDATA_DIR = Path(\"/mnt/data/projects/cupido\")\nREPO_ROOT = Path.home() / \"cupido\"\n\nMETADATA_TSV = DATA_DIR / \"all_video_info_merged.tsv\"\nTRACKED_DBS = DATA_DIR / \"tracked\"\n\n# Sanity-check the data location up front so any failure here points at\n# the obvious thing — rather than crashing inside load_roi_data later.\nassert METADATA_TSV.exists(), f\"Metadata TSV not found at {METADATA_TSV}\"\nassert TRACKED_DBS.is_dir(), f\"Tracked-DB directory not found at {TRACKED_DBS}\"\n\n# Pull the in-repo path constants (DATA_METADATA, DATA_PROCESSED, FIGURES)\n# from scripts/config.py — single source of truth.\nsys.path.insert(0, str(REPO_ROOT / \"scripts\"))\nfrom config import DATA_METADATA, DATA_PROCESSED, FIGURES\n\n# Plotting style\nplt.style.use('seaborn-v0_8')\nsns.set_palette(\"husl\")\n"
},
{
"cell_type": "markdown",