Move personal TSV into repo's data/metadata/ folder

Personal copy of all_video_info_merged.tsv now lives at
~/cupido/data/metadata/all_video_info_merged.tsv (gitignored) instead
of ~/cupido_metadata.tsv. That sits next to the other small metadata
CSVs (barrier_opening, etc.) — the natural home for it. Updated all
five notebooks and processed/README accordingly.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
Giorgio Gilestro 2026-05-01 09:30:22 +01:00
parent f08e4b843d
commit ac3b8c13f0
7 changed files with 14 additions and 10 deletions

View file

@ -10,7 +10,7 @@
"execution_count": null,
"metadata": {},
"outputs": [],
"source": "import sys\nfrom pathlib import Path\n\nimport pandas as pd\nimport numpy as np\nimport matplotlib.pyplot as plt\nimport seaborn as sns\nfrom scipy.spatial.distance import euclidean\nfrom scipy import stats\n\n# ─── Where the data lives ────────────────────────────────────────────────\n# DATA_DIR holds everything bulky/regenerable: the metadata TSV and the\n# tracking SQLite DBs. It's mounted into the container at this fixed path.\n# REPO_ROOT is your checkout of the cupido repo, in your home directory.\n# Path.home() expands to /home/<your-username>, so this works for any\n# user (no hard-coded usernames).\nDATA_DIR = Path(\"/mnt/data/projects/cupido\")\nREPO_ROOT = Path.home() / \"cupido\"\n\nTRACKED_DBS = DATA_DIR / \"tracked\"\n\n# ─── The metadata TSV — shared master vs. your personal copy ─────────────\n# DATA_DIR is mounted read-only inside the container, so the shared TSV\n# at SHARED_TSV cannot be edited. Fine for read-only analysis. But if\n# you want to flip `include` flags (or otherwise customize the metadata\n# for your own analysis), copy it to your home folder ONCE:\n#\n# $ cp /mnt/data/projects/cupido/all_video_info_merged.tsv ~/cupido_metadata.tsv\n#\n# After that, the auto-select line below will pick up your personal copy\n# automatically. Other users are unaffected.\nSHARED_TSV = DATA_DIR / \"all_video_info_merged.tsv\"\nPERSONAL_TSV = Path.home() / \"cupido_metadata.tsv\"\nMETADATA_TSV = PERSONAL_TSV if PERSONAL_TSV.exists() else SHARED_TSV\n\n# Sanity-check the data location up front so any failure here points at\n# the obvious thing — rather than crashing inside load_roi_data later.\nassert METADATA_TSV.exists(), f\"Metadata TSV not found at {METADATA_TSV}\"\nassert TRACKED_DBS.is_dir(), f\"Tracked-DB directory not found at {TRACKED_DBS}\"\n\n# Pull the in-repo path constants (DATA_METADATA, DATA_PROCESSED, FIGURES)\n# from scripts/config.py — single source of truth.\nsys.path.insert(0, str(REPO_ROOT / \"scripts\"))\nfrom config import DATA_METADATA, DATA_PROCESSED, FIGURES\n\n# Plotting style\nplt.style.use('seaborn-v0_8')\nsns.set_palette(\"husl\")\n\nprint(f\"Data directory: {DATA_DIR}\")\nprint(f\"Repo root: {REPO_ROOT}\")\nprint(f\"Metadata TSV: {METADATA_TSV} ({'personal' if METADATA_TSV == PERSONAL_TSV else 'shared (read-only)'})\")\nprint(f\"Pandas version: {pd.__version__}\")\nprint(f\"NumPy version: {np.__version__}\")\n"
"source": "import sys\nfrom pathlib import Path\n\nimport pandas as pd\nimport numpy as np\nimport matplotlib.pyplot as plt\nimport seaborn as sns\nfrom scipy.spatial.distance import euclidean\nfrom scipy import stats\n\n# ─── Where the data lives ────────────────────────────────────────────────\n# DATA_DIR holds everything bulky/regenerable: the metadata TSV and the\n# tracking SQLite DBs. It's mounted into the container at this fixed path.\n# REPO_ROOT is your checkout of the cupido repo, in your home directory.\n# Path.home() expands to /home/<your-username>, so this works for any\n# user (no hard-coded usernames).\nDATA_DIR = Path(\"/mnt/data/projects/cupido\")\nREPO_ROOT = Path.home() / \"cupido\"\n\nTRACKED_DBS = DATA_DIR / \"tracked\"\n\n# ─── The metadata TSV — shared master vs. your personal copy ─────────────\n# DATA_DIR is mounted read-only inside the container, so the shared TSV\n# at SHARED_TSV cannot be edited. Fine for read-only analysis. But if\n# you want to flip `include` flags (or otherwise customize the metadata\n# for your own analysis), copy it to your repo's data/metadata/ ONCE:\n#\n# $ cp /mnt/data/projects/cupido/all_video_info_merged.tsv ~/cupido/data/metadata/\n#\n# That location is gitignored, so your edits won't pollute the repo and\n# other users are unaffected. The auto-select line below picks up your\n# personal copy automatically once it's there.\nSHARED_TSV = DATA_DIR / \"all_video_info_merged.tsv\"\nPERSONAL_TSV = REPO_ROOT / \"data\" / \"metadata\" / \"all_video_info_merged.tsv\"\nMETADATA_TSV = PERSONAL_TSV if PERSONAL_TSV.exists() else SHARED_TSV\n\n# Sanity-check the data location up front so any failure here points at\n# the obvious thing — rather than crashing inside load_roi_data later.\nassert METADATA_TSV.exists(), f\"Metadata TSV not found at {METADATA_TSV}\"\nassert TRACKED_DBS.is_dir(), f\"Tracked-DB directory not found at {TRACKED_DBS}\"\n\n# Pull the in-repo path constants (DATA_METADATA, DATA_PROCESSED, FIGURES)\n# from scripts/config.py — single source of truth.\nsys.path.insert(0, str(REPO_ROOT / \"scripts\"))\nfrom config import DATA_METADATA, DATA_PROCESSED, FIGURES\n\n# Plotting style\nplt.style.use('seaborn-v0_8')\nsns.set_palette(\"husl\")\n\nprint(f\"Data directory: {DATA_DIR}\")\nprint(f\"Repo root: {REPO_ROOT}\")\nprint(f\"Metadata TSV: {METADATA_TSV} ({'personal' if METADATA_TSV == PERSONAL_TSV else 'shared (read-only)'})\")\nprint(f\"Pandas version: {pd.__version__}\")\nprint(f\"NumPy version: {np.__version__}\")\n"
},
{
"cell_type": "markdown",