Make data paths visible and explicit in flies_analysis notebooks

Define METADATA_TSV and TRACKED_DBS up front in cell 1, assert they exist before doing anything else, and pass the loaded metadata to load_roi_data() explicitly. Surfaces path problems immediately with a readable message instead of failing deep inside the loader. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-01 09:01:55 +01:00 · 2026-05-01 09:01:55 +01:00 · 723d1f3682
commit 723d1f3682
parent 231c7a437f
2 changed files with 4 additions and 4 deletions
--- a/notebooks/flies_analysis.ipynb
+++ b/notebooks/flies_analysis.ipynb
@ -14,7 +14,7 @@
   "execution_count": null,
   "metadata": {},
   "outputs": [],
-   "source": "import sys\nfrom pathlib import Path\n\nimport pandas as pd\nimport numpy as np\nimport sqlite3\nimport glob\nimport re\nimport matplotlib.pyplot as plt\nimport seaborn as sns\nfrom scipy.spatial.distance import euclidean\nfrom scipy import stats\n\n# Two locations to know about:\n#   - DATA_DIR  : where the project's bulky data lives (mounted into the container)\n#   - REPO_ROOT : where the code repository is checked out (your home directory)\n# Path.home() expands to the current user's home, so this works for any\n# user running the container (no hard-coded usernames).\nDATA_DIR  = Path(\"/mnt/data/projects/cupido\")\nREPO_ROOT = Path.home() / \"cupido\"\n\n# Pull every other path constant from scripts/config.py so this notebook\n# stays in sync with the rest of the codebase. (Reason: avoids drift when\n# paths change — config.py is the single source of truth.)\nsys.path.insert(0, str(REPO_ROOT / \"scripts\"))\nfrom config import DATA_RAW, DATA_METADATA, DATA_PROCESSED, FIGURES\n\n# Plotting style\nplt.style.use('seaborn-v0_8')\nsns.set_palette(\"husl\")\n"
+   "source": "import sys\nfrom pathlib import Path\n\nimport pandas as pd\nimport numpy as np\nimport sqlite3\nimport glob\nimport re\nimport matplotlib.pyplot as plt\nimport seaborn as sns\nfrom scipy.spatial.distance import euclidean\nfrom scipy import stats\n\n# ─── Where the data lives ────────────────────────────────────────────────\n# DATA_DIR holds everything bulky/regenerable: the metadata TSV and the\n# tracking SQLite DBs. It's mounted into the container at this fixed path.\n# REPO_ROOT is your checkout of the cupido repo, in your home directory.\n# Path.home() expands to /home/<your-username>, so this works for any\n# user (no hard-coded usernames).\nDATA_DIR  = Path(\"/mnt/data/projects/cupido\")\nREPO_ROOT = Path.home() / \"cupido\"\n\nMETADATA_TSV = DATA_DIR / \"all_video_info_merged.tsv\"\nTRACKED_DBS  = DATA_DIR / \"tracked\"\n\n# Sanity-check the data location up front so any failure here points at\n# the obvious thing — rather than crashing inside load_roi_data later.\nassert METADATA_TSV.exists(), f\"Metadata TSV not found at {METADATA_TSV}\"\nassert TRACKED_DBS.is_dir(),  f\"Tracked-DB directory not found at {TRACKED_DBS}\"\n\n# Pull the in-repo path constants (DATA_RAW, DATA_METADATA, DATA_PROCESSED,\n# FIGURES) from scripts/config.py — single source of truth.\nsys.path.insert(0, str(REPO_ROOT / \"scripts\"))\nfrom config import DATA_RAW, DATA_METADATA, DATA_PROCESSED, FIGURES\n\n# Plotting style\nplt.style.use('seaborn-v0_8')\nsns.set_palette(\"husl\")\n"
  },
  {
   "cell_type": "markdown",
@ -28,7 +28,7 @@
   "execution_count": null,
   "metadata": {},
   "outputs": [],
-   "source": "# Load tracking data via the unified loader (driven by all_video_info_merged.tsv).\nfrom load_roi_data import load_roi_data\n\ndata = load_roi_data()\ntrained_data   = data[data['male'] == 'trained'].copy()\nuntrained_data = data[data['male'] == 'naive'].copy()\n\nprint(f\"all data:  {data.shape}\")\nprint(f\"trained:   {trained_data.shape}\")\nprint(f\"naive:     {untrained_data.shape}\")\n"
+   "source": "# Load the metadata explicitly from METADATA_TSV (no hidden defaults), then\n# hand it to load_roi_data which opens each referenced tracking DB.\nfrom load_roi_data import load_roi_data\n\nmeta = pd.read_csv(METADATA_TSV, sep=\"\\t\")\nprint(f\"metadata rows: {len(meta)}\")\n\ndata = load_roi_data(meta)\ntrained_data   = data[data['male'] == 'trained'].copy()\nuntrained_data = data[data['male'] == 'naive'].copy()\n\nprint(f\"all data:  {data.shape}\")\nprint(f\"trained:   {trained_data.shape}\")\nprint(f\"naive:     {untrained_data.shape}\")\n"
  },
  {
   "cell_type": "markdown",
--- a/notebooks/flies_analysis_simple.ipynb
+++ b/notebooks/flies_analysis_simple.ipynb
@ -14,7 +14,7 @@
   "execution_count": null,
   "metadata": {},
   "outputs": [],
-   "source": "import sys\nfrom pathlib import Path\n\nimport pandas as pd\nimport numpy as np\nimport matplotlib.pyplot as plt\nimport seaborn as sns\nfrom scipy.spatial.distance import euclidean\nfrom scipy import stats\n\n# Two locations to know about:\n#   - DATA_DIR  : where the project's bulky data lives (mounted into the container)\n#   - REPO_ROOT : where the code repository is checked out (your home directory)\n# Path.home() expands to the current user's home, so this works for any\n# user running the container (no hard-coded usernames).\nDATA_DIR  = Path(\"/mnt/data/projects/cupido\")\nREPO_ROOT = Path.home() / \"cupido\"\n\n# Pull every other path constant from scripts/config.py so this notebook\n# stays in sync with the rest of the codebase. (Reason: avoids drift when\n# paths change — config.py is the single source of truth.)\nsys.path.insert(0, str(REPO_ROOT / \"scripts\"))\nfrom config import DATA_RAW, DATA_METADATA, DATA_PROCESSED, FIGURES\n\n# Plotting style\nplt.style.use('seaborn-v0_8')\nsns.set_palette(\"husl\")\n\nprint(f\"Data directory: {DATA_DIR}\")\nprint(f\"Repo root:      {REPO_ROOT}\")\nprint(f\"Pandas version: {pd.__version__}\")\nprint(f\"NumPy version:  {np.__version__}\")\n"
+   "source": "import sys\nfrom pathlib import Path\n\nimport pandas as pd\nimport numpy as np\nimport matplotlib.pyplot as plt\nimport seaborn as sns\nfrom scipy.spatial.distance import euclidean\nfrom scipy import stats\n\n# ─── Where the data lives ────────────────────────────────────────────────\n# DATA_DIR holds everything bulky/regenerable: the metadata TSV and the\n# tracking SQLite DBs. It's mounted into the container at this fixed path.\n# REPO_ROOT is your checkout of the cupido repo, in your home directory.\n# Path.home() expands to /home/<your-username>, so this works for any\n# user (no hard-coded usernames).\nDATA_DIR  = Path(\"/mnt/data/projects/cupido\")\nREPO_ROOT = Path.home() / \"cupido\"\n\nMETADATA_TSV = DATA_DIR / \"all_video_info_merged.tsv\"\nTRACKED_DBS  = DATA_DIR / \"tracked\"\n\n# Sanity-check the data location up front so any failure here points at\n# the obvious thing — rather than crashing inside load_roi_data later.\nassert METADATA_TSV.exists(), f\"Metadata TSV not found at {METADATA_TSV}\"\nassert TRACKED_DBS.is_dir(),  f\"Tracked-DB directory not found at {TRACKED_DBS}\"\n\n# Pull the in-repo path constants (DATA_RAW, DATA_METADATA, DATA_PROCESSED,\n# FIGURES) from scripts/config.py — single source of truth.\nsys.path.insert(0, str(REPO_ROOT / \"scripts\"))\nfrom config import DATA_RAW, DATA_METADATA, DATA_PROCESSED, FIGURES\n\n# Plotting style\nplt.style.use('seaborn-v0_8')\nsns.set_palette(\"husl\")\n\nprint(f\"Data directory: {DATA_DIR}\")\nprint(f\"Repo root:      {REPO_ROOT}\")\nprint(f\"Metadata TSV:   {METADATA_TSV}\")\nprint(f\"Pandas version: {pd.__version__}\")\nprint(f\"NumPy version:  {np.__version__}\")\n"
  },
  {
   "cell_type": "markdown",
@ -28,7 +28,7 @@
   "execution_count": null,
   "metadata": {},
   "outputs": [],
-   "source": "# Load tracking data via the unified loader (driven by all_video_info_merged.tsv).\nfrom load_roi_data import load_roi_data\n\ndata = load_roi_data()\ntrained_data   = data[data['male'] == 'trained'].copy()\nuntrained_data = data[data['male'] == 'naive'].copy()\n\nprint(f\"all data shape:    {data.shape}\")\nprint(f\"Trained data:      {trained_data.shape}\")\nprint(f\"Naive data:        {untrained_data.shape}\")\nprint(f\"Columns:           {list(trained_data.columns)}\")\n"
+   "source": "# Load the metadata explicitly from METADATA_TSV (no hidden defaults), then\n# hand it to load_roi_data which opens each referenced tracking DB.\nfrom load_roi_data import load_roi_data\n\nmeta = pd.read_csv(METADATA_TSV, sep=\"\\t\")\nprint(f\"metadata rows: {len(meta)}\")\n\ndata = load_roi_data(meta)\ntrained_data   = data[data['male'] == 'trained'].copy()\nuntrained_data = data[data['male'] == 'naive'].copy()\n\nprint(f\"all data shape:    {data.shape}\")\nprint(f\"Trained data:      {trained_data.shape}\")\nprint(f\"Naive data:        {untrained_data.shape}\")\nprint(f\"Columns:           {list(trained_data.columns)}\")\n"
  },
  {
   "cell_type": "markdown",