Move metadata xlsx/TSV to /mnt/data/projects/cupido/

Consolidates everything bulky (tracking DBs, targets, metadata spreadsheet) under a single DATA_VOLUME root outside the ownCloud-synced repo. Notebooks now use a visible DATA_DIR = Path(...) idiom rather than walking up the filesystem with PROJECT_ROOT.parent — easier for students with no Python background to follow. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-01 08:47:15 +01:00 · 2026-05-01 08:47:15 +01:00 · f176224150
commit f176224150
parent ec56e51bf9
8 changed files with 102 additions and 160 deletions
--- a/scripts/config.py
+++ b/scripts/config.py
@ -2,21 +2,26 @@

 from pathlib import Path

+# Where this code repository lives (the directory containing scripts/, notebooks/, ...).
 PROJECT_ROOT = Path(__file__).resolve().parent.parent
 DATA_RAW = PROJECT_ROOT / "data" / "raw"
 DATA_METADATA = PROJECT_ROOT / "data" / "metadata"
 DATA_PROCESSED = PROJECT_ROOT / "data" / "processed"
 FIGURES = PROJECT_ROOT / "figures"
-
-# Offline-tracking pipeline paths
-VIDEOS_ROOT = Path("/mnt/ethoscope_data/videos")
-VIDEO_INFO_XLSX = PROJECT_ROOT.parent / "all_video_info_merged.xlsx"
-INVENTORY_CSV = DATA_METADATA / "video_inventory.csv"
-# Reason: kept on the local data volume alongside the tracking DBs (out of
-# ownCloud sync). See TRACKING_OUTPUT_DIR comment below.
-TARGETS_DIR = Path("/mnt/data/projects/cupido/targets")
-# Reason: tracking DBs are large binary files that don't belong in
-# ownCloud-synced storage (sync conflicts + bandwidth). They live on the
-# local data volume instead. Regenerable from videos + target JSONs.
-TRACKING_OUTPUT_DIR = Path("/mnt/data/projects/cupido/tracked")
 LOGS_DIR = PROJECT_ROOT / "data" / "logs"
+
+# Where the source videos live (read-only NFS mount).
+VIDEOS_ROOT = Path("/mnt/ethoscope_data/videos")
+
+# Where the project's bulky data lives — outside the ownCloud-synced repo so
+# it doesn't churn the cloud sync. This single root holds everything that's
+# big or regenerable: tracking DBs, target-point JSONs, and the metadata
+# spreadsheet (xlsx + TSV).
+DATA_VOLUME = Path("/mnt/data/projects/cupido")
+TARGETS_DIR = DATA_VOLUME / "targets"
+TRACKING_OUTPUT_DIR = DATA_VOLUME / "tracked"
+VIDEO_INFO_XLSX = DATA_VOLUME / "all_video_info_merged.xlsx"
+VIDEO_INFO_TSV = DATA_VOLUME / "all_video_info_merged.tsv"
+
+# A small CSV listing every video file we know about (built locally).
+INVENTORY_CSV = DATA_METADATA / "video_inventory.csv"
--- a/scripts/export_video_db_index.py
+++ b/scripts/export_video_db_index.py
@ -26,7 +26,7 @@ from pathlib import Path

 import pandas as pd

-from config import INVENTORY_CSV, TRACKING_OUTPUT_DIR, VIDEO_INFO_XLSX
+from config import INVENTORY_CSV, TRACKING_OUTPUT_DIR, VIDEO_INFO_TSV, VIDEO_INFO_XLSX


 _TIME_RE = re.compile(r"^(\d{8})_(\d{1,2})(\d{2})?(AM|PM)$", re.IGNORECASE)
@ -138,7 +138,7 @@ def main() -> None:
    parser.add_argument(
        "--out",
        type=Path,
-        default=VIDEO_INFO_XLSX.with_suffix(".tsv"),
+        default=VIDEO_INFO_TSV,
        help="output TSV path (default: alongside the xlsx)",
    )
    args = parser.parse_args()
--- a/scripts/load_roi_data.py
+++ b/scripts/load_roi_data.py
@ -13,7 +13,7 @@ from pathlib import Path

 import pandas as pd

-from config import VIDEO_INFO_XLSX
+from config import VIDEO_INFO_TSV


 # Metadata columns to copy onto every tracking sample. These are the xlsx
@ -68,7 +68,7 @@ def load_roi_data(meta: pd.DataFrame | None = None) -> pd.DataFrame:
        sample. Empty if nothing could be loaded.
    """
    if meta is None:
-        meta = pd.read_csv(VIDEO_INFO_XLSX.with_suffix(".tsv"), sep="\t")
+        meta = pd.read_csv(VIDEO_INFO_TSV, sep="\t")

    db_cache: dict = {}
    chunks: list[pd.DataFrame] = []