Move metadata xlsx/TSV to /mnt/data/projects/cupido/

Consolidates everything bulky (tracking DBs, targets, metadata
spreadsheet) under a single DATA_VOLUME root outside the ownCloud-synced
repo. Notebooks now use a visible DATA_DIR = Path(...) idiom rather than
walking up the filesystem with PROJECT_ROOT.parent — easier for students
with no Python background to follow.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
Giorgio Gilestro 2026-05-01 08:47:15 +01:00
parent ec56e51bf9
commit f176224150
8 changed files with 102 additions and 160 deletions

View file

@ -2,21 +2,26 @@
from pathlib import Path
# Where this code repository lives (the directory containing scripts/, notebooks/, ...).
PROJECT_ROOT = Path(__file__).resolve().parent.parent
DATA_RAW = PROJECT_ROOT / "data" / "raw"
DATA_METADATA = PROJECT_ROOT / "data" / "metadata"
DATA_PROCESSED = PROJECT_ROOT / "data" / "processed"
FIGURES = PROJECT_ROOT / "figures"
# Offline-tracking pipeline paths
VIDEOS_ROOT = Path("/mnt/ethoscope_data/videos")
VIDEO_INFO_XLSX = PROJECT_ROOT.parent / "all_video_info_merged.xlsx"
INVENTORY_CSV = DATA_METADATA / "video_inventory.csv"
# Reason: kept on the local data volume alongside the tracking DBs (out of
# ownCloud sync). See TRACKING_OUTPUT_DIR comment below.
TARGETS_DIR = Path("/mnt/data/projects/cupido/targets")
# Reason: tracking DBs are large binary files that don't belong in
# ownCloud-synced storage (sync conflicts + bandwidth). They live on the
# local data volume instead. Regenerable from videos + target JSONs.
TRACKING_OUTPUT_DIR = Path("/mnt/data/projects/cupido/tracked")
LOGS_DIR = PROJECT_ROOT / "data" / "logs"
# Where the source videos live (read-only NFS mount).
VIDEOS_ROOT = Path("/mnt/ethoscope_data/videos")
# Where the project's bulky data lives — outside the ownCloud-synced repo so
# it doesn't churn the cloud sync. This single root holds everything that's
# big or regenerable: tracking DBs, target-point JSONs, and the metadata
# spreadsheet (xlsx + TSV).
DATA_VOLUME = Path("/mnt/data/projects/cupido")
TARGETS_DIR = DATA_VOLUME / "targets"
TRACKING_OUTPUT_DIR = DATA_VOLUME / "tracked"
VIDEO_INFO_XLSX = DATA_VOLUME / "all_video_info_merged.xlsx"
VIDEO_INFO_TSV = DATA_VOLUME / "all_video_info_merged.tsv"
# A small CSV listing every video file we know about (built locally).
INVENTORY_CSV = DATA_METADATA / "video_inventory.csv"

View file

@ -26,7 +26,7 @@ from pathlib import Path
import pandas as pd
from config import INVENTORY_CSV, TRACKING_OUTPUT_DIR, VIDEO_INFO_XLSX
from config import INVENTORY_CSV, TRACKING_OUTPUT_DIR, VIDEO_INFO_TSV, VIDEO_INFO_XLSX
_TIME_RE = re.compile(r"^(\d{8})_(\d{1,2})(\d{2})?(AM|PM)$", re.IGNORECASE)
@ -138,7 +138,7 @@ def main() -> None:
parser.add_argument(
"--out",
type=Path,
default=VIDEO_INFO_XLSX.with_suffix(".tsv"),
default=VIDEO_INFO_TSV,
help="output TSV path (default: alongside the xlsx)",
)
args = parser.parse_args()

View file

@ -13,7 +13,7 @@ from pathlib import Path
import pandas as pd
from config import VIDEO_INFO_XLSX
from config import VIDEO_INFO_TSV
# Metadata columns to copy onto every tracking sample. These are the xlsx
@ -68,7 +68,7 @@ def load_roi_data(meta: pd.DataFrame | None = None) -> pd.DataFrame:
sample. Empty if nothing could be loaded.
"""
if meta is None:
meta = pd.read_csv(VIDEO_INFO_XLSX.with_suffix(".tsv"), sep="\t")
meta = pd.read_csv(VIDEO_INFO_TSV, sep="\t")
db_cache: dict = {}
chunks: list[pd.DataFrame] = []