Unify analysis pipeline around the TSV; move tracked DBs out of cloud sync

- Tracked DBs now live at /mnt/data/projects/cupido/tracked/ (out of ownCloud to avoid sync conflicts and bandwidth churn). config.py TRACKING_OUTPUT_DIR points there; the docker-compose for ethoscope-lab mounts it world-readable for JupyterHub users. - New scripts/export_video_db_index.py joins all_video_info_merged.xlsx with the video inventory and the on-disk DBs, producing a TSV that has one row per fly/ROI plus training/testing video and DB paths. Handles approximate xlsx times, cross-day training/testing, the 12 AM/PM ambiguity, and date typos. - scripts/load_roi_data.py rewritten as a TSV-driven loader returning a single DataFrame with session and metadata columns. calculate_distances and the two flies_analysis notebooks migrated to use it; downstream trained/naive splits remain available via simple equality filters. - Metadata vocabulary canonicalized: {naïve, niave, untrained, test} all resolve to {trained, naive}. Normalization happens at the TSV-export boundary (idempotent); the xlsx and the 2025-07-15 legacy CSV were edited in place to remove the worst variants. - scripts/monitor_tracking.py rate calculation fixed: with N parallel workers, completions arrive in bursts; the old formula divided by burst width and reported nonsense rates. Now uses a 6 h window denominator. - scripts/track_videos.py: BGRMovieCamera retries cv2.read on transient NFS hiccups and a post-tracking completeness gate (≥ 90 % of expected duration via MAX(t) across all 6 ROIs) deletes silent partial DBs. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-04-30 15:20:14 +01:00 · 2026-04-30 15:20:14 +01:00 · f60a9d0530
commit f60a9d0530
parent e4da7691d5
13 changed files with 569 additions and 237 deletions
--- a/scripts/load_roi_data.py
+++ b/scripts/load_roi_data.py
@ -1,90 +1,113 @@
-import pandas as pd
+"""Load ROI tracking data from all sessions into one DataFrame.
+
+Drives off the merged TSV (one row per ROI/fly across training + testing
+phases). For each TSV row, opens the corresponding tracking DB and pulls
+the matching ROI table, then attaches the experimental metadata.
+
+The TSV is the single source of truth for what data exists and how it
+maps to flies and conditions.
+"""
+
 import sqlite3
-import re
+from pathlib import Path

-from config import DATA_RAW, DATA_METADATA, DATA_PROCESSED
+import pandas as pd
+
+from config import VIDEO_INFO_XLSX


-def load_roi_data():
-    """Load ROI data from SQLite databases and group by trained/untrained.
+# Metadata columns to copy onto every tracking sample. These are the xlsx
+# fields that describe the experimental condition behind each fly/ROI.
+# Reason: the ROI column is uppercase ("ROI") for backwards compatibility
+# with the existing analysis pipeline (calculate_distances.py, notebooks).
+_META_COLS = (
+    "date",
+    "machine_name",
+    "species",
+    "male",
+    "training_date_time",
+    "testing_date_time",
+    "training_length_hr",
+    "consolidation_length_hr",
+    "memory",
+    "age",
+)
+
+
+def _open_ro(db_path: str, cache: dict) -> sqlite3.Connection | None:
+    """Cached read-only sqlite connection. Returns None on failure."""
+    if not isinstance(db_path, str) or not db_path:
+        return None
+    if db_path not in cache:
+        try:
+            cache[db_path] = sqlite3.connect(f"file:{db_path}?mode=ro", uri=True)
+        except sqlite3.Error as e:
+            print(f"failed to open {Path(db_path).name}: {e}")
+            cache[db_path] = None
+    return cache[db_path]
+
+
+def load_roi_data(meta: pd.DataFrame | None = None) -> pd.DataFrame:
+    """Load ROI tracking data joined with experimental metadata.
+
+    For each row in ``meta``, reads the matching ROI table from both the
+    training DB and the testing DB (whichever exist), and stamps every
+    sample with the row's metadata plus a ``session`` column
+    (``"training"`` or ``"testing"``). Rows with empty DB paths (unusable
+    videos, or videos that didn't pass the completeness gate) are skipped.
+
+    Args:
+        meta: optional DataFrame with the same schema as
+            ``all_video_info_merged.tsv``. Pass a filtered slice to load a
+            subset (e.g. ``meta[meta.species == 'Melanogaster/CS']``).
+            Defaults to the full TSV.

    Returns:
-        tuple: (trained_df, untrained_df) DataFrames with tracking data.
+        DataFrame with columns ``id, t, x, y, w, h, phi, is_inferred,
+        has_interacted, session, <metadata>`` — one row per tracking
+        sample. Empty if nothing could be loaded.
    """
-    metadata = pd.read_csv(DATA_METADATA / '2025_07_15_metadata_fixed.csv')
-    metadata['machine_name'] = metadata['machine_name'].astype(str)
+    if meta is None:
+        meta = pd.read_csv(VIDEO_INFO_XLSX.with_suffix(".tsv"), sep="\t")

-    trained_rois = metadata[metadata['group'] == 'trained']
-    untrained_rois = metadata[metadata['group'] == 'untrained']
+    db_cache: dict = {}
+    chunks: list[pd.DataFrame] = []

-    db_files = list(DATA_RAW.glob('*_tracking.db'))
-
-    trained_df = pd.DataFrame()
-    untrained_df = pd.DataFrame()
-
-    for db_file in db_files:
-        print(f"Processing {db_file.name}")
-
-        pattern = r'_([0-9a-f]{32})__'
-        match = re.search(pattern, db_file.name)
-
-        if not match:
-            print(f"Could not extract UUID from {db_file.name}")
-            continue
-
-        uuid = match.group(1)
-        metadata_matches = metadata[metadata['path'].str.contains(uuid, na=False)]
-
-        if metadata_matches.empty:
-            print(f"No metadata matches found for UUID {uuid} from {db_file.name}")
-            continue
-
-        machine_id = metadata_matches.iloc[0]['machine_name']
-        print(f"Matched to machine ID: {machine_id}")
-
-        conn = sqlite3.connect(str(db_file))
-
-        machine_trained = trained_rois[trained_rois['machine_name'] == machine_id]
-        machine_untrained = untrained_rois[untrained_rois['machine_name'] == machine_id]
-
-        for _, row in machine_trained.iterrows():
-            roi = row['ROI']
+    for row in meta.itertuples(index=False):
+        for session in ("training", "testing"):
+            conn = _open_ro(getattr(row, f"{session}_db_path"), db_cache)
+            if conn is None:
+                continue
            try:
-                query = f"SELECT * FROM ROI_{roi}"
-                roi_data = pd.read_sql_query(query, conn)
-                roi_data['machine_name'] = machine_id
-                roi_data['ROI'] = roi
-                roi_data['group'] = 'trained'
-                trained_df = pd.concat([trained_df, roi_data], ignore_index=True)
+                df = pd.read_sql_query(
+                    f"SELECT * FROM ROI_{int(row.roi)}", conn
+                )
            except Exception as e:
-                print(f"Error loading ROI_{roi} from {db_file.name}: {e}")
+                # Reason: a DB may be missing a ROI table if tracking was
+                # partial — skip rather than abort the whole batch.
+                print(f"  ROI_{row.roi} from {session} DB: {e}")
+                continue
+            df["session"] = session
+            df["ROI"] = int(row.roi)
+            for col in _META_COLS:
+                df[col] = getattr(row, col)
+            chunks.append(df)

-        for _, row in machine_untrained.iterrows():
-            roi = row['ROI']
-            try:
-                query = f"SELECT * FROM ROI_{roi}"
-                roi_data = pd.read_sql_query(query, conn)
-                roi_data['machine_name'] = machine_id
-                roi_data['ROI'] = roi
-                roi_data['group'] = 'untrained'
-                untrained_df = pd.concat([untrained_df, roi_data], ignore_index=True)
-            except Exception as e:
-                print(f"Error loading ROI_{roi} from {db_file.name}: {e}")
+    for conn in db_cache.values():
+        if conn is not None:
+            conn.close()

-        conn.close()
-
-    return trained_df, untrained_df
+    return pd.concat(chunks, ignore_index=True) if chunks else pd.DataFrame()


 if __name__ == "__main__":
-    trained_data, untrained_data = load_roi_data()
-    print(f"Trained data shape: {trained_data.shape}")
-    print(f"Untrained data shape: {untrained_data.shape}")
-    if not trained_data.empty:
-        print("Trained data columns:", trained_data.columns.tolist())
-    if not untrained_data.empty:
-        print("Untrained data columns:", untrained_data.columns.tolist())
-
-    trained_data.to_csv(DATA_PROCESSED / 'trained_roi_data.csv', index=False)
-    untrained_data.to_csv(DATA_PROCESSED / 'untrained_roi_data.csv', index=False)
-    print("Data saved to trained_roi_data.csv and untrained_roi_data.csv")
+    data = load_roi_data()
+    print(f"shape: {data.shape}")
+    if not data.empty:
+        print(f"columns: {list(data.columns)}")
+        print(f"sessions: {data['session'].value_counts().to_dict()}")
+        print(f"unique machines: {data['machine_name'].nunique()}")
+        print(
+            f"unique flies (date,machine,roi): "
+            f"{data.groupby(['date','machine_name','roi']).ngroups}"
+        )