Unify analysis pipeline around the TSV; move tracked DBs out of cloud sync

- Tracked DBs now live at /mnt/data/projects/cupido/tracked/ (out of
  ownCloud to avoid sync conflicts and bandwidth churn). config.py
  TRACKING_OUTPUT_DIR points there; the docker-compose for ethoscope-lab
  mounts it world-readable for JupyterHub users.
- New scripts/export_video_db_index.py joins all_video_info_merged.xlsx
  with the video inventory and the on-disk DBs, producing a TSV that has
  one row per fly/ROI plus training/testing video and DB paths. Handles
  approximate xlsx times, cross-day training/testing, the 12 AM/PM
  ambiguity, and date typos.
- scripts/load_roi_data.py rewritten as a TSV-driven loader returning a
  single DataFrame with session and metadata columns. calculate_distances
  and the two flies_analysis notebooks migrated to use it; downstream
  trained/naive splits remain available via simple equality filters.
- Metadata vocabulary canonicalized: {naïve, niave, untrained, test} all
  resolve to {trained, naive}. Normalization happens at the TSV-export
  boundary (idempotent); the xlsx and the 2025-07-15 legacy CSV were
  edited in place to remove the worst variants.
- scripts/monitor_tracking.py rate calculation fixed: with N parallel
  workers, completions arrive in bursts; the old formula divided by burst
  width and reported nonsense rates. Now uses a 6 h window denominator.
- scripts/track_videos.py: BGRMovieCamera retries cv2.read on transient
  NFS hiccups and a post-tracking completeness gate (≥ 90 % of expected
  duration via MAX(t) across all 6 ROIs) deletes silent partial DBs.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
Giorgio Gilestro 2026-04-30 15:20:14 +01:00
parent e4da7691d5
commit f60a9d0530
13 changed files with 569 additions and 237 deletions

View file

@ -1,90 +1,113 @@
import pandas as pd
"""Load ROI tracking data from all sessions into one DataFrame.
Drives off the merged TSV (one row per ROI/fly across training + testing
phases). For each TSV row, opens the corresponding tracking DB and pulls
the matching ROI table, then attaches the experimental metadata.
The TSV is the single source of truth for what data exists and how it
maps to flies and conditions.
"""
import sqlite3
import re
from pathlib import Path
from config import DATA_RAW, DATA_METADATA, DATA_PROCESSED
import pandas as pd
from config import VIDEO_INFO_XLSX
def load_roi_data():
"""Load ROI data from SQLite databases and group by trained/untrained.
# Metadata columns to copy onto every tracking sample. These are the xlsx
# fields that describe the experimental condition behind each fly/ROI.
# Reason: the ROI column is uppercase ("ROI") for backwards compatibility
# with the existing analysis pipeline (calculate_distances.py, notebooks).
_META_COLS = (
"date",
"machine_name",
"species",
"male",
"training_date_time",
"testing_date_time",
"training_length_hr",
"consolidation_length_hr",
"memory",
"age",
)
def _open_ro(db_path: str, cache: dict) -> sqlite3.Connection | None:
"""Cached read-only sqlite connection. Returns None on failure."""
if not isinstance(db_path, str) or not db_path:
return None
if db_path not in cache:
try:
cache[db_path] = sqlite3.connect(f"file:{db_path}?mode=ro", uri=True)
except sqlite3.Error as e:
print(f"failed to open {Path(db_path).name}: {e}")
cache[db_path] = None
return cache[db_path]
def load_roi_data(meta: pd.DataFrame | None = None) -> pd.DataFrame:
"""Load ROI tracking data joined with experimental metadata.
For each row in ``meta``, reads the matching ROI table from both the
training DB and the testing DB (whichever exist), and stamps every
sample with the row's metadata plus a ``session`` column
(``"training"`` or ``"testing"``). Rows with empty DB paths (unusable
videos, or videos that didn't pass the completeness gate) are skipped.
Args:
meta: optional DataFrame with the same schema as
``all_video_info_merged.tsv``. Pass a filtered slice to load a
subset (e.g. ``meta[meta.species == 'Melanogaster/CS']``).
Defaults to the full TSV.
Returns:
tuple: (trained_df, untrained_df) DataFrames with tracking data.
DataFrame with columns ``id, t, x, y, w, h, phi, is_inferred,
has_interacted, session, <metadata>`` one row per tracking
sample. Empty if nothing could be loaded.
"""
metadata = pd.read_csv(DATA_METADATA / '2025_07_15_metadata_fixed.csv')
metadata['machine_name'] = metadata['machine_name'].astype(str)
if meta is None:
meta = pd.read_csv(VIDEO_INFO_XLSX.with_suffix(".tsv"), sep="\t")
trained_rois = metadata[metadata['group'] == 'trained']
untrained_rois = metadata[metadata['group'] == 'untrained']
db_cache: dict = {}
chunks: list[pd.DataFrame] = []
db_files = list(DATA_RAW.glob('*_tracking.db'))
trained_df = pd.DataFrame()
untrained_df = pd.DataFrame()
for db_file in db_files:
print(f"Processing {db_file.name}")
pattern = r'_([0-9a-f]{32})__'
match = re.search(pattern, db_file.name)
if not match:
print(f"Could not extract UUID from {db_file.name}")
continue
uuid = match.group(1)
metadata_matches = metadata[metadata['path'].str.contains(uuid, na=False)]
if metadata_matches.empty:
print(f"No metadata matches found for UUID {uuid} from {db_file.name}")
continue
machine_id = metadata_matches.iloc[0]['machine_name']
print(f"Matched to machine ID: {machine_id}")
conn = sqlite3.connect(str(db_file))
machine_trained = trained_rois[trained_rois['machine_name'] == machine_id]
machine_untrained = untrained_rois[untrained_rois['machine_name'] == machine_id]
for _, row in machine_trained.iterrows():
roi = row['ROI']
for row in meta.itertuples(index=False):
for session in ("training", "testing"):
conn = _open_ro(getattr(row, f"{session}_db_path"), db_cache)
if conn is None:
continue
try:
query = f"SELECT * FROM ROI_{roi}"
roi_data = pd.read_sql_query(query, conn)
roi_data['machine_name'] = machine_id
roi_data['ROI'] = roi
roi_data['group'] = 'trained'
trained_df = pd.concat([trained_df, roi_data], ignore_index=True)
df = pd.read_sql_query(
f"SELECT * FROM ROI_{int(row.roi)}", conn
)
except Exception as e:
print(f"Error loading ROI_{roi} from {db_file.name}: {e}")
# Reason: a DB may be missing a ROI table if tracking was
# partial — skip rather than abort the whole batch.
print(f" ROI_{row.roi} from {session} DB: {e}")
continue
df["session"] = session
df["ROI"] = int(row.roi)
for col in _META_COLS:
df[col] = getattr(row, col)
chunks.append(df)
for _, row in machine_untrained.iterrows():
roi = row['ROI']
try:
query = f"SELECT * FROM ROI_{roi}"
roi_data = pd.read_sql_query(query, conn)
roi_data['machine_name'] = machine_id
roi_data['ROI'] = roi
roi_data['group'] = 'untrained'
untrained_df = pd.concat([untrained_df, roi_data], ignore_index=True)
except Exception as e:
print(f"Error loading ROI_{roi} from {db_file.name}: {e}")
for conn in db_cache.values():
if conn is not None:
conn.close()
conn.close()
return trained_df, untrained_df
return pd.concat(chunks, ignore_index=True) if chunks else pd.DataFrame()
if __name__ == "__main__":
trained_data, untrained_data = load_roi_data()
print(f"Trained data shape: {trained_data.shape}")
print(f"Untrained data shape: {untrained_data.shape}")
if not trained_data.empty:
print("Trained data columns:", trained_data.columns.tolist())
if not untrained_data.empty:
print("Untrained data columns:", untrained_data.columns.tolist())
trained_data.to_csv(DATA_PROCESSED / 'trained_roi_data.csv', index=False)
untrained_data.to_csv(DATA_PROCESSED / 'untrained_roi_data.csv', index=False)
print("Data saved to trained_roi_data.csv and untrained_roi_data.csv")
data = load_roi_data()
print(f"shape: {data.shape}")
if not data.empty:
print(f"columns: {list(data.columns)}")
print(f"sessions: {data['session'].value_counts().to_dict()}")
print(f"unique machines: {data['machine_name'].nunique()}")
print(
f"unique flies (date,machine,roi): "
f"{data.groupby(['date','machine_name','roi']).ngroups}"
)