- Tracked DBs now live at /mnt/data/projects/cupido/tracked/ (out of
ownCloud to avoid sync conflicts and bandwidth churn). config.py
TRACKING_OUTPUT_DIR points there; the docker-compose for ethoscope-lab
mounts it world-readable for JupyterHub users.
- New scripts/export_video_db_index.py joins all_video_info_merged.xlsx
with the video inventory and the on-disk DBs, producing a TSV that has
one row per fly/ROI plus training/testing video and DB paths. Handles
approximate xlsx times, cross-day training/testing, the 12 AM/PM
ambiguity, and date typos.
- scripts/load_roi_data.py rewritten as a TSV-driven loader returning a
single DataFrame with session and metadata columns. calculate_distances
and the two flies_analysis notebooks migrated to use it; downstream
trained/naive splits remain available via simple equality filters.
- Metadata vocabulary canonicalized: {naïve, niave, untrained, test} all
resolve to {trained, naive}. Normalization happens at the TSV-export
boundary (idempotent); the xlsx and the 2025-07-15 legacy CSV were
edited in place to remove the worst variants.
- scripts/monitor_tracking.py rate calculation fixed: with N parallel
workers, completions arrive in bursts; the old formula divided by burst
width and reported nonsense rates. Now uses a 6 h window denominator.
- scripts/track_videos.py: BGRMovieCamera retries cv2.read on transient
NFS hiccups and a post-tracking completeness gate (≥ 90 % of expected
duration via MAX(t) across all 6 ROIs) deletes silent partial DBs.
Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
113 lines
3.9 KiB
Python
113 lines
3.9 KiB
Python
"""Load ROI tracking data from all sessions into one DataFrame.
|
|
|
|
Drives off the merged TSV (one row per ROI/fly across training + testing
|
|
phases). For each TSV row, opens the corresponding tracking DB and pulls
|
|
the matching ROI table, then attaches the experimental metadata.
|
|
|
|
The TSV is the single source of truth for what data exists and how it
|
|
maps to flies and conditions.
|
|
"""
|
|
|
|
import sqlite3
|
|
from pathlib import Path
|
|
|
|
import pandas as pd
|
|
|
|
from config import VIDEO_INFO_XLSX
|
|
|
|
|
|
# Metadata columns to copy onto every tracking sample. These are the xlsx
|
|
# fields that describe the experimental condition behind each fly/ROI.
|
|
# Reason: the ROI column is uppercase ("ROI") for backwards compatibility
|
|
# with the existing analysis pipeline (calculate_distances.py, notebooks).
|
|
_META_COLS = (
|
|
"date",
|
|
"machine_name",
|
|
"species",
|
|
"male",
|
|
"training_date_time",
|
|
"testing_date_time",
|
|
"training_length_hr",
|
|
"consolidation_length_hr",
|
|
"memory",
|
|
"age",
|
|
)
|
|
|
|
|
|
def _open_ro(db_path: str, cache: dict) -> sqlite3.Connection | None:
|
|
"""Cached read-only sqlite connection. Returns None on failure."""
|
|
if not isinstance(db_path, str) or not db_path:
|
|
return None
|
|
if db_path not in cache:
|
|
try:
|
|
cache[db_path] = sqlite3.connect(f"file:{db_path}?mode=ro", uri=True)
|
|
except sqlite3.Error as e:
|
|
print(f"failed to open {Path(db_path).name}: {e}")
|
|
cache[db_path] = None
|
|
return cache[db_path]
|
|
|
|
|
|
def load_roi_data(meta: pd.DataFrame | None = None) -> pd.DataFrame:
|
|
"""Load ROI tracking data joined with experimental metadata.
|
|
|
|
For each row in ``meta``, reads the matching ROI table from both the
|
|
training DB and the testing DB (whichever exist), and stamps every
|
|
sample with the row's metadata plus a ``session`` column
|
|
(``"training"`` or ``"testing"``). Rows with empty DB paths (unusable
|
|
videos, or videos that didn't pass the completeness gate) are skipped.
|
|
|
|
Args:
|
|
meta: optional DataFrame with the same schema as
|
|
``all_video_info_merged.tsv``. Pass a filtered slice to load a
|
|
subset (e.g. ``meta[meta.species == 'Melanogaster/CS']``).
|
|
Defaults to the full TSV.
|
|
|
|
Returns:
|
|
DataFrame with columns ``id, t, x, y, w, h, phi, is_inferred,
|
|
has_interacted, session, <metadata>`` — one row per tracking
|
|
sample. Empty if nothing could be loaded.
|
|
"""
|
|
if meta is None:
|
|
meta = pd.read_csv(VIDEO_INFO_XLSX.with_suffix(".tsv"), sep="\t")
|
|
|
|
db_cache: dict = {}
|
|
chunks: list[pd.DataFrame] = []
|
|
|
|
for row in meta.itertuples(index=False):
|
|
for session in ("training", "testing"):
|
|
conn = _open_ro(getattr(row, f"{session}_db_path"), db_cache)
|
|
if conn is None:
|
|
continue
|
|
try:
|
|
df = pd.read_sql_query(
|
|
f"SELECT * FROM ROI_{int(row.roi)}", conn
|
|
)
|
|
except Exception as e:
|
|
# Reason: a DB may be missing a ROI table if tracking was
|
|
# partial — skip rather than abort the whole batch.
|
|
print(f" ROI_{row.roi} from {session} DB: {e}")
|
|
continue
|
|
df["session"] = session
|
|
df["ROI"] = int(row.roi)
|
|
for col in _META_COLS:
|
|
df[col] = getattr(row, col)
|
|
chunks.append(df)
|
|
|
|
for conn in db_cache.values():
|
|
if conn is not None:
|
|
conn.close()
|
|
|
|
return pd.concat(chunks, ignore_index=True) if chunks else pd.DataFrame()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
data = load_roi_data()
|
|
print(f"shape: {data.shape}")
|
|
if not data.empty:
|
|
print(f"columns: {list(data.columns)}")
|
|
print(f"sessions: {data['session'].value_counts().to_dict()}")
|
|
print(f"unique machines: {data['machine_name'].nunique()}")
|
|
print(
|
|
f"unique flies (date,machine,roi): "
|
|
f"{data.groupby(['date','machine_name','roi']).ngroups}"
|
|
)
|