diff --git a/.gitignore b/.gitignore
index 02d5434..07f3445 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,11 +2,8 @@
 data/raw/*.db
 data/processed/*.csv
 
-# Offline-tracking outputs (reproducible from videos + target JSONs)
-data/tracked/*.db
-data/tracked/*.db-wal
-data/tracked/*.db-shm
-data/tracked/*.db-journal
+# Offline-tracking outputs (regenerable from videos + target JSONs)
+# DBs live outside the repo at /mnt/data/projects/cupido/tracked/
 data/targets/*.json
 data/metadata/video_inventory.csv
 data/logs/*.log
diff --git a/README.md b/README.md
index 9d9ff17..5644fea 100644
--- a/README.md
+++ b/README.md
@@ -66,7 +66,7 @@ python scripts/pick_targets.py --redo      # re-pick already-picked videos
 
 # 3) batch tracking (idempotent, can run in background)
 python scripts/track_videos.py --jobs 4    # parallel
-# output → data/tracked/*_tracking.db (SQLite, same schema as data/raw/)
+# output → /mnt/data/projects/cupido/tracked/*_tracking.db (SQLite, same schema as data/raw/)
 ```
 
 See `tasks/todo.md` "Offline Tracking" section for the full plan, and
diff --git a/data/metadata/2025_07_15_metadata_fixed.csv b/data/metadata/2025_07_15_metadata_fixed.csv
index 36d07c5..bce7bcc 100644
--- a/data/metadata/2025_07_15_metadata_fixed.csv
+++ b/data/metadata/2025_07_15_metadata_fixed.csv
@@ -1,37 +1,37 @@
-﻿date,HHMMSS,machine_name,ROI,genotype,group,path,filesize_mb
+date,HHMMSS,machine_name,ROI,genotype,group,path,filesize_mb
 15/07/2025,16-03-10,76,6,CS,trained,/mnt/ethoscope_data/videos/076e2825a7274661bd0697c42d6fa4c0/ETHOSCOPE_076/2025-07-15_16-03-10/2025-07-15_16-03-10_076e2825a7274661bd0697c42d6fa4c0__1920x1088@25fps-28q_merged.mp4,59.4
-15/07/2025,16-03-10,76,4,CS,untrained,/mnt/ethoscope_data/videos/076e2825a7274661bd0697c42d6fa4c0/ETHOSCOPE_076/2025-07-15_16-03-10/2025-07-15_16-03-10_076e2825a7274661bd0697c42d6fa4c0__1920x1088@25fps-28q_merged.mp4,59.4
+15/07/2025,16-03-10,76,4,CS,naive,/mnt/ethoscope_data/videos/076e2825a7274661bd0697c42d6fa4c0/ETHOSCOPE_076/2025-07-15_16-03-10/2025-07-15_16-03-10_076e2825a7274661bd0697c42d6fa4c0__1920x1088@25fps-28q_merged.mp4,59.4
 15/07/2025,16-03-10,76,2,CS,trained,/mnt/ethoscope_data/videos/076e2825a7274661bd0697c42d6fa4c0/ETHOSCOPE_076/2025-07-15_16-03-10/2025-07-15_16-03-10_076e2825a7274661bd0697c42d6fa4c0__1920x1088@25fps-28q_merged.mp4,59.4
-15/07/2025,16-03-10,76,5,CS,untrained,/mnt/ethoscope_data/videos/076e2825a7274661bd0697c42d6fa4c0/ETHOSCOPE_076/2025-07-15_16-03-10/2025-07-15_16-03-10_076e2825a7274661bd0697c42d6fa4c0__1920x1088@25fps-28q_merged.mp4,59.4
+15/07/2025,16-03-10,76,5,CS,naive,/mnt/ethoscope_data/videos/076e2825a7274661bd0697c42d6fa4c0/ETHOSCOPE_076/2025-07-15_16-03-10/2025-07-15_16-03-10_076e2825a7274661bd0697c42d6fa4c0__1920x1088@25fps-28q_merged.mp4,59.4
 15/07/2025,16-03-10,76,3,CS,trained,/mnt/ethoscope_data/videos/076e2825a7274661bd0697c42d6fa4c0/ETHOSCOPE_076/2025-07-15_16-03-10/2025-07-15_16-03-10_076e2825a7274661bd0697c42d6fa4c0__1920x1088@25fps-28q_merged.mp4,59.4
-15/07/2025,16-03-10,76,1,CS,untrained,/mnt/ethoscope_data/videos/076e2825a7274661bd0697c42d6fa4c0/ETHOSCOPE_076/2025-07-15_16-03-10/2025-07-15_16-03-10_076e2825a7274661bd0697c42d6fa4c0__1920x1088@25fps-28q_merged.mp4,59.4
+15/07/2025,16-03-10,76,1,CS,naive,/mnt/ethoscope_data/videos/076e2825a7274661bd0697c42d6fa4c0/ETHOSCOPE_076/2025-07-15_16-03-10/2025-07-15_16-03-10_076e2825a7274661bd0697c42d6fa4c0__1920x1088@25fps-28q_merged.mp4,59.4
 15/07/2025,16-31-34,76,6,CS,trained,/mnt/ethoscope_data/videos/076e2825a7274661bd0697c42d6fa4c0/ETHOSCOPE_076/2025-07-15_16-31-34/2025-07-15_16-31-34_076e2825a7274661bd0697c42d6fa4c0__1920x1088@25fps-28q_merged.mp4,78.98
 15/07/2025,16-31-34,76,4,CS,trained,/mnt/ethoscope_data/videos/076e2825a7274661bd0697c42d6fa4c0/ETHOSCOPE_076/2025-07-15_16-31-34/2025-07-15_16-31-34_076e2825a7274661bd0697c42d6fa4c0__1920x1088@25fps-28q_merged.mp4,78.98
 15/07/2025,16-31-34,76,2,CS,trained,/mnt/ethoscope_data/videos/076e2825a7274661bd0697c42d6fa4c0/ETHOSCOPE_076/2025-07-15_16-31-34/2025-07-15_16-31-34_076e2825a7274661bd0697c42d6fa4c0__1920x1088@25fps-28q_merged.mp4,78.98
-15/07/2025,16-31-34,76,5,CS,untrained,/mnt/ethoscope_data/videos/076e2825a7274661bd0697c42d6fa4c0/ETHOSCOPE_076/2025-07-15_16-31-34/2025-07-15_16-31-34_076e2825a7274661bd0697c42d6fa4c0__1920x1088@25fps-28q_merged.mp4,78.98
-15/07/2025,16-31-34,76,3,CS,untrained,/mnt/ethoscope_data/videos/076e2825a7274661bd0697c42d6fa4c0/ETHOSCOPE_076/2025-07-15_16-31-34/2025-07-15_16-31-34_076e2825a7274661bd0697c42d6fa4c0__1920x1088@25fps-28q_merged.mp4,78.98
-15/07/2025,16-31-34,76,1,CS,untrained,/mnt/ethoscope_data/videos/076e2825a7274661bd0697c42d6fa4c0/ETHOSCOPE_076/2025-07-15_16-31-34/2025-07-15_16-31-34_076e2825a7274661bd0697c42d6fa4c0__1920x1088@25fps-28q_merged.mp4,78.98
+15/07/2025,16-31-34,76,5,CS,naive,/mnt/ethoscope_data/videos/076e2825a7274661bd0697c42d6fa4c0/ETHOSCOPE_076/2025-07-15_16-31-34/2025-07-15_16-31-34_076e2825a7274661bd0697c42d6fa4c0__1920x1088@25fps-28q_merged.mp4,78.98
+15/07/2025,16-31-34,76,3,CS,naive,/mnt/ethoscope_data/videos/076e2825a7274661bd0697c42d6fa4c0/ETHOSCOPE_076/2025-07-15_16-31-34/2025-07-15_16-31-34_076e2825a7274661bd0697c42d6fa4c0__1920x1088@25fps-28q_merged.mp4,78.98
+15/07/2025,16-31-34,76,1,CS,naive,/mnt/ethoscope_data/videos/076e2825a7274661bd0697c42d6fa4c0/ETHOSCOPE_076/2025-07-15_16-31-34/2025-07-15_16-31-34_076e2825a7274661bd0697c42d6fa4c0__1920x1088@25fps-28q_merged.mp4,78.98
 15/07/2025,16-03-27,145,6,CS,trained,/mnt/ethoscope_data/videos/145bb573497a4e15b0690206748a3af6/ETHOSCOPE_145/2025-07-15_16-03-27/2025-07-15_16-03-27_145bb573497a4e15b0690206748a3af6__1920x1088@25fps-28q_merged.mp4,78.72
 15/07/2025,16-03-27,145,4,CS,trained,/mnt/ethoscope_data/videos/145bb573497a4e15b0690206748a3af6/ETHOSCOPE_145/2025-07-15_16-03-27/2025-07-15_16-03-27_145bb573497a4e15b0690206748a3af6__1920x1088@25fps-28q_merged.mp4,78.72
 15/07/2025,16-03-27,145,2,CS,trained,/mnt/ethoscope_data/videos/145bb573497a4e15b0690206748a3af6/ETHOSCOPE_145/2025-07-15_16-03-27/2025-07-15_16-03-27_145bb573497a4e15b0690206748a3af6__1920x1088@25fps-28q_merged.mp4,78.72
-15/07/2025,16-03-27,145,5,CS,untrained,/mnt/ethoscope_data/videos/145bb573497a4e15b0690206748a3af6/ETHOSCOPE_145/2025-07-15_16-03-27/2025-07-15_16-03-27_145bb573497a4e15b0690206748a3af6__1920x1088@25fps-28q_merged.mp4,78.72
-15/07/2025,16-03-27,145,3,CS,untrained,/mnt/ethoscope_data/videos/145bb573497a4e15b0690206748a3af6/ETHOSCOPE_145/2025-07-15_16-03-27/2025-07-15_16-03-27_145bb573497a4e15b0690206748a3af6__1920x1088@25fps-28q_merged.mp4,78.72
-15/07/2025,16-03-27,145,1,CS,untrained,/mnt/ethoscope_data/videos/145bb573497a4e15b0690206748a3af6/ETHOSCOPE_145/2025-07-15_16-03-27/2025-07-15_16-03-27_145bb573497a4e15b0690206748a3af6__1920x1088@25fps-28q_merged.mp4,78.72
+15/07/2025,16-03-27,145,5,CS,naive,/mnt/ethoscope_data/videos/145bb573497a4e15b0690206748a3af6/ETHOSCOPE_145/2025-07-15_16-03-27/2025-07-15_16-03-27_145bb573497a4e15b0690206748a3af6__1920x1088@25fps-28q_merged.mp4,78.72
+15/07/2025,16-03-27,145,3,CS,naive,/mnt/ethoscope_data/videos/145bb573497a4e15b0690206748a3af6/ETHOSCOPE_145/2025-07-15_16-03-27/2025-07-15_16-03-27_145bb573497a4e15b0690206748a3af6__1920x1088@25fps-28q_merged.mp4,78.72
+15/07/2025,16-03-27,145,1,CS,naive,/mnt/ethoscope_data/videos/145bb573497a4e15b0690206748a3af6/ETHOSCOPE_145/2025-07-15_16-03-27/2025-07-15_16-03-27_145bb573497a4e15b0690206748a3af6__1920x1088@25fps-28q_merged.mp4,78.72
 15/07/2025,16-31-41,145,6,CS,trained,/mnt/ethoscope_data/videos/145bb573497a4e15b0690206748a3af6/ETHOSCOPE_145/2025-07-15_16-31-41/2025-07-15_16-31-41_145bb573497a4e15b0690206748a3af6__1920x1088@25fps-28q_merged.mp4,90.9
 15/07/2025,16-31-41,145,4,CS,trained,/mnt/ethoscope_data/videos/145bb573497a4e15b0690206748a3af6/ETHOSCOPE_145/2025-07-15_16-31-41/2025-07-15_16-31-41_145bb573497a4e15b0690206748a3af6__1920x1088@25fps-28q_merged.mp4,90.9
 15/07/2025,16-31-41,145,2,CS,trained,/mnt/ethoscope_data/videos/145bb573497a4e15b0690206748a3af6/ETHOSCOPE_145/2025-07-15_16-31-41/2025-07-15_16-31-41_145bb573497a4e15b0690206748a3af6__1920x1088@25fps-28q_merged.mp4,90.9
-15/07/2025,16-31-41,145,5,CS,untrained,/mnt/ethoscope_data/videos/145bb573497a4e15b0690206748a3af6/ETHOSCOPE_145/2025-07-15_16-31-41/2025-07-15_16-31-41_145bb573497a4e15b0690206748a3af6__1920x1088@25fps-28q_merged.mp4,90.9
-15/07/2025,16-31-41,145,3,CS,untrained,/mnt/ethoscope_data/videos/145bb573497a4e15b0690206748a3af6/ETHOSCOPE_145/2025-07-15_16-31-41/2025-07-15_16-31-41_145bb573497a4e15b0690206748a3af6__1920x1088@25fps-28q_merged.mp4,90.9
-15/07/2025,16-31-41,145,1,CS,untrained,/mnt/ethoscope_data/videos/145bb573497a4e15b0690206748a3af6/ETHOSCOPE_145/2025-07-15_16-31-41/2025-07-15_16-31-41_145bb573497a4e15b0690206748a3af6__1920x1088@25fps-28q_merged.mp4,90.9
+15/07/2025,16-31-41,145,5,CS,naive,/mnt/ethoscope_data/videos/145bb573497a4e15b0690206748a3af6/ETHOSCOPE_145/2025-07-15_16-31-41/2025-07-15_16-31-41_145bb573497a4e15b0690206748a3af6__1920x1088@25fps-28q_merged.mp4,90.9
+15/07/2025,16-31-41,145,3,CS,naive,/mnt/ethoscope_data/videos/145bb573497a4e15b0690206748a3af6/ETHOSCOPE_145/2025-07-15_16-31-41/2025-07-15_16-31-41_145bb573497a4e15b0690206748a3af6__1920x1088@25fps-28q_merged.mp4,90.9
+15/07/2025,16-31-41,145,1,CS,naive,/mnt/ethoscope_data/videos/145bb573497a4e15b0690206748a3af6/ETHOSCOPE_145/2025-07-15_16-31-41/2025-07-15_16-31-41_145bb573497a4e15b0690206748a3af6__1920x1088@25fps-28q_merged.mp4,90.9
 15/07/2025,16-31-52,139,6,CS,trained,/mnt/ethoscope_data/videos/13924be2046d49f4a641cef2a5559852/ETHOSCOPE_139/2025-07-15_16-31-52/2025-07-15_16-31-52_13924be2046d49f4a641cef2a5559852__1920x1088@25fps-28q_merged.mp4,73.4
 15/07/2025,16-31-52,139,4,CS,trained,/mnt/ethoscope_data/videos/13924be2046d49f4a641cef2a5559852/ETHOSCOPE_139/2025-07-15_16-31-52/2025-07-15_16-31-52_13924be2046d49f4a641cef2a5559852__1920x1088@25fps-28q_merged.mp4,73.4
 15/07/2025,16-31-52,139,2,CS,trained,/mnt/ethoscope_data/videos/13924be2046d49f4a641cef2a5559852/ETHOSCOPE_139/2025-07-15_16-31-52/2025-07-15_16-31-52_13924be2046d49f4a641cef2a5559852__1920x1088@25fps-28q_merged.mp4,73.4
-15/07/2025,16-31-52,139,5,CS,untrained,/mnt/ethoscope_data/videos/13924be2046d49f4a641cef2a5559852/ETHOSCOPE_139/2025-07-15_16-31-52/2025-07-15_16-31-52_13924be2046d49f4a641cef2a5559852__1920x1088@25fps-28q_merged.mp4,73.4
-15/07/2025,16-31-52,139,3,CS,untrained,/mnt/ethoscope_data/videos/13924be2046d49f4a641cef2a5559852/ETHOSCOPE_139/2025-07-15_16-31-52/2025-07-15_16-31-52_13924be2046d49f4a641cef2a5559852__1920x1088@25fps-28q_merged.mp4,73.4
-15/07/2025,16-31-52,139,1,CS,untrained,/mnt/ethoscope_data/videos/13924be2046d49f4a641cef2a5559852/ETHOSCOPE_139/2025-07-15_16-31-52/2025-07-15_16-31-52_13924be2046d49f4a641cef2a5559852__1920x1088@25fps-28q_merged.mp4,73.4
-15/07/2025,16-32-05,268,6,CS,untrained,/mnt/ethoscope_data/videos/268102f92f51486f995200c29d980477/ETHOSCOPE_268/2025-07-15_16-32-05/2025-07-15_16-32-05_268102f92f51486f995200c29d980477__1920x1088@25fps-28q_merged.mp4,43.72
-15/07/2025,16-32-05,268,4,CS,untrained,/mnt/ethoscope_data/videos/268102f92f51486f995200c29d980477/ETHOSCOPE_268/2025-07-15_16-32-05/2025-07-15_16-32-05_268102f92f51486f995200c29d980477__1920x1088@25fps-28q_merged.mp4,43.72
-15/07/2025,16-32-05,268,2,CS,untrained,/mnt/ethoscope_data/videos/268102f92f51486f995200c29d980477/ETHOSCOPE_268/2025-07-15_16-32-05/2025-07-15_16-32-05_268102f92f51486f995200c29d980477__1920x1088@25fps-28q_merged.mp4,43.72
+15/07/2025,16-31-52,139,5,CS,naive,/mnt/ethoscope_data/videos/13924be2046d49f4a641cef2a5559852/ETHOSCOPE_139/2025-07-15_16-31-52/2025-07-15_16-31-52_13924be2046d49f4a641cef2a5559852__1920x1088@25fps-28q_merged.mp4,73.4
+15/07/2025,16-31-52,139,3,CS,naive,/mnt/ethoscope_data/videos/13924be2046d49f4a641cef2a5559852/ETHOSCOPE_139/2025-07-15_16-31-52/2025-07-15_16-31-52_13924be2046d49f4a641cef2a5559852__1920x1088@25fps-28q_merged.mp4,73.4
+15/07/2025,16-31-52,139,1,CS,naive,/mnt/ethoscope_data/videos/13924be2046d49f4a641cef2a5559852/ETHOSCOPE_139/2025-07-15_16-31-52/2025-07-15_16-31-52_13924be2046d49f4a641cef2a5559852__1920x1088@25fps-28q_merged.mp4,73.4
+15/07/2025,16-32-05,268,6,CS,naive,/mnt/ethoscope_data/videos/268102f92f51486f995200c29d980477/ETHOSCOPE_268/2025-07-15_16-32-05/2025-07-15_16-32-05_268102f92f51486f995200c29d980477__1920x1088@25fps-28q_merged.mp4,43.72
+15/07/2025,16-32-05,268,4,CS,naive,/mnt/ethoscope_data/videos/268102f92f51486f995200c29d980477/ETHOSCOPE_268/2025-07-15_16-32-05/2025-07-15_16-32-05_268102f92f51486f995200c29d980477__1920x1088@25fps-28q_merged.mp4,43.72
+15/07/2025,16-32-05,268,2,CS,naive,/mnt/ethoscope_data/videos/268102f92f51486f995200c29d980477/ETHOSCOPE_268/2025-07-15_16-32-05/2025-07-15_16-32-05_268102f92f51486f995200c29d980477__1920x1088@25fps-28q_merged.mp4,43.72
 15/07/2025,16-32-05,268,5,CS,trained,/mnt/ethoscope_data/videos/268102f92f51486f995200c29d980477/ETHOSCOPE_268/2025-07-15_16-32-05/2025-07-15_16-32-05_268102f92f51486f995200c29d980477__1920x1088@25fps-28q_merged.mp4,43.72
 15/07/2025,16-32-05,268,3,CS,trained,/mnt/ethoscope_data/videos/268102f92f51486f995200c29d980477/ETHOSCOPE_268/2025-07-15_16-32-05/2025-07-15_16-32-05_268102f92f51486f995200c29d980477__1920x1088@25fps-28q_merged.mp4,43.72
 15/07/2025,16-32-05,268,1,CS,trained,/mnt/ethoscope_data/videos/268102f92f51486f995200c29d980477/ETHOSCOPE_268/2025-07-15_16-32-05/2025-07-15_16-32-05_268102f92f51486f995200c29d980477__1920x1088@25fps-28q_merged.mp4,43.72
diff --git a/data/processed/README.md b/data/processed/README.md
index 97d2e82..d934460 100644
--- a/data/processed/README.md
+++ b/data/processed/README.md
@@ -1,39 +1,47 @@
 # Processed Data
 
-Large CSV files generated from the analysis pipeline. All files are gitignored (~370MB total) and can be regenerated.
+CSVs derived from the tracking DBs (`/mnt/data/projects/cupido/tracked/`)
+and the merged TSV (`../../all_video_info_merged.tsv`). All files are
+gitignored and regenerable.
 
 ## Files and Regeneration
 
 | File | Description | Generated By |
 |------|-------------|--------------|
-| `trained_roi_data.csv` | Raw tracking data for trained ROIs | `scripts/load_roi_data.py` or notebook step 1 |
-| `untrained_roi_data.csv` | Raw tracking data for untrained ROIs | `scripts/load_roi_data.py` or notebook step 1 |
-| `trained_distances.csv` | Pairwise distances (unaligned) | `scripts/calculate_distances.py` |
-| `untrained_distances.csv` | Pairwise distances (unaligned) | `scripts/calculate_distances.py` |
-| `trained_distances_aligned.csv` | Distances aligned to barrier opening | Notebook step 4 |
-| `untrained_distances_aligned.csv` | Distances aligned to barrier opening | Notebook step 4 |
-| `trained_tracked.csv` | Identity-tracked fly positions | Notebook step 7 |
-| `untrained_tracked.csv` | Identity-tracked fly positions | Notebook step 7 |
-| `trained_max_velocity.csv` | Max velocity over 10s windows | Notebook step 7 |
-| `untrained_max_velocity.csv` | Max velocity over 10s windows | Notebook step 7 |
+| `distances.csv` | Per-frame inter-fly distances for every (date, machine, ROI, session). Includes metadata columns to filter trained vs naïve, training phase, species, etc. | `scripts/calculate_distances.py` |
+| `*_distances_aligned.csv` | (legacy, 2025-07-15 only) distances aligned to barrier opening | `notebooks/flies_analysis*.ipynb` |
+| `*_tracked.csv` | (legacy) identity-tracked fly positions | `notebooks/flies_analysis_simple.ipynb` |
+| `*_max_velocity.csv` | (legacy) max velocity over 10 s windows | `notebooks/flies_analysis_simple.ipynb` |
 
-## To Regenerate All Data
+## Loading the data
 
-Run the full notebook `notebooks/flies_analysis_simple.ipynb` with:
 ```python
-recalculate_distances = True
-recalculate_tracking = True
+import sys
+sys.path.insert(0, "../scripts")
+from load_roi_data import load_roi_data
+
+data = load_roi_data()              # full batch as one DataFrame
+# Or filter the metadata first:
+import pandas as pd
+tsv = pd.read_csv("../../all_video_info_merged.tsv", sep="\t")
+data = load_roi_data(tsv[tsv.species.str.contains("Melanogaster")])
 ```
 
-**Warning**: Identity tracking and velocity calculations take significant time (~30+ minutes).
+The returned DataFrame has columns:
+`id, t, x, y, w, h, phi, is_inferred, has_interacted, session, ROI, date,
+machine_name, species, male, training_date_time, testing_date_time,
+training_length_hr, consolidation_length_hr, memory, age`.
 
-## Column Reference
+`session` is `"training"` or `"testing"`; `male` is `"trained"` or
+`"naive"` (canonical — variants like `"naïve"` and `"niave"` are normalized
+at the TSV-export step).
 
-### Distance CSVs (`*_distances_aligned.csv`)
-- `machine_name`: Ethoscope machine ID (string)
-- `ROI`: ROI number (1-6)
-- `aligned_time`: Time in ms relative to barrier opening (0 = opening)
-- `distance`: Euclidean distance between flies in pixels
-- `n_flies`: Number of flies detected at this time point
-- `area_fly1`, `area_fly2`: Bounding box areas (w*h) in pixels^2
-- `group`: "trained" or "untrained"
+## Column Reference (`distances.csv`)
+
+- `date`, `machine_name`, `ROI`, `session`: identifies one fly trajectory
+- `t`: time in ms within that session
+- `distance`: Euclidean distance between the two flies in pixels
+- `n_flies`: number of fly detections at this frame (1 or 2)
+- `area_fly1`, `area_fly2`: bounding-box areas (`w * h`) in pixels²
+- `male`: `trained` or `naive` (carried from the xlsx; normalized)
+- `species`, `memory`, `age`: experimental metadata
diff --git a/notebooks/flies_analysis.ipynb b/notebooks/flies_analysis.ipynb
index d9c24e3..9bf3a30 100644
--- a/notebooks/flies_analysis.ipynb
+++ b/notebooks/flies_analysis.ipynb
@@ -28,7 +28,22 @@
    "execution_count": null,
    "metadata": {},
    "outputs": [],
-   "source": "def load_roi_data():\n    \"\"\"Load ROI data from SQLite databases and group by trained/untrained\"\"\"\n    metadata = pd.read_csv(DATA_METADATA / '2025_07_15_metadata_fixed.csv')\n    metadata['machine_name'] = metadata['machine_name'].astype(str)\n    \n    trained_rois = metadata[metadata['group'] == 'trained']\n    untrained_rois = metadata[metadata['group'] == 'untrained']\n    \n    db_files = list(DATA_RAW.glob('*_tracking.db'))\n    \n    trained_df = pd.DataFrame()\n    untrained_df = pd.DataFrame()\n    \n    for db_file in db_files:\n        print(f\"Processing {db_file.name}\")\n        \n        pattern = r'_([0-9a-f]{32})__'\n        match = re.search(pattern, db_file.name)\n        \n        if not match:\n            print(f\"Could not extract UUID from {db_file.name}\")\n            continue\n            \n        uuid = match.group(1)\n        metadata_matches = metadata[metadata['path'].str.contains(uuid, na=False)]\n        \n        if metadata_matches.empty:\n            print(f\"No metadata matches found for UUID {uuid}\")\n            continue\n            \n        machine_id = metadata_matches.iloc[0]['machine_name']\n        print(f\"Matched to machine ID: {machine_id}\")\n        \n        conn = sqlite3.connect(str(db_file))\n        \n        machine_trained = trained_rois[trained_rois['machine_name'] == machine_id]\n        machine_untrained = untrained_rois[untrained_rois['machine_name'] == machine_id]\n        \n        for _, row in machine_trained.iterrows():\n            roi = row['ROI']\n            try:\n                roi_data = pd.read_sql_query(f\"SELECT * FROM ROI_{roi}\", conn)\n                roi_data['machine_name'] = machine_id\n                roi_data['ROI'] = roi\n                roi_data['group'] = 'trained'\n                trained_df = pd.concat([trained_df, roi_data], ignore_index=True)\n            except Exception as e:\n                print(f\"Error loading ROI_{roi}: {e}\")\n        \n        for _, row in machine_untrained.iterrows():\n            roi = row['ROI']\n            try:\n                roi_data = pd.read_sql_query(f\"SELECT * FROM ROI_{roi}\", conn)\n                roi_data['machine_name'] = machine_id\n                roi_data['ROI'] = roi\n                roi_data['group'] = 'untrained'\n                untrained_df = pd.concat([untrained_df, roi_data], ignore_index=True)\n            except Exception as e:\n                print(f\"Error loading ROI_{roi}: {e}\")\n        \n        conn.close()\n    \n    return trained_df, untrained_df\n\ntrained_data, untrained_data = load_roi_data()\nprint(f\"Trained data shape: {trained_data.shape}\")\nprint(f\"Untrained data shape: {untrained_data.shape}\")\n\ntrained_data.to_csv(DATA_PROCESSED / 'trained_roi_data.csv', index=False)\nuntrained_data.to_csv(DATA_PROCESSED / 'untrained_roi_data.csv', index=False)\nprint(\"Data saved to CSV files\")"
+   "source": [
+    "# Load tracking data via the unified loader (driven by all_video_info_merged.tsv).\n",
+    "# Reason: replaces the old data/raw + 2025_07_15_metadata_fixed.csv path with\n",
+    "# the TSV-based loader that covers the entire batch (2025-07-15 + 2024).\n",
+    "sys.path.insert(0, str(PROJECT_ROOT / 'scripts'))\n",
+    "from load_roi_data import load_roi_data\n",
+    "\n",
+    "data = load_roi_data()\n",
+    "# Backwards-compat slices for the rest of the notebook.\n",
+    "trained_data   = data[data['male'] == 'trained'].copy()\n",
+    "untrained_data = data[data['male'] == 'naive'].copy()\n",
+    "\n",
+    "print(f\"all data:  {data.shape}\")\n",
+    "print(f\"trained:   {trained_data.shape}\")\n",
+    "print(f\"naive:     {untrained_data.shape}\")\n"
+   ]
   },
   {
    "cell_type": "markdown",
@@ -219,4 +234,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 4
-}
\ No newline at end of file
+}
diff --git a/notebooks/flies_analysis_simple.ipynb b/notebooks/flies_analysis_simple.ipynb
index 1663b10..7072c73 100644
--- a/notebooks/flies_analysis_simple.ipynb
+++ b/notebooks/flies_analysis_simple.ipynb
@@ -28,7 +28,22 @@
    "execution_count": null,
    "metadata": {},
    "outputs": [],
-   "source": "# Load the pre-processed data\ntrained_data = pd.read_csv(DATA_PROCESSED / 'trained_roi_data.csv')\nuntrained_data = pd.read_csv(DATA_PROCESSED / 'untrained_roi_data.csv')\n\nprint(f\"Trained data shape: {trained_data.shape}\")\nprint(f\"Untrained data shape: {untrained_data.shape}\")\nprint(f\"Trained data columns: {list(trained_data.columns)}\")\nprint(f\"Untrained data columns: {list(untrained_data.columns)}\")"
+   "source": [
+    "# Load tracking data via the unified loader (driven by all_video_info_merged.tsv).\n",
+    "# Reason: replaces reads of trained_roi_data.csv / untrained_roi_data.csv with\n",
+    "# the live loader so the notebook always sees the current batch.\n",
+    "sys.path.insert(0, str(PROJECT_ROOT / 'scripts'))\n",
+    "from load_roi_data import load_roi_data\n",
+    "\n",
+    "data = load_roi_data()\n",
+    "trained_data   = data[data['male'] == 'trained'].copy()\n",
+    "untrained_data = data[data['male'] == 'naive'].copy()\n",
+    "\n",
+    "print(f\"all data shape:    {data.shape}\")\n",
+    "print(f\"Trained data:      {trained_data.shape}\")\n",
+    "print(f\"Naive data:        {untrained_data.shape}\")\n",
+    "print(f\"Columns:           {list(trained_data.columns)}\")\n"
+   ]
   },
   {
    "cell_type": "markdown",
@@ -418,4 +433,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 4
-}
\ No newline at end of file
+}
diff --git a/scripts/calculate_distances.py b/scripts/calculate_distances.py
index 09eff9a..75e7a1a 100644
--- a/scripts/calculate_distances.py
+++ b/scripts/calculate_distances.py
@@ -1,117 +1,99 @@
-import pandas as pd
+"""Compute per-frame inter-fly distances for every (date, machine, ROI, session).
+
+Reads tracking data via :func:`load_roi_data.load_roi_data` (which is driven
+by ``all_video_info_merged.tsv``) and produces one distances DataFrame
+spanning every fly/session in the batch. Group membership (``trained`` /
+``untrained``) is preserved from the ``male`` column.
+"""
+
 import numpy as np
+import pandas as pd
 from scipy.spatial.distance import euclidean
 
 from config import DATA_PROCESSED
+from load_roi_data import load_roi_data
 
 
-def calculate_fly_distances(trained_file=None, untrained_file=None):
-    """Calculate distances between flies at each time point.
+def calculate_fly_distances(data: pd.DataFrame | None = None) -> pd.DataFrame:
+    """Compute inter-fly distances over time for every fly/session.
 
-    For each time point:
-    - If two flies are detected: calculate Cartesian distance between them
-    - If one fly is detected: set distance to 0 if area > average area, otherwise NaN
+    For each time point inside one (date, machine, ROI, session) trajectory:
+    - 2+ flies detected: Euclidean distance between the first two by id
+    - 1 fly detected: distance = 0 if its bbox area exceeds the global
+      mean (likely a single blob containing both flies), else NaN
 
     Args:
-        trained_file (Path): Path to trained ROI data CSV.
-        untrained_file (Path): Path to untrained ROI data CSV.
+        data: optional pre-loaded DataFrame from :func:`load_roi_data`. If
+            None, the full batch is loaded.
 
     Returns:
-        tuple: (trained_distances, untrained_distances) DataFrames.
+        DataFrame with one row per (track, time) pair, including ``distance``,
+        ``n_flies``, ``area_fly1``, ``area_fly2``, plus the metadata columns
+        propagated from the source row (``date``, ``machine_name``, ``ROI``,
+        ``session``, ``male``, ``species``, ``memory``, ``age``).
     """
-    if trained_file is None:
-        trained_file = DATA_PROCESSED / 'trained_roi_data.csv'
-    if untrained_file is None:
-        untrained_file = DATA_PROCESSED / 'untrained_roi_data.csv'
+    if data is None:
+        data = load_roi_data()
+    if data.empty:
+        return pd.DataFrame()
 
-    trained_df = pd.read_csv(trained_file)
-    untrained_df = pd.read_csv(untrained_file)
-
-    trained_df['area'] = trained_df['w'] * trained_df['h']
-    untrained_df['area'] = untrained_df['w'] * untrained_df['h']
-
-    avg_area = np.mean([trained_df['area'].mean(), untrained_df['area'].mean()])
+    data = data.copy()
+    data["area"] = data["w"] * data["h"]
+    avg_area = data["area"].mean()
     print(f"Average area across all data: {avg_area:.2f}")
 
-    trained_distances = process_distance_data(trained_df, avg_area)
-    untrained_distances = process_distance_data(untrained_df, avg_area)
+    # Carry these onto every output row (constant within a track).
+    keep_meta = ["date", "machine_name", "ROI", "session", "male",
+                 "species", "memory", "age"]
 
-    return trained_distances, untrained_distances
-
-
-def process_distance_data(df, avg_area):
-    """Process a DataFrame to calculate distances between flies at each time point.
-
-    Args:
-        df (pd.DataFrame): Input tracking data.
-        avg_area (float): Average area threshold for single-fly detection.
-
-    Returns:
-        pd.DataFrame: Distance data with columns for machine, ROI, time, distance.
-    """
-    results = []
-
-    for (machine_name, roi), group in df.groupby(['machine_name', 'ROI']):
-        for t, time_group in group.groupby('t'):
-            time_group = time_group.sort_values('id').reset_index(drop=True)
+    rows: list[dict] = []
+    track_keys = ["date", "machine_name", "ROI", "session"]
+    for track, track_df in data.groupby(track_keys, sort=False):
+        meta_row = {k: v for k, v in zip(track_keys, track)}
+        # Carry the rest of the metadata from any sample (constant per track).
+        sample = track_df.iloc[0]
+        for col in keep_meta:
+            if col not in meta_row:
+                meta_row[col] = sample[col]
 
+        for t, time_group in track_df.groupby("t", sort=False):
+            time_group = time_group.sort_values("id").reset_index(drop=True)
+            row = dict(meta_row)
+            row["t"] = t
             if len(time_group) >= 2:
-                fly1 = time_group.iloc[0]
-                fly2 = time_group.iloc[1]
-                distance = euclidean([fly1['x'], fly1['y']], [fly2['x'], fly2['y']])
+                f1, f2 = time_group.iloc[0], time_group.iloc[1]
+                row["distance"] = euclidean([f1["x"], f1["y"]], [f2["x"], f2["y"]])
+                row["n_flies"] = len(time_group)
+                row["area_fly1"] = f1["area"]
+                row["area_fly2"] = f2["area"]
+            else:
+                f = time_group.iloc[0]
+                row["distance"] = 0.0 if f["area"] > avg_area else np.nan
+                row["n_flies"] = 1
+                row["area_fly1"] = f["area"]
+                row["area_fly2"] = np.nan
+            rows.append(row)
 
-                results.append({
-                    'machine_name': machine_name,
-                    'ROI': roi,
-                    't': t,
-                    'distance': distance,
-                    'n_flies': len(time_group),
-                    'area_fly1': fly1['area'],
-                    'area_fly2': fly2['area']
-                })
-            elif len(time_group) == 1:
-                fly = time_group.iloc[0]
-                area = fly['area']
-
-                if area > avg_area:
-                    distance = 0.0
-                else:
-                    distance = np.nan
-
-                results.append({
-                    'machine_name': machine_name,
-                    'ROI': roi,
-                    't': t,
-                    'distance': distance,
-                    'n_flies': 1,
-                    'area_fly1': area,
-                    'area_fly2': np.nan
-                })
-
-    return pd.DataFrame(results)
+    return pd.DataFrame(rows)
 
 
-def main():
-    """Run distance calculations and save results."""
-    trained_distances, untrained_distances = calculate_fly_distances()
+def main() -> None:
+    distances = calculate_fly_distances()
 
-    print(f"Trained data distance summary:")
-    print(f"  Shape: {trained_distances.shape}")
-    print(f"  Distance stats:")
-    print(f"    Count: {trained_distances['distance'].count()}")
-    print(f"    Mean: {trained_distances['distance'].mean():.2f}")
-    print(f"    Std: {trained_distances['distance'].std():.2f}")
+    print("\nDistance summary:")
+    print(f"  Shape: {distances.shape}")
+    if not distances.empty:
+        print(f"  Distance count: {distances['distance'].count()}")
+        print(f"  Distance mean:  {distances['distance'].mean():.2f}")
+        print(f"  Distance std:   {distances['distance'].std():.2f}")
+        male = distances["male"]
+        print(f"  Trained tracks: {(male == 'trained').sum()}")
+        print(f"  Naive tracks:   {(male == 'naive').sum()}")
 
-    print(f"\nUntrained data distance summary:")
-    print(f"  Shape: {untrained_distances.shape}")
-    print(f"  Distance stats:")
-    print(f"    Count: {untrained_distances['distance'].count()}")
-    print(f"    Mean: {untrained_distances['distance'].mean():.2f}")
-    print(f"    Std: {untrained_distances['distance'].std():.2f}")
-
-    trained_distances.to_csv(DATA_PROCESSED / 'trained_distances.csv', index=False)
-    untrained_distances.to_csv(DATA_PROCESSED / 'untrained_distances.csv', index=False)
-    print("\nDistance data saved")
+    DATA_PROCESSED.mkdir(parents=True, exist_ok=True)
+    out = DATA_PROCESSED / "distances.csv"
+    distances.to_csv(out, index=False)
+    print(f"\nSaved {out}")
 
 
 if __name__ == "__main__":
diff --git a/scripts/config.py b/scripts/config.py
index a3462b2..447cee3 100644
--- a/scripts/config.py
+++ b/scripts/config.py
@@ -13,5 +13,8 @@ VIDEOS_ROOT = Path("/mnt/ethoscope_data/videos")
 VIDEO_INFO_XLSX = PROJECT_ROOT.parent / "all_video_info_merged.xlsx"
 INVENTORY_CSV = DATA_METADATA / "video_inventory.csv"
 TARGETS_DIR = PROJECT_ROOT / "data" / "targets"
-TRACKING_OUTPUT_DIR = PROJECT_ROOT / "data" / "tracked"
+# Reason: tracking DBs are large binary files that don't belong in
+# ownCloud-synced storage (sync conflicts + bandwidth). They live on the
+# local data volume instead. Regenerable from videos + target JSONs.
+TRACKING_OUTPUT_DIR = Path("/mnt/data/projects/cupido/tracked")
 LOGS_DIR = PROJECT_ROOT / "data" / "logs"
diff --git a/scripts/export_video_db_index.py b/scripts/export_video_db_index.py
new file mode 100644
index 0000000..723108c
--- /dev/null
+++ b/scripts/export_video_db_index.py
@@ -0,0 +1,181 @@
+"""Augment all_video_info_merged.xlsx with the input video + tracking DB paths.
+
+Each xlsx row represents one fly (date, machine_name, ROI), observed across a
+training session and a testing session. We resolve those two sessions to the
+on-disk video files (via the inventory CSV) and to their tracking DBs (under
+TRACKING_OUTPUT_DIR), then write the result as TSV.
+
+Output columns added:
+    training_video_path, training_db_path,
+    testing_video_path,  testing_db_path
+
+Empty values mean either no video matched (rare — implies missing inventory
+entry) or no DB exists yet (e.g. the one video the completeness gate
+rejected).
+
+Usage:
+    python export_video_db_index.py
+    python export_video_db_index.py --out path/to/output.tsv
+"""
+
+from __future__ import annotations
+
+import argparse
+import re
+from pathlib import Path
+
+import pandas as pd
+
+from config import INVENTORY_CSV, TRACKING_OUTPUT_DIR, VIDEO_INFO_XLSX
+
+
+_TIME_RE = re.compile(r"^(\d{8})_(\d{1,2})(\d{2})?(AM|PM)$", re.IGNORECASE)
+
+
+def parse_xlsx_time(value: str) -> tuple[str, int] | None:
+    """Convert '20241021_11AM' / '20240918_1030AM' to (YYYY-MM-DD, minutes24).
+
+    Resolution is hour-only when no minutes are given (e.g. '11AM' → 11:00).
+    Returns minutes-from-midnight so we can do nearest-neighbor matching.
+    """
+    if not isinstance(value, str):
+        return None
+    m = _TIME_RE.match(value.strip())
+    if not m:
+        return None
+    ymd, hh, mm, ampm = m.groups()
+    date = f"{ymd[:4]}-{ymd[4:6]}-{ymd[6:8]}"
+    hour = int(hh)
+    minute = int(mm) if mm else 0
+    if ampm.upper() == "PM" and hour != 12:
+        hour += 12
+    if ampm.upper() == "AM" and hour == 12:
+        hour = 0
+    return date, hour * 60 + minute
+
+
+def build_session_index(inventory: pd.DataFrame) -> dict[tuple[str, str], list[dict]]:
+    """Index inventory rows by (date, machine_name) → list of session dicts."""
+    idx: dict[tuple[str, str], list[dict]] = {}
+    for row in inventory.itertuples(index=False):
+        h, m, _s = (int(p) for p in str(row.session_time).split("-"))
+        key = (row.session_date, row.machine_name)
+        idx.setdefault(key, []).append({
+            "mp4_path": row.mp4_path,
+            "session_datetime": row.session_datetime,
+            "minutes": h * 60 + m,
+        })
+    return idx
+
+
+def db_path_for_video(mp4_path: str) -> Path | None:
+    """Tracker writes <video_stem>_tracking.db under TRACKING_OUTPUT_DIR."""
+    stem = Path(mp4_path).stem
+    db = TRACKING_OUTPUT_DIR / f"{stem}_tracking.db"
+    return db if db.exists() else None
+
+
+_TIME_TOLERANCE_MIN = 90  # xlsx labels are approximate ("11AM" → 10:51 is fine)
+
+
+def resolve_session(
+    machine_name: str,
+    when: str,
+    fallback_date: str | None,
+    index: dict[tuple[str, str], list[dict]],
+) -> tuple[str, str]:
+    """Look up the video + db whose start time is closest to `when`.
+
+    Match strategy:
+    1. Use the date embedded in `when` (training/testing can fall on a
+       different calendar day from the row's ``date`` column).
+    2. If no candidates exist for that date, fall back to ``fallback_date``
+       (the xlsx row's ``date`` column). Reason: the xlsx contains
+       date typos like '20240110_11AM' for an Oct 1 experiment.
+
+    Among candidates, pick the video whose start minute is closest to the
+    xlsx-claimed time, within ±_TIME_TOLERANCE_MIN.
+    """
+    parsed = parse_xlsx_time(when)
+    if parsed is None:
+        return "", ""
+    date, target_min = parsed
+    candidates = index.get((date, machine_name), [])
+    if not candidates and fallback_date:
+        candidates = index.get((fallback_date, machine_name), [])
+    if not candidates:
+        return "", ""
+
+    def _gap(target: int, c: dict) -> int:
+        # Reason: xlsx times like '1230AM' are ambiguous (12 AM vs 12 PM).
+        # We try both the literal time AND a +12-hour shift, picking the
+        # interpretation that brings us closest to a real session.
+        return min(abs(c["minutes"] - target), abs(c["minutes"] - (target + 720) % 1440))
+
+    best = min(candidates, key=lambda c: _gap(target_min, c))
+    if _gap(target_min, best) > _TIME_TOLERANCE_MIN:
+        return "", ""
+    db = db_path_for_video(best["mp4_path"])
+    return best["mp4_path"], (str(db) if db else "")
+
+
+# Variants of "naive" the xlsx has accumulated: 'naïve', 'niave', plus
+# trailing whitespace. All collapse to a single canonical 'naive'.
+_MALE_NAIVE_VARIANTS = {"naïve", "niave", "naive"}
+
+
+def _normalize_metadata(df: pd.DataFrame) -> None:
+    """Strip whitespace and canonicalize the ``male`` column in place."""
+    for col in df.select_dtypes(include=("object", "string")).columns:
+        df[col] = df[col].astype(str).str.strip()
+    df["male"] = df["male"].apply(
+        lambda v: "naive" if v.lower() in _MALE_NAIVE_VARIANTS else v
+    )
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument(
+        "--out",
+        type=Path,
+        default=VIDEO_INFO_XLSX.with_suffix(".tsv"),
+        help="output TSV path (default: alongside the xlsx)",
+    )
+    args = parser.parse_args()
+
+    inv = pd.read_csv(INVENTORY_CSV)
+    inv = inv[inv["in_xlsx"]].copy()
+    index = build_session_index(inv)
+
+    df = pd.read_excel(VIDEO_INFO_XLSX)
+    _normalize_metadata(df)
+    date_iso = pd.to_datetime(df["date"]).dt.strftime("%Y-%m-%d")
+
+    train_videos, train_dbs, test_videos, test_dbs = [], [], [], []
+    for fallback, row in zip(date_iso, df.itertuples(index=False)):
+        tv, td = resolve_session(row.machine_name, row.training_date_time, fallback, index)
+        sv, sd = resolve_session(row.machine_name, row.testing_date_time, fallback, index)
+        train_videos.append(tv)
+        train_dbs.append(td)
+        test_videos.append(sv)
+        test_dbs.append(sd)
+
+    df["training_video_path"] = train_videos
+    df["training_db_path"] = train_dbs
+    df["testing_video_path"] = test_videos
+    df["testing_db_path"] = test_dbs
+
+    df.to_csv(args.out, sep="\t", index=False)
+
+    n_rows = len(df)
+    n_train_video = sum(bool(v) for v in train_videos)
+    n_train_db = sum(bool(v) for v in train_dbs)
+    n_test_video = sum(bool(v) for v in test_videos)
+    n_test_db = sum(bool(v) for v in test_dbs)
+    print(f"wrote {args.out}  ({n_rows} rows)")
+    print(f"  training:  {n_train_video} with video,  {n_train_db} with DB")
+    print(f"  testing:   {n_test_video} with video,  {n_test_db} with DB")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/load_roi_data.py b/scripts/load_roi_data.py
index 5cf3cc6..84b00eb 100644
--- a/scripts/load_roi_data.py
+++ b/scripts/load_roi_data.py
@@ -1,90 +1,113 @@
-import pandas as pd
+"""Load ROI tracking data from all sessions into one DataFrame.
+
+Drives off the merged TSV (one row per ROI/fly across training + testing
+phases). For each TSV row, opens the corresponding tracking DB and pulls
+the matching ROI table, then attaches the experimental metadata.
+
+The TSV is the single source of truth for what data exists and how it
+maps to flies and conditions.
+"""
+
 import sqlite3
-import re
+from pathlib import Path
 
-from config import DATA_RAW, DATA_METADATA, DATA_PROCESSED
+import pandas as pd
+
+from config import VIDEO_INFO_XLSX
 
 
-def load_roi_data():
-    """Load ROI data from SQLite databases and group by trained/untrained.
+# Metadata columns to copy onto every tracking sample. These are the xlsx
+# fields that describe the experimental condition behind each fly/ROI.
+# Reason: the ROI column is uppercase ("ROI") for backwards compatibility
+# with the existing analysis pipeline (calculate_distances.py, notebooks).
+_META_COLS = (
+    "date",
+    "machine_name",
+    "species",
+    "male",
+    "training_date_time",
+    "testing_date_time",
+    "training_length_hr",
+    "consolidation_length_hr",
+    "memory",
+    "age",
+)
+
+
+def _open_ro(db_path: str, cache: dict) -> sqlite3.Connection | None:
+    """Cached read-only sqlite connection. Returns None on failure."""
+    if not isinstance(db_path, str) or not db_path:
+        return None
+    if db_path not in cache:
+        try:
+            cache[db_path] = sqlite3.connect(f"file:{db_path}?mode=ro", uri=True)
+        except sqlite3.Error as e:
+            print(f"failed to open {Path(db_path).name}: {e}")
+            cache[db_path] = None
+    return cache[db_path]
+
+
+def load_roi_data(meta: pd.DataFrame | None = None) -> pd.DataFrame:
+    """Load ROI tracking data joined with experimental metadata.
+
+    For each row in ``meta``, reads the matching ROI table from both the
+    training DB and the testing DB (whichever exist), and stamps every
+    sample with the row's metadata plus a ``session`` column
+    (``"training"`` or ``"testing"``). Rows with empty DB paths (unusable
+    videos, or videos that didn't pass the completeness gate) are skipped.
+
+    Args:
+        meta: optional DataFrame with the same schema as
+            ``all_video_info_merged.tsv``. Pass a filtered slice to load a
+            subset (e.g. ``meta[meta.species == 'Melanogaster/CS']``).
+            Defaults to the full TSV.
 
     Returns:
-        tuple: (trained_df, untrained_df) DataFrames with tracking data.
+        DataFrame with columns ``id, t, x, y, w, h, phi, is_inferred,
+        has_interacted, session, <metadata>`` — one row per tracking
+        sample. Empty if nothing could be loaded.
     """
-    metadata = pd.read_csv(DATA_METADATA / '2025_07_15_metadata_fixed.csv')
-    metadata['machine_name'] = metadata['machine_name'].astype(str)
+    if meta is None:
+        meta = pd.read_csv(VIDEO_INFO_XLSX.with_suffix(".tsv"), sep="\t")
 
-    trained_rois = metadata[metadata['group'] == 'trained']
-    untrained_rois = metadata[metadata['group'] == 'untrained']
+    db_cache: dict = {}
+    chunks: list[pd.DataFrame] = []
 
-    db_files = list(DATA_RAW.glob('*_tracking.db'))
-
-    trained_df = pd.DataFrame()
-    untrained_df = pd.DataFrame()
-
-    for db_file in db_files:
-        print(f"Processing {db_file.name}")
-
-        pattern = r'_([0-9a-f]{32})__'
-        match = re.search(pattern, db_file.name)
-
-        if not match:
-            print(f"Could not extract UUID from {db_file.name}")
-            continue
-
-        uuid = match.group(1)
-        metadata_matches = metadata[metadata['path'].str.contains(uuid, na=False)]
-
-        if metadata_matches.empty:
-            print(f"No metadata matches found for UUID {uuid} from {db_file.name}")
-            continue
-
-        machine_id = metadata_matches.iloc[0]['machine_name']
-        print(f"Matched to machine ID: {machine_id}")
-
-        conn = sqlite3.connect(str(db_file))
-
-        machine_trained = trained_rois[trained_rois['machine_name'] == machine_id]
-        machine_untrained = untrained_rois[untrained_rois['machine_name'] == machine_id]
-
-        for _, row in machine_trained.iterrows():
-            roi = row['ROI']
+    for row in meta.itertuples(index=False):
+        for session in ("training", "testing"):
+            conn = _open_ro(getattr(row, f"{session}_db_path"), db_cache)
+            if conn is None:
+                continue
             try:
-                query = f"SELECT * FROM ROI_{roi}"
-                roi_data = pd.read_sql_query(query, conn)
-                roi_data['machine_name'] = machine_id
-                roi_data['ROI'] = roi
-                roi_data['group'] = 'trained'
-                trained_df = pd.concat([trained_df, roi_data], ignore_index=True)
+                df = pd.read_sql_query(
+                    f"SELECT * FROM ROI_{int(row.roi)}", conn
+                )
             except Exception as e:
-                print(f"Error loading ROI_{roi} from {db_file.name}: {e}")
+                # Reason: a DB may be missing a ROI table if tracking was
+                # partial — skip rather than abort the whole batch.
+                print(f"  ROI_{row.roi} from {session} DB: {e}")
+                continue
+            df["session"] = session
+            df["ROI"] = int(row.roi)
+            for col in _META_COLS:
+                df[col] = getattr(row, col)
+            chunks.append(df)
 
-        for _, row in machine_untrained.iterrows():
-            roi = row['ROI']
-            try:
-                query = f"SELECT * FROM ROI_{roi}"
-                roi_data = pd.read_sql_query(query, conn)
-                roi_data['machine_name'] = machine_id
-                roi_data['ROI'] = roi
-                roi_data['group'] = 'untrained'
-                untrained_df = pd.concat([untrained_df, roi_data], ignore_index=True)
-            except Exception as e:
-                print(f"Error loading ROI_{roi} from {db_file.name}: {e}")
+    for conn in db_cache.values():
+        if conn is not None:
+            conn.close()
 
-        conn.close()
-
-    return trained_df, untrained_df
+    return pd.concat(chunks, ignore_index=True) if chunks else pd.DataFrame()
 
 
 if __name__ == "__main__":
-    trained_data, untrained_data = load_roi_data()
-    print(f"Trained data shape: {trained_data.shape}")
-    print(f"Untrained data shape: {untrained_data.shape}")
-    if not trained_data.empty:
-        print("Trained data columns:", trained_data.columns.tolist())
-    if not untrained_data.empty:
-        print("Untrained data columns:", untrained_data.columns.tolist())
-
-    trained_data.to_csv(DATA_PROCESSED / 'trained_roi_data.csv', index=False)
-    untrained_data.to_csv(DATA_PROCESSED / 'untrained_roi_data.csv', index=False)
-    print("Data saved to trained_roi_data.csv and untrained_roi_data.csv")
+    data = load_roi_data()
+    print(f"shape: {data.shape}")
+    if not data.empty:
+        print(f"columns: {list(data.columns)}")
+        print(f"sessions: {data['session'].value_counts().to_dict()}")
+        print(f"unique machines: {data['machine_name'].nunique()}")
+        print(
+            f"unique flies (date,machine,roi): "
+            f"{data.groupby(['date','machine_name','roi']).ngroups}"
+        )
diff --git a/scripts/monitor_tracking.py b/scripts/monitor_tracking.py
index 9ffa891..991798f 100644
--- a/scripts/monitor_tracking.py
+++ b/scripts/monitor_tracking.py
@@ -97,13 +97,32 @@ def snapshot() -> str:
     )
     lines.append(f"  errors in log:     {len(errors)}")
 
-    # Rate from the last 10 completions, when available.
-    if len(history) >= 2:
-        window = history[-min(10, len(history)) :]
-        span = window[-1] - window[0]
-        if span > 0:
-            rate_per_hour = (len(window) - 1) / span * 3600
-            lines.append(f"  rate (last {len(window) - 1}):    {rate_per_hour:.1f} videos/hour")
+    # Rate from completions in the last 6 h — robust to gaps from killed /
+    # restarted runs, while wide enough to span multiple parallel-worker
+    # completion bursts. Reason: with 8 workers all started together on
+    # multi-hour videos, completions arrive in tight bursts every ~video-
+    # length apart; a 30-min window catches one burst and overestimates by
+    # ~10×. 6 h spans at least one full burst cycle for typical videos.
+    now_ts = time.time()
+    window_secs = 6 * 3600
+    recent = [t for t in history if t >= now_ts - window_secs]
+    if len(recent) >= 2:
+        # Reason: with N parallel workers, completions arrive in clumps
+        # (all workers finish near-simultaneously). Dividing N by the *burst*
+        # span gives nonsense rates. Use the full window as the denominator
+        # once the batch has been running long enough to fill it; otherwise
+        # use elapsed-since-first-DB. Detection: if every DB on disk also
+        # falls inside the window, the batch is younger than the window.
+        if len(recent) == len(history):
+            elapsed = max(1.0, now_ts - history[0])
+        else:
+            elapsed = float(window_secs)
+        if elapsed > 0:
+            rate_per_hour = len(recent) / elapsed * 3600
+            lines.append(
+                f"  rate (last {len(recent)} in {int(window_secs/3600)} h):"
+                f"    {rate_per_hour:.1f} videos/hour"
+            )
             remaining = max(0, pickable - tracked)
             if rate_per_hour > 0 and remaining > 0:
                 eta_sec = remaining * 3600 / rate_per_hour
@@ -112,6 +131,8 @@ def snapshot() -> str:
                     f"  ETA remaining:     {fmt_duration(eta_sec)}  "
                     f"(done by {eta_at:%H:%M %a})"
                 )
+    else:
+        lines.append("  rate:              (warming up — check again in a few min)")
 
     if last_mtime is not None and last_name is not None:
         ago = (datetime.now() - last_mtime).total_seconds()
diff --git a/scripts/track_videos.py b/scripts/track_videos.py
index d9bd197..cb65292 100644
--- a/scripts/track_videos.py
+++ b/scripts/track_videos.py
@@ -3,7 +3,7 @@
 Reads target JSONs produced by `pick_targets.py`, builds the 6 ROIs of the
 HD mating arena from the L-shape reference points, runs ethoscope's
 `MultiFlyTracker` against the merged.mp4 file via `MovieVirtualCamera`, and
-writes a SQLite DB to `data/tracked/<video_basename>_tracking.db`.
+writes a SQLite DB to `TRACKING_OUTPUT_DIR/<video_basename>_tracking.db`.
 
 Idempotent: skips videos whose tracking DB already exists (unless --redo).
 
@@ -58,17 +58,46 @@ def track_one(json_path: Path, output_dir: Path, max_duration: float | None,
     from ethoscope.io.sqlite import SQLiteResultWriter
     from ethoscope.trackers.multi_fly_tracker import MultiFlyTracker
 
-    class BGRMovieCamera(MovieVirtualCamera):
-        """MovieVirtualCamera variant that keeps BGR frames.
+    import time as _time
 
-        MultiFlyTracker calls cv2.cvtColor(img, COLOR_BGR2GRAY) without checking
-        whether img is already grayscale, so we must feed it 3-channel input.
+    class BGRMovieCamera(MovieVirtualCamera):
+        """MovieVirtualCamera that keeps BGR frames AND retries on transient
+        read failures.
+
+        Two reasons for the override:
+
+        1. MultiFlyTracker calls cv2.cvtColor(img, COLOR_BGR2GRAY) without
+           checking whether img is already grayscale, so we must feed it
+           3-channel input.
+
+        2. cv2.VideoCapture.read() can return False on transient I/O hiccups
+           (NFS contention when 8 workers pull big mp4s in parallel) without
+           the file actually being at EOF. A naive "False -> StopIteration"
+           handling makes the tracker silently exit mid-video and write a
+           short, lying DB. We retry a few times and only treat persistent
+           failures within the *interior* of the video as real EOF.
         """
+
+        _retry_count = 5
+        _retry_backoff_s = 0.25
+        _eof_safety_frames = 50  # near end-of-file, treat False as legitimate
+
         def _next_image(self):
-            ret, frame = self.capture.read()
-            if not ret or frame is None:
-                return None
-            return frame  # BGR, untouched
+            for attempt in range(self._retry_count):
+                ret, frame = self.capture.read()
+                if ret and frame is not None:
+                    return frame  # BGR, untouched
+                # If we're near the genuine end of the file, accept it.
+                if (
+                    self._has_end_of_file
+                    and self._frame_idx >= self._total_n_frames - self._eof_safety_frames
+                ):
+                    return None
+                # Otherwise, this is a suspected transient hiccup — back off
+                # and try again. The capture is still open; cv2 will pick up
+                # the next decoded frame.
+                _time.sleep(self._retry_backoff_s)
+            return None  # truly persistent failure
 
     payload = json.loads(json_path.read_text())
     if payload.get("unusable"):
@@ -146,6 +175,42 @@ def track_one(json_path: Path, output_dir: Path, max_duration: float | None,
 
     if not out_db.exists():
         return "error", "tracking finished but DB was not created"
+
+    # Post-tracking sanity check: did we cover most of the source video?
+    # If not (cv2 retry exhausted, codec corruption, etc.), reject the DB so
+    # it doesn't get cached as "done" — better an explicit failure than a
+    # silent partial write.
+    expected_ms = (cam._total_n_frames / 25.0) * 1000.0
+    if max_duration is not None:
+        expected_ms = min(expected_ms, max_duration * 1000.0)
+    completeness_threshold = 0.90  # require ≥ 90 % of expected duration
+
+    # Use MAX(t) across all ROIs — a single ROI can run dry early if its fly
+    # stops moving, so the latest detection anywhere in the arena is the
+    # better signal of how far the iterator actually got.
+    import sqlite3 as _sqlite3
+    try:
+        _con = _sqlite3.connect(f"file:{out_db}?mode=ro", uri=True)
+        t_max = 0
+        for _i in range(1, 7):
+            _v = _con.execute(f"SELECT MAX(t) FROM ROI_{_i}").fetchone()[0]
+            if _v and _v > t_max:
+                t_max = _v
+        _con.close()
+    except Exception:
+        t_max = 0
+
+    if expected_ms > 0 and t_max < expected_ms * completeness_threshold:
+        out_db.unlink()
+        for sidecar in (str(out_db) + "-wal", str(out_db) + "-shm"):
+            Path(sidecar).unlink(missing_ok=True)
+        ratio = t_max / expected_ms if expected_ms else 0
+        return (
+            "error",
+            f"short output: t_max={t_max} ms vs expected {int(expected_ms)} ms "
+            f"({ratio*100:.0f}%); DB removed",
+        )
+
     return "ok", str(out_db)
 
 
diff --git a/tasks/todo.md b/tasks/todo.md
index f86bd65..30b473c 100644
--- a/tasks/todo.md
+++ b/tasks/todo.md
@@ -115,4 +115,26 @@ all targets are picked, tracking can run in the background.
 
 ## Discovered During Work
 
-(Add new items here as they come up during analysis)
+### Barrier-opening annotation for the 2024 batch (added 2026-04-30)
+The current `flies_analysis*.ipynb` aligns trajectories to a barrier-opening
+event sourced from `data/metadata/2025_07_15_barrier_opening.csv`. That file
+covers only the 5 machines in the 2025-07-15 experiment. The 2024 batch
+(`/mnt/data/projects/cupido/tracked/`, 113 DBs) has no equivalent annotation
+yet, so all post-alignment cells silently exclude that data.
+
+- [ ] Build a small picker that lets the user scrub through each tracking
+      DB / video and mark the barrier-opening frame, writing a row to a new
+      `data/metadata/barrier_opening_2024.csv` (or extend the existing
+      file with a date column).
+- [ ] Once the 2024 entries exist, update `align_to_opening_time` so it
+      pulls from a unified `barrier_opening` table keyed by
+      `(date, machine_name)` rather than `machine_name` alone.
+
+### Metadata vocabulary normalization (done 2026-04-30)
+The xlsx had inconsistent labels for control flies (`'naïve'`, `'niave'`,
+`'untrained'` plus trailing whitespace). All sources now use a single
+canonical `'naive'`. Normalization happens in
+`scripts/export_video_db_index.py` so re-running it from the xlsx always
+produces a clean TSV. The 2025-07-15 legacy CSV
+(`data/metadata/2025_07_15_metadata_fixed.csv`) was edited in place from
+`'untrained'` → `'naive'`.