diff --git a/.gitignore b/.gitignore index 02d5434..07f3445 100644 --- a/.gitignore +++ b/.gitignore @@ -2,11 +2,8 @@ data/raw/*.db data/processed/*.csv -# Offline-tracking outputs (reproducible from videos + target JSONs) -data/tracked/*.db -data/tracked/*.db-wal -data/tracked/*.db-shm -data/tracked/*.db-journal +# Offline-tracking outputs (regenerable from videos + target JSONs) +# DBs live outside the repo at /mnt/data/projects/cupido/tracked/ data/targets/*.json data/metadata/video_inventory.csv data/logs/*.log diff --git a/README.md b/README.md index 9d9ff17..5644fea 100644 --- a/README.md +++ b/README.md @@ -66,7 +66,7 @@ python scripts/pick_targets.py --redo # re-pick already-picked videos # 3) batch tracking (idempotent, can run in background) python scripts/track_videos.py --jobs 4 # parallel -# output → data/tracked/*_tracking.db (SQLite, same schema as data/raw/) +# output → /mnt/data/projects/cupido/tracked/*_tracking.db (SQLite, same schema as data/raw/) ``` See `tasks/todo.md` "Offline Tracking" section for the full plan, and diff --git a/data/metadata/2025_07_15_metadata_fixed.csv b/data/metadata/2025_07_15_metadata_fixed.csv index 36d07c5..bce7bcc 100644 --- a/data/metadata/2025_07_15_metadata_fixed.csv +++ b/data/metadata/2025_07_15_metadata_fixed.csv @@ -1,37 +1,37 @@ -date,HHMMSS,machine_name,ROI,genotype,group,path,filesize_mb +date,HHMMSS,machine_name,ROI,genotype,group,path,filesize_mb 15/07/2025,16-03-10,76,6,CS,trained,/mnt/ethoscope_data/videos/076e2825a7274661bd0697c42d6fa4c0/ETHOSCOPE_076/2025-07-15_16-03-10/2025-07-15_16-03-10_076e2825a7274661bd0697c42d6fa4c0__1920x1088@25fps-28q_merged.mp4,59.4 -15/07/2025,16-03-10,76,4,CS,untrained,/mnt/ethoscope_data/videos/076e2825a7274661bd0697c42d6fa4c0/ETHOSCOPE_076/2025-07-15_16-03-10/2025-07-15_16-03-10_076e2825a7274661bd0697c42d6fa4c0__1920x1088@25fps-28q_merged.mp4,59.4 +15/07/2025,16-03-10,76,4,CS,naive,/mnt/ethoscope_data/videos/076e2825a7274661bd0697c42d6fa4c0/ETHOSCOPE_076/2025-07-15_16-03-10/2025-07-15_16-03-10_076e2825a7274661bd0697c42d6fa4c0__1920x1088@25fps-28q_merged.mp4,59.4 15/07/2025,16-03-10,76,2,CS,trained,/mnt/ethoscope_data/videos/076e2825a7274661bd0697c42d6fa4c0/ETHOSCOPE_076/2025-07-15_16-03-10/2025-07-15_16-03-10_076e2825a7274661bd0697c42d6fa4c0__1920x1088@25fps-28q_merged.mp4,59.4 -15/07/2025,16-03-10,76,5,CS,untrained,/mnt/ethoscope_data/videos/076e2825a7274661bd0697c42d6fa4c0/ETHOSCOPE_076/2025-07-15_16-03-10/2025-07-15_16-03-10_076e2825a7274661bd0697c42d6fa4c0__1920x1088@25fps-28q_merged.mp4,59.4 +15/07/2025,16-03-10,76,5,CS,naive,/mnt/ethoscope_data/videos/076e2825a7274661bd0697c42d6fa4c0/ETHOSCOPE_076/2025-07-15_16-03-10/2025-07-15_16-03-10_076e2825a7274661bd0697c42d6fa4c0__1920x1088@25fps-28q_merged.mp4,59.4 15/07/2025,16-03-10,76,3,CS,trained,/mnt/ethoscope_data/videos/076e2825a7274661bd0697c42d6fa4c0/ETHOSCOPE_076/2025-07-15_16-03-10/2025-07-15_16-03-10_076e2825a7274661bd0697c42d6fa4c0__1920x1088@25fps-28q_merged.mp4,59.4 -15/07/2025,16-03-10,76,1,CS,untrained,/mnt/ethoscope_data/videos/076e2825a7274661bd0697c42d6fa4c0/ETHOSCOPE_076/2025-07-15_16-03-10/2025-07-15_16-03-10_076e2825a7274661bd0697c42d6fa4c0__1920x1088@25fps-28q_merged.mp4,59.4 +15/07/2025,16-03-10,76,1,CS,naive,/mnt/ethoscope_data/videos/076e2825a7274661bd0697c42d6fa4c0/ETHOSCOPE_076/2025-07-15_16-03-10/2025-07-15_16-03-10_076e2825a7274661bd0697c42d6fa4c0__1920x1088@25fps-28q_merged.mp4,59.4 15/07/2025,16-31-34,76,6,CS,trained,/mnt/ethoscope_data/videos/076e2825a7274661bd0697c42d6fa4c0/ETHOSCOPE_076/2025-07-15_16-31-34/2025-07-15_16-31-34_076e2825a7274661bd0697c42d6fa4c0__1920x1088@25fps-28q_merged.mp4,78.98 15/07/2025,16-31-34,76,4,CS,trained,/mnt/ethoscope_data/videos/076e2825a7274661bd0697c42d6fa4c0/ETHOSCOPE_076/2025-07-15_16-31-34/2025-07-15_16-31-34_076e2825a7274661bd0697c42d6fa4c0__1920x1088@25fps-28q_merged.mp4,78.98 15/07/2025,16-31-34,76,2,CS,trained,/mnt/ethoscope_data/videos/076e2825a7274661bd0697c42d6fa4c0/ETHOSCOPE_076/2025-07-15_16-31-34/2025-07-15_16-31-34_076e2825a7274661bd0697c42d6fa4c0__1920x1088@25fps-28q_merged.mp4,78.98 -15/07/2025,16-31-34,76,5,CS,untrained,/mnt/ethoscope_data/videos/076e2825a7274661bd0697c42d6fa4c0/ETHOSCOPE_076/2025-07-15_16-31-34/2025-07-15_16-31-34_076e2825a7274661bd0697c42d6fa4c0__1920x1088@25fps-28q_merged.mp4,78.98 -15/07/2025,16-31-34,76,3,CS,untrained,/mnt/ethoscope_data/videos/076e2825a7274661bd0697c42d6fa4c0/ETHOSCOPE_076/2025-07-15_16-31-34/2025-07-15_16-31-34_076e2825a7274661bd0697c42d6fa4c0__1920x1088@25fps-28q_merged.mp4,78.98 -15/07/2025,16-31-34,76,1,CS,untrained,/mnt/ethoscope_data/videos/076e2825a7274661bd0697c42d6fa4c0/ETHOSCOPE_076/2025-07-15_16-31-34/2025-07-15_16-31-34_076e2825a7274661bd0697c42d6fa4c0__1920x1088@25fps-28q_merged.mp4,78.98 +15/07/2025,16-31-34,76,5,CS,naive,/mnt/ethoscope_data/videos/076e2825a7274661bd0697c42d6fa4c0/ETHOSCOPE_076/2025-07-15_16-31-34/2025-07-15_16-31-34_076e2825a7274661bd0697c42d6fa4c0__1920x1088@25fps-28q_merged.mp4,78.98 +15/07/2025,16-31-34,76,3,CS,naive,/mnt/ethoscope_data/videos/076e2825a7274661bd0697c42d6fa4c0/ETHOSCOPE_076/2025-07-15_16-31-34/2025-07-15_16-31-34_076e2825a7274661bd0697c42d6fa4c0__1920x1088@25fps-28q_merged.mp4,78.98 +15/07/2025,16-31-34,76,1,CS,naive,/mnt/ethoscope_data/videos/076e2825a7274661bd0697c42d6fa4c0/ETHOSCOPE_076/2025-07-15_16-31-34/2025-07-15_16-31-34_076e2825a7274661bd0697c42d6fa4c0__1920x1088@25fps-28q_merged.mp4,78.98 15/07/2025,16-03-27,145,6,CS,trained,/mnt/ethoscope_data/videos/145bb573497a4e15b0690206748a3af6/ETHOSCOPE_145/2025-07-15_16-03-27/2025-07-15_16-03-27_145bb573497a4e15b0690206748a3af6__1920x1088@25fps-28q_merged.mp4,78.72 15/07/2025,16-03-27,145,4,CS,trained,/mnt/ethoscope_data/videos/145bb573497a4e15b0690206748a3af6/ETHOSCOPE_145/2025-07-15_16-03-27/2025-07-15_16-03-27_145bb573497a4e15b0690206748a3af6__1920x1088@25fps-28q_merged.mp4,78.72 15/07/2025,16-03-27,145,2,CS,trained,/mnt/ethoscope_data/videos/145bb573497a4e15b0690206748a3af6/ETHOSCOPE_145/2025-07-15_16-03-27/2025-07-15_16-03-27_145bb573497a4e15b0690206748a3af6__1920x1088@25fps-28q_merged.mp4,78.72 -15/07/2025,16-03-27,145,5,CS,untrained,/mnt/ethoscope_data/videos/145bb573497a4e15b0690206748a3af6/ETHOSCOPE_145/2025-07-15_16-03-27/2025-07-15_16-03-27_145bb573497a4e15b0690206748a3af6__1920x1088@25fps-28q_merged.mp4,78.72 -15/07/2025,16-03-27,145,3,CS,untrained,/mnt/ethoscope_data/videos/145bb573497a4e15b0690206748a3af6/ETHOSCOPE_145/2025-07-15_16-03-27/2025-07-15_16-03-27_145bb573497a4e15b0690206748a3af6__1920x1088@25fps-28q_merged.mp4,78.72 -15/07/2025,16-03-27,145,1,CS,untrained,/mnt/ethoscope_data/videos/145bb573497a4e15b0690206748a3af6/ETHOSCOPE_145/2025-07-15_16-03-27/2025-07-15_16-03-27_145bb573497a4e15b0690206748a3af6__1920x1088@25fps-28q_merged.mp4,78.72 +15/07/2025,16-03-27,145,5,CS,naive,/mnt/ethoscope_data/videos/145bb573497a4e15b0690206748a3af6/ETHOSCOPE_145/2025-07-15_16-03-27/2025-07-15_16-03-27_145bb573497a4e15b0690206748a3af6__1920x1088@25fps-28q_merged.mp4,78.72 +15/07/2025,16-03-27,145,3,CS,naive,/mnt/ethoscope_data/videos/145bb573497a4e15b0690206748a3af6/ETHOSCOPE_145/2025-07-15_16-03-27/2025-07-15_16-03-27_145bb573497a4e15b0690206748a3af6__1920x1088@25fps-28q_merged.mp4,78.72 +15/07/2025,16-03-27,145,1,CS,naive,/mnt/ethoscope_data/videos/145bb573497a4e15b0690206748a3af6/ETHOSCOPE_145/2025-07-15_16-03-27/2025-07-15_16-03-27_145bb573497a4e15b0690206748a3af6__1920x1088@25fps-28q_merged.mp4,78.72 15/07/2025,16-31-41,145,6,CS,trained,/mnt/ethoscope_data/videos/145bb573497a4e15b0690206748a3af6/ETHOSCOPE_145/2025-07-15_16-31-41/2025-07-15_16-31-41_145bb573497a4e15b0690206748a3af6__1920x1088@25fps-28q_merged.mp4,90.9 15/07/2025,16-31-41,145,4,CS,trained,/mnt/ethoscope_data/videos/145bb573497a4e15b0690206748a3af6/ETHOSCOPE_145/2025-07-15_16-31-41/2025-07-15_16-31-41_145bb573497a4e15b0690206748a3af6__1920x1088@25fps-28q_merged.mp4,90.9 15/07/2025,16-31-41,145,2,CS,trained,/mnt/ethoscope_data/videos/145bb573497a4e15b0690206748a3af6/ETHOSCOPE_145/2025-07-15_16-31-41/2025-07-15_16-31-41_145bb573497a4e15b0690206748a3af6__1920x1088@25fps-28q_merged.mp4,90.9 -15/07/2025,16-31-41,145,5,CS,untrained,/mnt/ethoscope_data/videos/145bb573497a4e15b0690206748a3af6/ETHOSCOPE_145/2025-07-15_16-31-41/2025-07-15_16-31-41_145bb573497a4e15b0690206748a3af6__1920x1088@25fps-28q_merged.mp4,90.9 -15/07/2025,16-31-41,145,3,CS,untrained,/mnt/ethoscope_data/videos/145bb573497a4e15b0690206748a3af6/ETHOSCOPE_145/2025-07-15_16-31-41/2025-07-15_16-31-41_145bb573497a4e15b0690206748a3af6__1920x1088@25fps-28q_merged.mp4,90.9 -15/07/2025,16-31-41,145,1,CS,untrained,/mnt/ethoscope_data/videos/145bb573497a4e15b0690206748a3af6/ETHOSCOPE_145/2025-07-15_16-31-41/2025-07-15_16-31-41_145bb573497a4e15b0690206748a3af6__1920x1088@25fps-28q_merged.mp4,90.9 +15/07/2025,16-31-41,145,5,CS,naive,/mnt/ethoscope_data/videos/145bb573497a4e15b0690206748a3af6/ETHOSCOPE_145/2025-07-15_16-31-41/2025-07-15_16-31-41_145bb573497a4e15b0690206748a3af6__1920x1088@25fps-28q_merged.mp4,90.9 +15/07/2025,16-31-41,145,3,CS,naive,/mnt/ethoscope_data/videos/145bb573497a4e15b0690206748a3af6/ETHOSCOPE_145/2025-07-15_16-31-41/2025-07-15_16-31-41_145bb573497a4e15b0690206748a3af6__1920x1088@25fps-28q_merged.mp4,90.9 +15/07/2025,16-31-41,145,1,CS,naive,/mnt/ethoscope_data/videos/145bb573497a4e15b0690206748a3af6/ETHOSCOPE_145/2025-07-15_16-31-41/2025-07-15_16-31-41_145bb573497a4e15b0690206748a3af6__1920x1088@25fps-28q_merged.mp4,90.9 15/07/2025,16-31-52,139,6,CS,trained,/mnt/ethoscope_data/videos/13924be2046d49f4a641cef2a5559852/ETHOSCOPE_139/2025-07-15_16-31-52/2025-07-15_16-31-52_13924be2046d49f4a641cef2a5559852__1920x1088@25fps-28q_merged.mp4,73.4 15/07/2025,16-31-52,139,4,CS,trained,/mnt/ethoscope_data/videos/13924be2046d49f4a641cef2a5559852/ETHOSCOPE_139/2025-07-15_16-31-52/2025-07-15_16-31-52_13924be2046d49f4a641cef2a5559852__1920x1088@25fps-28q_merged.mp4,73.4 15/07/2025,16-31-52,139,2,CS,trained,/mnt/ethoscope_data/videos/13924be2046d49f4a641cef2a5559852/ETHOSCOPE_139/2025-07-15_16-31-52/2025-07-15_16-31-52_13924be2046d49f4a641cef2a5559852__1920x1088@25fps-28q_merged.mp4,73.4 -15/07/2025,16-31-52,139,5,CS,untrained,/mnt/ethoscope_data/videos/13924be2046d49f4a641cef2a5559852/ETHOSCOPE_139/2025-07-15_16-31-52/2025-07-15_16-31-52_13924be2046d49f4a641cef2a5559852__1920x1088@25fps-28q_merged.mp4,73.4 -15/07/2025,16-31-52,139,3,CS,untrained,/mnt/ethoscope_data/videos/13924be2046d49f4a641cef2a5559852/ETHOSCOPE_139/2025-07-15_16-31-52/2025-07-15_16-31-52_13924be2046d49f4a641cef2a5559852__1920x1088@25fps-28q_merged.mp4,73.4 -15/07/2025,16-31-52,139,1,CS,untrained,/mnt/ethoscope_data/videos/13924be2046d49f4a641cef2a5559852/ETHOSCOPE_139/2025-07-15_16-31-52/2025-07-15_16-31-52_13924be2046d49f4a641cef2a5559852__1920x1088@25fps-28q_merged.mp4,73.4 -15/07/2025,16-32-05,268,6,CS,untrained,/mnt/ethoscope_data/videos/268102f92f51486f995200c29d980477/ETHOSCOPE_268/2025-07-15_16-32-05/2025-07-15_16-32-05_268102f92f51486f995200c29d980477__1920x1088@25fps-28q_merged.mp4,43.72 -15/07/2025,16-32-05,268,4,CS,untrained,/mnt/ethoscope_data/videos/268102f92f51486f995200c29d980477/ETHOSCOPE_268/2025-07-15_16-32-05/2025-07-15_16-32-05_268102f92f51486f995200c29d980477__1920x1088@25fps-28q_merged.mp4,43.72 -15/07/2025,16-32-05,268,2,CS,untrained,/mnt/ethoscope_data/videos/268102f92f51486f995200c29d980477/ETHOSCOPE_268/2025-07-15_16-32-05/2025-07-15_16-32-05_268102f92f51486f995200c29d980477__1920x1088@25fps-28q_merged.mp4,43.72 +15/07/2025,16-31-52,139,5,CS,naive,/mnt/ethoscope_data/videos/13924be2046d49f4a641cef2a5559852/ETHOSCOPE_139/2025-07-15_16-31-52/2025-07-15_16-31-52_13924be2046d49f4a641cef2a5559852__1920x1088@25fps-28q_merged.mp4,73.4 +15/07/2025,16-31-52,139,3,CS,naive,/mnt/ethoscope_data/videos/13924be2046d49f4a641cef2a5559852/ETHOSCOPE_139/2025-07-15_16-31-52/2025-07-15_16-31-52_13924be2046d49f4a641cef2a5559852__1920x1088@25fps-28q_merged.mp4,73.4 +15/07/2025,16-31-52,139,1,CS,naive,/mnt/ethoscope_data/videos/13924be2046d49f4a641cef2a5559852/ETHOSCOPE_139/2025-07-15_16-31-52/2025-07-15_16-31-52_13924be2046d49f4a641cef2a5559852__1920x1088@25fps-28q_merged.mp4,73.4 +15/07/2025,16-32-05,268,6,CS,naive,/mnt/ethoscope_data/videos/268102f92f51486f995200c29d980477/ETHOSCOPE_268/2025-07-15_16-32-05/2025-07-15_16-32-05_268102f92f51486f995200c29d980477__1920x1088@25fps-28q_merged.mp4,43.72 +15/07/2025,16-32-05,268,4,CS,naive,/mnt/ethoscope_data/videos/268102f92f51486f995200c29d980477/ETHOSCOPE_268/2025-07-15_16-32-05/2025-07-15_16-32-05_268102f92f51486f995200c29d980477__1920x1088@25fps-28q_merged.mp4,43.72 +15/07/2025,16-32-05,268,2,CS,naive,/mnt/ethoscope_data/videos/268102f92f51486f995200c29d980477/ETHOSCOPE_268/2025-07-15_16-32-05/2025-07-15_16-32-05_268102f92f51486f995200c29d980477__1920x1088@25fps-28q_merged.mp4,43.72 15/07/2025,16-32-05,268,5,CS,trained,/mnt/ethoscope_data/videos/268102f92f51486f995200c29d980477/ETHOSCOPE_268/2025-07-15_16-32-05/2025-07-15_16-32-05_268102f92f51486f995200c29d980477__1920x1088@25fps-28q_merged.mp4,43.72 15/07/2025,16-32-05,268,3,CS,trained,/mnt/ethoscope_data/videos/268102f92f51486f995200c29d980477/ETHOSCOPE_268/2025-07-15_16-32-05/2025-07-15_16-32-05_268102f92f51486f995200c29d980477__1920x1088@25fps-28q_merged.mp4,43.72 15/07/2025,16-32-05,268,1,CS,trained,/mnt/ethoscope_data/videos/268102f92f51486f995200c29d980477/ETHOSCOPE_268/2025-07-15_16-32-05/2025-07-15_16-32-05_268102f92f51486f995200c29d980477__1920x1088@25fps-28q_merged.mp4,43.72 diff --git a/data/processed/README.md b/data/processed/README.md index 97d2e82..d934460 100644 --- a/data/processed/README.md +++ b/data/processed/README.md @@ -1,39 +1,47 @@ # Processed Data -Large CSV files generated from the analysis pipeline. All files are gitignored (~370MB total) and can be regenerated. +CSVs derived from the tracking DBs (`/mnt/data/projects/cupido/tracked/`) +and the merged TSV (`../../all_video_info_merged.tsv`). All files are +gitignored and regenerable. ## Files and Regeneration | File | Description | Generated By | |------|-------------|--------------| -| `trained_roi_data.csv` | Raw tracking data for trained ROIs | `scripts/load_roi_data.py` or notebook step 1 | -| `untrained_roi_data.csv` | Raw tracking data for untrained ROIs | `scripts/load_roi_data.py` or notebook step 1 | -| `trained_distances.csv` | Pairwise distances (unaligned) | `scripts/calculate_distances.py` | -| `untrained_distances.csv` | Pairwise distances (unaligned) | `scripts/calculate_distances.py` | -| `trained_distances_aligned.csv` | Distances aligned to barrier opening | Notebook step 4 | -| `untrained_distances_aligned.csv` | Distances aligned to barrier opening | Notebook step 4 | -| `trained_tracked.csv` | Identity-tracked fly positions | Notebook step 7 | -| `untrained_tracked.csv` | Identity-tracked fly positions | Notebook step 7 | -| `trained_max_velocity.csv` | Max velocity over 10s windows | Notebook step 7 | -| `untrained_max_velocity.csv` | Max velocity over 10s windows | Notebook step 7 | +| `distances.csv` | Per-frame inter-fly distances for every (date, machine, ROI, session). Includes metadata columns to filter trained vs naïve, training phase, species, etc. | `scripts/calculate_distances.py` | +| `*_distances_aligned.csv` | (legacy, 2025-07-15 only) distances aligned to barrier opening | `notebooks/flies_analysis*.ipynb` | +| `*_tracked.csv` | (legacy) identity-tracked fly positions | `notebooks/flies_analysis_simple.ipynb` | +| `*_max_velocity.csv` | (legacy) max velocity over 10 s windows | `notebooks/flies_analysis_simple.ipynb` | -## To Regenerate All Data +## Loading the data -Run the full notebook `notebooks/flies_analysis_simple.ipynb` with: ```python -recalculate_distances = True -recalculate_tracking = True +import sys +sys.path.insert(0, "../scripts") +from load_roi_data import load_roi_data + +data = load_roi_data() # full batch as one DataFrame +# Or filter the metadata first: +import pandas as pd +tsv = pd.read_csv("../../all_video_info_merged.tsv", sep="\t") +data = load_roi_data(tsv[tsv.species.str.contains("Melanogaster")]) ``` -**Warning**: Identity tracking and velocity calculations take significant time (~30+ minutes). +The returned DataFrame has columns: +`id, t, x, y, w, h, phi, is_inferred, has_interacted, session, ROI, date, +machine_name, species, male, training_date_time, testing_date_time, +training_length_hr, consolidation_length_hr, memory, age`. -## Column Reference +`session` is `"training"` or `"testing"`; `male` is `"trained"` or +`"naive"` (canonical — variants like `"naïve"` and `"niave"` are normalized +at the TSV-export step). -### Distance CSVs (`*_distances_aligned.csv`) -- `machine_name`: Ethoscope machine ID (string) -- `ROI`: ROI number (1-6) -- `aligned_time`: Time in ms relative to barrier opening (0 = opening) -- `distance`: Euclidean distance between flies in pixels -- `n_flies`: Number of flies detected at this time point -- `area_fly1`, `area_fly2`: Bounding box areas (w*h) in pixels^2 -- `group`: "trained" or "untrained" +## Column Reference (`distances.csv`) + +- `date`, `machine_name`, `ROI`, `session`: identifies one fly trajectory +- `t`: time in ms within that session +- `distance`: Euclidean distance between the two flies in pixels +- `n_flies`: number of fly detections at this frame (1 or 2) +- `area_fly1`, `area_fly2`: bounding-box areas (`w * h`) in pixels² +- `male`: `trained` or `naive` (carried from the xlsx; normalized) +- `species`, `memory`, `age`: experimental metadata diff --git a/notebooks/flies_analysis.ipynb b/notebooks/flies_analysis.ipynb index d9c24e3..9bf3a30 100644 --- a/notebooks/flies_analysis.ipynb +++ b/notebooks/flies_analysis.ipynb @@ -28,7 +28,22 @@ "execution_count": null, "metadata": {}, "outputs": [], - "source": "def load_roi_data():\n \"\"\"Load ROI data from SQLite databases and group by trained/untrained\"\"\"\n metadata = pd.read_csv(DATA_METADATA / '2025_07_15_metadata_fixed.csv')\n metadata['machine_name'] = metadata['machine_name'].astype(str)\n \n trained_rois = metadata[metadata['group'] == 'trained']\n untrained_rois = metadata[metadata['group'] == 'untrained']\n \n db_files = list(DATA_RAW.glob('*_tracking.db'))\n \n trained_df = pd.DataFrame()\n untrained_df = pd.DataFrame()\n \n for db_file in db_files:\n print(f\"Processing {db_file.name}\")\n \n pattern = r'_([0-9a-f]{32})__'\n match = re.search(pattern, db_file.name)\n \n if not match:\n print(f\"Could not extract UUID from {db_file.name}\")\n continue\n \n uuid = match.group(1)\n metadata_matches = metadata[metadata['path'].str.contains(uuid, na=False)]\n \n if metadata_matches.empty:\n print(f\"No metadata matches found for UUID {uuid}\")\n continue\n \n machine_id = metadata_matches.iloc[0]['machine_name']\n print(f\"Matched to machine ID: {machine_id}\")\n \n conn = sqlite3.connect(str(db_file))\n \n machine_trained = trained_rois[trained_rois['machine_name'] == machine_id]\n machine_untrained = untrained_rois[untrained_rois['machine_name'] == machine_id]\n \n for _, row in machine_trained.iterrows():\n roi = row['ROI']\n try:\n roi_data = pd.read_sql_query(f\"SELECT * FROM ROI_{roi}\", conn)\n roi_data['machine_name'] = machine_id\n roi_data['ROI'] = roi\n roi_data['group'] = 'trained'\n trained_df = pd.concat([trained_df, roi_data], ignore_index=True)\n except Exception as e:\n print(f\"Error loading ROI_{roi}: {e}\")\n \n for _, row in machine_untrained.iterrows():\n roi = row['ROI']\n try:\n roi_data = pd.read_sql_query(f\"SELECT * FROM ROI_{roi}\", conn)\n roi_data['machine_name'] = machine_id\n roi_data['ROI'] = roi\n roi_data['group'] = 'untrained'\n untrained_df = pd.concat([untrained_df, roi_data], ignore_index=True)\n except Exception as e:\n print(f\"Error loading ROI_{roi}: {e}\")\n \n conn.close()\n \n return trained_df, untrained_df\n\ntrained_data, untrained_data = load_roi_data()\nprint(f\"Trained data shape: {trained_data.shape}\")\nprint(f\"Untrained data shape: {untrained_data.shape}\")\n\ntrained_data.to_csv(DATA_PROCESSED / 'trained_roi_data.csv', index=False)\nuntrained_data.to_csv(DATA_PROCESSED / 'untrained_roi_data.csv', index=False)\nprint(\"Data saved to CSV files\")" + "source": [ + "# Load tracking data via the unified loader (driven by all_video_info_merged.tsv).\n", + "# Reason: replaces the old data/raw + 2025_07_15_metadata_fixed.csv path with\n", + "# the TSV-based loader that covers the entire batch (2025-07-15 + 2024).\n", + "sys.path.insert(0, str(PROJECT_ROOT / 'scripts'))\n", + "from load_roi_data import load_roi_data\n", + "\n", + "data = load_roi_data()\n", + "# Backwards-compat slices for the rest of the notebook.\n", + "trained_data = data[data['male'] == 'trained'].copy()\n", + "untrained_data = data[data['male'] == 'naive'].copy()\n", + "\n", + "print(f\"all data: {data.shape}\")\n", + "print(f\"trained: {trained_data.shape}\")\n", + "print(f\"naive: {untrained_data.shape}\")\n" + ] }, { "cell_type": "markdown", @@ -219,4 +234,4 @@ }, "nbformat": 4, "nbformat_minor": 4 -} \ No newline at end of file +} diff --git a/notebooks/flies_analysis_simple.ipynb b/notebooks/flies_analysis_simple.ipynb index 1663b10..7072c73 100644 --- a/notebooks/flies_analysis_simple.ipynb +++ b/notebooks/flies_analysis_simple.ipynb @@ -28,7 +28,22 @@ "execution_count": null, "metadata": {}, "outputs": [], - "source": "# Load the pre-processed data\ntrained_data = pd.read_csv(DATA_PROCESSED / 'trained_roi_data.csv')\nuntrained_data = pd.read_csv(DATA_PROCESSED / 'untrained_roi_data.csv')\n\nprint(f\"Trained data shape: {trained_data.shape}\")\nprint(f\"Untrained data shape: {untrained_data.shape}\")\nprint(f\"Trained data columns: {list(trained_data.columns)}\")\nprint(f\"Untrained data columns: {list(untrained_data.columns)}\")" + "source": [ + "# Load tracking data via the unified loader (driven by all_video_info_merged.tsv).\n", + "# Reason: replaces reads of trained_roi_data.csv / untrained_roi_data.csv with\n", + "# the live loader so the notebook always sees the current batch.\n", + "sys.path.insert(0, str(PROJECT_ROOT / 'scripts'))\n", + "from load_roi_data import load_roi_data\n", + "\n", + "data = load_roi_data()\n", + "trained_data = data[data['male'] == 'trained'].copy()\n", + "untrained_data = data[data['male'] == 'naive'].copy()\n", + "\n", + "print(f\"all data shape: {data.shape}\")\n", + "print(f\"Trained data: {trained_data.shape}\")\n", + "print(f\"Naive data: {untrained_data.shape}\")\n", + "print(f\"Columns: {list(trained_data.columns)}\")\n" + ] }, { "cell_type": "markdown", @@ -418,4 +433,4 @@ }, "nbformat": 4, "nbformat_minor": 4 -} \ No newline at end of file +} diff --git a/scripts/calculate_distances.py b/scripts/calculate_distances.py index 09eff9a..75e7a1a 100644 --- a/scripts/calculate_distances.py +++ b/scripts/calculate_distances.py @@ -1,117 +1,99 @@ -import pandas as pd +"""Compute per-frame inter-fly distances for every (date, machine, ROI, session). + +Reads tracking data via :func:`load_roi_data.load_roi_data` (which is driven +by ``all_video_info_merged.tsv``) and produces one distances DataFrame +spanning every fly/session in the batch. Group membership (``trained`` / +``untrained``) is preserved from the ``male`` column. +""" + import numpy as np +import pandas as pd from scipy.spatial.distance import euclidean from config import DATA_PROCESSED +from load_roi_data import load_roi_data -def calculate_fly_distances(trained_file=None, untrained_file=None): - """Calculate distances between flies at each time point. +def calculate_fly_distances(data: pd.DataFrame | None = None) -> pd.DataFrame: + """Compute inter-fly distances over time for every fly/session. - For each time point: - - If two flies are detected: calculate Cartesian distance between them - - If one fly is detected: set distance to 0 if area > average area, otherwise NaN + For each time point inside one (date, machine, ROI, session) trajectory: + - 2+ flies detected: Euclidean distance between the first two by id + - 1 fly detected: distance = 0 if its bbox area exceeds the global + mean (likely a single blob containing both flies), else NaN Args: - trained_file (Path): Path to trained ROI data CSV. - untrained_file (Path): Path to untrained ROI data CSV. + data: optional pre-loaded DataFrame from :func:`load_roi_data`. If + None, the full batch is loaded. Returns: - tuple: (trained_distances, untrained_distances) DataFrames. + DataFrame with one row per (track, time) pair, including ``distance``, + ``n_flies``, ``area_fly1``, ``area_fly2``, plus the metadata columns + propagated from the source row (``date``, ``machine_name``, ``ROI``, + ``session``, ``male``, ``species``, ``memory``, ``age``). """ - if trained_file is None: - trained_file = DATA_PROCESSED / 'trained_roi_data.csv' - if untrained_file is None: - untrained_file = DATA_PROCESSED / 'untrained_roi_data.csv' + if data is None: + data = load_roi_data() + if data.empty: + return pd.DataFrame() - trained_df = pd.read_csv(trained_file) - untrained_df = pd.read_csv(untrained_file) - - trained_df['area'] = trained_df['w'] * trained_df['h'] - untrained_df['area'] = untrained_df['w'] * untrained_df['h'] - - avg_area = np.mean([trained_df['area'].mean(), untrained_df['area'].mean()]) + data = data.copy() + data["area"] = data["w"] * data["h"] + avg_area = data["area"].mean() print(f"Average area across all data: {avg_area:.2f}") - trained_distances = process_distance_data(trained_df, avg_area) - untrained_distances = process_distance_data(untrained_df, avg_area) + # Carry these onto every output row (constant within a track). + keep_meta = ["date", "machine_name", "ROI", "session", "male", + "species", "memory", "age"] - return trained_distances, untrained_distances - - -def process_distance_data(df, avg_area): - """Process a DataFrame to calculate distances between flies at each time point. - - Args: - df (pd.DataFrame): Input tracking data. - avg_area (float): Average area threshold for single-fly detection. - - Returns: - pd.DataFrame: Distance data with columns for machine, ROI, time, distance. - """ - results = [] - - for (machine_name, roi), group in df.groupby(['machine_name', 'ROI']): - for t, time_group in group.groupby('t'): - time_group = time_group.sort_values('id').reset_index(drop=True) + rows: list[dict] = [] + track_keys = ["date", "machine_name", "ROI", "session"] + for track, track_df in data.groupby(track_keys, sort=False): + meta_row = {k: v for k, v in zip(track_keys, track)} + # Carry the rest of the metadata from any sample (constant per track). + sample = track_df.iloc[0] + for col in keep_meta: + if col not in meta_row: + meta_row[col] = sample[col] + for t, time_group in track_df.groupby("t", sort=False): + time_group = time_group.sort_values("id").reset_index(drop=True) + row = dict(meta_row) + row["t"] = t if len(time_group) >= 2: - fly1 = time_group.iloc[0] - fly2 = time_group.iloc[1] - distance = euclidean([fly1['x'], fly1['y']], [fly2['x'], fly2['y']]) + f1, f2 = time_group.iloc[0], time_group.iloc[1] + row["distance"] = euclidean([f1["x"], f1["y"]], [f2["x"], f2["y"]]) + row["n_flies"] = len(time_group) + row["area_fly1"] = f1["area"] + row["area_fly2"] = f2["area"] + else: + f = time_group.iloc[0] + row["distance"] = 0.0 if f["area"] > avg_area else np.nan + row["n_flies"] = 1 + row["area_fly1"] = f["area"] + row["area_fly2"] = np.nan + rows.append(row) - results.append({ - 'machine_name': machine_name, - 'ROI': roi, - 't': t, - 'distance': distance, - 'n_flies': len(time_group), - 'area_fly1': fly1['area'], - 'area_fly2': fly2['area'] - }) - elif len(time_group) == 1: - fly = time_group.iloc[0] - area = fly['area'] - - if area > avg_area: - distance = 0.0 - else: - distance = np.nan - - results.append({ - 'machine_name': machine_name, - 'ROI': roi, - 't': t, - 'distance': distance, - 'n_flies': 1, - 'area_fly1': area, - 'area_fly2': np.nan - }) - - return pd.DataFrame(results) + return pd.DataFrame(rows) -def main(): - """Run distance calculations and save results.""" - trained_distances, untrained_distances = calculate_fly_distances() +def main() -> None: + distances = calculate_fly_distances() - print(f"Trained data distance summary:") - print(f" Shape: {trained_distances.shape}") - print(f" Distance stats:") - print(f" Count: {trained_distances['distance'].count()}") - print(f" Mean: {trained_distances['distance'].mean():.2f}") - print(f" Std: {trained_distances['distance'].std():.2f}") + print("\nDistance summary:") + print(f" Shape: {distances.shape}") + if not distances.empty: + print(f" Distance count: {distances['distance'].count()}") + print(f" Distance mean: {distances['distance'].mean():.2f}") + print(f" Distance std: {distances['distance'].std():.2f}") + male = distances["male"] + print(f" Trained tracks: {(male == 'trained').sum()}") + print(f" Naive tracks: {(male == 'naive').sum()}") - print(f"\nUntrained data distance summary:") - print(f" Shape: {untrained_distances.shape}") - print(f" Distance stats:") - print(f" Count: {untrained_distances['distance'].count()}") - print(f" Mean: {untrained_distances['distance'].mean():.2f}") - print(f" Std: {untrained_distances['distance'].std():.2f}") - - trained_distances.to_csv(DATA_PROCESSED / 'trained_distances.csv', index=False) - untrained_distances.to_csv(DATA_PROCESSED / 'untrained_distances.csv', index=False) - print("\nDistance data saved") + DATA_PROCESSED.mkdir(parents=True, exist_ok=True) + out = DATA_PROCESSED / "distances.csv" + distances.to_csv(out, index=False) + print(f"\nSaved {out}") if __name__ == "__main__": diff --git a/scripts/config.py b/scripts/config.py index a3462b2..447cee3 100644 --- a/scripts/config.py +++ b/scripts/config.py @@ -13,5 +13,8 @@ VIDEOS_ROOT = Path("/mnt/ethoscope_data/videos") VIDEO_INFO_XLSX = PROJECT_ROOT.parent / "all_video_info_merged.xlsx" INVENTORY_CSV = DATA_METADATA / "video_inventory.csv" TARGETS_DIR = PROJECT_ROOT / "data" / "targets" -TRACKING_OUTPUT_DIR = PROJECT_ROOT / "data" / "tracked" +# Reason: tracking DBs are large binary files that don't belong in +# ownCloud-synced storage (sync conflicts + bandwidth). They live on the +# local data volume instead. Regenerable from videos + target JSONs. +TRACKING_OUTPUT_DIR = Path("/mnt/data/projects/cupido/tracked") LOGS_DIR = PROJECT_ROOT / "data" / "logs" diff --git a/scripts/export_video_db_index.py b/scripts/export_video_db_index.py new file mode 100644 index 0000000..723108c --- /dev/null +++ b/scripts/export_video_db_index.py @@ -0,0 +1,181 @@ +"""Augment all_video_info_merged.xlsx with the input video + tracking DB paths. + +Each xlsx row represents one fly (date, machine_name, ROI), observed across a +training session and a testing session. We resolve those two sessions to the +on-disk video files (via the inventory CSV) and to their tracking DBs (under +TRACKING_OUTPUT_DIR), then write the result as TSV. + +Output columns added: + training_video_path, training_db_path, + testing_video_path, testing_db_path + +Empty values mean either no video matched (rare — implies missing inventory +entry) or no DB exists yet (e.g. the one video the completeness gate +rejected). + +Usage: + python export_video_db_index.py + python export_video_db_index.py --out path/to/output.tsv +""" + +from __future__ import annotations + +import argparse +import re +from pathlib import Path + +import pandas as pd + +from config import INVENTORY_CSV, TRACKING_OUTPUT_DIR, VIDEO_INFO_XLSX + + +_TIME_RE = re.compile(r"^(\d{8})_(\d{1,2})(\d{2})?(AM|PM)$", re.IGNORECASE) + + +def parse_xlsx_time(value: str) -> tuple[str, int] | None: + """Convert '20241021_11AM' / '20240918_1030AM' to (YYYY-MM-DD, minutes24). + + Resolution is hour-only when no minutes are given (e.g. '11AM' → 11:00). + Returns minutes-from-midnight so we can do nearest-neighbor matching. + """ + if not isinstance(value, str): + return None + m = _TIME_RE.match(value.strip()) + if not m: + return None + ymd, hh, mm, ampm = m.groups() + date = f"{ymd[:4]}-{ymd[4:6]}-{ymd[6:8]}" + hour = int(hh) + minute = int(mm) if mm else 0 + if ampm.upper() == "PM" and hour != 12: + hour += 12 + if ampm.upper() == "AM" and hour == 12: + hour = 0 + return date, hour * 60 + minute + + +def build_session_index(inventory: pd.DataFrame) -> dict[tuple[str, str], list[dict]]: + """Index inventory rows by (date, machine_name) → list of session dicts.""" + idx: dict[tuple[str, str], list[dict]] = {} + for row in inventory.itertuples(index=False): + h, m, _s = (int(p) for p in str(row.session_time).split("-")) + key = (row.session_date, row.machine_name) + idx.setdefault(key, []).append({ + "mp4_path": row.mp4_path, + "session_datetime": row.session_datetime, + "minutes": h * 60 + m, + }) + return idx + + +def db_path_for_video(mp4_path: str) -> Path | None: + """Tracker writes _tracking.db under TRACKING_OUTPUT_DIR.""" + stem = Path(mp4_path).stem + db = TRACKING_OUTPUT_DIR / f"{stem}_tracking.db" + return db if db.exists() else None + + +_TIME_TOLERANCE_MIN = 90 # xlsx labels are approximate ("11AM" → 10:51 is fine) + + +def resolve_session( + machine_name: str, + when: str, + fallback_date: str | None, + index: dict[tuple[str, str], list[dict]], +) -> tuple[str, str]: + """Look up the video + db whose start time is closest to `when`. + + Match strategy: + 1. Use the date embedded in `when` (training/testing can fall on a + different calendar day from the row's ``date`` column). + 2. If no candidates exist for that date, fall back to ``fallback_date`` + (the xlsx row's ``date`` column). Reason: the xlsx contains + date typos like '20240110_11AM' for an Oct 1 experiment. + + Among candidates, pick the video whose start minute is closest to the + xlsx-claimed time, within ±_TIME_TOLERANCE_MIN. + """ + parsed = parse_xlsx_time(when) + if parsed is None: + return "", "" + date, target_min = parsed + candidates = index.get((date, machine_name), []) + if not candidates and fallback_date: + candidates = index.get((fallback_date, machine_name), []) + if not candidates: + return "", "" + + def _gap(target: int, c: dict) -> int: + # Reason: xlsx times like '1230AM' are ambiguous (12 AM vs 12 PM). + # We try both the literal time AND a +12-hour shift, picking the + # interpretation that brings us closest to a real session. + return min(abs(c["minutes"] - target), abs(c["minutes"] - (target + 720) % 1440)) + + best = min(candidates, key=lambda c: _gap(target_min, c)) + if _gap(target_min, best) > _TIME_TOLERANCE_MIN: + return "", "" + db = db_path_for_video(best["mp4_path"]) + return best["mp4_path"], (str(db) if db else "") + + +# Variants of "naive" the xlsx has accumulated: 'naïve', 'niave', plus +# trailing whitespace. All collapse to a single canonical 'naive'. +_MALE_NAIVE_VARIANTS = {"naïve", "niave", "naive"} + + +def _normalize_metadata(df: pd.DataFrame) -> None: + """Strip whitespace and canonicalize the ``male`` column in place.""" + for col in df.select_dtypes(include=("object", "string")).columns: + df[col] = df[col].astype(str).str.strip() + df["male"] = df["male"].apply( + lambda v: "naive" if v.lower() in _MALE_NAIVE_VARIANTS else v + ) + + +def main() -> None: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument( + "--out", + type=Path, + default=VIDEO_INFO_XLSX.with_suffix(".tsv"), + help="output TSV path (default: alongside the xlsx)", + ) + args = parser.parse_args() + + inv = pd.read_csv(INVENTORY_CSV) + inv = inv[inv["in_xlsx"]].copy() + index = build_session_index(inv) + + df = pd.read_excel(VIDEO_INFO_XLSX) + _normalize_metadata(df) + date_iso = pd.to_datetime(df["date"]).dt.strftime("%Y-%m-%d") + + train_videos, train_dbs, test_videos, test_dbs = [], [], [], [] + for fallback, row in zip(date_iso, df.itertuples(index=False)): + tv, td = resolve_session(row.machine_name, row.training_date_time, fallback, index) + sv, sd = resolve_session(row.machine_name, row.testing_date_time, fallback, index) + train_videos.append(tv) + train_dbs.append(td) + test_videos.append(sv) + test_dbs.append(sd) + + df["training_video_path"] = train_videos + df["training_db_path"] = train_dbs + df["testing_video_path"] = test_videos + df["testing_db_path"] = test_dbs + + df.to_csv(args.out, sep="\t", index=False) + + n_rows = len(df) + n_train_video = sum(bool(v) for v in train_videos) + n_train_db = sum(bool(v) for v in train_dbs) + n_test_video = sum(bool(v) for v in test_videos) + n_test_db = sum(bool(v) for v in test_dbs) + print(f"wrote {args.out} ({n_rows} rows)") + print(f" training: {n_train_video} with video, {n_train_db} with DB") + print(f" testing: {n_test_video} with video, {n_test_db} with DB") + + +if __name__ == "__main__": + main() diff --git a/scripts/load_roi_data.py b/scripts/load_roi_data.py index 5cf3cc6..84b00eb 100644 --- a/scripts/load_roi_data.py +++ b/scripts/load_roi_data.py @@ -1,90 +1,113 @@ -import pandas as pd +"""Load ROI tracking data from all sessions into one DataFrame. + +Drives off the merged TSV (one row per ROI/fly across training + testing +phases). For each TSV row, opens the corresponding tracking DB and pulls +the matching ROI table, then attaches the experimental metadata. + +The TSV is the single source of truth for what data exists and how it +maps to flies and conditions. +""" + import sqlite3 -import re +from pathlib import Path -from config import DATA_RAW, DATA_METADATA, DATA_PROCESSED +import pandas as pd + +from config import VIDEO_INFO_XLSX -def load_roi_data(): - """Load ROI data from SQLite databases and group by trained/untrained. +# Metadata columns to copy onto every tracking sample. These are the xlsx +# fields that describe the experimental condition behind each fly/ROI. +# Reason: the ROI column is uppercase ("ROI") for backwards compatibility +# with the existing analysis pipeline (calculate_distances.py, notebooks). +_META_COLS = ( + "date", + "machine_name", + "species", + "male", + "training_date_time", + "testing_date_time", + "training_length_hr", + "consolidation_length_hr", + "memory", + "age", +) + + +def _open_ro(db_path: str, cache: dict) -> sqlite3.Connection | None: + """Cached read-only sqlite connection. Returns None on failure.""" + if not isinstance(db_path, str) or not db_path: + return None + if db_path not in cache: + try: + cache[db_path] = sqlite3.connect(f"file:{db_path}?mode=ro", uri=True) + except sqlite3.Error as e: + print(f"failed to open {Path(db_path).name}: {e}") + cache[db_path] = None + return cache[db_path] + + +def load_roi_data(meta: pd.DataFrame | None = None) -> pd.DataFrame: + """Load ROI tracking data joined with experimental metadata. + + For each row in ``meta``, reads the matching ROI table from both the + training DB and the testing DB (whichever exist), and stamps every + sample with the row's metadata plus a ``session`` column + (``"training"`` or ``"testing"``). Rows with empty DB paths (unusable + videos, or videos that didn't pass the completeness gate) are skipped. + + Args: + meta: optional DataFrame with the same schema as + ``all_video_info_merged.tsv``. Pass a filtered slice to load a + subset (e.g. ``meta[meta.species == 'Melanogaster/CS']``). + Defaults to the full TSV. Returns: - tuple: (trained_df, untrained_df) DataFrames with tracking data. + DataFrame with columns ``id, t, x, y, w, h, phi, is_inferred, + has_interacted, session, `` — one row per tracking + sample. Empty if nothing could be loaded. """ - metadata = pd.read_csv(DATA_METADATA / '2025_07_15_metadata_fixed.csv') - metadata['machine_name'] = metadata['machine_name'].astype(str) + if meta is None: + meta = pd.read_csv(VIDEO_INFO_XLSX.with_suffix(".tsv"), sep="\t") - trained_rois = metadata[metadata['group'] == 'trained'] - untrained_rois = metadata[metadata['group'] == 'untrained'] + db_cache: dict = {} + chunks: list[pd.DataFrame] = [] - db_files = list(DATA_RAW.glob('*_tracking.db')) - - trained_df = pd.DataFrame() - untrained_df = pd.DataFrame() - - for db_file in db_files: - print(f"Processing {db_file.name}") - - pattern = r'_([0-9a-f]{32})__' - match = re.search(pattern, db_file.name) - - if not match: - print(f"Could not extract UUID from {db_file.name}") - continue - - uuid = match.group(1) - metadata_matches = metadata[metadata['path'].str.contains(uuid, na=False)] - - if metadata_matches.empty: - print(f"No metadata matches found for UUID {uuid} from {db_file.name}") - continue - - machine_id = metadata_matches.iloc[0]['machine_name'] - print(f"Matched to machine ID: {machine_id}") - - conn = sqlite3.connect(str(db_file)) - - machine_trained = trained_rois[trained_rois['machine_name'] == machine_id] - machine_untrained = untrained_rois[untrained_rois['machine_name'] == machine_id] - - for _, row in machine_trained.iterrows(): - roi = row['ROI'] + for row in meta.itertuples(index=False): + for session in ("training", "testing"): + conn = _open_ro(getattr(row, f"{session}_db_path"), db_cache) + if conn is None: + continue try: - query = f"SELECT * FROM ROI_{roi}" - roi_data = pd.read_sql_query(query, conn) - roi_data['machine_name'] = machine_id - roi_data['ROI'] = roi - roi_data['group'] = 'trained' - trained_df = pd.concat([trained_df, roi_data], ignore_index=True) + df = pd.read_sql_query( + f"SELECT * FROM ROI_{int(row.roi)}", conn + ) except Exception as e: - print(f"Error loading ROI_{roi} from {db_file.name}: {e}") + # Reason: a DB may be missing a ROI table if tracking was + # partial — skip rather than abort the whole batch. + print(f" ROI_{row.roi} from {session} DB: {e}") + continue + df["session"] = session + df["ROI"] = int(row.roi) + for col in _META_COLS: + df[col] = getattr(row, col) + chunks.append(df) - for _, row in machine_untrained.iterrows(): - roi = row['ROI'] - try: - query = f"SELECT * FROM ROI_{roi}" - roi_data = pd.read_sql_query(query, conn) - roi_data['machine_name'] = machine_id - roi_data['ROI'] = roi - roi_data['group'] = 'untrained' - untrained_df = pd.concat([untrained_df, roi_data], ignore_index=True) - except Exception as e: - print(f"Error loading ROI_{roi} from {db_file.name}: {e}") + for conn in db_cache.values(): + if conn is not None: + conn.close() - conn.close() - - return trained_df, untrained_df + return pd.concat(chunks, ignore_index=True) if chunks else pd.DataFrame() if __name__ == "__main__": - trained_data, untrained_data = load_roi_data() - print(f"Trained data shape: {trained_data.shape}") - print(f"Untrained data shape: {untrained_data.shape}") - if not trained_data.empty: - print("Trained data columns:", trained_data.columns.tolist()) - if not untrained_data.empty: - print("Untrained data columns:", untrained_data.columns.tolist()) - - trained_data.to_csv(DATA_PROCESSED / 'trained_roi_data.csv', index=False) - untrained_data.to_csv(DATA_PROCESSED / 'untrained_roi_data.csv', index=False) - print("Data saved to trained_roi_data.csv and untrained_roi_data.csv") + data = load_roi_data() + print(f"shape: {data.shape}") + if not data.empty: + print(f"columns: {list(data.columns)}") + print(f"sessions: {data['session'].value_counts().to_dict()}") + print(f"unique machines: {data['machine_name'].nunique()}") + print( + f"unique flies (date,machine,roi): " + f"{data.groupby(['date','machine_name','roi']).ngroups}" + ) diff --git a/scripts/monitor_tracking.py b/scripts/monitor_tracking.py index 9ffa891..991798f 100644 --- a/scripts/monitor_tracking.py +++ b/scripts/monitor_tracking.py @@ -97,13 +97,32 @@ def snapshot() -> str: ) lines.append(f" errors in log: {len(errors)}") - # Rate from the last 10 completions, when available. - if len(history) >= 2: - window = history[-min(10, len(history)) :] - span = window[-1] - window[0] - if span > 0: - rate_per_hour = (len(window) - 1) / span * 3600 - lines.append(f" rate (last {len(window) - 1}): {rate_per_hour:.1f} videos/hour") + # Rate from completions in the last 6 h — robust to gaps from killed / + # restarted runs, while wide enough to span multiple parallel-worker + # completion bursts. Reason: with 8 workers all started together on + # multi-hour videos, completions arrive in tight bursts every ~video- + # length apart; a 30-min window catches one burst and overestimates by + # ~10×. 6 h spans at least one full burst cycle for typical videos. + now_ts = time.time() + window_secs = 6 * 3600 + recent = [t for t in history if t >= now_ts - window_secs] + if len(recent) >= 2: + # Reason: with N parallel workers, completions arrive in clumps + # (all workers finish near-simultaneously). Dividing N by the *burst* + # span gives nonsense rates. Use the full window as the denominator + # once the batch has been running long enough to fill it; otherwise + # use elapsed-since-first-DB. Detection: if every DB on disk also + # falls inside the window, the batch is younger than the window. + if len(recent) == len(history): + elapsed = max(1.0, now_ts - history[0]) + else: + elapsed = float(window_secs) + if elapsed > 0: + rate_per_hour = len(recent) / elapsed * 3600 + lines.append( + f" rate (last {len(recent)} in {int(window_secs/3600)} h):" + f" {rate_per_hour:.1f} videos/hour" + ) remaining = max(0, pickable - tracked) if rate_per_hour > 0 and remaining > 0: eta_sec = remaining * 3600 / rate_per_hour @@ -112,6 +131,8 @@ def snapshot() -> str: f" ETA remaining: {fmt_duration(eta_sec)} " f"(done by {eta_at:%H:%M %a})" ) + else: + lines.append(" rate: (warming up — check again in a few min)") if last_mtime is not None and last_name is not None: ago = (datetime.now() - last_mtime).total_seconds() diff --git a/scripts/track_videos.py b/scripts/track_videos.py index d9bd197..cb65292 100644 --- a/scripts/track_videos.py +++ b/scripts/track_videos.py @@ -3,7 +3,7 @@ Reads target JSONs produced by `pick_targets.py`, builds the 6 ROIs of the HD mating arena from the L-shape reference points, runs ethoscope's `MultiFlyTracker` against the merged.mp4 file via `MovieVirtualCamera`, and -writes a SQLite DB to `data/tracked/_tracking.db`. +writes a SQLite DB to `TRACKING_OUTPUT_DIR/_tracking.db`. Idempotent: skips videos whose tracking DB already exists (unless --redo). @@ -58,17 +58,46 @@ def track_one(json_path: Path, output_dir: Path, max_duration: float | None, from ethoscope.io.sqlite import SQLiteResultWriter from ethoscope.trackers.multi_fly_tracker import MultiFlyTracker - class BGRMovieCamera(MovieVirtualCamera): - """MovieVirtualCamera variant that keeps BGR frames. + import time as _time - MultiFlyTracker calls cv2.cvtColor(img, COLOR_BGR2GRAY) without checking - whether img is already grayscale, so we must feed it 3-channel input. + class BGRMovieCamera(MovieVirtualCamera): + """MovieVirtualCamera that keeps BGR frames AND retries on transient + read failures. + + Two reasons for the override: + + 1. MultiFlyTracker calls cv2.cvtColor(img, COLOR_BGR2GRAY) without + checking whether img is already grayscale, so we must feed it + 3-channel input. + + 2. cv2.VideoCapture.read() can return False on transient I/O hiccups + (NFS contention when 8 workers pull big mp4s in parallel) without + the file actually being at EOF. A naive "False -> StopIteration" + handling makes the tracker silently exit mid-video and write a + short, lying DB. We retry a few times and only treat persistent + failures within the *interior* of the video as real EOF. """ + + _retry_count = 5 + _retry_backoff_s = 0.25 + _eof_safety_frames = 50 # near end-of-file, treat False as legitimate + def _next_image(self): - ret, frame = self.capture.read() - if not ret or frame is None: - return None - return frame # BGR, untouched + for attempt in range(self._retry_count): + ret, frame = self.capture.read() + if ret and frame is not None: + return frame # BGR, untouched + # If we're near the genuine end of the file, accept it. + if ( + self._has_end_of_file + and self._frame_idx >= self._total_n_frames - self._eof_safety_frames + ): + return None + # Otherwise, this is a suspected transient hiccup — back off + # and try again. The capture is still open; cv2 will pick up + # the next decoded frame. + _time.sleep(self._retry_backoff_s) + return None # truly persistent failure payload = json.loads(json_path.read_text()) if payload.get("unusable"): @@ -146,6 +175,42 @@ def track_one(json_path: Path, output_dir: Path, max_duration: float | None, if not out_db.exists(): return "error", "tracking finished but DB was not created" + + # Post-tracking sanity check: did we cover most of the source video? + # If not (cv2 retry exhausted, codec corruption, etc.), reject the DB so + # it doesn't get cached as "done" — better an explicit failure than a + # silent partial write. + expected_ms = (cam._total_n_frames / 25.0) * 1000.0 + if max_duration is not None: + expected_ms = min(expected_ms, max_duration * 1000.0) + completeness_threshold = 0.90 # require ≥ 90 % of expected duration + + # Use MAX(t) across all ROIs — a single ROI can run dry early if its fly + # stops moving, so the latest detection anywhere in the arena is the + # better signal of how far the iterator actually got. + import sqlite3 as _sqlite3 + try: + _con = _sqlite3.connect(f"file:{out_db}?mode=ro", uri=True) + t_max = 0 + for _i in range(1, 7): + _v = _con.execute(f"SELECT MAX(t) FROM ROI_{_i}").fetchone()[0] + if _v and _v > t_max: + t_max = _v + _con.close() + except Exception: + t_max = 0 + + if expected_ms > 0 and t_max < expected_ms * completeness_threshold: + out_db.unlink() + for sidecar in (str(out_db) + "-wal", str(out_db) + "-shm"): + Path(sidecar).unlink(missing_ok=True) + ratio = t_max / expected_ms if expected_ms else 0 + return ( + "error", + f"short output: t_max={t_max} ms vs expected {int(expected_ms)} ms " + f"({ratio*100:.0f}%); DB removed", + ) + return "ok", str(out_db) diff --git a/tasks/todo.md b/tasks/todo.md index f86bd65..30b473c 100644 --- a/tasks/todo.md +++ b/tasks/todo.md @@ -115,4 +115,26 @@ all targets are picked, tracking can run in the background. ## Discovered During Work -(Add new items here as they come up during analysis) +### Barrier-opening annotation for the 2024 batch (added 2026-04-30) +The current `flies_analysis*.ipynb` aligns trajectories to a barrier-opening +event sourced from `data/metadata/2025_07_15_barrier_opening.csv`. That file +covers only the 5 machines in the 2025-07-15 experiment. The 2024 batch +(`/mnt/data/projects/cupido/tracked/`, 113 DBs) has no equivalent annotation +yet, so all post-alignment cells silently exclude that data. + +- [ ] Build a small picker that lets the user scrub through each tracking + DB / video and mark the barrier-opening frame, writing a row to a new + `data/metadata/barrier_opening_2024.csv` (or extend the existing + file with a date column). +- [ ] Once the 2024 entries exist, update `align_to_opening_time` so it + pulls from a unified `barrier_opening` table keyed by + `(date, machine_name)` rather than `machine_name` alone. + +### Metadata vocabulary normalization (done 2026-04-30) +The xlsx had inconsistent labels for control flies (`'naïve'`, `'niave'`, +`'untrained'` plus trailing whitespace). All sources now use a single +canonical `'naive'`. Normalization happens in +`scripts/export_video_db_index.py` so re-running it from the xlsx always +produces a clean TSV. The 2025-07-15 legacy CSV +(`data/metadata/2025_07_15_metadata_fixed.csv`) was edited in place from +`'untrained'` → `'naive'`.