diff --git a/.gitignore b/.gitignore index 54331af..b094370 100644 --- a/.gitignore +++ b/.gitignore @@ -1,9 +1,7 @@ -# Large data files (reproducible from raw DBs) -data/raw/*.db +# Generated CSVs (regenerable from the tracking DBs + the merged TSV) data/processed/*.csv -# Offline-tracking outputs (regenerable from videos + target JSONs) -# DBs and target JSONs live outside the repo at /mnt/data/projects/cupido/ +# Tracking DBs and target JSONs live outside the repo at /mnt/data/projects/cupido/ data/metadata/video_inventory.csv data/logs/*.log diff --git a/PLANNING.md b/PLANNING.md index 92c711b..b1b8052 100644 --- a/PLANNING.md +++ b/PLANNING.md @@ -30,14 +30,19 @@ Drosophila behavioral tracking analysis for the Cupido project. Compares social ``` tracking/ -├── data/raw/ # SQLite DBs (gitignored) -├── data/metadata/ # Small CSVs (tracked) +├── data/metadata/ # Small hand-curated CSVs (tracked in git) ├── data/processed/ # Large generated CSVs (gitignored) +├── data/logs/ # Tracker logs (gitignored) ├── scripts/ # Python scripts with config.py imports ├── notebooks/ # Jupyter analysis notebooks ├── figures/ # Generated plots (gitignored) ├── docs/ # Scientific documentation └── tasks/ # Task tracking + +# All bulky data lives outside the repo at /mnt/data/projects/cupido/: +# tracked/ # SQLite tracking DBs +# targets/ # Target-point JSON sidecars +# all_video_info_merged.{xlsx,tsv} # Metadata spreadsheet ``` ## Next Direction diff --git a/README.md b/README.md index 6c54237..9782d73 100644 --- a/README.md +++ b/README.md @@ -14,9 +14,11 @@ python -m venv .venv source .venv/bin/activate pip install -r requirements.txt -# Get the data files (not in git - ask lab for copies) -# Place .db files in data/raw/ -# Place large .csv files in data/processed/ +# Project data lives outside the repo at /mnt/data/projects/cupido/: +# tracked/ → SQLite tracking DBs +# targets/ → target-point JSONs +# all_video_info_merged.{xlsx,tsv} → metadata spreadsheet +# Generated CSVs land in data/processed/ (gitignored). # Run the main analysis notebook jupyter notebook notebooks/flies_analysis_simple.ipynb @@ -66,7 +68,7 @@ python scripts/pick_targets.py --redo # re-pick already-picked videos # 3) batch tracking (idempotent, can run in background) python scripts/track_videos.py --jobs 4 # parallel -# output → /mnt/data/projects/cupido/tracked/*_tracking.db (SQLite, same schema as data/raw/) +# output → /mnt/data/projects/cupido/tracked/*_tracking.db (SQLite) ``` See `tasks/todo.md` "Offline Tracking" section for the full plan, and @@ -80,9 +82,9 @@ tracking/ ├── PLANNING.md # Architecture & conventions ├── requirements.txt # Python dependencies ├── data/ -│ ├── raw/ # SQLite tracking databases (gitignored) -│ ├── metadata/ # Experiment metadata CSVs -│ └── processed/ # Generated analysis CSVs (gitignored) +│ ├── metadata/ # Experiment metadata CSVs (small, hand-curated) +│ ├── processed/ # Generated analysis CSVs (gitignored) +│ └── logs/ # Tracker logs (gitignored) ├── scripts/ # Python analysis scripts │ ├── config.py # Shared path constants │ ├── load_roi_data.py # Extract data from DBs @@ -107,13 +109,13 @@ tracking/ ## Data Pipeline ``` -SQLite DBs (data/raw/) +SQLite DBs (/mnt/data/projects/cupido/tracked/) + merged TSV │ - ▼ load_roi_data.py / notebook step 1 -ROI CSVs (data/processed/*_roi_data.csv) + ▼ scripts/load_roi_data.py +single DataFrame stamped with experimental metadata │ - ▼ notebook steps 2-4 -Aligned Distance CSVs (data/processed/*_distances_aligned.csv) + ▼ notebooks/flies_analysis_simple.ipynb (steps 2–4) +Aligned distance CSVs (data/processed/*_distances_aligned.csv) │ ├──▶ Plots (figures/) ├──▶ Statistical tests diff --git a/data/raw/README.md b/data/raw/README.md deleted file mode 100644 index 2f9694d..0000000 --- a/data/raw/README.md +++ /dev/null @@ -1,37 +0,0 @@ -# Raw Data - -SQLite databases containing fly tracking data from ethoscope recordings. - -## Files - -| File | Machine | Session | Size | -|------|---------|---------|------| -| `2025-07-15_16-03-10_076e...tracking.db` | ETHOSCOPE_076 | 16:03:10 | ~6.5MB | -| `2025-07-15_16-03-27_145b...tracking.db` | ETHOSCOPE_145 | 16:03:27 | ~6.1MB | -| `2025-07-15_16-31-34_076e...tracking.db` | ETHOSCOPE_076 | 16:31:34 | ~6.6MB | -| `2025-07-15_16-31-41_145b...tracking.db` | ETHOSCOPE_145 | 16:31:41 | ~6.6MB | -| `2025-07-15_16-32-05_268...tracking.db` | ETHOSCOPE_268 | 16:32:05 | ~7.0MB | - -**Note**: Machine 139 has metadata but no tracking database. See `docs/experimental_design.md`. - -## Schema - -Each database contains tables `ROI_1` through `ROI_6`: - -| Column | Type | Description | -|--------|------|-------------| -| `id` | int | Detection ID within frame | -| `t` | int | Time in **milliseconds** from recording start | -| `x` | float | X position in pixels | -| `y` | float | Y position in pixels | -| `w` | float | Bounding box width in pixels | -| `h` | float | Bounding box height in pixels | -| `phi` | float | Orientation angle | -| `is_inferred` | int | Whether position was inferred (0/1) | -| `has_interacted` | int | Whether interaction detected (0/1) | - -## Provenance - -Data recorded on 2025-07-15 using ethoscope platform. -Resolution: 1920x1088 @ 25fps, H.264 28q quality. -These files are gitignored (binary, ~33MB total). diff --git a/notebooks/flies_analysis.ipynb b/notebooks/flies_analysis.ipynb index ea89468..f81feb2 100644 --- a/notebooks/flies_analysis.ipynb +++ b/notebooks/flies_analysis.ipynb @@ -14,7 +14,7 @@ "execution_count": null, "metadata": {}, "outputs": [], - "source": "import sys\nfrom pathlib import Path\n\nimport pandas as pd\nimport numpy as np\nimport sqlite3\nimport glob\nimport re\nimport matplotlib.pyplot as plt\nimport seaborn as sns\nfrom scipy.spatial.distance import euclidean\nfrom scipy import stats\n\n# ─── Where the data lives ────────────────────────────────────────────────\n# DATA_DIR holds everything bulky/regenerable: the metadata TSV and the\n# tracking SQLite DBs. It's mounted into the container at this fixed path.\n# REPO_ROOT is your checkout of the cupido repo, in your home directory.\n# Path.home() expands to /home/, so this works for any\n# user (no hard-coded usernames).\nDATA_DIR = Path(\"/mnt/data/projects/cupido\")\nREPO_ROOT = Path.home() / \"cupido\"\n\nMETADATA_TSV = DATA_DIR / \"all_video_info_merged.tsv\"\nTRACKED_DBS = DATA_DIR / \"tracked\"\n\n# Sanity-check the data location up front so any failure here points at\n# the obvious thing — rather than crashing inside load_roi_data later.\nassert METADATA_TSV.exists(), f\"Metadata TSV not found at {METADATA_TSV}\"\nassert TRACKED_DBS.is_dir(), f\"Tracked-DB directory not found at {TRACKED_DBS}\"\n\n# Pull the in-repo path constants (DATA_RAW, DATA_METADATA, DATA_PROCESSED,\n# FIGURES) from scripts/config.py — single source of truth.\nsys.path.insert(0, str(REPO_ROOT / \"scripts\"))\nfrom config import DATA_RAW, DATA_METADATA, DATA_PROCESSED, FIGURES\n\n# Plotting style\nplt.style.use('seaborn-v0_8')\nsns.set_palette(\"husl\")\n" + "source": "import sys\nfrom pathlib import Path\n\nimport pandas as pd\nimport numpy as np\nimport sqlite3\nimport glob\nimport re\nimport matplotlib.pyplot as plt\nimport seaborn as sns\nfrom scipy.spatial.distance import euclidean\nfrom scipy import stats\n\n# ─── Where the data lives ────────────────────────────────────────────────\n# DATA_DIR holds everything bulky/regenerable: the metadata TSV and the\n# tracking SQLite DBs. It's mounted into the container at this fixed path.\n# REPO_ROOT is your checkout of the cupido repo, in your home directory.\n# Path.home() expands to /home/, so this works for any\n# user (no hard-coded usernames).\nDATA_DIR = Path(\"/mnt/data/projects/cupido\")\nREPO_ROOT = Path.home() / \"cupido\"\n\nMETADATA_TSV = DATA_DIR / \"all_video_info_merged.tsv\"\nTRACKED_DBS = DATA_DIR / \"tracked\"\n\n# Sanity-check the data location up front so any failure here points at\n# the obvious thing — rather than crashing inside load_roi_data later.\nassert METADATA_TSV.exists(), f\"Metadata TSV not found at {METADATA_TSV}\"\nassert TRACKED_DBS.is_dir(), f\"Tracked-DB directory not found at {TRACKED_DBS}\"\n\n# Pull the in-repo path constants (DATA_METADATA, DATA_PROCESSED, FIGURES)\n# from scripts/config.py — single source of truth.\nsys.path.insert(0, str(REPO_ROOT / \"scripts\"))\nfrom config import DATA_METADATA, DATA_PROCESSED, FIGURES\n\n# Plotting style\nplt.style.use('seaborn-v0_8')\nsns.set_palette(\"husl\")\n" }, { "cell_type": "markdown", diff --git a/notebooks/flies_analysis_simple.ipynb b/notebooks/flies_analysis_simple.ipynb index 264997b..d8ba164 100644 --- a/notebooks/flies_analysis_simple.ipynb +++ b/notebooks/flies_analysis_simple.ipynb @@ -10,7 +10,7 @@ "execution_count": null, "metadata": {}, "outputs": [], - "source": "import sys\nfrom pathlib import Path\n\nimport pandas as pd\nimport numpy as np\nimport matplotlib.pyplot as plt\nimport seaborn as sns\nfrom scipy.spatial.distance import euclidean\nfrom scipy import stats\n\n# ─── Where the data lives ────────────────────────────────────────────────\n# DATA_DIR holds everything bulky/regenerable: the metadata TSV and the\n# tracking SQLite DBs. It's mounted into the container at this fixed path.\n# REPO_ROOT is your checkout of the cupido repo, in your home directory.\n# Path.home() expands to /home/, so this works for any\n# user (no hard-coded usernames).\nDATA_DIR = Path(\"/mnt/data/projects/cupido\")\nREPO_ROOT = Path.home() / \"cupido\"\n\nMETADATA_TSV = DATA_DIR / \"all_video_info_merged.tsv\"\nTRACKED_DBS = DATA_DIR / \"tracked\"\n\n# Sanity-check the data location up front so any failure here points at\n# the obvious thing — rather than crashing inside load_roi_data later.\nassert METADATA_TSV.exists(), f\"Metadata TSV not found at {METADATA_TSV}\"\nassert TRACKED_DBS.is_dir(), f\"Tracked-DB directory not found at {TRACKED_DBS}\"\n\n# Pull the in-repo path constants (DATA_RAW, DATA_METADATA, DATA_PROCESSED,\n# FIGURES) from scripts/config.py — single source of truth.\nsys.path.insert(0, str(REPO_ROOT / \"scripts\"))\nfrom config import DATA_RAW, DATA_METADATA, DATA_PROCESSED, FIGURES\n\n# Plotting style\nplt.style.use('seaborn-v0_8')\nsns.set_palette(\"husl\")\n\nprint(f\"Data directory: {DATA_DIR}\")\nprint(f\"Repo root: {REPO_ROOT}\")\nprint(f\"Metadata TSV: {METADATA_TSV}\")\nprint(f\"Pandas version: {pd.__version__}\")\nprint(f\"NumPy version: {np.__version__}\")\n" + "source": "import sys\nfrom pathlib import Path\n\nimport pandas as pd\nimport numpy as np\nimport matplotlib.pyplot as plt\nimport seaborn as sns\nfrom scipy.spatial.distance import euclidean\nfrom scipy import stats\n\n# ─── Where the data lives ────────────────────────────────────────────────\n# DATA_DIR holds everything bulky/regenerable: the metadata TSV and the\n# tracking SQLite DBs. It's mounted into the container at this fixed path.\n# REPO_ROOT is your checkout of the cupido repo, in your home directory.\n# Path.home() expands to /home/, so this works for any\n# user (no hard-coded usernames).\nDATA_DIR = Path(\"/mnt/data/projects/cupido\")\nREPO_ROOT = Path.home() / \"cupido\"\n\nMETADATA_TSV = DATA_DIR / \"all_video_info_merged.tsv\"\nTRACKED_DBS = DATA_DIR / \"tracked\"\n\n# Sanity-check the data location up front so any failure here points at\n# the obvious thing — rather than crashing inside load_roi_data later.\nassert METADATA_TSV.exists(), f\"Metadata TSV not found at {METADATA_TSV}\"\nassert TRACKED_DBS.is_dir(), f\"Tracked-DB directory not found at {TRACKED_DBS}\"\n\n# Pull the in-repo path constants (DATA_METADATA, DATA_PROCESSED, FIGURES)\n# from scripts/config.py — single source of truth.\nsys.path.insert(0, str(REPO_ROOT / \"scripts\"))\nfrom config import DATA_METADATA, DATA_PROCESSED, FIGURES\n\n# Plotting style\nplt.style.use('seaborn-v0_8')\nsns.set_palette(\"husl\")\n\nprint(f\"Data directory: {DATA_DIR}\")\nprint(f\"Repo root: {REPO_ROOT}\")\nprint(f\"Metadata TSV: {METADATA_TSV}\")\nprint(f\"Pandas version: {pd.__version__}\")\nprint(f\"NumPy version: {np.__version__}\")\n" }, { "cell_type": "markdown", diff --git a/scripts/build_video_inventory.py b/scripts/build_video_inventory.py index 3c083e7..e931137 100644 --- a/scripts/build_video_inventory.py +++ b/scripts/build_video_inventory.py @@ -16,7 +16,7 @@ from pathlib import Path import pandas as pd -from config import DATA_RAW, INVENTORY_CSV, VIDEO_INFO_XLSX, VIDEOS_ROOT +from config import INVENTORY_CSV, TRACKING_OUTPUT_DIR, VIDEO_INFO_XLSX, VIDEOS_ROOT SESSION_RE = re.compile(r"^(\d{4}-\d{2}-\d{2})_(\d{2}-\d{2}-\d{2})$") @@ -64,14 +64,14 @@ def scan_videos(videos_root: Path) -> pd.DataFrame: return pd.DataFrame(rows) -def already_tracked_set(data_raw: Path) -> set[tuple[str, str]]: +def already_tracked_set(tracked_dir: Path) -> set[tuple[str, str]]: """Return the set of (date, time) sessions for which a tracking DB exists. DBs are named like: 2025-07-15_16-03-10___1920x1088@25fps-28q_merged_tracking.db """ out = set() - for db in data_raw.glob("*_tracking.db"): + for db in tracked_dir.glob("*_tracking.db"): m = re.match(r"^(\d{4}-\d{2}-\d{2})_(\d{2}-\d{2}-\d{2})_", db.name) if m: out.add((m.group(1), m.group(2))) @@ -99,8 +99,8 @@ def main() -> None: lambda r: (r["session_date"], r["machine_name"]) in xlsx_keys, axis=1 ) - # Mark which already have tracking DBs in data/raw/ - tracked = already_tracked_set(DATA_RAW) + # Mark which already have tracking DBs in TRACKING_OUTPUT_DIR + tracked = already_tracked_set(TRACKING_OUTPUT_DIR) videos_df["already_tracked"] = videos_df.apply( lambda r: (r["session_date"], r["session_time"]) in tracked, axis=1 ) diff --git a/scripts/config.py b/scripts/config.py index 2e4bda0..18e89ef 100644 --- a/scripts/config.py +++ b/scripts/config.py @@ -5,7 +5,6 @@ from pathlib import Path # Where this code repository lives (the directory containing scripts/, notebooks/, ...). PROJECT_ROOT = Path(__file__).resolve().parent.parent -DATA_RAW = PROJECT_ROOT / "data" / "raw" DATA_METADATA = PROJECT_ROOT / "data" / "metadata" DATA_PROCESSED = PROJECT_ROOT / "data" / "processed" FIGURES = PROJECT_ROOT / "figures" diff --git a/tasks/todo.md b/tasks/todo.md index 30b473c..c114f9e 100644 --- a/tasks/todo.md +++ b/tasks/todo.md @@ -55,14 +55,14 @@ See `docs/bimodal_hypothesis.md` for detailed methodology. ### Recap -Tracked so far (5 sessions, all from 2025-07-15, machines 076/145/268). The DBs in -`data/raw/` use tracker `ConstrainedMultiFlyTracker` and template -`HD_Mating_Arena_6_ROIS.json` (2 flies × 6 ROIs per video). +Tracked so far (5 sessions, all from 2025-07-15, machines 076/145/268). Those +were re-tracked through the unified pipeline and now live at +`/mnt/data/projects/cupido/tracked/` (no separate `data/raw/` anymore — the +old pre-pipeline copies were deleted on 2026-05-01). -The metadata file `../all_video_info_merged.xlsx` indexes a different set of -experiments: 7 dates from 2024-09-17 → 2024-10-21, 16 ethoscope machines, -63 unique (date, machine) sessions = 484 ROI-rows. **None of the already-tracked -sessions are in this xlsx — these are fresh recordings to track.** +The metadata file `/mnt/data/projects/cupido/all_video_info_merged.xlsx` +indexes a different set of experiments: 7 dates from 2024-09-17 → 2024-10-21, +16 ethoscope machines, 63 unique (date, machine) sessions = 484 ROI-rows. Inventory: see `data/metadata/video_inventory.csv` (built by `scripts/build_video_inventory.py`).