cupido/scripts/load_roi_data.py
Giorgio e7e4db264d Initial commit: organized project structure for student handoff
Reorganized flat 41-file directory into structured layout with:
- scripts/ for Python analysis code with shared config.py
- notebooks/ for Jupyter analysis notebooks
- data/ split into raw/, metadata/, processed/
- docs/ with analysis summary, experimental design, and bimodal hypothesis tutorial
- tasks/ with todo checklist and lessons learned
- Comprehensive README, PLANNING.md, and .gitignore

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-05 16:08:36 +00:00

90 lines
3.2 KiB
Python

import pandas as pd
import sqlite3
import re
from config import DATA_RAW, DATA_METADATA, DATA_PROCESSED
def load_roi_data():
"""Load ROI data from SQLite databases and group by trained/untrained.
Returns:
tuple: (trained_df, untrained_df) DataFrames with tracking data.
"""
metadata = pd.read_csv(DATA_METADATA / '2025_07_15_metadata_fixed.csv')
metadata['machine_name'] = metadata['machine_name'].astype(str)
trained_rois = metadata[metadata['group'] == 'trained']
untrained_rois = metadata[metadata['group'] == 'untrained']
db_files = list(DATA_RAW.glob('*_tracking.db'))
trained_df = pd.DataFrame()
untrained_df = pd.DataFrame()
for db_file in db_files:
print(f"Processing {db_file.name}")
pattern = r'_([0-9a-f]{32})__'
match = re.search(pattern, db_file.name)
if not match:
print(f"Could not extract UUID from {db_file.name}")
continue
uuid = match.group(1)
metadata_matches = metadata[metadata['path'].str.contains(uuid, na=False)]
if metadata_matches.empty:
print(f"No metadata matches found for UUID {uuid} from {db_file.name}")
continue
machine_id = metadata_matches.iloc[0]['machine_name']
print(f"Matched to machine ID: {machine_id}")
conn = sqlite3.connect(str(db_file))
machine_trained = trained_rois[trained_rois['machine_name'] == machine_id]
machine_untrained = untrained_rois[untrained_rois['machine_name'] == machine_id]
for _, row in machine_trained.iterrows():
roi = row['ROI']
try:
query = f"SELECT * FROM ROI_{roi}"
roi_data = pd.read_sql_query(query, conn)
roi_data['machine_name'] = machine_id
roi_data['ROI'] = roi
roi_data['group'] = 'trained'
trained_df = pd.concat([trained_df, roi_data], ignore_index=True)
except Exception as e:
print(f"Error loading ROI_{roi} from {db_file.name}: {e}")
for _, row in machine_untrained.iterrows():
roi = row['ROI']
try:
query = f"SELECT * FROM ROI_{roi}"
roi_data = pd.read_sql_query(query, conn)
roi_data['machine_name'] = machine_id
roi_data['ROI'] = roi
roi_data['group'] = 'untrained'
untrained_df = pd.concat([untrained_df, roi_data], ignore_index=True)
except Exception as e:
print(f"Error loading ROI_{roi} from {db_file.name}: {e}")
conn.close()
return trained_df, untrained_df
if __name__ == "__main__":
trained_data, untrained_data = load_roi_data()
print(f"Trained data shape: {trained_data.shape}")
print(f"Untrained data shape: {untrained_data.shape}")
if not trained_data.empty:
print("Trained data columns:", trained_data.columns.tolist())
if not untrained_data.empty:
print("Untrained data columns:", untrained_data.columns.tolist())
trained_data.to_csv(DATA_PROCESSED / 'trained_roi_data.csv', index=False)
untrained_data.to_csv(DATA_PROCESSED / 'untrained_roi_data.csv', index=False)
print("Data saved to trained_roi_data.csv and untrained_roi_data.csv")