Reorganized flat 41-file directory into structured layout with: - scripts/ for Python analysis code with shared config.py - notebooks/ for Jupyter analysis notebooks - data/ split into raw/, metadata/, processed/ - docs/ with analysis summary, experimental design, and bimodal hypothesis tutorial - tasks/ with todo checklist and lessons learned - Comprehensive README, PLANNING.md, and .gitignore Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
90 lines
3.2 KiB
Python
90 lines
3.2 KiB
Python
import pandas as pd
|
|
import sqlite3
|
|
import re
|
|
|
|
from config import DATA_RAW, DATA_METADATA, DATA_PROCESSED
|
|
|
|
|
|
def load_roi_data():
|
|
"""Load ROI data from SQLite databases and group by trained/untrained.
|
|
|
|
Returns:
|
|
tuple: (trained_df, untrained_df) DataFrames with tracking data.
|
|
"""
|
|
metadata = pd.read_csv(DATA_METADATA / '2025_07_15_metadata_fixed.csv')
|
|
metadata['machine_name'] = metadata['machine_name'].astype(str)
|
|
|
|
trained_rois = metadata[metadata['group'] == 'trained']
|
|
untrained_rois = metadata[metadata['group'] == 'untrained']
|
|
|
|
db_files = list(DATA_RAW.glob('*_tracking.db'))
|
|
|
|
trained_df = pd.DataFrame()
|
|
untrained_df = pd.DataFrame()
|
|
|
|
for db_file in db_files:
|
|
print(f"Processing {db_file.name}")
|
|
|
|
pattern = r'_([0-9a-f]{32})__'
|
|
match = re.search(pattern, db_file.name)
|
|
|
|
if not match:
|
|
print(f"Could not extract UUID from {db_file.name}")
|
|
continue
|
|
|
|
uuid = match.group(1)
|
|
metadata_matches = metadata[metadata['path'].str.contains(uuid, na=False)]
|
|
|
|
if metadata_matches.empty:
|
|
print(f"No metadata matches found for UUID {uuid} from {db_file.name}")
|
|
continue
|
|
|
|
machine_id = metadata_matches.iloc[0]['machine_name']
|
|
print(f"Matched to machine ID: {machine_id}")
|
|
|
|
conn = sqlite3.connect(str(db_file))
|
|
|
|
machine_trained = trained_rois[trained_rois['machine_name'] == machine_id]
|
|
machine_untrained = untrained_rois[untrained_rois['machine_name'] == machine_id]
|
|
|
|
for _, row in machine_trained.iterrows():
|
|
roi = row['ROI']
|
|
try:
|
|
query = f"SELECT * FROM ROI_{roi}"
|
|
roi_data = pd.read_sql_query(query, conn)
|
|
roi_data['machine_name'] = machine_id
|
|
roi_data['ROI'] = roi
|
|
roi_data['group'] = 'trained'
|
|
trained_df = pd.concat([trained_df, roi_data], ignore_index=True)
|
|
except Exception as e:
|
|
print(f"Error loading ROI_{roi} from {db_file.name}: {e}")
|
|
|
|
for _, row in machine_untrained.iterrows():
|
|
roi = row['ROI']
|
|
try:
|
|
query = f"SELECT * FROM ROI_{roi}"
|
|
roi_data = pd.read_sql_query(query, conn)
|
|
roi_data['machine_name'] = machine_id
|
|
roi_data['ROI'] = roi
|
|
roi_data['group'] = 'untrained'
|
|
untrained_df = pd.concat([untrained_df, roi_data], ignore_index=True)
|
|
except Exception as e:
|
|
print(f"Error loading ROI_{roi} from {db_file.name}: {e}")
|
|
|
|
conn.close()
|
|
|
|
return trained_df, untrained_df
|
|
|
|
|
|
if __name__ == "__main__":
|
|
trained_data, untrained_data = load_roi_data()
|
|
print(f"Trained data shape: {trained_data.shape}")
|
|
print(f"Untrained data shape: {untrained_data.shape}")
|
|
if not trained_data.empty:
|
|
print("Trained data columns:", trained_data.columns.tolist())
|
|
if not untrained_data.empty:
|
|
print("Untrained data columns:", untrained_data.columns.tolist())
|
|
|
|
trained_data.to_csv(DATA_PROCESSED / 'trained_roi_data.csv', index=False)
|
|
untrained_data.to_csv(DATA_PROCESSED / 'untrained_roi_data.csv', index=False)
|
|
print("Data saved to trained_roi_data.csv and untrained_roi_data.csv")
|