import pandas as pd import sqlite3 import re from config import DATA_RAW, DATA_METADATA, DATA_PROCESSED def load_roi_data(): """Load ROI data from SQLite databases and group by trained/untrained. Returns: tuple: (trained_df, untrained_df) DataFrames with tracking data. """ metadata = pd.read_csv(DATA_METADATA / '2025_07_15_metadata_fixed.csv') metadata['machine_name'] = metadata['machine_name'].astype(str) trained_rois = metadata[metadata['group'] == 'trained'] untrained_rois = metadata[metadata['group'] == 'untrained'] db_files = list(DATA_RAW.glob('*_tracking.db')) trained_df = pd.DataFrame() untrained_df = pd.DataFrame() for db_file in db_files: print(f"Processing {db_file.name}") pattern = r'_([0-9a-f]{32})__' match = re.search(pattern, db_file.name) if not match: print(f"Could not extract UUID from {db_file.name}") continue uuid = match.group(1) metadata_matches = metadata[metadata['path'].str.contains(uuid, na=False)] if metadata_matches.empty: print(f"No metadata matches found for UUID {uuid} from {db_file.name}") continue machine_id = metadata_matches.iloc[0]['machine_name'] print(f"Matched to machine ID: {machine_id}") conn = sqlite3.connect(str(db_file)) machine_trained = trained_rois[trained_rois['machine_name'] == machine_id] machine_untrained = untrained_rois[untrained_rois['machine_name'] == machine_id] for _, row in machine_trained.iterrows(): roi = row['ROI'] try: query = f"SELECT * FROM ROI_{roi}" roi_data = pd.read_sql_query(query, conn) roi_data['machine_name'] = machine_id roi_data['ROI'] = roi roi_data['group'] = 'trained' trained_df = pd.concat([trained_df, roi_data], ignore_index=True) except Exception as e: print(f"Error loading ROI_{roi} from {db_file.name}: {e}") for _, row in machine_untrained.iterrows(): roi = row['ROI'] try: query = f"SELECT * FROM ROI_{roi}" roi_data = pd.read_sql_query(query, conn) roi_data['machine_name'] = machine_id roi_data['ROI'] = roi roi_data['group'] = 'untrained' untrained_df = pd.concat([untrained_df, roi_data], ignore_index=True) except Exception as e: print(f"Error loading ROI_{roi} from {db_file.name}: {e}") conn.close() return trained_df, untrained_df if __name__ == "__main__": trained_data, untrained_data = load_roi_data() print(f"Trained data shape: {trained_data.shape}") print(f"Untrained data shape: {untrained_data.shape}") if not trained_data.empty: print("Trained data columns:", trained_data.columns.tolist()) if not untrained_data.empty: print("Untrained data columns:", untrained_data.columns.tolist()) trained_data.to_csv(DATA_PROCESSED / 'trained_roi_data.csv', index=False) untrained_data.to_csv(DATA_PROCESSED / 'untrained_roi_data.csv', index=False) print("Data saved to trained_roi_data.csv and untrained_roi_data.csv")