cupido/notebooks/flies_analysis.ipynb
Giorgio e7e4db264d Initial commit: organized project structure for student handoff
Reorganized flat 41-file directory into structured layout with:
- scripts/ for Python analysis code with shared config.py
- notebooks/ for Jupyter analysis notebooks
- data/ split into raw/, metadata/, processed/
- docs/ with analysis summary, experimental design, and bimodal hypothesis tutorial
- tasks/ with todo checklist and lessons learned
- Comprehensive README, PLANNING.md, and .gitignore

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-05 16:08:36 +00:00

222 lines
No EOL
14 KiB
Text

{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Flies Behavior Analysis Pipeline\n",
"\n",
"This notebook implements the complete analysis pipeline for discriminating between trained and untrained flies based on their distance behavior."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": "import pandas as pd\nimport numpy as np\nimport sqlite3\nimport glob\nimport re\nimport matplotlib.pyplot as plt\nimport seaborn as sns\nfrom scipy.spatial.distance import euclidean\nfrom scipy import stats\nfrom pathlib import Path\nimport sys\n\n# Set up paths relative to notebook location\nPROJECT_ROOT = Path(\"..\").resolve()\nDATA_RAW = PROJECT_ROOT / \"data\" / \"raw\"\nDATA_METADATA = PROJECT_ROOT / \"data\" / \"metadata\"\nDATA_PROCESSED = PROJECT_ROOT / \"data\" / \"processed\"\nFIGURES = PROJECT_ROOT / \"figures\"\n\nsys.path.insert(0, str(PROJECT_ROOT / \"scripts\"))\n\n# Set plotting style\nplt.style.use('seaborn-v0_8')\nsns.set_palette(\"husl\")"
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 1. Load data from DB and save as CSV grouped by trained/untrained"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": "def load_roi_data():\n \"\"\"Load ROI data from SQLite databases and group by trained/untrained\"\"\"\n metadata = pd.read_csv(DATA_METADATA / '2025_07_15_metadata_fixed.csv')\n metadata['machine_name'] = metadata['machine_name'].astype(str)\n \n trained_rois = metadata[metadata['group'] == 'trained']\n untrained_rois = metadata[metadata['group'] == 'untrained']\n \n db_files = list(DATA_RAW.glob('*_tracking.db'))\n \n trained_df = pd.DataFrame()\n untrained_df = pd.DataFrame()\n \n for db_file in db_files:\n print(f\"Processing {db_file.name}\")\n \n pattern = r'_([0-9a-f]{32})__'\n match = re.search(pattern, db_file.name)\n \n if not match:\n print(f\"Could not extract UUID from {db_file.name}\")\n continue\n \n uuid = match.group(1)\n metadata_matches = metadata[metadata['path'].str.contains(uuid, na=False)]\n \n if metadata_matches.empty:\n print(f\"No metadata matches found for UUID {uuid}\")\n continue\n \n machine_id = metadata_matches.iloc[0]['machine_name']\n print(f\"Matched to machine ID: {machine_id}\")\n \n conn = sqlite3.connect(str(db_file))\n \n machine_trained = trained_rois[trained_rois['machine_name'] == machine_id]\n machine_untrained = untrained_rois[untrained_rois['machine_name'] == machine_id]\n \n for _, row in machine_trained.iterrows():\n roi = row['ROI']\n try:\n roi_data = pd.read_sql_query(f\"SELECT * FROM ROI_{roi}\", conn)\n roi_data['machine_name'] = machine_id\n roi_data['ROI'] = roi\n roi_data['group'] = 'trained'\n trained_df = pd.concat([trained_df, roi_data], ignore_index=True)\n except Exception as e:\n print(f\"Error loading ROI_{roi}: {e}\")\n \n for _, row in machine_untrained.iterrows():\n roi = row['ROI']\n try:\n roi_data = pd.read_sql_query(f\"SELECT * FROM ROI_{roi}\", conn)\n roi_data['machine_name'] = machine_id\n roi_data['ROI'] = roi\n roi_data['group'] = 'untrained'\n untrained_df = pd.concat([untrained_df, roi_data], ignore_index=True)\n except Exception as e:\n print(f\"Error loading ROI_{roi}: {e}\")\n \n conn.close()\n \n return trained_df, untrained_df\n\ntrained_data, untrained_data = load_roi_data()\nprint(f\"Trained data shape: {trained_data.shape}\")\nprint(f\"Untrained data shape: {untrained_data.shape}\")\n\ntrained_data.to_csv(DATA_PROCESSED / 'trained_roi_data.csv', index=False)\nuntrained_data.to_csv(DATA_PROCESSED / 'untrained_roi_data.csv', index=False)\nprint(\"Data saved to CSV files\")"
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 2. Align data using barrier opening time as time 0"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": "barrier_data = pd.read_csv(DATA_METADATA / '2025_07_15_barrier_opening.csv')\nbarrier_data['opening_time_ms'] = barrier_data['opening_time'] * 1000\nopening_times = dict(zip(barrier_data['machine'], barrier_data['opening_time_ms']))\nprint(\"Barrier opening times:\")\nprint(barrier_data)"
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def align_to_opening_time(df, opening_times):\n",
" \"\"\"Align data to barrier opening time\"\"\"\n",
" # Add aligned time column\n",
" df_aligned = df.copy()\n",
" df_aligned['aligned_time'] = np.nan\n",
" \n",
" # Align each machine's data\n",
" for machine in df['machine_name'].unique():\n",
" if machine in opening_times:\n",
" opening_time = opening_times[machine]\n",
" mask = df['machine_name'] == machine\n",
" df_aligned.loc[mask, 'aligned_time'] = df.loc[mask, 't'] - opening_time\n",
" \n",
" # Remove rows where aligned_time is NaN\n",
" df_aligned = df_aligned.dropna(subset=['aligned_time'])\n",
" \n",
" return df_aligned\n",
"\n",
"# Align the data\n",
"trained_aligned = align_to_opening_time(trained_data, opening_times)\n",
"untrained_aligned = align_to_opening_time(untrained_data, opening_times)\n",
"\n",
"print(f\"Trained aligned data shape: {trained_aligned.shape}\")\n",
"print(f\"Untrained aligned data shape: {untrained_aligned.shape}\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 3. Calculate median area size in rows where two flies are being tracked"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def calculate_areas_with_two_flies(df):\n",
" \"\"\"Calculate median area size for time points with two flies\"\"\"\n",
" # Calculate area for each row\n",
" df['area'] = df['w'] * df['h']\n",
" \n",
" # Group by machine_name, ROI, and time to count flies per time point\n",
" fly_counts = df.groupby(['machine_name', 'ROI', 't']).size().reset_index(name='fly_count')\n",
" \n",
" # Filter for time points with exactly 2 flies\n",
" two_fly_times = fly_counts[fly_counts['fly_count'] == 2]\n",
" \n",
" # Merge back with original data to get areas for these time points\n",
" two_fly_data = pd.merge(df, two_fly_times[['machine_name', 'ROI', 't']], \n",
" on=['machine_name', 'ROI', 't'])\n",
" \n",
" # Calculate median area\n",
" median_area = two_fly_data['area'].median()\n",
" \n",
" return median_area, two_fly_data\n",
"\n",
"# Combine trained and untrained data for area calculation\n",
"combined_data = pd.concat([trained_aligned, untrained_aligned], ignore_index=True)\n",
"\n",
"# Calculate median area for time points with two flies\n",
"median_area, two_fly_data = calculate_areas_with_two_flies(combined_data)\n",
"print(f\"Median area size for time points with two flies: {median_area:.2f}\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 4. Calculate distances taking into account area size"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": "def calculate_distances_with_area(df, median_area_threshold):\n \"\"\"Calculate distances between flies, setting to 0 for large single-fly detections\"\"\"\n df['area'] = df['w'] * df['h']\n results = []\n \n for (machine_name, roi, t), group in df.groupby(['machine_name', 'ROI', 'aligned_time']):\n group = group.sort_values('id').reset_index(drop=True)\n \n if len(group) >= 2:\n fly1 = group.iloc[0]\n fly2 = group.iloc[1]\n distance = euclidean([fly1['x'], fly1['y']], [fly2['x'], fly2['y']])\n results.append({\n 'machine_name': machine_name, 'ROI': roi, 'aligned_time': t,\n 'distance': distance, 'n_flies': len(group),\n 'area_fly1': fly1['area'], 'area_fly2': fly2['area'],\n 'group': fly1['group']\n })\n elif len(group) == 1:\n fly = group.iloc[0]\n area = fly['area']\n distance = 0.0 if area > 1.5 * median_area_threshold else np.nan\n results.append({\n 'machine_name': machine_name, 'ROI': roi, 'aligned_time': t,\n 'distance': distance, 'n_flies': 1,\n 'area_fly1': area, 'area_fly2': np.nan,\n 'group': fly['group']\n })\n \n return pd.DataFrame(results)\n\ntrained_distances = calculate_distances_with_area(trained_aligned, median_area)\nuntrained_distances = calculate_distances_with_area(untrained_aligned, median_area)\n\nprint(f\"Trained distances shape: {trained_distances.shape}\")\nprint(f\"Untrained distances shape: {untrained_distances.shape}\")\n\ntrained_distances.to_csv(DATA_PROCESSED / 'trained_distances_aligned.csv', index=False)\nuntrained_distances.to_csv(DATA_PROCESSED / 'untrained_distances_aligned.csv', index=False)\nprint(\"Distance data saved to CSV files\")"
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 5. Plot averaged lines of trained vs untrained for the entire experiment"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": "trained_clean = trained_distances.dropna(subset=['distance'])\nuntrained_clean = untrained_distances.dropna(subset=['distance'])\n\ntrained_avg = trained_clean.groupby('aligned_time')['distance'].mean()\nuntrained_avg = untrained_clean.groupby('aligned_time')['distance'].mean()\n\nwindow_size = 50\ntrained_smooth = trained_avg.rolling(window=window_size, center=True).mean()\nuntrained_smooth = untrained_avg.rolling(window=window_size, center=True).mean()\n\nplt.figure(figsize=(15, 8))\nplt.plot(trained_smooth.index/1000, trained_smooth.values, label='Trained (smoothed)', color='blue', linewidth=2)\nplt.plot(untrained_smooth.index/1000, untrained_smooth.values, label='Untrained (smoothed)', color='red', linewidth=2)\nplt.axvline(x=0, color='black', linestyle='--', alpha=0.7, label='Barrier Opening')\nplt.xlabel('Time (seconds relative to barrier opening)')\nplt.ylabel('Average Distance')\nplt.title('Average Distance Between Flies Over Entire Experiment')\nplt.legend()\nplt.grid(True, alpha=0.3)\nplt.tight_layout()\nplt.savefig(FIGURES / 'avg_distance_entire_experiment.png', dpi=300, bbox_inches='tight')\nplt.show()"
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 6. Same plot but ending at time +300 seconds"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": "trained_filtered = trained_clean[trained_clean['aligned_time'] <= 300000]\nuntrained_filtered = untrained_clean[untrained_clean['aligned_time'] <= 300000]\n\ntrained_avg_300 = trained_filtered.groupby('aligned_time')['distance'].mean()\nuntrained_avg_300 = untrained_filtered.groupby('aligned_time')['distance'].mean()\n\ntrained_smooth_300 = trained_avg_300.rolling(window=window_size, center=True).mean()\nuntrained_smooth_300 = untrained_avg_300.rolling(window=window_size, center=True).mean()\n\nplt.figure(figsize=(15, 8))\nplt.plot(trained_smooth_300.index/1000, trained_smooth_300.values, label='Trained (smoothed)', color='blue', linewidth=2)\nplt.plot(untrained_smooth_300.index/1000, untrained_smooth_300.values, label='Untrained (smoothed)', color='red', linewidth=2)\nplt.axvline(x=0, color='black', linestyle='--', alpha=0.7, label='Barrier Opening')\nplt.xlabel('Time (seconds relative to barrier opening)')\nplt.ylabel('Average Distance')\nplt.title('Average Distance Between Flies (First 300 Seconds Post-Opening)')\nplt.legend()\nplt.grid(True, alpha=0.3)\nplt.xlim(-150, 300)\nplt.tight_layout()\nplt.savefig(FIGURES / 'avg_distance_300_seconds.png', dpi=300, bbox_inches='tight')\nplt.show()"
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Summary Statistics"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print(\"=== SUMMARY STATISTICS ===\")\n",
"print(f\"Median area size for two-fly detections: {median_area:.2f}\")\n",
"\n",
"print(\"\\nPre-opening period (t < 0):\")\n",
"trained_pre = trained_clean[trained_clean['aligned_time'] < 0]['distance']\n",
"untrained_pre = untrained_clean[untrained_clean['aligned_time'] < 0]['distance']\n",
"print(f\" Trained mean distance: {trained_pre.mean():.2f}\")\n",
"print(f\" Untrained mean distance: {untrained_pre.mean():.2f}\")\n",
"\n",
"print(\"\\nPost-opening period (t > 0):\")\n",
"trained_post = trained_clean[trained_clean['aligned_time'] > 0]['distance']\n",
"untrained_post = untrained_clean[untrained_clean['aligned_time'] > 0]['distance']\n",
"print(f\" Trained mean distance: {trained_post.mean():.2f}\")\n",
"print(f\" Untrained mean distance: {untrained_post.mean():.2f}\")\n",
"\n",
"# Statistical test\n",
"t_stat, p_val = stats.ttest_ind(trained_post, untrained_post)\n",
"cohens_d = (trained_post.mean() - untrained_post.mean()) / np.sqrt(((len(trained_post)-1)*trained_post.var() + (len(untrained_post)-1)*untrained_post.var()) / (len(trained_post) + len(untrained_post) - 2))\n",
"\n",
"print(f\"\\nPost-opening comparison (trained vs untrained):\")\n",
"print(f\" T-statistic: {t_stat:.4f}\")\n",
"print(f\" P-value: {p_val:.2e}\")\n",
"print(f\" Cohen's d: {cohens_d:.4f}\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.5"
}
},
"nbformat": 4,
"nbformat_minor": 4
}