{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Flies Behavior Analysis Pipeline\n", "\n", "This notebook implements the complete analysis pipeline for discriminating between trained and untrained flies based on their distance behavior." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": "import pandas as pd\nimport numpy as np\nimport sqlite3\nimport glob\nimport re\nimport matplotlib.pyplot as plt\nimport seaborn as sns\nfrom scipy.spatial.distance import euclidean\nfrom scipy import stats\nfrom pathlib import Path\nimport sys\n\n# Set up paths relative to notebook location\nPROJECT_ROOT = Path(\"..\").resolve()\nDATA_RAW = PROJECT_ROOT / \"data\" / \"raw\"\nDATA_METADATA = PROJECT_ROOT / \"data\" / \"metadata\"\nDATA_PROCESSED = PROJECT_ROOT / \"data\" / \"processed\"\nFIGURES = PROJECT_ROOT / \"figures\"\n\nsys.path.insert(0, str(PROJECT_ROOT / \"scripts\"))\n\n# Set plotting style\nplt.style.use('seaborn-v0_8')\nsns.set_palette(\"husl\")" }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 1. Load data from DB and save as CSV grouped by trained/untrained" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": "def load_roi_data():\n \"\"\"Load ROI data from SQLite databases and group by trained/untrained\"\"\"\n metadata = pd.read_csv(DATA_METADATA / '2025_07_15_metadata_fixed.csv')\n metadata['machine_name'] = metadata['machine_name'].astype(str)\n \n trained_rois = metadata[metadata['group'] == 'trained']\n untrained_rois = metadata[metadata['group'] == 'untrained']\n \n db_files = list(DATA_RAW.glob('*_tracking.db'))\n \n trained_df = pd.DataFrame()\n untrained_df = pd.DataFrame()\n \n for db_file in db_files:\n print(f\"Processing {db_file.name}\")\n \n pattern = r'_([0-9a-f]{32})__'\n match = re.search(pattern, db_file.name)\n \n if not match:\n print(f\"Could not extract UUID from {db_file.name}\")\n continue\n \n uuid = match.group(1)\n metadata_matches = metadata[metadata['path'].str.contains(uuid, na=False)]\n \n if metadata_matches.empty:\n print(f\"No metadata matches found for UUID {uuid}\")\n continue\n \n machine_id = metadata_matches.iloc[0]['machine_name']\n print(f\"Matched to machine ID: {machine_id}\")\n \n conn = sqlite3.connect(str(db_file))\n \n machine_trained = trained_rois[trained_rois['machine_name'] == machine_id]\n machine_untrained = untrained_rois[untrained_rois['machine_name'] == machine_id]\n \n for _, row in machine_trained.iterrows():\n roi = row['ROI']\n try:\n roi_data = pd.read_sql_query(f\"SELECT * FROM ROI_{roi}\", conn)\n roi_data['machine_name'] = machine_id\n roi_data['ROI'] = roi\n roi_data['group'] = 'trained'\n trained_df = pd.concat([trained_df, roi_data], ignore_index=True)\n except Exception as e:\n print(f\"Error loading ROI_{roi}: {e}\")\n \n for _, row in machine_untrained.iterrows():\n roi = row['ROI']\n try:\n roi_data = pd.read_sql_query(f\"SELECT * FROM ROI_{roi}\", conn)\n roi_data['machine_name'] = machine_id\n roi_data['ROI'] = roi\n roi_data['group'] = 'untrained'\n untrained_df = pd.concat([untrained_df, roi_data], ignore_index=True)\n except Exception as e:\n print(f\"Error loading ROI_{roi}: {e}\")\n \n conn.close()\n \n return trained_df, untrained_df\n\ntrained_data, untrained_data = load_roi_data()\nprint(f\"Trained data shape: {trained_data.shape}\")\nprint(f\"Untrained data shape: {untrained_data.shape}\")\n\ntrained_data.to_csv(DATA_PROCESSED / 'trained_roi_data.csv', index=False)\nuntrained_data.to_csv(DATA_PROCESSED / 'untrained_roi_data.csv', index=False)\nprint(\"Data saved to CSV files\")" }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 2. Align data using barrier opening time as time 0" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": "barrier_data = pd.read_csv(DATA_METADATA / '2025_07_15_barrier_opening.csv')\nbarrier_data['opening_time_ms'] = barrier_data['opening_time'] * 1000\nopening_times = dict(zip(barrier_data['machine'], barrier_data['opening_time_ms']))\nprint(\"Barrier opening times:\")\nprint(barrier_data)" }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def align_to_opening_time(df, opening_times):\n", " \"\"\"Align data to barrier opening time\"\"\"\n", " # Add aligned time column\n", " df_aligned = df.copy()\n", " df_aligned['aligned_time'] = np.nan\n", " \n", " # Align each machine's data\n", " for machine in df['machine_name'].unique():\n", " if machine in opening_times:\n", " opening_time = opening_times[machine]\n", " mask = df['machine_name'] == machine\n", " df_aligned.loc[mask, 'aligned_time'] = df.loc[mask, 't'] - opening_time\n", " \n", " # Remove rows where aligned_time is NaN\n", " df_aligned = df_aligned.dropna(subset=['aligned_time'])\n", " \n", " return df_aligned\n", "\n", "# Align the data\n", "trained_aligned = align_to_opening_time(trained_data, opening_times)\n", "untrained_aligned = align_to_opening_time(untrained_data, opening_times)\n", "\n", "print(f\"Trained aligned data shape: {trained_aligned.shape}\")\n", "print(f\"Untrained aligned data shape: {untrained_aligned.shape}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 3. Calculate median area size in rows where two flies are being tracked" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def calculate_areas_with_two_flies(df):\n", " \"\"\"Calculate median area size for time points with two flies\"\"\"\n", " # Calculate area for each row\n", " df['area'] = df['w'] * df['h']\n", " \n", " # Group by machine_name, ROI, and time to count flies per time point\n", " fly_counts = df.groupby(['machine_name', 'ROI', 't']).size().reset_index(name='fly_count')\n", " \n", " # Filter for time points with exactly 2 flies\n", " two_fly_times = fly_counts[fly_counts['fly_count'] == 2]\n", " \n", " # Merge back with original data to get areas for these time points\n", " two_fly_data = pd.merge(df, two_fly_times[['machine_name', 'ROI', 't']], \n", " on=['machine_name', 'ROI', 't'])\n", " \n", " # Calculate median area\n", " median_area = two_fly_data['area'].median()\n", " \n", " return median_area, two_fly_data\n", "\n", "# Combine trained and untrained data for area calculation\n", "combined_data = pd.concat([trained_aligned, untrained_aligned], ignore_index=True)\n", "\n", "# Calculate median area for time points with two flies\n", "median_area, two_fly_data = calculate_areas_with_two_flies(combined_data)\n", "print(f\"Median area size for time points with two flies: {median_area:.2f}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 4. Calculate distances taking into account area size" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": "def calculate_distances_with_area(df, median_area_threshold):\n \"\"\"Calculate distances between flies, setting to 0 for large single-fly detections\"\"\"\n df['area'] = df['w'] * df['h']\n results = []\n \n for (machine_name, roi, t), group in df.groupby(['machine_name', 'ROI', 'aligned_time']):\n group = group.sort_values('id').reset_index(drop=True)\n \n if len(group) >= 2:\n fly1 = group.iloc[0]\n fly2 = group.iloc[1]\n distance = euclidean([fly1['x'], fly1['y']], [fly2['x'], fly2['y']])\n results.append({\n 'machine_name': machine_name, 'ROI': roi, 'aligned_time': t,\n 'distance': distance, 'n_flies': len(group),\n 'area_fly1': fly1['area'], 'area_fly2': fly2['area'],\n 'group': fly1['group']\n })\n elif len(group) == 1:\n fly = group.iloc[0]\n area = fly['area']\n distance = 0.0 if area > 1.5 * median_area_threshold else np.nan\n results.append({\n 'machine_name': machine_name, 'ROI': roi, 'aligned_time': t,\n 'distance': distance, 'n_flies': 1,\n 'area_fly1': area, 'area_fly2': np.nan,\n 'group': fly['group']\n })\n \n return pd.DataFrame(results)\n\ntrained_distances = calculate_distances_with_area(trained_aligned, median_area)\nuntrained_distances = calculate_distances_with_area(untrained_aligned, median_area)\n\nprint(f\"Trained distances shape: {trained_distances.shape}\")\nprint(f\"Untrained distances shape: {untrained_distances.shape}\")\n\ntrained_distances.to_csv(DATA_PROCESSED / 'trained_distances_aligned.csv', index=False)\nuntrained_distances.to_csv(DATA_PROCESSED / 'untrained_distances_aligned.csv', index=False)\nprint(\"Distance data saved to CSV files\")" }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 5. Plot averaged lines of trained vs untrained for the entire experiment" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": "trained_clean = trained_distances.dropna(subset=['distance'])\nuntrained_clean = untrained_distances.dropna(subset=['distance'])\n\ntrained_avg = trained_clean.groupby('aligned_time')['distance'].mean()\nuntrained_avg = untrained_clean.groupby('aligned_time')['distance'].mean()\n\nwindow_size = 50\ntrained_smooth = trained_avg.rolling(window=window_size, center=True).mean()\nuntrained_smooth = untrained_avg.rolling(window=window_size, center=True).mean()\n\nplt.figure(figsize=(15, 8))\nplt.plot(trained_smooth.index/1000, trained_smooth.values, label='Trained (smoothed)', color='blue', linewidth=2)\nplt.plot(untrained_smooth.index/1000, untrained_smooth.values, label='Untrained (smoothed)', color='red', linewidth=2)\nplt.axvline(x=0, color='black', linestyle='--', alpha=0.7, label='Barrier Opening')\nplt.xlabel('Time (seconds relative to barrier opening)')\nplt.ylabel('Average Distance')\nplt.title('Average Distance Between Flies Over Entire Experiment')\nplt.legend()\nplt.grid(True, alpha=0.3)\nplt.tight_layout()\nplt.savefig(FIGURES / 'avg_distance_entire_experiment.png', dpi=300, bbox_inches='tight')\nplt.show()" }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 6. Same plot but ending at time +300 seconds" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": "trained_filtered = trained_clean[trained_clean['aligned_time'] <= 300000]\nuntrained_filtered = untrained_clean[untrained_clean['aligned_time'] <= 300000]\n\ntrained_avg_300 = trained_filtered.groupby('aligned_time')['distance'].mean()\nuntrained_avg_300 = untrained_filtered.groupby('aligned_time')['distance'].mean()\n\ntrained_smooth_300 = trained_avg_300.rolling(window=window_size, center=True).mean()\nuntrained_smooth_300 = untrained_avg_300.rolling(window=window_size, center=True).mean()\n\nplt.figure(figsize=(15, 8))\nplt.plot(trained_smooth_300.index/1000, trained_smooth_300.values, label='Trained (smoothed)', color='blue', linewidth=2)\nplt.plot(untrained_smooth_300.index/1000, untrained_smooth_300.values, label='Untrained (smoothed)', color='red', linewidth=2)\nplt.axvline(x=0, color='black', linestyle='--', alpha=0.7, label='Barrier Opening')\nplt.xlabel('Time (seconds relative to barrier opening)')\nplt.ylabel('Average Distance')\nplt.title('Average Distance Between Flies (First 300 Seconds Post-Opening)')\nplt.legend()\nplt.grid(True, alpha=0.3)\nplt.xlim(-150, 300)\nplt.tight_layout()\nplt.savefig(FIGURES / 'avg_distance_300_seconds.png', dpi=300, bbox_inches='tight')\nplt.show()" }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Summary Statistics" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(\"=== SUMMARY STATISTICS ===\")\n", "print(f\"Median area size for two-fly detections: {median_area:.2f}\")\n", "\n", "print(\"\\nPre-opening period (t < 0):\")\n", "trained_pre = trained_clean[trained_clean['aligned_time'] < 0]['distance']\n", "untrained_pre = untrained_clean[untrained_clean['aligned_time'] < 0]['distance']\n", "print(f\" Trained mean distance: {trained_pre.mean():.2f}\")\n", "print(f\" Untrained mean distance: {untrained_pre.mean():.2f}\")\n", "\n", "print(\"\\nPost-opening period (t > 0):\")\n", "trained_post = trained_clean[trained_clean['aligned_time'] > 0]['distance']\n", "untrained_post = untrained_clean[untrained_clean['aligned_time'] > 0]['distance']\n", "print(f\" Trained mean distance: {trained_post.mean():.2f}\")\n", "print(f\" Untrained mean distance: {untrained_post.mean():.2f}\")\n", "\n", "# Statistical test\n", "t_stat, p_val = stats.ttest_ind(trained_post, untrained_post)\n", "cohens_d = (trained_post.mean() - untrained_post.mean()) / np.sqrt(((len(trained_post)-1)*trained_post.var() + (len(untrained_post)-1)*untrained_post.var()) / (len(trained_post) + len(untrained_post) - 2))\n", "\n", "print(f\"\\nPost-opening comparison (trained vs untrained):\")\n", "print(f\" T-statistic: {t_stat:.4f}\")\n", "print(f\" P-value: {p_val:.2e}\")\n", "print(f\" Cohen's d: {cohens_d:.4f}\")" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.5" } }, "nbformat": 4, "nbformat_minor": 4 }