cupido/notebooks/flies_analysis_simple.ipynb
Giorgio e7e4db264d Initial commit: organized project structure for student handoff
Reorganized flat 41-file directory into structured layout with:
- scripts/ for Python analysis code with shared config.py
- notebooks/ for Jupyter analysis notebooks
- data/ split into raw/, metadata/, processed/
- docs/ with analysis summary, experimental design, and bimodal hypothesis tutorial
- tasks/ with todo checklist and lessons learned
- Comprehensive README, PLANNING.md, and .gitignore

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-05 16:08:36 +00:00

421 lines
No EOL
28 KiB
Text

{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Flies Behavior Analysis Pipeline\n",
"\n",
"This notebook analyzes the behavior of trained vs untrained flies based on their distance patterns."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": "import pandas as pd\nimport numpy as np\nimport matplotlib.pyplot as plt\nimport seaborn as sns\nfrom scipy.spatial.distance import euclidean\nfrom scipy import stats\nfrom pathlib import Path\nimport sys\nimport os\n\n# Set up paths relative to notebook location\nPROJECT_ROOT = Path(\"..\").resolve()\nDATA_RAW = PROJECT_ROOT / \"data\" / \"raw\"\nDATA_METADATA = PROJECT_ROOT / \"data\" / \"metadata\"\nDATA_PROCESSED = PROJECT_ROOT / \"data\" / \"processed\"\nFIGURES = PROJECT_ROOT / \"figures\"\n\n# Add scripts to path for imports\nsys.path.insert(0, str(PROJECT_ROOT / \"scripts\"))\n\n# Set plotting style\nplt.style.use('seaborn-v0_8')\nsns.set_palette(\"husl\")\n\nprint(f\"Project root: {PROJECT_ROOT}\")\nprint(f\"Pandas version: {pd.__version__}\")\nprint(f\"NumPy version: {np.__version__}\")"
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 1. Load existing CSV data"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": "# Load the pre-processed data\ntrained_data = pd.read_csv(DATA_PROCESSED / 'trained_roi_data.csv')\nuntrained_data = pd.read_csv(DATA_PROCESSED / 'untrained_roi_data.csv')\n\nprint(f\"Trained data shape: {trained_data.shape}\")\nprint(f\"Untrained data shape: {untrained_data.shape}\")\nprint(f\"Trained data columns: {list(trained_data.columns)}\")\nprint(f\"Untrained data columns: {list(untrained_data.columns)}\")"
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 2. Align data using barrier opening time as time 0"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": "# Load barrier opening data\nbarrier_data = pd.read_csv(DATA_METADATA / '2025_07_15_barrier_opening.csv')\nbarrier_data['opening_time_ms'] = barrier_data['opening_time'] * 1000\n\n# Create a dictionary mapping machine_name to opening time\nopening_times = dict(zip(barrier_data['machine'], barrier_data['opening_time_ms']))\nprint(\"Barrier opening times:\")\nprint(barrier_data)"
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Trained aligned data shape: (1166318, 13)\n",
"Untrained aligned data shape: (1130333, 13)\n"
]
}
],
"source": [
"def align_to_opening_time(df, opening_times):\n",
" \"\"\"Align data to barrier opening time\"\"\"\n",
" # Add aligned time column\n",
" df_aligned = df.copy()\n",
" df_aligned['aligned_time'] = np.nan\n",
" \n",
" # Align each machine's data\n",
" for machine in df['machine_name'].unique():\n",
" if machine in opening_times:\n",
" opening_time = opening_times[machine]\n",
" mask = df['machine_name'] == machine\n",
" df_aligned.loc[mask, 'aligned_time'] = df.loc[mask, 't'] - opening_time\n",
" \n",
" # Remove rows where aligned_time is NaN\n",
" df_aligned = df_aligned.dropna(subset=['aligned_time'])\n",
" \n",
" return df_aligned\n",
"\n",
"# Align the data\n",
"trained_aligned = align_to_opening_time(trained_data, opening_times)\n",
"untrained_aligned = align_to_opening_time(untrained_data, opening_times)\n",
"\n",
"print(f\"Trained aligned data shape: {trained_aligned.shape}\")\n",
"print(f\"Untrained aligned data shape: {untrained_aligned.shape}\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 3. Calculate median area size in rows where two flies are being tracked"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Median area size for time points with two flies: 1749.00\n"
]
}
],
"source": [
"def calculate_areas_with_two_flies(df):\n",
" \"\"\"Calculate median area size for time points with two flies\"\"\"\n",
" # Calculate area for each row\n",
" df['area'] = df['w'] * df['h']\n",
" \n",
" # Group by machine_name, ROI, and time to count flies per time point\n",
" fly_counts = df.groupby(['machine_name', 'ROI', 't']).size().reset_index(name='fly_count')\n",
" \n",
" # Filter for time points with exactly 2 flies\n",
" two_fly_times = fly_counts[fly_counts['fly_count'] == 2]\n",
" \n",
" # Merge back with original data to get areas for these time points\n",
" two_fly_data = pd.merge(df, two_fly_times[['machine_name', 'ROI', 't']], \n",
" on=['machine_name', 'ROI', 't'])\n",
" \n",
" # Calculate median area\n",
" median_area = two_fly_data['area'].median()\n",
" \n",
" return median_area, two_fly_data\n",
"\n",
"# Combine trained and untrained data for area calculation\n",
"combined_data = pd.concat([trained_aligned, untrained_aligned], ignore_index=True)\n",
"\n",
"# Calculate median area for time points with two flies\n",
"median_area, two_fly_data = calculate_areas_with_two_flies(combined_data)\n",
"print(f\"Median area size for time points with two flies: {median_area:.2f}\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 4. Calculate distances taking into account area size"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": "recalculate_distances = False # Set to True if you want to recalculate\n\ntrained_dist_file = DATA_PROCESSED / 'trained_distances_aligned.csv'\nuntrained_dist_file = DATA_PROCESSED / 'untrained_distances_aligned.csv'\n\nif not recalculate_distances and trained_dist_file.exists() and untrained_dist_file.exists():\n print(\"Loading pre-calculated distance data from CSV files...\")\n trained_distances = pd.read_csv(trained_dist_file)\n untrained_distances = pd.read_csv(untrained_dist_file)\n print(f\"Trained distances shape: {trained_distances.shape}\")\n print(f\"Untrained distances shape: {untrained_distances.shape}\")\nelse:\n print(\"Calculating distances from scratch...\")\n def calculate_distances_with_area(df, median_area_threshold):\n \"\"\"Calculate distances between flies, setting to 0 for large single-fly detections\"\"\"\n df['area'] = df['w'] * df['h']\n results = []\n \n for (machine_name, roi, t), group in df.groupby(['machine_name', 'ROI', 'aligned_time']):\n group = group.sort_values('id').reset_index(drop=True)\n \n if len(group) >= 2:\n fly1 = group.iloc[0]\n fly2 = group.iloc[1]\n distance = euclidean([fly1['x'], fly1['y']], [fly2['x'], fly2['y']])\n \n results.append({\n 'machine_name': machine_name, 'ROI': roi, 'aligned_time': t,\n 'distance': distance, 'n_flies': len(group),\n 'area_fly1': fly1['area'], 'area_fly2': fly2['area'],\n 'group': fly1['group']\n })\n elif len(group) == 1:\n fly = group.iloc[0]\n area = fly['area']\n distance = 0.0 if area > 1.5 * median_area_threshold else np.nan\n \n results.append({\n 'machine_name': machine_name, 'ROI': roi, 'aligned_time': t,\n 'distance': distance, 'n_flies': 1,\n 'area_fly1': area, 'area_fly2': np.nan,\n 'group': fly['group']\n })\n \n return pd.DataFrame(results)\n \n trained_distances = calculate_distances_with_area(trained_aligned, median_area)\n untrained_distances = calculate_distances_with_area(untrained_aligned, median_area)\n \n print(f\"Trained distances shape: {trained_distances.shape}\")\n print(f\"Untrained distances shape: {untrained_distances.shape}\")\n \n trained_distances.to_csv(trained_dist_file, index=False)\n untrained_distances.to_csv(untrained_dist_file, index=False)\n print(\"Distance data saved to CSV files\")"
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 5. Plot averaged lines of trained vs untrained for the entire experiment"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": "# Remove NaN distances for plotting\ntrained_clean = trained_distances.dropna(subset=['distance'])\nuntrained_clean = untrained_distances.dropna(subset=['distance'])\n\n# Calculate average distance over time for each group\ntrained_avg = trained_clean.groupby('aligned_time')['distance'].mean()\nuntrained_avg = untrained_clean.groupby('aligned_time')['distance'].mean()\n\n# Apply smoothing using a rolling average\nwindow_size = 50\ntrained_smooth = trained_avg.rolling(window=window_size, center=True).mean()\nuntrained_smooth = untrained_avg.rolling(window=window_size, center=True).mean()\n\n# Create the plot\nplt.figure(figsize=(15, 8))\n\nplt.plot(trained_smooth.index/1000, trained_smooth.values, \n label='Trained (smoothed)', color='blue', linewidth=2)\nplt.plot(untrained_smooth.index/1000, untrained_smooth.values, \n label='Untrained (smoothed)', color='red', linewidth=2)\n\nplt.axvline(x=0, color='black', linestyle='--', alpha=0.7, label='Barrier Opening')\n\nplt.xlabel('Time (seconds relative to barrier opening)')\nplt.ylabel('Average Distance')\nplt.title('Average Distance Between Flies Over Entire Experiment')\nplt.legend()\nplt.grid(True, alpha=0.3)\n\nplt.tight_layout()\nplt.savefig(FIGURES / 'avg_distance_entire_experiment.png', dpi=300, bbox_inches='tight')\nplt.show()"
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 6. Same plot but ending at time +300 seconds"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": "# Filter data to +300 seconds\ntrained_filtered = trained_clean[trained_clean['aligned_time'] <= 300000]\nuntrained_filtered = untrained_clean[untrained_clean['aligned_time'] <= 300000]\n\n# Calculate average distance over time for each group\ntrained_avg_300 = trained_filtered.groupby('aligned_time')['distance'].mean()\nuntrained_avg_300 = untrained_filtered.groupby('aligned_time')['distance'].mean()\n\n# Apply smoothing using a rolling average\ntrained_smooth_300 = trained_avg_300.rolling(window=window_size, center=True).mean()\nuntrained_smooth_300 = untrained_avg_300.rolling(window=window_size, center=True).mean()\n\n# Create the plot\nplt.figure(figsize=(15, 8))\n\nplt.plot(trained_smooth_300.index/1000, trained_smooth_300.values, \n label='Trained (smoothed)', color='blue', linewidth=2)\nplt.plot(untrained_smooth_300.index/1000, untrained_smooth_300.values, \n label='Untrained (smoothed)', color='red', linewidth=2)\n\nplt.axvline(x=0, color='black', linestyle='--', alpha=0.7, label='Barrier Opening')\n\nplt.xlabel('Time (seconds relative to barrier opening)')\nplt.ylabel('Average Distance')\nplt.title('Average Distance Between Flies (First 300 Seconds Post-Opening)')\nplt.legend()\nplt.grid(True, alpha=0.3)\nplt.xlim(-150, 300)\n\nplt.tight_layout()\nplt.savefig(FIGURES / 'avg_distance_300_seconds.png', dpi=300, bbox_inches='tight')\nplt.show()"
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 7. Track fly identities and calculate meaningful velocity"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": "def track_fly_identities_vectorized(df):\n \"\"\"Efficiently track fly identities across frames using vectorized operations\"\"\"\n from scipy.optimize import linear_sum_assignment\n \n df = df.sort_values(['machine_name', 'ROI', 'aligned_time']).reset_index(drop=True)\n tracked_flies = []\n \n for (machine_name, roi), group in df.groupby(['machine_name', 'ROI']):\n group = group.sort_values('aligned_time').reset_index(drop=True)\n time_points = sorted(group['aligned_time'].unique())\n \n if len(time_points) < 2:\n continue\n \n fly_id_counter = 0\n fly_identities = {}\n \n first_time_flies = group[group['aligned_time'] == time_points[0]]\n for i in range(len(first_time_flies)):\n fly_identities[(time_points[0], i)] = fly_id_counter\n fly_id_counter += 1\n \n for t_idx in range(len(time_points) - 1):\n t1, t2 = time_points[t_idx], time_points[t_idx + 1]\n flies_t1 = group[group['aligned_time'] == t1].reset_index(drop=True)\n flies_t2 = group[group['aligned_time'] == t2].reset_index(drop=True)\n \n if len(flies_t1) == 0 or len(flies_t2) == 0:\n continue\n \n if len(flies_t1) == len(flies_t2):\n pos_t1 = flies_t1[['x', 'y']].values\n pos_t2 = flies_t2[['x', 'y']].values\n distances = np.sqrt(np.sum((pos_t1[:, np.newaxis] - pos_t2[np.newaxis, :])**2, axis=2))\n \n row_ind, col_ind = linear_sum_assignment(distances)\n \n for i, j in zip(row_ind, col_ind):\n prev_id = fly_identities[(t1, i)]\n fly_identities[(t2, j)] = prev_id\n else:\n min_flies = min(len(flies_t1), len(flies_t2))\n max_flies = max(len(flies_t1), len(flies_t2))\n \n for i in range(min_flies):\n prev_id = fly_identities[(t1, i)]\n fly_identities[(t2, i)] = prev_id\n \n for i in range(min_flies, max_flies):\n fly_identities[(t2, i)] = fly_id_counter\n fly_id_counter += 1\n \n for t in time_points:\n flies_at_t = group[group['aligned_time'] == t].reset_index(drop=True)\n for i, (idx, fly) in enumerate(flies_at_t.iterrows()):\n if (t, i) in fly_identities:\n fly_copy = fly.copy()\n fly_copy['fly_id'] = fly_identities[(t, i)]\n tracked_flies.append(fly_copy)\n \n return pd.DataFrame(tracked_flies)\n\n# Check if pre-calculated tracked identity files exist\nrecalculate_tracking = False # Set to True if you want to recalculate\n\ntrained_tracked_file = DATA_PROCESSED / 'trained_tracked.csv'\nuntrained_tracked_file = DATA_PROCESSED / 'untrained_tracked.csv'\n\nif not recalculate_tracking and trained_tracked_file.exists() and untrained_tracked_file.exists():\n print(\"Loading pre-calculated tracked identity data from CSV files...\")\n trained_tracked = pd.read_csv(trained_tracked_file)\n untrained_tracked = pd.read_csv(untrained_tracked_file)\n print(f\"Trained tracked data shape: {trained_tracked.shape}\")\n print(f\"Untrained tracked data shape: {untrained_tracked.shape}\")\nelse:\n print(\"Tracking fly identities (this may take a while...)\")\n trained_tracked = track_fly_identities_vectorized(trained_aligned)\n untrained_tracked = track_fly_identities_vectorized(untrained_aligned)\n \n trained_tracked.to_csv(trained_tracked_file, index=False)\n untrained_tracked.to_csv(untrained_tracked_file, index=False)\n print(\"Tracked identity data saved to CSV files\")\n \n print(f\"Trained tracked data shape: {trained_tracked.shape}\")\n print(f\"Untrained tracked data shape: {untrained_tracked.shape}\")"
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Calculating velocities based on tracked identities...\n",
"Trained velocity data shape: (1160573, 15)\n",
"Untrained velocity data shape: (1125288, 15)\n"
]
}
],
"source": [
"def calculate_velocity_with_identities_vectorized(df):\n",
" \"\"\"Efficiently calculate velocity for each fly based on tracked identities\"\"\"\n",
" if df.empty:\n",
" return pd.DataFrame()\n",
" \n",
" # Sort by fly_id and time\n",
" df = df.sort_values(['machine_name', 'ROI', 'fly_id', 'aligned_time']).reset_index(drop=True)\n",
" \n",
" # Calculate velocities using groupby and vectorized operations\n",
" def calculate_group_velocity(group):\n",
" if len(group) < 2:\n",
" return pd.DataFrame()\n",
" \n",
" # Sort by time\n",
" group = group.sort_values('aligned_time').reset_index(drop=True)\n",
" \n",
" # Calculate differences using vectorized operations\n",
" time_diff = group['aligned_time'].diff() / 1000.0 # Convert ms to seconds\n",
" x_diff = group['x'].diff()\n",
" y_diff = group['y'].diff()\n",
" \n",
" # Calculate Euclidean distance (in pixels)\n",
" distance = np.sqrt(x_diff**2 + y_diff**2)\n",
" \n",
" # Calculate velocity (pixels per second)\n",
" velocity = distance / time_diff\n",
" \n",
" # Add to results (skip first row which has NaN velocity)\n",
" result = group.iloc[1:].copy()\n",
" result['velocity'] = velocity.iloc[1:].values\n",
" \n",
" return result\n",
" \n",
" # Apply the function to each group\n",
" velocity_groups = []\n",
" for (machine_name, roi, fly_id), group in df.groupby(['machine_name', 'ROI', 'fly_id']):\n",
" velocity_group = calculate_group_velocity(group)\n",
" if not velocity_group.empty:\n",
" velocity_groups.append(velocity_group)\n",
" \n",
" if velocity_groups:\n",
" return pd.concat(velocity_groups, ignore_index=True)\n",
" else:\n",
" return pd.DataFrame()\n",
"\n",
"# Calculate velocities for both groups\n",
"print(\"Calculating velocities based on tracked identities...\")\n",
"trained_velocity = calculate_velocity_with_identities_vectorized(trained_tracked)\n",
"untrained_velocity = calculate_velocity_with_identities_vectorized(untrained_tracked)\n",
"\n",
"print(f\"Trained velocity data shape: {trained_velocity.shape}\")\n",
"print(f\"Untrained velocity data shape: {untrained_velocity.shape}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": "# Analyze maximal velocity over a moving window of 10 seconds\ndef calculate_max_velocity_window_vectorized(df, window_size_seconds=10):\n \"\"\"Efficiently calculate maximal velocity over a moving window\"\"\"\n df_clean = df.dropna(subset=['velocity'])\n \n if df_clean.empty:\n return pd.DataFrame()\n \n window_size_ms = window_size_seconds * 1000\n results = []\n \n for (machine_name, roi), group in df_clean.groupby(['machine_name', 'ROI']):\n group = group.sort_values('aligned_time').reset_index(drop=True)\n \n if len(group) == 0:\n continue\n \n times = group['aligned_time'].values\n velocities = group['velocity'].values\n \n for i, current_time in enumerate(times):\n window_end = current_time + window_size_ms\n window_mask = (times >= current_time) & (times < window_end)\n window_velocities = velocities[window_mask]\n \n if len(window_velocities) > 0:\n max_velocity = np.max(window_velocities)\n results.append({\n 'machine_name': machine_name,\n 'ROI': roi,\n 'aligned_time': current_time,\n 'max_velocity': max_velocity,\n 'group': group.iloc[i]['group']\n })\n \n return pd.DataFrame(results)\n\n# Calculate max velocity over 10-second windows\ntrained_max_velocity = calculate_max_velocity_window_vectorized(trained_velocity, 10)\nuntrained_max_velocity = calculate_max_velocity_window_vectorized(untrained_velocity, 10)\n\nprint(f\"Trained max velocity data shape: {trained_max_velocity.shape}\")\nprint(f\"Untrained max velocity data shape: {untrained_max_velocity.shape}\")\n\n# Save velocity data to CSV\ntrained_max_velocity.to_csv(DATA_PROCESSED / 'trained_max_velocity.csv', index=False)\nuntrained_max_velocity.to_csv(DATA_PROCESSED / 'untrained_max_velocity.csv', index=False)\nprint(\"Max velocity data saved to CSV files\")"
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": "# Plot averaged max velocity over time for each group\n# Focus on the time period between 50 and 200 seconds where differences are largest\ntrained_velocity_filtered = trained_max_velocity[\n (trained_max_velocity['aligned_time'] >= 50000) & \n (trained_max_velocity['aligned_time'] <= 200000)\n]\nuntrained_velocity_filtered = untrained_max_velocity[\n (untrained_max_velocity['aligned_time'] >= 50000) & \n (untrained_max_velocity['aligned_time'] <= 200000)\n]\n\nif not trained_velocity_filtered.empty and not untrained_velocity_filtered.empty:\n trained_velocity_avg = trained_velocity_filtered.groupby('aligned_time')['max_velocity'].mean()\n untrained_velocity_avg = untrained_velocity_filtered.groupby('aligned_time')['max_velocity'].mean()\n\n velocity_window_size = 30\n trained_velocity_smooth = trained_velocity_avg.rolling(window=velocity_window_size, center=True).mean()\n untrained_velocity_smooth = untrained_velocity_avg.rolling(window=velocity_window_size, center=True).mean()\n\n plt.figure(figsize=(15, 8))\n\n plt.plot(trained_velocity_smooth.index/1000, trained_velocity_smooth.values, \n label='Trained (smoothed)', color='blue', linewidth=2)\n plt.plot(untrained_velocity_smooth.index/1000, untrained_velocity_smooth.values, \n label='Untrained (smoothed)', color='red', linewidth=2)\n\n plt.axvline(x=0, color='black', linestyle='--', alpha=0.7, label='Barrier Opening')\n\n plt.xlabel('Time (seconds relative to barrier opening)')\n plt.ylabel('Average Max Velocity (pixels/second)')\n plt.title('Average Max Velocity Over 10-Second Windows (50-200 Seconds)')\n plt.legend()\n plt.grid(True, alpha=0.3)\n plt.xlim(50, 200)\n\n plt.tight_layout()\n plt.savefig(FIGURES / 'avg_max_velocity_50_200_seconds.png', dpi=300, bbox_inches='tight')\n plt.show()\nelse:\n print(\"Not enough data to plot velocity differences\")"
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"=== MAX VELOCITY STATISTICS (50-200 seconds) ===\n",
"Trained flies:\n",
" Mean max velocity: 6120.19 pixels/second\n",
" Std max velocity: 2913.85 pixels/second\n",
" Median max velocity: 7426.51 pixels/second\n",
"\n",
"Untrained flies:\n",
" Mean max velocity: 5710.33 pixels/second\n",
" Std max velocity: 2784.93 pixels/second\n",
" Median max velocity: 6876.77 pixels/second\n",
"\n",
"Statistical comparison (trained vs untrained):\n",
" T-statistic: 43.8194\n",
" P-value: 0.00e+00\n",
" Cohen's d: 0.1438\n"
]
}
],
"source": [
"# Summary statistics for max velocity in the 50-200 second window\n",
"if not trained_velocity_filtered.empty and not untrained_velocity_filtered.empty:\n",
" print(\"=== MAX VELOCITY STATISTICS (50-200 seconds) ===\")\n",
" trained_velocity_50_200 = trained_velocity_filtered['max_velocity']\n",
" untrained_velocity_50_200 = untrained_velocity_filtered['max_velocity']\n",
"\n",
" print(f\"Trained flies:\")\n",
" print(f\" Mean max velocity: {trained_velocity_50_200.mean():.2f} pixels/second\")\n",
" print(f\" Std max velocity: {trained_velocity_50_200.std():.2f} pixels/second\")\n",
" print(f\" Median max velocity: {trained_velocity_50_200.median():.2f} pixels/second\")\n",
"\n",
" print(f\"\\nUntrained flies:\")\n",
" print(f\" Mean max velocity: {untrained_velocity_50_200.mean():.2f} pixels/second\")\n",
" print(f\" Std max velocity: {untrained_velocity_50_200.std():.2f} pixels/second\")\n",
" print(f\" Median max velocity: {untrained_velocity_50_200.median():.2f} pixels/second\")\n",
"\n",
" # Statistical test\n",
" if len(trained_velocity_50_200) > 1 and len(untrained_velocity_50_200) > 1:\n",
" t_stat_vel, p_val_vel = stats.ttest_ind(trained_velocity_50_200, untrained_velocity_50_200)\n",
" cohens_d_vel = (trained_velocity_50_200.mean() - untrained_velocity_50_200.mean()) / \\\n",
" np.sqrt(((len(trained_velocity_50_200)-1)*trained_velocity_50_200.var() + \\\n",
" (len(untrained_velocity_50_200)-1)*untrained_velocity_50_200.var()) / \\\n",
" (len(trained_velocity_50_200) + len(untrained_velocity_50_200) - 2))\n",
"\n",
" print(f\"\\nStatistical comparison (trained vs untrained):\")\n",
" print(f\" T-statistic: {t_stat_vel:.4f}\")\n",
" print(f\" P-value: {p_val_vel:.2e}\")\n",
" print(f\" Cohen's d: {cohens_d_vel:.4f}\")\n",
" else:\n",
" print(\"\\nNot enough data for statistical test\")\n",
"else:\n",
" print(\"Not enough data for velocity statistics\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Summary Statistics"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"=== SUMMARY STATISTICS ===\n",
"Median area size for two-fly detections: 1749.00\n",
"\n",
"Pre-opening period (t < 0):\n",
" Trained mean distance: 156.05\n",
" Untrained mean distance: 147.69\n",
"\n",
"Post-opening period (t > 0):\n",
" Trained mean distance: 72.60\n",
" Untrained mean distance: 64.00\n",
"\n",
"Post-opening comparison (trained vs untrained):\n",
" T-statistic: 30.2455\n",
" P-value: 9.57e-201\n",
" Cohen's d: 0.0908\n"
]
}
],
"source": [
"print(\"=== SUMMARY STATISTICS ===\")\n",
"print(f\"Median area size for two-fly detections: {median_area:.2f}\")\n",
"\n",
"print(\"\\nPre-opening period (t < 0):\")\n",
"trained_pre = trained_clean[trained_clean['aligned_time'] < 0]['distance']\n",
"untrained_pre = untrained_clean[untrained_clean['aligned_time'] < 0]['distance']\n",
"print(f\" Trained mean distance: {trained_pre.mean():.2f}\")\n",
"print(f\" Untrained mean distance: {untrained_pre.mean():.2f}\")\n",
"\n",
"print(\"\\nPost-opening period (t > 0):\")\n",
"trained_post = trained_clean[trained_clean['aligned_time'] > 0]['distance']\n",
"untrained_post = untrained_clean[untrained_clean['aligned_time'] > 0]['distance']\n",
"print(f\" Trained mean distance: {trained_post.mean():.2f}\")\n",
"print(f\" Untrained mean distance: {untrained_post.mean():.2f}\")\n",
"\n",
"# Statistical test\n",
"t_stat, p_val = stats.ttest_ind(trained_post, untrained_post)\n",
"cohens_d = (trained_post.mean() - untrained_post.mean()) / np.sqrt(((len(trained_post)-1)*trained_post.var() + (len(untrained_post)-1)*untrained_post.var()) / (len(trained_post) + len(untrained_post) - 2))\n",
"\n",
"print(f\"\\nPost-opening comparison (trained vs untrained):\")\n",
"print(f\" T-statistic: {t_stat:.4f}\")\n",
"print(f\" P-value: {p_val:.2e}\")\n",
"print(f\" Cohen's d: {cohens_d:.4f}\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.13.7"
}
},
"nbformat": 4,
"nbformat_minor": 4
}