Initial commit: organized project structure for student handoff

Reorganized flat 41-file directory into structured layout with:
- scripts/ for Python analysis code with shared config.py
- notebooks/ for Jupyter analysis notebooks
- data/ split into raw/, metadata/, processed/
- docs/ with analysis summary, experimental design, and bimodal hypothesis tutorial
- tasks/ with todo checklist and lessons learned
- Comprehensive README, PLANNING.md, and .gitignore

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Giorgio Gilestro 2026-03-05 16:08:36 +00:00
commit e7e4db264d
27 changed files with 3105 additions and 0 deletions

0
scripts/__init__.py Normal file
View file

View file

@ -0,0 +1,240 @@
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
import warnings
warnings.filterwarnings('ignore')
from config import DATA_PROCESSED, FIGURES
def load_and_combine_data():
"""Load and combine trained and untrained distance data.
Returns:
pd.DataFrame: Combined distance data with group labels.
"""
trained_distances = pd.read_csv(DATA_PROCESSED / 'trained_distances.csv')
untrained_distances = pd.read_csv(DATA_PROCESSED / 'untrained_distances.csv')
trained_distances['group'] = 'trained'
untrained_distances['group'] = 'untrained'
combined_data = pd.concat([trained_distances, untrained_distances], ignore_index=True)
combined_data = combined_data.dropna(subset=['distance'])
print(f"Combined data shape: {combined_data.shape}")
print(f"Trained samples: {len(combined_data[combined_data['group'] == 'trained'])}")
print(f"Untrained samples: {len(combined_data[combined_data['group'] == 'untrained'])}")
return combined_data
def basic_statistics(combined_data):
"""Perform basic statistical analysis.
Args:
combined_data (pd.DataFrame): Combined distance data.
"""
print("\n=== BASIC STATISTICS ===")
for group in ['trained', 'untrained']:
group_data = combined_data[combined_data['group'] == group]['distance']
print(f"\n{group.capitalize()} flies:")
print(f" Count: {len(group_data)}")
print(f" Mean distance: {group_data.mean():.2f}")
print(f" Std distance: {group_data.std():.2f}")
print(f" Median distance: {group_data.median():.2f}")
print(f" Min distance: {group_data.min():.2f}")
print(f" Max distance: {group_data.max():.2f}")
trained_dist = combined_data[combined_data['group'] == 'trained']['distance']
untrained_dist = combined_data[combined_data['group'] == 'untrained']['distance']
t_stat, p_value = stats.ttest_ind(trained_dist, untrained_dist)
print(f"\nT-test between groups:")
print(f" T-statistic: {t_stat:.4f}")
print(f" P-value: {p_value:.2e}")
pooled_std = np.sqrt(((len(trained_dist)-1)*trained_dist.std()**2 +
(len(untrained_dist)-1)*untrained_dist.std()**2) /
(len(trained_dist) + len(untrained_dist) - 2))
cohens_d = (trained_dist.mean() - untrained_dist.mean()) / pooled_std
print(f" Cohen's d (effect size): {cohens_d:.4f}")
def distance_distribution_analysis(combined_data):
"""Analyze distance distributions and create plots.
Args:
combined_data (pd.DataFrame): Combined distance data.
"""
print("\n=== DISTANCE DISTRIBUTION ANALYSIS ===")
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
fig.suptitle('Distance Distribution Analysis', fontsize=16)
axes[0, 0].hist(combined_data[combined_data['group'] == 'trained']['distance'],
alpha=0.7, label='Trained', bins=50, density=True)
axes[0, 0].hist(combined_data[combined_data['group'] == 'untrained']['distance'],
alpha=0.7, label='Untrained', bins=50, density=True)
axes[0, 0].set_xlabel('Distance')
axes[0, 0].set_ylabel('Density')
axes[0, 0].set_title('Distance Distribution by Group')
axes[0, 0].legend()
combined_data.boxplot(column='distance', by='group', ax=axes[0, 1])
axes[0, 1].set_title('Distance Box Plot by Group')
axes[0, 1].set_xlabel('Group')
axes[0, 1].set_ylabel('Distance')
trained_dist = combined_data[combined_data['group'] == 'trained']['distance']
untrained_dist = combined_data[combined_data['group'] == 'untrained']['distance']
trained_sorted = np.sort(trained_dist)
untrained_sorted = np.sort(untrained_dist)
trained_cumulative = np.arange(1, len(trained_sorted) + 1) / len(trained_sorted)
untrained_cumulative = np.arange(1, len(untrained_sorted) + 1) / len(untrained_sorted)
axes[1, 0].plot(trained_sorted, trained_cumulative, label='Trained', alpha=0.7)
axes[1, 0].plot(untrained_sorted, untrained_cumulative, label='Untrained', alpha=0.7)
axes[1, 0].set_xlabel('Distance')
axes[1, 0].set_ylabel('Cumulative Probability')
axes[1, 0].set_title('Cumulative Distribution of Distances')
axes[1, 0].legend()
sns.violinplot(data=combined_data, x='group', y='distance', ax=axes[1, 1])
axes[1, 1].set_title('Distance Violin Plot by Group')
axes[1, 1].set_xlabel('Group')
axes[1, 1].set_ylabel('Distance')
plt.tight_layout()
plt.savefig(FIGURES / 'distance_analysis.png', dpi=300, bbox_inches='tight')
plt.show()
print("Distance distribution plots saved")
def clustering_analysis(combined_data):
"""Perform clustering analysis on distance data.
Args:
combined_data (pd.DataFrame): Combined distance data.
Returns:
tuple: (clustered_data, kmeans_model, scaler).
"""
print("\n=== CLUSTERING ANALYSIS ===")
features = ['distance', 'n_flies', 'area_fly1', 'area_fly2']
X = combined_data[features].dropna()
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
k_range = range(2, 6)
inertias = []
sil_scores = []
for k in k_range:
kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
kmeans.fit(X_scaled)
inertias.append(kmeans.inertia_)
sil_scores.append(silhouette_score(X_scaled, kmeans.labels_))
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))
ax1.plot(k_range, inertias, 'bo-')
ax1.set_xlabel('Number of Clusters (k)')
ax1.set_ylabel('Inertia')
ax1.set_title('Elbow Method for Optimal k')
ax2.plot(k_range, sil_scores, 'ro-')
ax2.set_xlabel('Number of Clusters (k)')
ax2.set_ylabel('Silhouette Score')
ax2.set_title('Silhouette Score for Different k')
plt.tight_layout()
plt.savefig(FIGURES / 'clustering_analysis.png', dpi=300, bbox_inches='tight')
plt.show()
optimal_k = 2
kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
cluster_labels = kmeans.fit_predict(X_scaled)
X_clustered = X.copy()
X_clustered['cluster'] = cluster_labels
X_clustered['actual_group'] = combined_data.loc[X_clustered.index, 'group'].values
confusion = pd.crosstab(X_clustered['cluster'], X_clustered['actual_group'])
print(f"Clustering results (k={optimal_k}):")
print(confusion)
c0t = len(X_clustered[(X_clustered['cluster'] == 0) & (X_clustered['actual_group'] == 'trained')])
c0u = len(X_clustered[(X_clustered['cluster'] == 0) & (X_clustered['actual_group'] == 'untrained')])
c1t = len(X_clustered[(X_clustered['cluster'] == 1) & (X_clustered['actual_group'] == 'trained')])
c1u = len(X_clustered[(X_clustered['cluster'] == 1) & (X_clustered['actual_group'] == 'untrained')])
accuracy = max((c0t + c1u) / len(X_clustered), (c0u + c1t) / len(X_clustered))
print(f"\nClustering accuracy: {accuracy:.4f}")
print("\nCluster characteristics:")
for i in range(optimal_k):
cluster_data = X_clustered[X_clustered['cluster'] == i]
print(f"\nCluster {i}:")
print(f" Size: {len(cluster_data)}")
print(f" Distance - Mean: {cluster_data['distance'].mean():.2f}, Std: {cluster_data['distance'].std():.2f}")
print(f" N_flies - Mean: {cluster_data['n_flies'].mean():.2f}")
print(f" Area_fly1 - Mean: {cluster_data['area_fly1'].mean():.2f}")
return X_clustered, kmeans, scaler
def simple_classification_rule(combined_data):
"""Create a simple rule-based classifier.
Args:
combined_data (pd.DataFrame): Combined distance data.
"""
print("\n=== SIMPLE RULE-BASED CLASSIFICATION ===")
clean_data = combined_data.dropna(subset=['distance'])
thresholds = np.percentile(clean_data['distance'], [25, 50, 75])
print(f"Distance percentiles: 25%={thresholds[0]:.2f}, 50%={thresholds[1]:.2f}, 75%={thresholds[2]:.2f}")
for threshold in thresholds:
predictions = ['trained' if d > threshold else 'untrained'
for d in clean_data['distance']]
actual = clean_data['group']
accuracy = np.mean([p == a for p, a in zip(predictions, actual)])
tp = sum([p == 'trained' and a == 'trained' for p, a in zip(predictions, actual)])
tn = sum([p == 'untrained' and a == 'untrained' for p, a in zip(predictions, actual)])
fp = sum([p == 'trained' and a == 'untrained' for p, a in zip(predictions, actual)])
fn = sum([p == 'untrained' and a == 'trained' for p, a in zip(predictions, actual)])
sensitivity = tp / (tp + fn) if (tp + fn) > 0 else 0
specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
print(f"\nThreshold = {threshold:.2f}:")
print(f" Accuracy: {accuracy:.4f}")
print(f" Sensitivity: {sensitivity:.4f}, Specificity: {specificity:.4f}")
def main():
"""Run the full distance analysis pipeline."""
combined_data = load_and_combine_data()
basic_statistics(combined_data)
distance_distribution_analysis(combined_data)
clustered_data, kmeans_model, scaler = clustering_analysis(combined_data)
simple_classification_rule(combined_data)
clustered_data.to_csv(DATA_PROCESSED / 'clustered_distance_data.csv', index=False)
print("\n=== ANALYSIS COMPLETE ===")
if __name__ == "__main__":
main()

View file

@ -0,0 +1,118 @@
import pandas as pd
import numpy as np
from scipy.spatial.distance import euclidean
from config import DATA_PROCESSED
def calculate_fly_distances(trained_file=None, untrained_file=None):
"""Calculate distances between flies at each time point.
For each time point:
- If two flies are detected: calculate Cartesian distance between them
- If one fly is detected: set distance to 0 if area > average area, otherwise NaN
Args:
trained_file (Path): Path to trained ROI data CSV.
untrained_file (Path): Path to untrained ROI data CSV.
Returns:
tuple: (trained_distances, untrained_distances) DataFrames.
"""
if trained_file is None:
trained_file = DATA_PROCESSED / 'trained_roi_data.csv'
if untrained_file is None:
untrained_file = DATA_PROCESSED / 'untrained_roi_data.csv'
trained_df = pd.read_csv(trained_file)
untrained_df = pd.read_csv(untrained_file)
trained_df['area'] = trained_df['w'] * trained_df['h']
untrained_df['area'] = untrained_df['w'] * untrained_df['h']
avg_area = np.mean([trained_df['area'].mean(), untrained_df['area'].mean()])
print(f"Average area across all data: {avg_area:.2f}")
trained_distances = process_distance_data(trained_df, avg_area)
untrained_distances = process_distance_data(untrained_df, avg_area)
return trained_distances, untrained_distances
def process_distance_data(df, avg_area):
"""Process a DataFrame to calculate distances between flies at each time point.
Args:
df (pd.DataFrame): Input tracking data.
avg_area (float): Average area threshold for single-fly detection.
Returns:
pd.DataFrame: Distance data with columns for machine, ROI, time, distance.
"""
results = []
for (machine_name, roi), group in df.groupby(['machine_name', 'ROI']):
for t, time_group in group.groupby('t'):
time_group = time_group.sort_values('id').reset_index(drop=True)
if len(time_group) >= 2:
fly1 = time_group.iloc[0]
fly2 = time_group.iloc[1]
distance = euclidean([fly1['x'], fly1['y']], [fly2['x'], fly2['y']])
results.append({
'machine_name': machine_name,
'ROI': roi,
't': t,
'distance': distance,
'n_flies': len(time_group),
'area_fly1': fly1['area'],
'area_fly2': fly2['area']
})
elif len(time_group) == 1:
fly = time_group.iloc[0]
area = fly['area']
if area > avg_area:
distance = 0.0
else:
distance = np.nan
results.append({
'machine_name': machine_name,
'ROI': roi,
't': t,
'distance': distance,
'n_flies': 1,
'area_fly1': area,
'area_fly2': np.nan
})
return pd.DataFrame(results)
def main():
"""Run distance calculations and save results."""
trained_distances, untrained_distances = calculate_fly_distances()
print(f"Trained data distance summary:")
print(f" Shape: {trained_distances.shape}")
print(f" Distance stats:")
print(f" Count: {trained_distances['distance'].count()}")
print(f" Mean: {trained_distances['distance'].mean():.2f}")
print(f" Std: {trained_distances['distance'].std():.2f}")
print(f"\nUntrained data distance summary:")
print(f" Shape: {untrained_distances.shape}")
print(f" Distance stats:")
print(f" Count: {untrained_distances['distance'].count()}")
print(f" Mean: {untrained_distances['distance'].mean():.2f}")
print(f" Std: {untrained_distances['distance'].std():.2f}")
trained_distances.to_csv(DATA_PROCESSED / 'trained_distances.csv', index=False)
untrained_distances.to_csv(DATA_PROCESSED / 'untrained_distances.csv', index=False)
print("\nDistance data saved")
if __name__ == "__main__":
main()

9
scripts/config.py Normal file
View file

@ -0,0 +1,9 @@
"""Shared path constants for the Cupido tracking project."""
from pathlib import Path
PROJECT_ROOT = Path(__file__).resolve().parent.parent
DATA_RAW = PROJECT_ROOT / "data" / "raw"
DATA_METADATA = PROJECT_ROOT / "data" / "metadata"
DATA_PROCESSED = PROJECT_ROOT / "data" / "processed"
FIGURES = PROJECT_ROOT / "figures"

90
scripts/load_roi_data.py Normal file
View file

@ -0,0 +1,90 @@
import pandas as pd
import sqlite3
import re
from config import DATA_RAW, DATA_METADATA, DATA_PROCESSED
def load_roi_data():
"""Load ROI data from SQLite databases and group by trained/untrained.
Returns:
tuple: (trained_df, untrained_df) DataFrames with tracking data.
"""
metadata = pd.read_csv(DATA_METADATA / '2025_07_15_metadata_fixed.csv')
metadata['machine_name'] = metadata['machine_name'].astype(str)
trained_rois = metadata[metadata['group'] == 'trained']
untrained_rois = metadata[metadata['group'] == 'untrained']
db_files = list(DATA_RAW.glob('*_tracking.db'))
trained_df = pd.DataFrame()
untrained_df = pd.DataFrame()
for db_file in db_files:
print(f"Processing {db_file.name}")
pattern = r'_([0-9a-f]{32})__'
match = re.search(pattern, db_file.name)
if not match:
print(f"Could not extract UUID from {db_file.name}")
continue
uuid = match.group(1)
metadata_matches = metadata[metadata['path'].str.contains(uuid, na=False)]
if metadata_matches.empty:
print(f"No metadata matches found for UUID {uuid} from {db_file.name}")
continue
machine_id = metadata_matches.iloc[0]['machine_name']
print(f"Matched to machine ID: {machine_id}")
conn = sqlite3.connect(str(db_file))
machine_trained = trained_rois[trained_rois['machine_name'] == machine_id]
machine_untrained = untrained_rois[untrained_rois['machine_name'] == machine_id]
for _, row in machine_trained.iterrows():
roi = row['ROI']
try:
query = f"SELECT * FROM ROI_{roi}"
roi_data = pd.read_sql_query(query, conn)
roi_data['machine_name'] = machine_id
roi_data['ROI'] = roi
roi_data['group'] = 'trained'
trained_df = pd.concat([trained_df, roi_data], ignore_index=True)
except Exception as e:
print(f"Error loading ROI_{roi} from {db_file.name}: {e}")
for _, row in machine_untrained.iterrows():
roi = row['ROI']
try:
query = f"SELECT * FROM ROI_{roi}"
roi_data = pd.read_sql_query(query, conn)
roi_data['machine_name'] = machine_id
roi_data['ROI'] = roi
roi_data['group'] = 'untrained'
untrained_df = pd.concat([untrained_df, roi_data], ignore_index=True)
except Exception as e:
print(f"Error loading ROI_{roi} from {db_file.name}: {e}")
conn.close()
return trained_df, untrained_df
if __name__ == "__main__":
trained_data, untrained_data = load_roi_data()
print(f"Trained data shape: {trained_data.shape}")
print(f"Untrained data shape: {untrained_data.shape}")
if not trained_data.empty:
print("Trained data columns:", trained_data.columns.tolist())
if not untrained_data.empty:
print("Untrained data columns:", untrained_data.columns.tolist())
trained_data.to_csv(DATA_PROCESSED / 'trained_roi_data.csv', index=False)
untrained_data.to_csv(DATA_PROCESSED / 'untrained_roi_data.csv', index=False)
print("Data saved to trained_roi_data.csv and untrained_roi_data.csv")

View file

@ -0,0 +1,97 @@
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt
import seaborn as sns
from config import DATA_PROCESSED, FIGURES
# Load data
trained_distances = pd.read_csv(DATA_PROCESSED / 'trained_distances.csv')
untrained_distances = pd.read_csv(DATA_PROCESSED / 'untrained_distances.csv')
# Add group labels
trained_distances['group'] = 'trained'
untrained_distances['group'] = 'untrained'
# Combine data
combined_data = pd.concat([trained_distances, untrained_distances], ignore_index=True)
combined_data = combined_data.dropna(subset=['group'])
# Prepare features and target
features = ['distance', 'n_flies', 'area_fly1', 'area_fly2']
X = combined_data[features]
y = combined_data['group']
# Handle missing values in features
imputer = SimpleImputer(strategy='mean')
X_imputed = pd.DataFrame(imputer.fit_transform(X), columns=features)
# Split data
X_train, X_test, y_train, y_test = train_test_split(X_imputed, y, test_size=0.2, random_state=42)
# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
print("=== MACHINE LEARNING CLASSIFICATION ===")
print(f"Training set size: {len(X_train)}")
print(f"Testing set size: {len(X_test)}")
# 1. Logistic Regression
print("\n1. Logistic Regression:")
lr_model = LogisticRegression(random_state=42)
lr_model.fit(X_train_scaled, y_train)
lr_predictions = lr_model.predict(X_test_scaled)
lr_accuracy = accuracy_score(y_test, lr_predictions)
print(f"Accuracy: {lr_accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, lr_predictions))
# 2. Random Forest
print("\n2. Random Forest:")
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
rf_predictions = rf_model.predict(X_test)
rf_accuracy = accuracy_score(y_test, rf_predictions)
print(f"Accuracy: {rf_accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, rf_predictions))
# Feature importance
print("\nFeature Importance (Random Forest):")
feature_importance = pd.DataFrame({
'feature': features,
'importance': rf_model.feature_importances_
}).sort_values('importance', ascending=False)
print(feature_importance)
# Confusion matrix for the best model
best_model_name = "Random Forest" if rf_accuracy > lr_accuracy else "Logistic Regression"
best_predictions = rf_predictions if rf_accuracy > lr_accuracy else lr_predictions
plt.figure(figsize=(8, 6))
cm = confusion_matrix(y_test, best_predictions)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
xticklabels=['Trained', 'Untrained'],
yticklabels=['Trained', 'Untrained'])
plt.title(f'Confusion Matrix - {best_model_name}')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.tight_layout()
plt.savefig(FIGURES / 'confusion_matrix.png', dpi=300, bbox_inches='tight')
plt.show()
# Cross-validation scores
print("\n=== CROSS-VALIDATION SCORES ===")
lr_cv_scores = cross_val_score(LogisticRegression(random_state=42), X_train_scaled, y_train, cv=5)
rf_cv_scores = cross_val_score(RandomForestClassifier(n_estimators=100, random_state=42), X_train, y_train, cv=5)
print(f"Logistic Regression CV Score: {lr_cv_scores.mean():.4f} (+/- {lr_cv_scores.std() * 2:.4f})")
print(f"Random Forest CV Score: {rf_cv_scores.mean():.4f} (+/- {rf_cv_scores.std() * 2:.4f})")

View file

@ -0,0 +1,101 @@
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from config import DATA_PROCESSED, DATA_METADATA, FIGURES
# Load data
trained_distances = pd.read_csv(DATA_PROCESSED / 'trained_distances.csv')
untrained_distances = pd.read_csv(DATA_PROCESSED / 'untrained_distances.csv')
barrier_data = pd.read_csv(DATA_METADATA / '2025_07_15_barrier_opening.csv')
# Convert opening_time to milliseconds and create a mapping
barrier_data['opening_time_ms'] = barrier_data['opening_time'] * 1000
opening_times = dict(zip(barrier_data['machine'], barrier_data['opening_time_ms']))
def align_to_opening_time(df, opening_times, max_time=300000):
"""Align distance data to barrier opening time.
Args:
df (pd.DataFrame): Distance data.
opening_times (dict): Machine to opening time mapping.
max_time (int): Maximum time in ms to include.
Returns:
pd.DataFrame: Aligned data filtered to +/-150s around opening.
"""
df_aligned = df.copy()
df_aligned['aligned_time'] = np.nan
for machine in df['machine_name'].unique():
if machine in opening_times:
opening_time = opening_times[machine]
mask = (df['machine_name'] == machine) & (df['t'] <= max_time)
df_aligned.loc[mask, 'aligned_time'] = df.loc[mask, 't'] - opening_time
df_aligned = df_aligned.dropna(subset=['aligned_time'])
df_aligned = df_aligned[(df_aligned['aligned_time'] >= -150000) &
(df_aligned['aligned_time'] <= 150000)]
return df_aligned
# Align the data
trained_aligned = align_to_opening_time(trained_distances, opening_times)
untrained_aligned = align_to_opening_time(untrained_distances, opening_times)
# Calculate average distance over aligned time
trained_avg = trained_aligned.groupby('aligned_time')['distance'].mean()
untrained_avg = untrained_aligned.groupby('aligned_time')['distance'].mean()
# Apply smoothing
window_size = 50
trained_smooth = trained_avg.rolling(window=window_size, center=True).mean()
untrained_smooth = untrained_avg.rolling(window=window_size, center=True).mean()
# Create the plot
plt.figure(figsize=(12, 6))
plt.plot(trained_smooth.index/1000, trained_smooth.values,
label='Trained (smoothed)', color='blue', linewidth=2)
plt.plot(untrained_smooth.index/1000, untrained_smooth.values,
label='Untrained (smoothed)', color='red', linewidth=2)
plt.axvline(x=0, color='black', linestyle='--', alpha=0.7, label='Barrier Opening')
plt.xlabel('Time (seconds relative to barrier opening)')
plt.ylabel('Average Distance')
plt.title('Average Distance Between Flies Aligned to Barrier Opening Time')
plt.legend()
plt.grid(True, alpha=0.3)
plt.xlim(-150, 150)
plt.tight_layout()
plt.savefig(FIGURES / 'avg_distance_aligned_to_opening.png', dpi=300, bbox_inches='tight')
plt.show()
# Print statistics
print("Trained flies (aligned to barrier opening):")
print(f" Data points: {len(trained_aligned)}")
print(f" Mean distance: {trained_aligned['distance'].mean():.2f}")
print(f" Std distance: {trained_aligned['distance'].std():.2f}")
print("\nUntrained flies (aligned to barrier opening):")
print(f" Data points: {len(untrained_aligned)}")
print(f" Mean distance: {untrained_aligned['distance'].mean():.2f}")
print(f" Std distance: {untrained_aligned['distance'].std():.2f}")
# Pre/post analysis
trained_pre = trained_aligned[trained_aligned['aligned_time'] < 0]
trained_post = trained_aligned[trained_aligned['aligned_time'] > 0]
untrained_pre = untrained_aligned[untrained_aligned['aligned_time'] < 0]
untrained_post = untrained_aligned[untrained_aligned['aligned_time'] > 0]
print("\nPre-opening period (t < 0):")
print(f" Trained mean distance: {trained_pre['distance'].mean():.2f}")
print(f" Untrained mean distance: {untrained_pre['distance'].mean():.2f}")
print("\nPost-opening period (t > 0):")
print(f" Trained mean distance: {trained_post['distance'].mean():.2f}")
print(f" Untrained mean distance: {untrained_post['distance'].mean():.2f}")

View file

@ -0,0 +1,51 @@
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from config import DATA_PROCESSED, FIGURES
# Load data
trained_distances = pd.read_csv(DATA_PROCESSED / 'trained_distances.csv')
untrained_distances = pd.read_csv(DATA_PROCESSED / 'untrained_distances.csv')
# Remove NaN distances and filter for first 200 seconds
trained_clean = trained_distances.dropna(subset=['distance'])
untrained_clean = untrained_distances.dropna(subset=['distance'])
trained_filtered = trained_clean[trained_clean['t'] <= 200000]
untrained_filtered = untrained_clean[untrained_clean['t'] <= 200000]
# Calculate average distance over time
trained_avg = trained_filtered.groupby('t')['distance'].mean()
untrained_avg = untrained_filtered.groupby('t')['distance'].mean()
# Apply smoothing
window_size = 50
trained_smooth = trained_avg.rolling(window=window_size, center=True).mean()
untrained_smooth = untrained_avg.rolling(window=window_size, center=True).mean()
# Create the plot
plt.figure(figsize=(12, 6))
plt.plot(trained_smooth.index/1000, trained_smooth.values,
label='Trained (smoothed)', color='blue', linewidth=2)
plt.plot(untrained_smooth.index/1000, untrained_smooth.values,
label='Untrained (smoothed)', color='red', linewidth=2)
plt.xlabel('Time (seconds)')
plt.ylabel('Average Distance')
plt.title('Average Distance Between Flies Over Time (First 200 Seconds)')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig(FIGURES / 'avg_distance_over_time_first_200s.png', dpi=300, bbox_inches='tight')
plt.show()
print("Trained flies (first 200 seconds):")
print(f" Mean distance: {trained_filtered['distance'].mean():.2f}")
print(f" Std distance: {trained_filtered['distance'].std():.2f}")
print("\nUntrained flies (first 200 seconds):")
print(f" Mean distance: {untrained_filtered['distance'].mean():.2f}")
print(f" Std distance: {untrained_filtered['distance'].std():.2f}")

View file

@ -0,0 +1,43 @@
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from config import DATA_PROCESSED, FIGURES
# Load data
trained_distances = pd.read_csv(DATA_PROCESSED / 'trained_distances.csv')
untrained_distances = pd.read_csv(DATA_PROCESSED / 'untrained_distances.csv')
# Remove NaN distances
trained_clean = trained_distances.dropna(subset=['distance'])
untrained_clean = untrained_distances.dropna(subset=['distance'])
# Calculate average distance over time
trained_avg = trained_clean.groupby('t')['distance'].mean()
untrained_avg = untrained_clean.groupby('t')['distance'].mean()
# Create the plot
plt.figure(figsize=(12, 6))
plt.plot(trained_avg.index, trained_avg.values,
label='Trained (avg)', color='blue', linewidth=1)
plt.plot(untrained_avg.index, untrained_avg.values,
label='Untrained (avg)', color='red', linewidth=1)
plt.xlabel('Time')
plt.ylabel('Average Distance')
plt.title('Average Distance Between Flies Over Time by Group')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig(FIGURES / 'avg_distance_over_time.png', dpi=300, bbox_inches='tight')
plt.show()
print("Trained flies:")
print(f" Mean distance: {trained_clean['distance'].mean():.2f}")
print(f" Std distance: {trained_clean['distance'].std():.2f}")
print("\nUntrained flies:")
print(f" Mean distance: {untrained_clean['distance'].mean():.2f}")
print(f" Std distance: {untrained_clean['distance'].std():.2f}")

View file

@ -0,0 +1,50 @@
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from config import DATA_PROCESSED, FIGURES
# Load data
trained_distances = pd.read_csv(DATA_PROCESSED / 'trained_distances.csv')
untrained_distances = pd.read_csv(DATA_PROCESSED / 'untrained_distances.csv')
# Remove NaN distances
trained_clean = trained_distances.dropna(subset=['distance'])
untrained_clean = untrained_distances.dropna(subset=['distance'])
# Create the plot
plt.figure(figsize=(12, 6))
# Sample 1000 points from each group to avoid overcrowding
if len(trained_clean) > 1000:
trained_sample = trained_clean.sample(1000, random_state=42)
else:
trained_sample = trained_clean
if len(untrained_clean) > 1000:
untrained_sample = untrained_clean.sample(1000, random_state=42)
else:
untrained_sample = untrained_clean
plt.scatter(trained_sample['t'], trained_sample['distance'],
alpha=0.5, s=1, label='Trained', color='blue')
plt.scatter(untrained_sample['t'], untrained_sample['distance'],
alpha=0.5, s=1, label='Untrained', color='red')
plt.xlabel('Time')
plt.ylabel('Distance')
plt.title('Distance Between Flies Over Time')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig(FIGURES / 'distance_over_time.png', dpi=300, bbox_inches='tight')
plt.show()
print("Trained flies:")
print(f" Mean distance: {trained_clean['distance'].mean():.2f}")
print(f" Std distance: {trained_clean['distance'].std():.2f}")
print("\nUntrained flies:")
print(f" Mean distance: {untrained_clean['distance'].mean():.2f}")
print(f" Std distance: {untrained_clean['distance'].std():.2f}")

View file

@ -0,0 +1,90 @@
import pandas as pd
import numpy as np
from scipy import stats
from config import DATA_PROCESSED, DATA_METADATA
# Load data
trained_distances = pd.read_csv(DATA_PROCESSED / 'trained_distances.csv')
untrained_distances = pd.read_csv(DATA_PROCESSED / 'untrained_distances.csv')
barrier_data = pd.read_csv(DATA_METADATA / '2025_07_15_barrier_opening.csv')
# Convert opening_time to milliseconds and create a mapping
barrier_data['opening_time_ms'] = barrier_data['opening_time'] * 1000
opening_times = dict(zip(barrier_data['machine'], barrier_data['opening_time_ms']))
def align_to_opening_time(df, opening_times):
"""Align distance data to barrier opening time.
Args:
df (pd.DataFrame): Distance data with machine_name and t columns.
opening_times (dict): Mapping of machine ID to opening time in ms.
Returns:
pd.DataFrame: Data with aligned_time column added.
"""
df_aligned = df.copy()
df_aligned['aligned_time'] = np.nan
for machine in df['machine_name'].unique():
if machine in opening_times:
opening_time = opening_times[machine]
mask = df['machine_name'] == machine
df_aligned.loc[mask, 'aligned_time'] = df.loc[mask, 't'] - opening_time
df_aligned = df_aligned.dropna(subset=['aligned_time'])
return df_aligned
# Align the data
trained_aligned = align_to_opening_time(trained_distances, opening_times)
untrained_aligned = align_to_opening_time(untrained_distances, opening_times)
# Remove NaN distances
trained_clean = trained_aligned.dropna(subset=['distance'])
untrained_clean = untrained_aligned.dropna(subset=['distance'])
# Split into pre- and post-opening periods
trained_pre = trained_clean[trained_clean['aligned_time'] < 0]['distance']
trained_post = trained_clean[trained_clean['aligned_time'] > 0]['distance']
untrained_pre = untrained_clean[untrained_clean['aligned_time'] < 0]['distance']
untrained_post = untrained_clean[untrained_clean['aligned_time'] > 0]['distance']
print("=== STATISTICAL TESTS ===")
# Pre-opening period comparison
t_stat_pre, p_val_pre = stats.ttest_ind(trained_pre, untrained_pre)
cohens_d_pre = (trained_pre.mean() - untrained_pre.mean()) / np.sqrt(((len(trained_pre)-1)*trained_pre.var() + (len(untrained_pre)-1)*untrained_pre.var()) / (len(trained_pre) + len(untrained_pre) - 2))
print(f"Pre-opening period:")
print(f" Trained mean: {trained_pre.mean():.2f}, Untrained mean: {untrained_pre.mean():.2f}")
print(f" T-statistic: {t_stat_pre:.4f}, P-value: {p_val_pre:.2e}")
print(f" Cohen's d: {cohens_d_pre:.4f}")
# Post-opening period comparison
t_stat_post, p_val_post = stats.ttest_ind(trained_post, untrained_post)
cohens_d_post = (trained_post.mean() - untrained_post.mean()) / np.sqrt(((len(trained_post)-1)*trained_post.var() + (len(untrained_post)-1)*untrained_post.var()) / (len(trained_post) + len(untrained_post) - 2))
print(f"\nPost-opening period:")
print(f" Trained mean: {trained_post.mean():.2f}, Untrained mean: {untrained_post.mean():.2f}")
print(f" T-statistic: {t_stat_post:.4f}, P-value: {p_val_post:.2e}")
print(f" Cohen's d: {cohens_d_post:.4f}")
# Within-group comparisons (pre vs post)
t_stat_trained, p_val_trained = stats.ttest_ind(trained_pre, trained_post)
cohens_d_trained = (trained_post.mean() - trained_pre.mean()) / np.sqrt(((len(trained_post)-1)*trained_post.var() + (len(trained_pre)-1)*trained_pre.var()) / (len(trained_post) + len(trained_pre) - 2))
t_stat_untrained, p_val_untrained = stats.ttest_ind(untrained_pre, untrained_post)
cohens_d_untrained = (untrained_post.mean() - untrained_pre.mean()) / np.sqrt(((len(untrained_post)-1)*untrained_post.var() + (len(untrained_pre)-1)*untrained_pre.var()) / (len(untrained_post) + len(untrained_pre) - 2))
print(f"\nWithin-group changes:")
print(f" Trained flies - Pre vs Post:")
print(f" Mean change: {trained_post.mean() - trained_pre.mean():.2f}")
print(f" T-statistic: {t_stat_trained:.4f}, P-value: {p_val_trained:.2e}")
print(f" Cohen's d: {cohens_d_trained:.4f}")
print(f" Untrained flies - Pre vs Post:")
print(f" Mean change: {untrained_post.mean() - untrained_pre.mean():.2f}")
print(f" T-statistic: {t_stat_untrained:.4f}, P-value: {p_val_untrained:.2e}")
print(f" Cohen's d: {cohens_d_untrained:.4f}")