Initial commit: organized project structure for student handoff
Reorganized flat 41-file directory into structured layout with: - scripts/ for Python analysis code with shared config.py - notebooks/ for Jupyter analysis notebooks - data/ split into raw/, metadata/, processed/ - docs/ with analysis summary, experimental design, and bimodal hypothesis tutorial - tasks/ with todo checklist and lessons learned - Comprehensive README, PLANNING.md, and .gitignore Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
commit
e7e4db264d
27 changed files with 3105 additions and 0 deletions
0
scripts/__init__.py
Normal file
0
scripts/__init__.py
Normal file
240
scripts/analyze_distances.py
Normal file
240
scripts/analyze_distances.py
Normal file
|
|
@ -0,0 +1,240 @@
|
|||
import pandas as pd
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
import seaborn as sns
|
||||
from scipy import stats
|
||||
from sklearn.cluster import KMeans
|
||||
from sklearn.preprocessing import StandardScaler
|
||||
from sklearn.metrics import silhouette_score
|
||||
import warnings
|
||||
warnings.filterwarnings('ignore')
|
||||
|
||||
from config import DATA_PROCESSED, FIGURES
|
||||
|
||||
|
||||
def load_and_combine_data():
|
||||
"""Load and combine trained and untrained distance data.
|
||||
|
||||
Returns:
|
||||
pd.DataFrame: Combined distance data with group labels.
|
||||
"""
|
||||
trained_distances = pd.read_csv(DATA_PROCESSED / 'trained_distances.csv')
|
||||
untrained_distances = pd.read_csv(DATA_PROCESSED / 'untrained_distances.csv')
|
||||
|
||||
trained_distances['group'] = 'trained'
|
||||
untrained_distances['group'] = 'untrained'
|
||||
|
||||
combined_data = pd.concat([trained_distances, untrained_distances], ignore_index=True)
|
||||
combined_data = combined_data.dropna(subset=['distance'])
|
||||
|
||||
print(f"Combined data shape: {combined_data.shape}")
|
||||
print(f"Trained samples: {len(combined_data[combined_data['group'] == 'trained'])}")
|
||||
print(f"Untrained samples: {len(combined_data[combined_data['group'] == 'untrained'])}")
|
||||
|
||||
return combined_data
|
||||
|
||||
|
||||
def basic_statistics(combined_data):
|
||||
"""Perform basic statistical analysis.
|
||||
|
||||
Args:
|
||||
combined_data (pd.DataFrame): Combined distance data.
|
||||
"""
|
||||
print("\n=== BASIC STATISTICS ===")
|
||||
|
||||
for group in ['trained', 'untrained']:
|
||||
group_data = combined_data[combined_data['group'] == group]['distance']
|
||||
print(f"\n{group.capitalize()} flies:")
|
||||
print(f" Count: {len(group_data)}")
|
||||
print(f" Mean distance: {group_data.mean():.2f}")
|
||||
print(f" Std distance: {group_data.std():.2f}")
|
||||
print(f" Median distance: {group_data.median():.2f}")
|
||||
print(f" Min distance: {group_data.min():.2f}")
|
||||
print(f" Max distance: {group_data.max():.2f}")
|
||||
|
||||
trained_dist = combined_data[combined_data['group'] == 'trained']['distance']
|
||||
untrained_dist = combined_data[combined_data['group'] == 'untrained']['distance']
|
||||
|
||||
t_stat, p_value = stats.ttest_ind(trained_dist, untrained_dist)
|
||||
print(f"\nT-test between groups:")
|
||||
print(f" T-statistic: {t_stat:.4f}")
|
||||
print(f" P-value: {p_value:.2e}")
|
||||
|
||||
pooled_std = np.sqrt(((len(trained_dist)-1)*trained_dist.std()**2 +
|
||||
(len(untrained_dist)-1)*untrained_dist.std()**2) /
|
||||
(len(trained_dist) + len(untrained_dist) - 2))
|
||||
cohens_d = (trained_dist.mean() - untrained_dist.mean()) / pooled_std
|
||||
print(f" Cohen's d (effect size): {cohens_d:.4f}")
|
||||
|
||||
|
||||
def distance_distribution_analysis(combined_data):
|
||||
"""Analyze distance distributions and create plots.
|
||||
|
||||
Args:
|
||||
combined_data (pd.DataFrame): Combined distance data.
|
||||
"""
|
||||
print("\n=== DISTANCE DISTRIBUTION ANALYSIS ===")
|
||||
|
||||
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
|
||||
fig.suptitle('Distance Distribution Analysis', fontsize=16)
|
||||
|
||||
axes[0, 0].hist(combined_data[combined_data['group'] == 'trained']['distance'],
|
||||
alpha=0.7, label='Trained', bins=50, density=True)
|
||||
axes[0, 0].hist(combined_data[combined_data['group'] == 'untrained']['distance'],
|
||||
alpha=0.7, label='Untrained', bins=50, density=True)
|
||||
axes[0, 0].set_xlabel('Distance')
|
||||
axes[0, 0].set_ylabel('Density')
|
||||
axes[0, 0].set_title('Distance Distribution by Group')
|
||||
axes[0, 0].legend()
|
||||
|
||||
combined_data.boxplot(column='distance', by='group', ax=axes[0, 1])
|
||||
axes[0, 1].set_title('Distance Box Plot by Group')
|
||||
axes[0, 1].set_xlabel('Group')
|
||||
axes[0, 1].set_ylabel('Distance')
|
||||
|
||||
trained_dist = combined_data[combined_data['group'] == 'trained']['distance']
|
||||
untrained_dist = combined_data[combined_data['group'] == 'untrained']['distance']
|
||||
|
||||
trained_sorted = np.sort(trained_dist)
|
||||
untrained_sorted = np.sort(untrained_dist)
|
||||
trained_cumulative = np.arange(1, len(trained_sorted) + 1) / len(trained_sorted)
|
||||
untrained_cumulative = np.arange(1, len(untrained_sorted) + 1) / len(untrained_sorted)
|
||||
|
||||
axes[1, 0].plot(trained_sorted, trained_cumulative, label='Trained', alpha=0.7)
|
||||
axes[1, 0].plot(untrained_sorted, untrained_cumulative, label='Untrained', alpha=0.7)
|
||||
axes[1, 0].set_xlabel('Distance')
|
||||
axes[1, 0].set_ylabel('Cumulative Probability')
|
||||
axes[1, 0].set_title('Cumulative Distribution of Distances')
|
||||
axes[1, 0].legend()
|
||||
|
||||
sns.violinplot(data=combined_data, x='group', y='distance', ax=axes[1, 1])
|
||||
axes[1, 1].set_title('Distance Violin Plot by Group')
|
||||
axes[1, 1].set_xlabel('Group')
|
||||
axes[1, 1].set_ylabel('Distance')
|
||||
|
||||
plt.tight_layout()
|
||||
plt.savefig(FIGURES / 'distance_analysis.png', dpi=300, bbox_inches='tight')
|
||||
plt.show()
|
||||
|
||||
print("Distance distribution plots saved")
|
||||
|
||||
|
||||
def clustering_analysis(combined_data):
|
||||
"""Perform clustering analysis on distance data.
|
||||
|
||||
Args:
|
||||
combined_data (pd.DataFrame): Combined distance data.
|
||||
|
||||
Returns:
|
||||
tuple: (clustered_data, kmeans_model, scaler).
|
||||
"""
|
||||
print("\n=== CLUSTERING ANALYSIS ===")
|
||||
|
||||
features = ['distance', 'n_flies', 'area_fly1', 'area_fly2']
|
||||
X = combined_data[features].dropna()
|
||||
|
||||
scaler = StandardScaler()
|
||||
X_scaled = scaler.fit_transform(X)
|
||||
|
||||
k_range = range(2, 6)
|
||||
inertias = []
|
||||
sil_scores = []
|
||||
|
||||
for k in k_range:
|
||||
kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
|
||||
kmeans.fit(X_scaled)
|
||||
inertias.append(kmeans.inertia_)
|
||||
sil_scores.append(silhouette_score(X_scaled, kmeans.labels_))
|
||||
|
||||
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))
|
||||
ax1.plot(k_range, inertias, 'bo-')
|
||||
ax1.set_xlabel('Number of Clusters (k)')
|
||||
ax1.set_ylabel('Inertia')
|
||||
ax1.set_title('Elbow Method for Optimal k')
|
||||
|
||||
ax2.plot(k_range, sil_scores, 'ro-')
|
||||
ax2.set_xlabel('Number of Clusters (k)')
|
||||
ax2.set_ylabel('Silhouette Score')
|
||||
ax2.set_title('Silhouette Score for Different k')
|
||||
|
||||
plt.tight_layout()
|
||||
plt.savefig(FIGURES / 'clustering_analysis.png', dpi=300, bbox_inches='tight')
|
||||
plt.show()
|
||||
|
||||
optimal_k = 2
|
||||
kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
|
||||
cluster_labels = kmeans.fit_predict(X_scaled)
|
||||
|
||||
X_clustered = X.copy()
|
||||
X_clustered['cluster'] = cluster_labels
|
||||
X_clustered['actual_group'] = combined_data.loc[X_clustered.index, 'group'].values
|
||||
|
||||
confusion = pd.crosstab(X_clustered['cluster'], X_clustered['actual_group'])
|
||||
print(f"Clustering results (k={optimal_k}):")
|
||||
print(confusion)
|
||||
|
||||
c0t = len(X_clustered[(X_clustered['cluster'] == 0) & (X_clustered['actual_group'] == 'trained')])
|
||||
c0u = len(X_clustered[(X_clustered['cluster'] == 0) & (X_clustered['actual_group'] == 'untrained')])
|
||||
c1t = len(X_clustered[(X_clustered['cluster'] == 1) & (X_clustered['actual_group'] == 'trained')])
|
||||
c1u = len(X_clustered[(X_clustered['cluster'] == 1) & (X_clustered['actual_group'] == 'untrained')])
|
||||
|
||||
accuracy = max((c0t + c1u) / len(X_clustered), (c0u + c1t) / len(X_clustered))
|
||||
print(f"\nClustering accuracy: {accuracy:.4f}")
|
||||
|
||||
print("\nCluster characteristics:")
|
||||
for i in range(optimal_k):
|
||||
cluster_data = X_clustered[X_clustered['cluster'] == i]
|
||||
print(f"\nCluster {i}:")
|
||||
print(f" Size: {len(cluster_data)}")
|
||||
print(f" Distance - Mean: {cluster_data['distance'].mean():.2f}, Std: {cluster_data['distance'].std():.2f}")
|
||||
print(f" N_flies - Mean: {cluster_data['n_flies'].mean():.2f}")
|
||||
print(f" Area_fly1 - Mean: {cluster_data['area_fly1'].mean():.2f}")
|
||||
|
||||
return X_clustered, kmeans, scaler
|
||||
|
||||
|
||||
def simple_classification_rule(combined_data):
|
||||
"""Create a simple rule-based classifier.
|
||||
|
||||
Args:
|
||||
combined_data (pd.DataFrame): Combined distance data.
|
||||
"""
|
||||
print("\n=== SIMPLE RULE-BASED CLASSIFICATION ===")
|
||||
|
||||
clean_data = combined_data.dropna(subset=['distance'])
|
||||
thresholds = np.percentile(clean_data['distance'], [25, 50, 75])
|
||||
print(f"Distance percentiles: 25%={thresholds[0]:.2f}, 50%={thresholds[1]:.2f}, 75%={thresholds[2]:.2f}")
|
||||
|
||||
for threshold in thresholds:
|
||||
predictions = ['trained' if d > threshold else 'untrained'
|
||||
for d in clean_data['distance']]
|
||||
actual = clean_data['group']
|
||||
accuracy = np.mean([p == a for p, a in zip(predictions, actual)])
|
||||
|
||||
tp = sum([p == 'trained' and a == 'trained' for p, a in zip(predictions, actual)])
|
||||
tn = sum([p == 'untrained' and a == 'untrained' for p, a in zip(predictions, actual)])
|
||||
fp = sum([p == 'trained' and a == 'untrained' for p, a in zip(predictions, actual)])
|
||||
fn = sum([p == 'untrained' and a == 'trained' for p, a in zip(predictions, actual)])
|
||||
|
||||
sensitivity = tp / (tp + fn) if (tp + fn) > 0 else 0
|
||||
specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
|
||||
|
||||
print(f"\nThreshold = {threshold:.2f}:")
|
||||
print(f" Accuracy: {accuracy:.4f}")
|
||||
print(f" Sensitivity: {sensitivity:.4f}, Specificity: {specificity:.4f}")
|
||||
|
||||
|
||||
def main():
|
||||
"""Run the full distance analysis pipeline."""
|
||||
combined_data = load_and_combine_data()
|
||||
basic_statistics(combined_data)
|
||||
distance_distribution_analysis(combined_data)
|
||||
clustered_data, kmeans_model, scaler = clustering_analysis(combined_data)
|
||||
simple_classification_rule(combined_data)
|
||||
|
||||
clustered_data.to_csv(DATA_PROCESSED / 'clustered_distance_data.csv', index=False)
|
||||
print("\n=== ANALYSIS COMPLETE ===")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
118
scripts/calculate_distances.py
Normal file
118
scripts/calculate_distances.py
Normal file
|
|
@ -0,0 +1,118 @@
|
|||
import pandas as pd
|
||||
import numpy as np
|
||||
from scipy.spatial.distance import euclidean
|
||||
|
||||
from config import DATA_PROCESSED
|
||||
|
||||
|
||||
def calculate_fly_distances(trained_file=None, untrained_file=None):
|
||||
"""Calculate distances between flies at each time point.
|
||||
|
||||
For each time point:
|
||||
- If two flies are detected: calculate Cartesian distance between them
|
||||
- If one fly is detected: set distance to 0 if area > average area, otherwise NaN
|
||||
|
||||
Args:
|
||||
trained_file (Path): Path to trained ROI data CSV.
|
||||
untrained_file (Path): Path to untrained ROI data CSV.
|
||||
|
||||
Returns:
|
||||
tuple: (trained_distances, untrained_distances) DataFrames.
|
||||
"""
|
||||
if trained_file is None:
|
||||
trained_file = DATA_PROCESSED / 'trained_roi_data.csv'
|
||||
if untrained_file is None:
|
||||
untrained_file = DATA_PROCESSED / 'untrained_roi_data.csv'
|
||||
|
||||
trained_df = pd.read_csv(trained_file)
|
||||
untrained_df = pd.read_csv(untrained_file)
|
||||
|
||||
trained_df['area'] = trained_df['w'] * trained_df['h']
|
||||
untrained_df['area'] = untrained_df['w'] * untrained_df['h']
|
||||
|
||||
avg_area = np.mean([trained_df['area'].mean(), untrained_df['area'].mean()])
|
||||
print(f"Average area across all data: {avg_area:.2f}")
|
||||
|
||||
trained_distances = process_distance_data(trained_df, avg_area)
|
||||
untrained_distances = process_distance_data(untrained_df, avg_area)
|
||||
|
||||
return trained_distances, untrained_distances
|
||||
|
||||
|
||||
def process_distance_data(df, avg_area):
|
||||
"""Process a DataFrame to calculate distances between flies at each time point.
|
||||
|
||||
Args:
|
||||
df (pd.DataFrame): Input tracking data.
|
||||
avg_area (float): Average area threshold for single-fly detection.
|
||||
|
||||
Returns:
|
||||
pd.DataFrame: Distance data with columns for machine, ROI, time, distance.
|
||||
"""
|
||||
results = []
|
||||
|
||||
for (machine_name, roi), group in df.groupby(['machine_name', 'ROI']):
|
||||
for t, time_group in group.groupby('t'):
|
||||
time_group = time_group.sort_values('id').reset_index(drop=True)
|
||||
|
||||
if len(time_group) >= 2:
|
||||
fly1 = time_group.iloc[0]
|
||||
fly2 = time_group.iloc[1]
|
||||
distance = euclidean([fly1['x'], fly1['y']], [fly2['x'], fly2['y']])
|
||||
|
||||
results.append({
|
||||
'machine_name': machine_name,
|
||||
'ROI': roi,
|
||||
't': t,
|
||||
'distance': distance,
|
||||
'n_flies': len(time_group),
|
||||
'area_fly1': fly1['area'],
|
||||
'area_fly2': fly2['area']
|
||||
})
|
||||
elif len(time_group) == 1:
|
||||
fly = time_group.iloc[0]
|
||||
area = fly['area']
|
||||
|
||||
if area > avg_area:
|
||||
distance = 0.0
|
||||
else:
|
||||
distance = np.nan
|
||||
|
||||
results.append({
|
||||
'machine_name': machine_name,
|
||||
'ROI': roi,
|
||||
't': t,
|
||||
'distance': distance,
|
||||
'n_flies': 1,
|
||||
'area_fly1': area,
|
||||
'area_fly2': np.nan
|
||||
})
|
||||
|
||||
return pd.DataFrame(results)
|
||||
|
||||
|
||||
def main():
|
||||
"""Run distance calculations and save results."""
|
||||
trained_distances, untrained_distances = calculate_fly_distances()
|
||||
|
||||
print(f"Trained data distance summary:")
|
||||
print(f" Shape: {trained_distances.shape}")
|
||||
print(f" Distance stats:")
|
||||
print(f" Count: {trained_distances['distance'].count()}")
|
||||
print(f" Mean: {trained_distances['distance'].mean():.2f}")
|
||||
print(f" Std: {trained_distances['distance'].std():.2f}")
|
||||
|
||||
print(f"\nUntrained data distance summary:")
|
||||
print(f" Shape: {untrained_distances.shape}")
|
||||
print(f" Distance stats:")
|
||||
print(f" Count: {untrained_distances['distance'].count()}")
|
||||
print(f" Mean: {untrained_distances['distance'].mean():.2f}")
|
||||
print(f" Std: {untrained_distances['distance'].std():.2f}")
|
||||
|
||||
trained_distances.to_csv(DATA_PROCESSED / 'trained_distances.csv', index=False)
|
||||
untrained_distances.to_csv(DATA_PROCESSED / 'untrained_distances.csv', index=False)
|
||||
print("\nDistance data saved")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
9
scripts/config.py
Normal file
9
scripts/config.py
Normal file
|
|
@ -0,0 +1,9 @@
|
|||
"""Shared path constants for the Cupido tracking project."""
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
PROJECT_ROOT = Path(__file__).resolve().parent.parent
|
||||
DATA_RAW = PROJECT_ROOT / "data" / "raw"
|
||||
DATA_METADATA = PROJECT_ROOT / "data" / "metadata"
|
||||
DATA_PROCESSED = PROJECT_ROOT / "data" / "processed"
|
||||
FIGURES = PROJECT_ROOT / "figures"
|
||||
90
scripts/load_roi_data.py
Normal file
90
scripts/load_roi_data.py
Normal file
|
|
@ -0,0 +1,90 @@
|
|||
import pandas as pd
|
||||
import sqlite3
|
||||
import re
|
||||
|
||||
from config import DATA_RAW, DATA_METADATA, DATA_PROCESSED
|
||||
|
||||
|
||||
def load_roi_data():
|
||||
"""Load ROI data from SQLite databases and group by trained/untrained.
|
||||
|
||||
Returns:
|
||||
tuple: (trained_df, untrained_df) DataFrames with tracking data.
|
||||
"""
|
||||
metadata = pd.read_csv(DATA_METADATA / '2025_07_15_metadata_fixed.csv')
|
||||
metadata['machine_name'] = metadata['machine_name'].astype(str)
|
||||
|
||||
trained_rois = metadata[metadata['group'] == 'trained']
|
||||
untrained_rois = metadata[metadata['group'] == 'untrained']
|
||||
|
||||
db_files = list(DATA_RAW.glob('*_tracking.db'))
|
||||
|
||||
trained_df = pd.DataFrame()
|
||||
untrained_df = pd.DataFrame()
|
||||
|
||||
for db_file in db_files:
|
||||
print(f"Processing {db_file.name}")
|
||||
|
||||
pattern = r'_([0-9a-f]{32})__'
|
||||
match = re.search(pattern, db_file.name)
|
||||
|
||||
if not match:
|
||||
print(f"Could not extract UUID from {db_file.name}")
|
||||
continue
|
||||
|
||||
uuid = match.group(1)
|
||||
metadata_matches = metadata[metadata['path'].str.contains(uuid, na=False)]
|
||||
|
||||
if metadata_matches.empty:
|
||||
print(f"No metadata matches found for UUID {uuid} from {db_file.name}")
|
||||
continue
|
||||
|
||||
machine_id = metadata_matches.iloc[0]['machine_name']
|
||||
print(f"Matched to machine ID: {machine_id}")
|
||||
|
||||
conn = sqlite3.connect(str(db_file))
|
||||
|
||||
machine_trained = trained_rois[trained_rois['machine_name'] == machine_id]
|
||||
machine_untrained = untrained_rois[untrained_rois['machine_name'] == machine_id]
|
||||
|
||||
for _, row in machine_trained.iterrows():
|
||||
roi = row['ROI']
|
||||
try:
|
||||
query = f"SELECT * FROM ROI_{roi}"
|
||||
roi_data = pd.read_sql_query(query, conn)
|
||||
roi_data['machine_name'] = machine_id
|
||||
roi_data['ROI'] = roi
|
||||
roi_data['group'] = 'trained'
|
||||
trained_df = pd.concat([trained_df, roi_data], ignore_index=True)
|
||||
except Exception as e:
|
||||
print(f"Error loading ROI_{roi} from {db_file.name}: {e}")
|
||||
|
||||
for _, row in machine_untrained.iterrows():
|
||||
roi = row['ROI']
|
||||
try:
|
||||
query = f"SELECT * FROM ROI_{roi}"
|
||||
roi_data = pd.read_sql_query(query, conn)
|
||||
roi_data['machine_name'] = machine_id
|
||||
roi_data['ROI'] = roi
|
||||
roi_data['group'] = 'untrained'
|
||||
untrained_df = pd.concat([untrained_df, roi_data], ignore_index=True)
|
||||
except Exception as e:
|
||||
print(f"Error loading ROI_{roi} from {db_file.name}: {e}")
|
||||
|
||||
conn.close()
|
||||
|
||||
return trained_df, untrained_df
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
trained_data, untrained_data = load_roi_data()
|
||||
print(f"Trained data shape: {trained_data.shape}")
|
||||
print(f"Untrained data shape: {untrained_data.shape}")
|
||||
if not trained_data.empty:
|
||||
print("Trained data columns:", trained_data.columns.tolist())
|
||||
if not untrained_data.empty:
|
||||
print("Untrained data columns:", untrained_data.columns.tolist())
|
||||
|
||||
trained_data.to_csv(DATA_PROCESSED / 'trained_roi_data.csv', index=False)
|
||||
untrained_data.to_csv(DATA_PROCESSED / 'untrained_roi_data.csv', index=False)
|
||||
print("Data saved to trained_roi_data.csv and untrained_roi_data.csv")
|
||||
97
scripts/ml_classification.py
Normal file
97
scripts/ml_classification.py
Normal file
|
|
@ -0,0 +1,97 @@
|
|||
import pandas as pd
|
||||
import numpy as np
|
||||
from sklearn.model_selection import train_test_split, cross_val_score
|
||||
from sklearn.ensemble import RandomForestClassifier
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
|
||||
from sklearn.preprocessing import StandardScaler
|
||||
from sklearn.impute import SimpleImputer
|
||||
import matplotlib.pyplot as plt
|
||||
import seaborn as sns
|
||||
|
||||
from config import DATA_PROCESSED, FIGURES
|
||||
|
||||
# Load data
|
||||
trained_distances = pd.read_csv(DATA_PROCESSED / 'trained_distances.csv')
|
||||
untrained_distances = pd.read_csv(DATA_PROCESSED / 'untrained_distances.csv')
|
||||
|
||||
# Add group labels
|
||||
trained_distances['group'] = 'trained'
|
||||
untrained_distances['group'] = 'untrained'
|
||||
|
||||
# Combine data
|
||||
combined_data = pd.concat([trained_distances, untrained_distances], ignore_index=True)
|
||||
combined_data = combined_data.dropna(subset=['group'])
|
||||
|
||||
# Prepare features and target
|
||||
features = ['distance', 'n_flies', 'area_fly1', 'area_fly2']
|
||||
X = combined_data[features]
|
||||
y = combined_data['group']
|
||||
|
||||
# Handle missing values in features
|
||||
imputer = SimpleImputer(strategy='mean')
|
||||
X_imputed = pd.DataFrame(imputer.fit_transform(X), columns=features)
|
||||
|
||||
# Split data
|
||||
X_train, X_test, y_train, y_test = train_test_split(X_imputed, y, test_size=0.2, random_state=42)
|
||||
|
||||
# Standardize features
|
||||
scaler = StandardScaler()
|
||||
X_train_scaled = scaler.fit_transform(X_train)
|
||||
X_test_scaled = scaler.transform(X_test)
|
||||
|
||||
print("=== MACHINE LEARNING CLASSIFICATION ===")
|
||||
print(f"Training set size: {len(X_train)}")
|
||||
print(f"Testing set size: {len(X_test)}")
|
||||
|
||||
# 1. Logistic Regression
|
||||
print("\n1. Logistic Regression:")
|
||||
lr_model = LogisticRegression(random_state=42)
|
||||
lr_model.fit(X_train_scaled, y_train)
|
||||
lr_predictions = lr_model.predict(X_test_scaled)
|
||||
lr_accuracy = accuracy_score(y_test, lr_predictions)
|
||||
print(f"Accuracy: {lr_accuracy:.4f}")
|
||||
print("\nClassification Report:")
|
||||
print(classification_report(y_test, lr_predictions))
|
||||
|
||||
# 2. Random Forest
|
||||
print("\n2. Random Forest:")
|
||||
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
|
||||
rf_model.fit(X_train, y_train)
|
||||
rf_predictions = rf_model.predict(X_test)
|
||||
rf_accuracy = accuracy_score(y_test, rf_predictions)
|
||||
print(f"Accuracy: {rf_accuracy:.4f}")
|
||||
print("\nClassification Report:")
|
||||
print(classification_report(y_test, rf_predictions))
|
||||
|
||||
# Feature importance
|
||||
print("\nFeature Importance (Random Forest):")
|
||||
feature_importance = pd.DataFrame({
|
||||
'feature': features,
|
||||
'importance': rf_model.feature_importances_
|
||||
}).sort_values('importance', ascending=False)
|
||||
print(feature_importance)
|
||||
|
||||
# Confusion matrix for the best model
|
||||
best_model_name = "Random Forest" if rf_accuracy > lr_accuracy else "Logistic Regression"
|
||||
best_predictions = rf_predictions if rf_accuracy > lr_accuracy else lr_predictions
|
||||
|
||||
plt.figure(figsize=(8, 6))
|
||||
cm = confusion_matrix(y_test, best_predictions)
|
||||
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
|
||||
xticklabels=['Trained', 'Untrained'],
|
||||
yticklabels=['Trained', 'Untrained'])
|
||||
plt.title(f'Confusion Matrix - {best_model_name}')
|
||||
plt.xlabel('Predicted')
|
||||
plt.ylabel('Actual')
|
||||
plt.tight_layout()
|
||||
plt.savefig(FIGURES / 'confusion_matrix.png', dpi=300, bbox_inches='tight')
|
||||
plt.show()
|
||||
|
||||
# Cross-validation scores
|
||||
print("\n=== CROSS-VALIDATION SCORES ===")
|
||||
lr_cv_scores = cross_val_score(LogisticRegression(random_state=42), X_train_scaled, y_train, cv=5)
|
||||
rf_cv_scores = cross_val_score(RandomForestClassifier(n_estimators=100, random_state=42), X_train, y_train, cv=5)
|
||||
|
||||
print(f"Logistic Regression CV Score: {lr_cv_scores.mean():.4f} (+/- {lr_cv_scores.std() * 2:.4f})")
|
||||
print(f"Random Forest CV Score: {rf_cv_scores.mean():.4f} (+/- {rf_cv_scores.std() * 2:.4f})")
|
||||
101
scripts/plot_avg_distance_aligned.py
Normal file
101
scripts/plot_avg_distance_aligned.py
Normal file
|
|
@ -0,0 +1,101 @@
|
|||
import pandas as pd
|
||||
import matplotlib.pyplot as plt
|
||||
import numpy as np
|
||||
|
||||
from config import DATA_PROCESSED, DATA_METADATA, FIGURES
|
||||
|
||||
# Load data
|
||||
trained_distances = pd.read_csv(DATA_PROCESSED / 'trained_distances.csv')
|
||||
untrained_distances = pd.read_csv(DATA_PROCESSED / 'untrained_distances.csv')
|
||||
barrier_data = pd.read_csv(DATA_METADATA / '2025_07_15_barrier_opening.csv')
|
||||
|
||||
# Convert opening_time to milliseconds and create a mapping
|
||||
barrier_data['opening_time_ms'] = barrier_data['opening_time'] * 1000
|
||||
opening_times = dict(zip(barrier_data['machine'], barrier_data['opening_time_ms']))
|
||||
|
||||
|
||||
def align_to_opening_time(df, opening_times, max_time=300000):
|
||||
"""Align distance data to barrier opening time.
|
||||
|
||||
Args:
|
||||
df (pd.DataFrame): Distance data.
|
||||
opening_times (dict): Machine to opening time mapping.
|
||||
max_time (int): Maximum time in ms to include.
|
||||
|
||||
Returns:
|
||||
pd.DataFrame: Aligned data filtered to +/-150s around opening.
|
||||
"""
|
||||
df_aligned = df.copy()
|
||||
df_aligned['aligned_time'] = np.nan
|
||||
|
||||
for machine in df['machine_name'].unique():
|
||||
if machine in opening_times:
|
||||
opening_time = opening_times[machine]
|
||||
mask = (df['machine_name'] == machine) & (df['t'] <= max_time)
|
||||
df_aligned.loc[mask, 'aligned_time'] = df.loc[mask, 't'] - opening_time
|
||||
|
||||
df_aligned = df_aligned.dropna(subset=['aligned_time'])
|
||||
df_aligned = df_aligned[(df_aligned['aligned_time'] >= -150000) &
|
||||
(df_aligned['aligned_time'] <= 150000)]
|
||||
|
||||
return df_aligned
|
||||
|
||||
|
||||
# Align the data
|
||||
trained_aligned = align_to_opening_time(trained_distances, opening_times)
|
||||
untrained_aligned = align_to_opening_time(untrained_distances, opening_times)
|
||||
|
||||
# Calculate average distance over aligned time
|
||||
trained_avg = trained_aligned.groupby('aligned_time')['distance'].mean()
|
||||
untrained_avg = untrained_aligned.groupby('aligned_time')['distance'].mean()
|
||||
|
||||
# Apply smoothing
|
||||
window_size = 50
|
||||
trained_smooth = trained_avg.rolling(window=window_size, center=True).mean()
|
||||
untrained_smooth = untrained_avg.rolling(window=window_size, center=True).mean()
|
||||
|
||||
# Create the plot
|
||||
plt.figure(figsize=(12, 6))
|
||||
|
||||
plt.plot(trained_smooth.index/1000, trained_smooth.values,
|
||||
label='Trained (smoothed)', color='blue', linewidth=2)
|
||||
plt.plot(untrained_smooth.index/1000, untrained_smooth.values,
|
||||
label='Untrained (smoothed)', color='red', linewidth=2)
|
||||
|
||||
plt.axvline(x=0, color='black', linestyle='--', alpha=0.7, label='Barrier Opening')
|
||||
|
||||
plt.xlabel('Time (seconds relative to barrier opening)')
|
||||
plt.ylabel('Average Distance')
|
||||
plt.title('Average Distance Between Flies Aligned to Barrier Opening Time')
|
||||
plt.legend()
|
||||
plt.grid(True, alpha=0.3)
|
||||
plt.xlim(-150, 150)
|
||||
|
||||
plt.tight_layout()
|
||||
plt.savefig(FIGURES / 'avg_distance_aligned_to_opening.png', dpi=300, bbox_inches='tight')
|
||||
plt.show()
|
||||
|
||||
# Print statistics
|
||||
print("Trained flies (aligned to barrier opening):")
|
||||
print(f" Data points: {len(trained_aligned)}")
|
||||
print(f" Mean distance: {trained_aligned['distance'].mean():.2f}")
|
||||
print(f" Std distance: {trained_aligned['distance'].std():.2f}")
|
||||
|
||||
print("\nUntrained flies (aligned to barrier opening):")
|
||||
print(f" Data points: {len(untrained_aligned)}")
|
||||
print(f" Mean distance: {untrained_aligned['distance'].mean():.2f}")
|
||||
print(f" Std distance: {untrained_aligned['distance'].std():.2f}")
|
||||
|
||||
# Pre/post analysis
|
||||
trained_pre = trained_aligned[trained_aligned['aligned_time'] < 0]
|
||||
trained_post = trained_aligned[trained_aligned['aligned_time'] > 0]
|
||||
untrained_pre = untrained_aligned[untrained_aligned['aligned_time'] < 0]
|
||||
untrained_post = untrained_aligned[untrained_aligned['aligned_time'] > 0]
|
||||
|
||||
print("\nPre-opening period (t < 0):")
|
||||
print(f" Trained mean distance: {trained_pre['distance'].mean():.2f}")
|
||||
print(f" Untrained mean distance: {untrained_pre['distance'].mean():.2f}")
|
||||
|
||||
print("\nPost-opening period (t > 0):")
|
||||
print(f" Trained mean distance: {trained_post['distance'].mean():.2f}")
|
||||
print(f" Untrained mean distance: {untrained_post['distance'].mean():.2f}")
|
||||
51
scripts/plot_avg_distance_first_200s.py
Normal file
51
scripts/plot_avg_distance_first_200s.py
Normal file
|
|
@ -0,0 +1,51 @@
|
|||
import pandas as pd
|
||||
import matplotlib.pyplot as plt
|
||||
import numpy as np
|
||||
|
||||
from config import DATA_PROCESSED, FIGURES
|
||||
|
||||
# Load data
|
||||
trained_distances = pd.read_csv(DATA_PROCESSED / 'trained_distances.csv')
|
||||
untrained_distances = pd.read_csv(DATA_PROCESSED / 'untrained_distances.csv')
|
||||
|
||||
# Remove NaN distances and filter for first 200 seconds
|
||||
trained_clean = trained_distances.dropna(subset=['distance'])
|
||||
untrained_clean = untrained_distances.dropna(subset=['distance'])
|
||||
|
||||
trained_filtered = trained_clean[trained_clean['t'] <= 200000]
|
||||
untrained_filtered = untrained_clean[untrained_clean['t'] <= 200000]
|
||||
|
||||
# Calculate average distance over time
|
||||
trained_avg = trained_filtered.groupby('t')['distance'].mean()
|
||||
untrained_avg = untrained_filtered.groupby('t')['distance'].mean()
|
||||
|
||||
# Apply smoothing
|
||||
window_size = 50
|
||||
trained_smooth = trained_avg.rolling(window=window_size, center=True).mean()
|
||||
untrained_smooth = untrained_avg.rolling(window=window_size, center=True).mean()
|
||||
|
||||
# Create the plot
|
||||
plt.figure(figsize=(12, 6))
|
||||
|
||||
plt.plot(trained_smooth.index/1000, trained_smooth.values,
|
||||
label='Trained (smoothed)', color='blue', linewidth=2)
|
||||
plt.plot(untrained_smooth.index/1000, untrained_smooth.values,
|
||||
label='Untrained (smoothed)', color='red', linewidth=2)
|
||||
|
||||
plt.xlabel('Time (seconds)')
|
||||
plt.ylabel('Average Distance')
|
||||
plt.title('Average Distance Between Flies Over Time (First 200 Seconds)')
|
||||
plt.legend()
|
||||
plt.grid(True, alpha=0.3)
|
||||
|
||||
plt.tight_layout()
|
||||
plt.savefig(FIGURES / 'avg_distance_over_time_first_200s.png', dpi=300, bbox_inches='tight')
|
||||
plt.show()
|
||||
|
||||
print("Trained flies (first 200 seconds):")
|
||||
print(f" Mean distance: {trained_filtered['distance'].mean():.2f}")
|
||||
print(f" Std distance: {trained_filtered['distance'].std():.2f}")
|
||||
|
||||
print("\nUntrained flies (first 200 seconds):")
|
||||
print(f" Mean distance: {untrained_filtered['distance'].mean():.2f}")
|
||||
print(f" Std distance: {untrained_filtered['distance'].std():.2f}")
|
||||
43
scripts/plot_avg_distance_over_time.py
Normal file
43
scripts/plot_avg_distance_over_time.py
Normal file
|
|
@ -0,0 +1,43 @@
|
|||
import pandas as pd
|
||||
import matplotlib.pyplot as plt
|
||||
import numpy as np
|
||||
|
||||
from config import DATA_PROCESSED, FIGURES
|
||||
|
||||
# Load data
|
||||
trained_distances = pd.read_csv(DATA_PROCESSED / 'trained_distances.csv')
|
||||
untrained_distances = pd.read_csv(DATA_PROCESSED / 'untrained_distances.csv')
|
||||
|
||||
# Remove NaN distances
|
||||
trained_clean = trained_distances.dropna(subset=['distance'])
|
||||
untrained_clean = untrained_distances.dropna(subset=['distance'])
|
||||
|
||||
# Calculate average distance over time
|
||||
trained_avg = trained_clean.groupby('t')['distance'].mean()
|
||||
untrained_avg = untrained_clean.groupby('t')['distance'].mean()
|
||||
|
||||
# Create the plot
|
||||
plt.figure(figsize=(12, 6))
|
||||
|
||||
plt.plot(trained_avg.index, trained_avg.values,
|
||||
label='Trained (avg)', color='blue', linewidth=1)
|
||||
plt.plot(untrained_avg.index, untrained_avg.values,
|
||||
label='Untrained (avg)', color='red', linewidth=1)
|
||||
|
||||
plt.xlabel('Time')
|
||||
plt.ylabel('Average Distance')
|
||||
plt.title('Average Distance Between Flies Over Time by Group')
|
||||
plt.legend()
|
||||
plt.grid(True, alpha=0.3)
|
||||
|
||||
plt.tight_layout()
|
||||
plt.savefig(FIGURES / 'avg_distance_over_time.png', dpi=300, bbox_inches='tight')
|
||||
plt.show()
|
||||
|
||||
print("Trained flies:")
|
||||
print(f" Mean distance: {trained_clean['distance'].mean():.2f}")
|
||||
print(f" Std distance: {trained_clean['distance'].std():.2f}")
|
||||
|
||||
print("\nUntrained flies:")
|
||||
print(f" Mean distance: {untrained_clean['distance'].mean():.2f}")
|
||||
print(f" Std distance: {untrained_clean['distance'].std():.2f}")
|
||||
50
scripts/plot_distance_over_time.py
Normal file
50
scripts/plot_distance_over_time.py
Normal file
|
|
@ -0,0 +1,50 @@
|
|||
import pandas as pd
|
||||
import matplotlib.pyplot as plt
|
||||
import numpy as np
|
||||
|
||||
from config import DATA_PROCESSED, FIGURES
|
||||
|
||||
# Load data
|
||||
trained_distances = pd.read_csv(DATA_PROCESSED / 'trained_distances.csv')
|
||||
untrained_distances = pd.read_csv(DATA_PROCESSED / 'untrained_distances.csv')
|
||||
|
||||
# Remove NaN distances
|
||||
trained_clean = trained_distances.dropna(subset=['distance'])
|
||||
untrained_clean = untrained_distances.dropna(subset=['distance'])
|
||||
|
||||
# Create the plot
|
||||
plt.figure(figsize=(12, 6))
|
||||
|
||||
# Sample 1000 points from each group to avoid overcrowding
|
||||
if len(trained_clean) > 1000:
|
||||
trained_sample = trained_clean.sample(1000, random_state=42)
|
||||
else:
|
||||
trained_sample = trained_clean
|
||||
|
||||
if len(untrained_clean) > 1000:
|
||||
untrained_sample = untrained_clean.sample(1000, random_state=42)
|
||||
else:
|
||||
untrained_sample = untrained_clean
|
||||
|
||||
plt.scatter(trained_sample['t'], trained_sample['distance'],
|
||||
alpha=0.5, s=1, label='Trained', color='blue')
|
||||
plt.scatter(untrained_sample['t'], untrained_sample['distance'],
|
||||
alpha=0.5, s=1, label='Untrained', color='red')
|
||||
|
||||
plt.xlabel('Time')
|
||||
plt.ylabel('Distance')
|
||||
plt.title('Distance Between Flies Over Time')
|
||||
plt.legend()
|
||||
plt.grid(True, alpha=0.3)
|
||||
|
||||
plt.tight_layout()
|
||||
plt.savefig(FIGURES / 'distance_over_time.png', dpi=300, bbox_inches='tight')
|
||||
plt.show()
|
||||
|
||||
print("Trained flies:")
|
||||
print(f" Mean distance: {trained_clean['distance'].mean():.2f}")
|
||||
print(f" Std distance: {trained_clean['distance'].std():.2f}")
|
||||
|
||||
print("\nUntrained flies:")
|
||||
print(f" Mean distance: {untrained_clean['distance'].mean():.2f}")
|
||||
print(f" Std distance: {untrained_clean['distance'].std():.2f}")
|
||||
90
scripts/statistical_tests.py
Normal file
90
scripts/statistical_tests.py
Normal file
|
|
@ -0,0 +1,90 @@
|
|||
import pandas as pd
|
||||
import numpy as np
|
||||
from scipy import stats
|
||||
|
||||
from config import DATA_PROCESSED, DATA_METADATA
|
||||
|
||||
# Load data
|
||||
trained_distances = pd.read_csv(DATA_PROCESSED / 'trained_distances.csv')
|
||||
untrained_distances = pd.read_csv(DATA_PROCESSED / 'untrained_distances.csv')
|
||||
barrier_data = pd.read_csv(DATA_METADATA / '2025_07_15_barrier_opening.csv')
|
||||
|
||||
# Convert opening_time to milliseconds and create a mapping
|
||||
barrier_data['opening_time_ms'] = barrier_data['opening_time'] * 1000
|
||||
opening_times = dict(zip(barrier_data['machine'], barrier_data['opening_time_ms']))
|
||||
|
||||
|
||||
def align_to_opening_time(df, opening_times):
|
||||
"""Align distance data to barrier opening time.
|
||||
|
||||
Args:
|
||||
df (pd.DataFrame): Distance data with machine_name and t columns.
|
||||
opening_times (dict): Mapping of machine ID to opening time in ms.
|
||||
|
||||
Returns:
|
||||
pd.DataFrame: Data with aligned_time column added.
|
||||
"""
|
||||
df_aligned = df.copy()
|
||||
df_aligned['aligned_time'] = np.nan
|
||||
|
||||
for machine in df['machine_name'].unique():
|
||||
if machine in opening_times:
|
||||
opening_time = opening_times[machine]
|
||||
mask = df['machine_name'] == machine
|
||||
df_aligned.loc[mask, 'aligned_time'] = df.loc[mask, 't'] - opening_time
|
||||
|
||||
df_aligned = df_aligned.dropna(subset=['aligned_time'])
|
||||
return df_aligned
|
||||
|
||||
|
||||
# Align the data
|
||||
trained_aligned = align_to_opening_time(trained_distances, opening_times)
|
||||
untrained_aligned = align_to_opening_time(untrained_distances, opening_times)
|
||||
|
||||
# Remove NaN distances
|
||||
trained_clean = trained_aligned.dropna(subset=['distance'])
|
||||
untrained_clean = untrained_aligned.dropna(subset=['distance'])
|
||||
|
||||
# Split into pre- and post-opening periods
|
||||
trained_pre = trained_clean[trained_clean['aligned_time'] < 0]['distance']
|
||||
trained_post = trained_clean[trained_clean['aligned_time'] > 0]['distance']
|
||||
untrained_pre = untrained_clean[untrained_clean['aligned_time'] < 0]['distance']
|
||||
untrained_post = untrained_clean[untrained_clean['aligned_time'] > 0]['distance']
|
||||
|
||||
print("=== STATISTICAL TESTS ===")
|
||||
|
||||
# Pre-opening period comparison
|
||||
t_stat_pre, p_val_pre = stats.ttest_ind(trained_pre, untrained_pre)
|
||||
cohens_d_pre = (trained_pre.mean() - untrained_pre.mean()) / np.sqrt(((len(trained_pre)-1)*trained_pre.var() + (len(untrained_pre)-1)*untrained_pre.var()) / (len(trained_pre) + len(untrained_pre) - 2))
|
||||
|
||||
print(f"Pre-opening period:")
|
||||
print(f" Trained mean: {trained_pre.mean():.2f}, Untrained mean: {untrained_pre.mean():.2f}")
|
||||
print(f" T-statistic: {t_stat_pre:.4f}, P-value: {p_val_pre:.2e}")
|
||||
print(f" Cohen's d: {cohens_d_pre:.4f}")
|
||||
|
||||
# Post-opening period comparison
|
||||
t_stat_post, p_val_post = stats.ttest_ind(trained_post, untrained_post)
|
||||
cohens_d_post = (trained_post.mean() - untrained_post.mean()) / np.sqrt(((len(trained_post)-1)*trained_post.var() + (len(untrained_post)-1)*untrained_post.var()) / (len(trained_post) + len(untrained_post) - 2))
|
||||
|
||||
print(f"\nPost-opening period:")
|
||||
print(f" Trained mean: {trained_post.mean():.2f}, Untrained mean: {untrained_post.mean():.2f}")
|
||||
print(f" T-statistic: {t_stat_post:.4f}, P-value: {p_val_post:.2e}")
|
||||
print(f" Cohen's d: {cohens_d_post:.4f}")
|
||||
|
||||
# Within-group comparisons (pre vs post)
|
||||
t_stat_trained, p_val_trained = stats.ttest_ind(trained_pre, trained_post)
|
||||
cohens_d_trained = (trained_post.mean() - trained_pre.mean()) / np.sqrt(((len(trained_post)-1)*trained_post.var() + (len(trained_pre)-1)*trained_pre.var()) / (len(trained_post) + len(trained_pre) - 2))
|
||||
|
||||
t_stat_untrained, p_val_untrained = stats.ttest_ind(untrained_pre, untrained_post)
|
||||
cohens_d_untrained = (untrained_post.mean() - untrained_pre.mean()) / np.sqrt(((len(untrained_post)-1)*untrained_post.var() + (len(untrained_pre)-1)*untrained_pre.var()) / (len(untrained_post) + len(untrained_pre) - 2))
|
||||
|
||||
print(f"\nWithin-group changes:")
|
||||
print(f" Trained flies - Pre vs Post:")
|
||||
print(f" Mean change: {trained_post.mean() - trained_pre.mean():.2f}")
|
||||
print(f" T-statistic: {t_stat_trained:.4f}, P-value: {p_val_trained:.2e}")
|
||||
print(f" Cohen's d: {cohens_d_trained:.4f}")
|
||||
|
||||
print(f" Untrained flies - Pre vs Post:")
|
||||
print(f" Mean change: {untrained_post.mean() - untrained_pre.mean():.2f}")
|
||||
print(f" T-statistic: {t_stat_untrained:.4f}, P-value: {p_val_untrained:.2e}")
|
||||
print(f" Cohen's d: {cohens_d_untrained:.4f}")
|
||||
Loading…
Add table
Add a link
Reference in a new issue