"""Compute per-frame inter-fly distances for every (date, machine, ROI, session). Reads tracking data via :func:`load_roi_data.load_roi_data` (which is driven by ``all_video_info_merged.tsv``) and produces one distances DataFrame spanning every fly/session in the batch. Group membership (``trained`` / ``untrained``) is preserved from the ``male`` column. """ import numpy as np import pandas as pd from scipy.spatial.distance import euclidean from config import DATA_PROCESSED from load_roi_data import load_roi_data def calculate_fly_distances(data: pd.DataFrame | None = None) -> pd.DataFrame: """Compute inter-fly distances over time for every fly/session. For each time point inside one (date, machine, ROI, session) trajectory: - 2+ flies detected: Euclidean distance between the first two by id - 1 fly detected: distance = 0 if its bbox area exceeds the global mean (likely a single blob containing both flies), else NaN Args: data: optional pre-loaded DataFrame from :func:`load_roi_data`. If None, the full batch is loaded. Returns: DataFrame with one row per (track, time) pair, including ``distance``, ``n_flies``, ``area_fly1``, ``area_fly2``, plus the metadata columns propagated from the source row (``date``, ``machine_name``, ``ROI``, ``session``, ``male``, ``species``, ``memory``, ``age``). """ if data is None: data = load_roi_data() if data.empty: return pd.DataFrame() data = data.copy() data["area"] = data["w"] * data["h"] avg_area = data["area"].mean() print(f"Average area across all data: {avg_area:.2f}") # Carry these onto every output row (constant within a track). keep_meta = ["date", "machine_name", "ROI", "session", "male", "species", "memory", "age"] rows: list[dict] = [] track_keys = ["date", "machine_name", "ROI", "session"] for track, track_df in data.groupby(track_keys, sort=False): meta_row = {k: v for k, v in zip(track_keys, track)} # Carry the rest of the metadata from any sample (constant per track). sample = track_df.iloc[0] for col in keep_meta: if col not in meta_row: meta_row[col] = sample[col] for t, time_group in track_df.groupby("t", sort=False): time_group = time_group.sort_values("id").reset_index(drop=True) row = dict(meta_row) row["t"] = t if len(time_group) >= 2: f1, f2 = time_group.iloc[0], time_group.iloc[1] row["distance"] = euclidean([f1["x"], f1["y"]], [f2["x"], f2["y"]]) row["n_flies"] = len(time_group) row["area_fly1"] = f1["area"] row["area_fly2"] = f2["area"] else: f = time_group.iloc[0] row["distance"] = 0.0 if f["area"] > avg_area else np.nan row["n_flies"] = 1 row["area_fly1"] = f["area"] row["area_fly2"] = np.nan rows.append(row) return pd.DataFrame(rows) def main() -> None: distances = calculate_fly_distances() print("\nDistance summary:") print(f" Shape: {distances.shape}") if not distances.empty: print(f" Distance count: {distances['distance'].count()}") print(f" Distance mean: {distances['distance'].mean():.2f}") print(f" Distance std: {distances['distance'].std():.2f}") male = distances["male"] print(f" Trained tracks: {(male == 'trained').sum()}") print(f" Naive tracks: {(male == 'naive').sum()}") DATA_PROCESSED.mkdir(parents=True, exist_ok=True) out = DATA_PROCESSED / "distances.csv" distances.to_csv(out, index=False) print(f"\nSaved {out}") if __name__ == "__main__": main()