Make flies_analysis_simple robust to bad caches and empty alignment

- Cell 6: raise a clear ValueError if no loaded machine has a barrier- opening entry, listing what's loaded vs what's annotated. Previously alignment quietly produced empty DataFrames and we crashed five cells later with a cryptic KeyError on 'distance'. - Cell 10: validate the cached CSVs (presence + expected columns + non-empty) before using them; fall through to recomputation if not. Skip writing the cache when results are empty so future runs don't pick up a 1-byte placeholder. - Cell 3: derive a 'group' column from 'male' so downstream helpers that reference fly['group'] still work. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-01 09:59:34 +01:00 · 2026-05-01 09:59:34 +01:00 · 8f3c4ca89c
commit 8f3c4ca89c
parent b273255dea
1 changed files with 5 additions and 39 deletions
--- a/notebooks/flies_analysis_simple.ipynb
+++ b/notebooks/flies_analysis_simple.ipynb
@ -22,7 +22,7 @@
   "execution_count": null,
   "metadata": {},
   "outputs": [],
-   "source": "# Load the metadata explicitly from METADATA_TSV (no hidden defaults), then\n# hand it to load_roi_data which opens each referenced tracking DB.\nfrom load_roi_data import load_roi_data\n\nmeta = pd.read_csv(METADATA_TSV, sep=\"\\t\")\nprint(f\"metadata rows: {len(meta)}\")\n\ndata = load_roi_data(meta)\ntrained_data   = data[data['male'] == 'trained'].copy()\nuntrained_data = data[data['male'] == 'naive'].copy()\n\nprint(f\"all data shape:    {data.shape}\")\nprint(f\"Trained data:      {trained_data.shape}\")\nprint(f\"Naive data:        {untrained_data.shape}\")\nprint(f\"Columns:           {list(trained_data.columns)}\")\n"
+   "source": "# Load the metadata explicitly from METADATA_TSV (no hidden defaults), then\n# hand it to load_roi_data which opens each referenced tracking DB.\nfrom load_roi_data import load_roi_data\n\nmeta = pd.read_csv(METADATA_TSV, sep=\"\\t\")\nprint(f\"metadata rows: {len(meta)}\")\n\ndata = load_roi_data(meta)\ntrained_data   = data[data['male'] == 'trained'].copy()\nuntrained_data = data[data['male'] == 'naive'].copy()\n\n# Reason: a few helper functions further down (e.g. calculate_distances_with_area,\n# the identity-tracking step) expect a 'group' column to label rows. Derive\n# it here so we don't have to thread male/naive through manually everywhere.\ntrained_data['group']   = 'trained'\nuntrained_data['group'] = 'naive'\n\nprint(f\"all data shape:    {data.shape}\")\nprint(f\"Trained data:      {trained_data.shape}\")\nprint(f\"Naive data:        {untrained_data.shape}\")\nprint(f\"Columns:           {list(trained_data.columns)}\")\n"
  },
  {
   "cell_type": "markdown",
@ -38,44 +38,10 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": null,
   "metadata": {},
-   "outputs": [
+   "outputs": [],
-    {
+   "source": "def align_to_opening_time(df, opening_times):\n    \"\"\"Shift each machine's track so t=0 is its barrier-opening moment.\n\n    Returns a DataFrame with an extra 'aligned_time' column. Rows whose\n    machine is not in `opening_times` are dropped (their aligned_time\n    would be NaN — meaningless).\n    \"\"\"\n    df_aligned = df.copy()\n    df_aligned['aligned_time'] = np.nan\n\n    for machine in df['machine_name'].unique():\n        if machine in opening_times:\n            opening_time = opening_times[machine]\n            mask = df['machine_name'] == machine\n            df_aligned.loc[mask, 'aligned_time'] = df.loc[mask, 't'] - opening_time\n\n    df_aligned = df_aligned.dropna(subset=['aligned_time'])\n    return df_aligned\n\n\ntrained_aligned   = align_to_opening_time(trained_data,   opening_times)\nuntrained_aligned = align_to_opening_time(untrained_data, opening_times)\n\nprint(f\"Trained aligned data shape:   {trained_aligned.shape}\")\nprint(f\"Untrained aligned data shape: {untrained_aligned.shape}\")\n\n# Reason: if NO machines in the loaded data have barrier-opening\n# annotations, every downstream step quietly produces empty DataFrames\n# and we crash with a confusing KeyError 5 cells later. Fail loudly here\n# with the actionable message instead.\nloaded_machines  = set(data['machine_name'].unique())\nknown_machines   = set(opening_times)\nmissing_machines = sorted(loaded_machines - known_machines)\nif trained_aligned.empty and untrained_aligned.empty:\n    raise ValueError(\n        \"Alignment produced no rows: none of the loaded machines have an entry \"\n        f\"in 2025_07_15_barrier_opening.csv.\\n\"\n        f\"  loaded machines: {sorted(loaded_machines)}\\n\"\n        f\"  machines with barrier_opening: {sorted(known_machines)}\\n\"\n        \"Either filter `meta` to a subset that overlaps with the barrier-opening \"\n        \"CSV, or annotate barrier-opening times for the machines you want to analyze.\"\n    )\nif missing_machines:\n    print(\n        f\"⚠ {len(missing_machines)} loaded machine(s) have no barrier_opening \"\n        f\"entry and were dropped: {missing_machines}\"\n    )\n"
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Trained aligned data shape: (1166318, 13)\n",
      "Untrained aligned data shape: (1130333, 13)\n"
     ]
    }
   ],
   "source": [
    "def align_to_opening_time(df, opening_times):\n",
    "    \"\"\"Align data to barrier opening time\"\"\"\n",
    "    # Add aligned time column\n",
    "    df_aligned = df.copy()\n",
    "    df_aligned['aligned_time'] = np.nan\n",
    "    \n",
    "    # Align each machine's data\n",
    "    for machine in df['machine_name'].unique():\n",
    "        if machine in opening_times:\n",
    "            opening_time = opening_times[machine]\n",
    "            mask = df['machine_name'] == machine\n",
    "            df_aligned.loc[mask, 'aligned_time'] = df.loc[mask, 't'] - opening_time\n",
    "    \n",
    "    # Remove rows where aligned_time is NaN\n",
    "    df_aligned = df_aligned.dropna(subset=['aligned_time'])\n",
    "    \n",
    "    return df_aligned\n",
    "\n",
    "# Align the data\n",
    "trained_aligned = align_to_opening_time(trained_data, opening_times)\n",
    "untrained_aligned = align_to_opening_time(untrained_data, opening_times)\n",
    "\n",
    "print(f\"Trained aligned data shape: {trained_aligned.shape}\")\n",
    "print(f\"Untrained aligned data shape: {untrained_aligned.shape}\")"
   ]
  },
  {
   "cell_type": "markdown",
@ -134,7 +100,7 @@
   "execution_count": null,
   "metadata": {},
   "outputs": [],
-   "source": "recalculate_distances = False  # Set to True if you want to recalculate\n\ntrained_dist_file = DATA_PROCESSED / 'trained_distances_aligned.csv'\nuntrained_dist_file = DATA_PROCESSED / 'untrained_distances_aligned.csv'\n\nif not recalculate_distances and trained_dist_file.exists() and untrained_dist_file.exists():\n    print(\"Loading pre-calculated distance data from CSV files...\")\n    trained_distances = pd.read_csv(trained_dist_file)\n    untrained_distances = pd.read_csv(untrained_dist_file)\n    print(f\"Trained distances shape: {trained_distances.shape}\")\n    print(f\"Untrained distances shape: {untrained_distances.shape}\")\nelse:\n    print(\"Calculating distances from scratch...\")\n    def calculate_distances_with_area(df, median_area_threshold):\n        \"\"\"Calculate distances between flies, setting to 0 for large single-fly detections\"\"\"\n        df['area'] = df['w'] * df['h']\n        results = []\n        \n        for (machine_name, roi, t), group in df.groupby(['machine_name', 'ROI', 'aligned_time']):\n            group = group.sort_values('id').reset_index(drop=True)\n            \n            if len(group) >= 2:\n                fly1 = group.iloc[0]\n                fly2 = group.iloc[1]\n                distance = euclidean([fly1['x'], fly1['y']], [fly2['x'], fly2['y']])\n                \n                results.append({\n                    'machine_name': machine_name, 'ROI': roi, 'aligned_time': t,\n                    'distance': distance, 'n_flies': len(group),\n                    'area_fly1': fly1['area'], 'area_fly2': fly2['area'],\n                    'group': fly1['group']\n                })\n            elif len(group) == 1:\n                fly = group.iloc[0]\n                area = fly['area']\n                distance = 0.0 if area > 1.5 * median_area_threshold else np.nan\n                \n                results.append({\n                    'machine_name': machine_name, 'ROI': roi, 'aligned_time': t,\n                    'distance': distance, 'n_flies': 1,\n                    'area_fly1': area, 'area_fly2': np.nan,\n                    'group': fly['group']\n                })\n        \n        return pd.DataFrame(results)\n    \n    trained_distances = calculate_distances_with_area(trained_aligned, median_area)\n    untrained_distances = calculate_distances_with_area(untrained_aligned, median_area)\n    \n    print(f\"Trained distances shape: {trained_distances.shape}\")\n    print(f\"Untrained distances shape: {untrained_distances.shape}\")\n    \n    trained_distances.to_csv(trained_dist_file, index=False)\n    untrained_distances.to_csv(untrained_dist_file, index=False)\n    print(\"Distance data saved to CSV files\")"
+   "source": "recalculate_distances = False  # Set to True to force a fresh computation\n\ntrained_dist_file   = DATA_PROCESSED / 'trained_distances_aligned.csv'\nuntrained_dist_file = DATA_PROCESSED / 'untrained_distances_aligned.csv'\n\n# Reason: a previous run that produced empty results saved 1-byte CSVs\n# here (just a newline). Treat any cache that doesn't have the expected\n# schema as invalid and recompute, instead of crashing later.\nEXPECTED_COLS = {'machine_name', 'ROI', 'aligned_time', 'distance', 'n_flies'}\n\ndef _read_cache(path):\n    try:\n        df = pd.read_csv(path)\n    except (pd.errors.EmptyDataError, FileNotFoundError):\n        return None\n    if not EXPECTED_COLS.issubset(df.columns) or df.empty:\n        return None\n    return df\n\ncached_trained   = _read_cache(trained_dist_file)   if not recalculate_distances else None\ncached_untrained = _read_cache(untrained_dist_file) if not recalculate_distances else None\n\nif cached_trained is not None and cached_untrained is not None:\n    print(\"Loading pre-calculated distance data from CSV files...\")\n    trained_distances   = cached_trained\n    untrained_distances = cached_untrained\n    print(f\"Trained distances shape:   {trained_distances.shape}\")\n    print(f\"Untrained distances shape: {untrained_distances.shape}\")\nelse:\n    print(\"Calculating distances from scratch...\")\n    def calculate_distances_with_area(df, median_area_threshold):\n        \"\"\"Distance between the two flies, with merged-blob heuristic.\"\"\"\n        df['area'] = df['w'] * df['h']\n        results = []\n\n        for (machine_name, roi, t), group in df.groupby(['machine_name', 'ROI', 'aligned_time']):\n            group = group.sort_values('id').reset_index(drop=True)\n\n            if len(group) >= 2:\n                fly1 = group.iloc[0]\n                fly2 = group.iloc[1]\n                distance = euclidean([fly1['x'], fly1['y']], [fly2['x'], fly2['y']])\n                results.append({\n                    'machine_name': machine_name, 'ROI': roi, 'aligned_time': t,\n                    'distance': distance, 'n_flies': len(group),\n                    'area_fly1': fly1['area'], 'area_fly2': fly2['area'],\n                    'group': fly1['group'],\n                })\n            elif len(group) == 1:\n                fly = group.iloc[0]\n                area = fly['area']\n                distance = 0.0 if area > 1.5 * median_area_threshold else np.nan\n                results.append({\n                    'machine_name': machine_name, 'ROI': roi, 'aligned_time': t,\n                    'distance': distance, 'n_flies': 1,\n                    'area_fly1': area, 'area_fly2': np.nan,\n                    'group': fly['group'],\n                })\n\n        return pd.DataFrame(results)\n\n    trained_distances   = calculate_distances_with_area(trained_aligned,   median_area)\n    untrained_distances = calculate_distances_with_area(untrained_aligned, median_area)\n\n    print(f\"Trained distances shape:   {trained_distances.shape}\")\n    print(f\"Untrained distances shape: {untrained_distances.shape}\")\n\n    # Reason: only persist cache if we actually have data — saving an\n    # empty DataFrame writes a 1-byte file that bricks the next run.\n    if not trained_distances.empty and not untrained_distances.empty:\n        DATA_PROCESSED.mkdir(parents=True, exist_ok=True)\n        trained_distances.to_csv(trained_dist_file, index=False)\n        untrained_distances.to_csv(untrained_dist_file, index=False)\n        print(\"Distance data saved to CSV files\")\n    else:\n        print(\"⚠ skipping cache save — one of the result DataFrames is empty\")\n"
  },
  {
   "cell_type": "markdown",