Make flies_analysis_simple robust to bad caches and empty alignment
- Cell 6: raise a clear ValueError if no loaded machine has a barrier- opening entry, listing what's loaded vs what's annotated. Previously alignment quietly produced empty DataFrames and we crashed five cells later with a cryptic KeyError on 'distance'. - Cell 10: validate the cached CSVs (presence + expected columns + non-empty) before using them; fall through to recomputation if not. Skip writing the cache when results are empty so future runs don't pick up a 1-byte placeholder. - Cell 3: derive a 'group' column from 'male' so downstream helpers that reference fly['group'] still work. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
parent
b273255dea
commit
8f3c4ca89c
1 changed files with 5 additions and 39 deletions
|
|
@ -22,7 +22,7 @@
|
||||||
"execution_count": null,
|
"execution_count": null,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": "# Load the metadata explicitly from METADATA_TSV (no hidden defaults), then\n# hand it to load_roi_data which opens each referenced tracking DB.\nfrom load_roi_data import load_roi_data\n\nmeta = pd.read_csv(METADATA_TSV, sep=\"\\t\")\nprint(f\"metadata rows: {len(meta)}\")\n\ndata = load_roi_data(meta)\ntrained_data = data[data['male'] == 'trained'].copy()\nuntrained_data = data[data['male'] == 'naive'].copy()\n\nprint(f\"all data shape: {data.shape}\")\nprint(f\"Trained data: {trained_data.shape}\")\nprint(f\"Naive data: {untrained_data.shape}\")\nprint(f\"Columns: {list(trained_data.columns)}\")\n"
|
"source": "# Load the metadata explicitly from METADATA_TSV (no hidden defaults), then\n# hand it to load_roi_data which opens each referenced tracking DB.\nfrom load_roi_data import load_roi_data\n\nmeta = pd.read_csv(METADATA_TSV, sep=\"\\t\")\nprint(f\"metadata rows: {len(meta)}\")\n\ndata = load_roi_data(meta)\ntrained_data = data[data['male'] == 'trained'].copy()\nuntrained_data = data[data['male'] == 'naive'].copy()\n\n# Reason: a few helper functions further down (e.g. calculate_distances_with_area,\n# the identity-tracking step) expect a 'group' column to label rows. Derive\n# it here so we don't have to thread male/naive through manually everywhere.\ntrained_data['group'] = 'trained'\nuntrained_data['group'] = 'naive'\n\nprint(f\"all data shape: {data.shape}\")\nprint(f\"Trained data: {trained_data.shape}\")\nprint(f\"Naive data: {untrained_data.shape}\")\nprint(f\"Columns: {list(trained_data.columns)}\")\n"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
|
|
@ -38,44 +38,10 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 4,
|
"execution_count": null,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [],
|
||||||
{
|
"source": "def align_to_opening_time(df, opening_times):\n \"\"\"Shift each machine's track so t=0 is its barrier-opening moment.\n\n Returns a DataFrame with an extra 'aligned_time' column. Rows whose\n machine is not in `opening_times` are dropped (their aligned_time\n would be NaN — meaningless).\n \"\"\"\n df_aligned = df.copy()\n df_aligned['aligned_time'] = np.nan\n\n for machine in df['machine_name'].unique():\n if machine in opening_times:\n opening_time = opening_times[machine]\n mask = df['machine_name'] == machine\n df_aligned.loc[mask, 'aligned_time'] = df.loc[mask, 't'] - opening_time\n\n df_aligned = df_aligned.dropna(subset=['aligned_time'])\n return df_aligned\n\n\ntrained_aligned = align_to_opening_time(trained_data, opening_times)\nuntrained_aligned = align_to_opening_time(untrained_data, opening_times)\n\nprint(f\"Trained aligned data shape: {trained_aligned.shape}\")\nprint(f\"Untrained aligned data shape: {untrained_aligned.shape}\")\n\n# Reason: if NO machines in the loaded data have barrier-opening\n# annotations, every downstream step quietly produces empty DataFrames\n# and we crash with a confusing KeyError 5 cells later. Fail loudly here\n# with the actionable message instead.\nloaded_machines = set(data['machine_name'].unique())\nknown_machines = set(opening_times)\nmissing_machines = sorted(loaded_machines - known_machines)\nif trained_aligned.empty and untrained_aligned.empty:\n raise ValueError(\n \"Alignment produced no rows: none of the loaded machines have an entry \"\n f\"in 2025_07_15_barrier_opening.csv.\\n\"\n f\" loaded machines: {sorted(loaded_machines)}\\n\"\n f\" machines with barrier_opening: {sorted(known_machines)}\\n\"\n \"Either filter `meta` to a subset that overlaps with the barrier-opening \"\n \"CSV, or annotate barrier-opening times for the machines you want to analyze.\"\n )\nif missing_machines:\n print(\n f\"⚠ {len(missing_machines)} loaded machine(s) have no barrier_opening \"\n f\"entry and were dropped: {missing_machines}\"\n )\n"
|
||||||
"name": "stdout",
|
|
||||||
"output_type": "stream",
|
|
||||||
"text": [
|
|
||||||
"Trained aligned data shape: (1166318, 13)\n",
|
|
||||||
"Untrained aligned data shape: (1130333, 13)\n"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"def align_to_opening_time(df, opening_times):\n",
|
|
||||||
" \"\"\"Align data to barrier opening time\"\"\"\n",
|
|
||||||
" # Add aligned time column\n",
|
|
||||||
" df_aligned = df.copy()\n",
|
|
||||||
" df_aligned['aligned_time'] = np.nan\n",
|
|
||||||
" \n",
|
|
||||||
" # Align each machine's data\n",
|
|
||||||
" for machine in df['machine_name'].unique():\n",
|
|
||||||
" if machine in opening_times:\n",
|
|
||||||
" opening_time = opening_times[machine]\n",
|
|
||||||
" mask = df['machine_name'] == machine\n",
|
|
||||||
" df_aligned.loc[mask, 'aligned_time'] = df.loc[mask, 't'] - opening_time\n",
|
|
||||||
" \n",
|
|
||||||
" # Remove rows where aligned_time is NaN\n",
|
|
||||||
" df_aligned = df_aligned.dropna(subset=['aligned_time'])\n",
|
|
||||||
" \n",
|
|
||||||
" return df_aligned\n",
|
|
||||||
"\n",
|
|
||||||
"# Align the data\n",
|
|
||||||
"trained_aligned = align_to_opening_time(trained_data, opening_times)\n",
|
|
||||||
"untrained_aligned = align_to_opening_time(untrained_data, opening_times)\n",
|
|
||||||
"\n",
|
|
||||||
"print(f\"Trained aligned data shape: {trained_aligned.shape}\")\n",
|
|
||||||
"print(f\"Untrained aligned data shape: {untrained_aligned.shape}\")"
|
|
||||||
]
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
|
|
@ -134,7 +100,7 @@
|
||||||
"execution_count": null,
|
"execution_count": null,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": "recalculate_distances = False # Set to True if you want to recalculate\n\ntrained_dist_file = DATA_PROCESSED / 'trained_distances_aligned.csv'\nuntrained_dist_file = DATA_PROCESSED / 'untrained_distances_aligned.csv'\n\nif not recalculate_distances and trained_dist_file.exists() and untrained_dist_file.exists():\n print(\"Loading pre-calculated distance data from CSV files...\")\n trained_distances = pd.read_csv(trained_dist_file)\n untrained_distances = pd.read_csv(untrained_dist_file)\n print(f\"Trained distances shape: {trained_distances.shape}\")\n print(f\"Untrained distances shape: {untrained_distances.shape}\")\nelse:\n print(\"Calculating distances from scratch...\")\n def calculate_distances_with_area(df, median_area_threshold):\n \"\"\"Calculate distances between flies, setting to 0 for large single-fly detections\"\"\"\n df['area'] = df['w'] * df['h']\n results = []\n \n for (machine_name, roi, t), group in df.groupby(['machine_name', 'ROI', 'aligned_time']):\n group = group.sort_values('id').reset_index(drop=True)\n \n if len(group) >= 2:\n fly1 = group.iloc[0]\n fly2 = group.iloc[1]\n distance = euclidean([fly1['x'], fly1['y']], [fly2['x'], fly2['y']])\n \n results.append({\n 'machine_name': machine_name, 'ROI': roi, 'aligned_time': t,\n 'distance': distance, 'n_flies': len(group),\n 'area_fly1': fly1['area'], 'area_fly2': fly2['area'],\n 'group': fly1['group']\n })\n elif len(group) == 1:\n fly = group.iloc[0]\n area = fly['area']\n distance = 0.0 if area > 1.5 * median_area_threshold else np.nan\n \n results.append({\n 'machine_name': machine_name, 'ROI': roi, 'aligned_time': t,\n 'distance': distance, 'n_flies': 1,\n 'area_fly1': area, 'area_fly2': np.nan,\n 'group': fly['group']\n })\n \n return pd.DataFrame(results)\n \n trained_distances = calculate_distances_with_area(trained_aligned, median_area)\n untrained_distances = calculate_distances_with_area(untrained_aligned, median_area)\n \n print(f\"Trained distances shape: {trained_distances.shape}\")\n print(f\"Untrained distances shape: {untrained_distances.shape}\")\n \n trained_distances.to_csv(trained_dist_file, index=False)\n untrained_distances.to_csv(untrained_dist_file, index=False)\n print(\"Distance data saved to CSV files\")"
|
"source": "recalculate_distances = False # Set to True to force a fresh computation\n\ntrained_dist_file = DATA_PROCESSED / 'trained_distances_aligned.csv'\nuntrained_dist_file = DATA_PROCESSED / 'untrained_distances_aligned.csv'\n\n# Reason: a previous run that produced empty results saved 1-byte CSVs\n# here (just a newline). Treat any cache that doesn't have the expected\n# schema as invalid and recompute, instead of crashing later.\nEXPECTED_COLS = {'machine_name', 'ROI', 'aligned_time', 'distance', 'n_flies'}\n\ndef _read_cache(path):\n try:\n df = pd.read_csv(path)\n except (pd.errors.EmptyDataError, FileNotFoundError):\n return None\n if not EXPECTED_COLS.issubset(df.columns) or df.empty:\n return None\n return df\n\ncached_trained = _read_cache(trained_dist_file) if not recalculate_distances else None\ncached_untrained = _read_cache(untrained_dist_file) if not recalculate_distances else None\n\nif cached_trained is not None and cached_untrained is not None:\n print(\"Loading pre-calculated distance data from CSV files...\")\n trained_distances = cached_trained\n untrained_distances = cached_untrained\n print(f\"Trained distances shape: {trained_distances.shape}\")\n print(f\"Untrained distances shape: {untrained_distances.shape}\")\nelse:\n print(\"Calculating distances from scratch...\")\n def calculate_distances_with_area(df, median_area_threshold):\n \"\"\"Distance between the two flies, with merged-blob heuristic.\"\"\"\n df['area'] = df['w'] * df['h']\n results = []\n\n for (machine_name, roi, t), group in df.groupby(['machine_name', 'ROI', 'aligned_time']):\n group = group.sort_values('id').reset_index(drop=True)\n\n if len(group) >= 2:\n fly1 = group.iloc[0]\n fly2 = group.iloc[1]\n distance = euclidean([fly1['x'], fly1['y']], [fly2['x'], fly2['y']])\n results.append({\n 'machine_name': machine_name, 'ROI': roi, 'aligned_time': t,\n 'distance': distance, 'n_flies': len(group),\n 'area_fly1': fly1['area'], 'area_fly2': fly2['area'],\n 'group': fly1['group'],\n })\n elif len(group) == 1:\n fly = group.iloc[0]\n area = fly['area']\n distance = 0.0 if area > 1.5 * median_area_threshold else np.nan\n results.append({\n 'machine_name': machine_name, 'ROI': roi, 'aligned_time': t,\n 'distance': distance, 'n_flies': 1,\n 'area_fly1': area, 'area_fly2': np.nan,\n 'group': fly['group'],\n })\n\n return pd.DataFrame(results)\n\n trained_distances = calculate_distances_with_area(trained_aligned, median_area)\n untrained_distances = calculate_distances_with_area(untrained_aligned, median_area)\n\n print(f\"Trained distances shape: {trained_distances.shape}\")\n print(f\"Untrained distances shape: {untrained_distances.shape}\")\n\n # Reason: only persist cache if we actually have data — saving an\n # empty DataFrame writes a 1-byte file that bricks the next run.\n if not trained_distances.empty and not untrained_distances.empty:\n DATA_PROCESSED.mkdir(parents=True, exist_ok=True)\n trained_distances.to_csv(trained_dist_file, index=False)\n untrained_distances.to_csv(untrained_dist_file, index=False)\n print(\"Distance data saved to CSV files\")\n else:\n print(\"⚠ skipping cache save — one of the result DataFrames is empty\")\n"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue