Merge 2025-07-15 batch into the xlsx; tools to detect & re-track
- merge_2025_07_15_into_xlsx.py: pivot the legacy 2025_07_15_metadata_fixed.csv into the unified xlsx schema (one row per fly, training_date_time + testing_date_time). Backs up the xlsx before writing. 24 new rows across machines 076 / 139 / 145 / 268. - pick_targets.py: --video flag to bypass the inventory's in_xlsx filter, so a specific mp4 can be picked outside the normal flow. - explore_barrier_signal.py: visualises raw y(t), per-frame inter-fly distance, and sliding min/mean distance against a known barrier-opening time. Used for prototyping the detector. - detect_barrier_opening.py: per-ROI sliding-window mean-distance change-point estimator (median across ROIs). Currently noisy on a one-video calibration set; will be re-tuned once the 4 missing 2025-07-15 videos are re-tracked. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
parent
8f3c4ca89c
commit
847d2cbd1b
4 changed files with 480 additions and 1 deletions
195
scripts/detect_barrier_opening.py
Normal file
195
scripts/detect_barrier_opening.py
Normal file
|
|
@ -0,0 +1,195 @@
|
||||||
|
"""Detect the barrier-opening time from tracking data.
|
||||||
|
|
||||||
|
Idea: before the barrier is removed, the two flies in a ROI are stuck on
|
||||||
|
opposite sides of a divider. Their inter-fly distance is bounded below
|
||||||
|
by ~the barrier width (typically 100–250 px). After removal they can
|
||||||
|
walk up to each other and the minimum distance drops near zero. We
|
||||||
|
detect the first time the sliding-window MIN drops below a threshold
|
||||||
|
and call that the opening moment.
|
||||||
|
|
||||||
|
Per-ROI estimates are aggregated (median) across the 6 ROIs of one
|
||||||
|
video for a single video-level opening time. Disagreeing ROIs are
|
||||||
|
flagged so the analyst can double-check by eye.
|
||||||
|
|
||||||
|
This module exposes ``detect_opening_time(db_path)`` for callers, and
|
||||||
|
runs as a CLI to produce a TSV with one row per DB. Use::
|
||||||
|
|
||||||
|
python detect_barrier_opening.py --db <one.db> # single
|
||||||
|
python detect_barrier_opening.py # all DBs in TRACKING_OUTPUT_DIR
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import sqlite3
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
from config import TRACKING_OUTPUT_DIR
|
||||||
|
|
||||||
|
# Tunables (calibrated on machine 076 / 16-03-10, ground truth 52s).
|
||||||
|
# We use windowed MEAN distance (not min) because the min is too easily
|
||||||
|
# tripped by isolated tracking artifacts in the first few seconds. The
|
||||||
|
# mean drops cleanly when the barrier opens because the flies start
|
||||||
|
# spending real time near each other instead of being held apart.
|
||||||
|
WINDOW_S = 30.0 # sliding-window length for the distance signal
|
||||||
|
STEP_S = 1.0 # step between window centres
|
||||||
|
SEARCH_END_S = 300.0 # opening always happens in the first 5 minutes
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class RoiEstimate:
|
||||||
|
roi: int
|
||||||
|
opening_s: float | None
|
||||||
|
n_pairs: int # how many 2-fly frames we had
|
||||||
|
pre_min: float # median min-dist in pre-opening window (sanity)
|
||||||
|
post_min: float # median min-dist in post-opening window (sanity)
|
||||||
|
|
||||||
|
|
||||||
|
def per_frame_distance(df: pd.DataFrame) -> pd.DataFrame:
|
||||||
|
"""Frames with exactly 2 detections → (t_s, dist_px). Empty if none."""
|
||||||
|
if df.empty:
|
||||||
|
return df.assign(dist_px=np.nan).iloc[:0]
|
||||||
|
n = df.groupby("t").size()
|
||||||
|
two = n[n == 2].index
|
||||||
|
sub = df[df["t"].isin(two)].sort_values(["t", "id"])
|
||||||
|
if sub.empty:
|
||||||
|
return pd.DataFrame(columns=["t_s", "dist_px"])
|
||||||
|
pairs = sub.groupby("t").agg(
|
||||||
|
x1=("x", "first"), y1=("y", "first"),
|
||||||
|
x2=("x", "last"), y2=("y", "last"),
|
||||||
|
t_s=("t", "first"),
|
||||||
|
)
|
||||||
|
pairs["t_s"] = pairs["t_s"] / 1000.0
|
||||||
|
pairs["dist_px"] = np.hypot(pairs["x1"] - pairs["x2"], pairs["y1"] - pairs["y2"])
|
||||||
|
return pairs[["t_s", "dist_px"]].reset_index(drop=True)
|
||||||
|
|
||||||
|
|
||||||
|
def sliding_mean(dist: pd.DataFrame, window_s: float, step_s: float,
|
||||||
|
t_max: float) -> pd.DataFrame:
|
||||||
|
"""Return (mid_t, mean_dist) over sliding windows up to t_max."""
|
||||||
|
if dist.empty:
|
||||||
|
return pd.DataFrame(columns=["mid_t", "mean_dist"])
|
||||||
|
rows = []
|
||||||
|
for start in np.arange(0, t_max - window_s, step_s):
|
||||||
|
sub = dist[(dist["t_s"] >= start) & (dist["t_s"] < start + window_s)]
|
||||||
|
if sub.empty:
|
||||||
|
continue
|
||||||
|
rows.append({"mid_t": start + window_s / 2,
|
||||||
|
"mean_dist": sub["dist_px"].mean()})
|
||||||
|
return pd.DataFrame(rows)
|
||||||
|
|
||||||
|
|
||||||
|
def detect_one_roi(df_roi: pd.DataFrame) -> RoiEstimate:
|
||||||
|
"""Per-ROI detection.
|
||||||
|
|
||||||
|
Strategy: compute sliding-window mean distance, find the time of the
|
||||||
|
largest *drop* (windowed mean before vs after each candidate t).
|
||||||
|
The opening corresponds to the candidate that maximises (pre - post).
|
||||||
|
"""
|
||||||
|
roi_id = int(df_roi["ROI"].iloc[0]) if "ROI" in df_roi.columns and not df_roi.empty else -1
|
||||||
|
dist = per_frame_distance(df_roi)
|
||||||
|
n_pairs = len(dist)
|
||||||
|
if n_pairs < 100:
|
||||||
|
return RoiEstimate(roi_id, None, n_pairs, np.nan, np.nan)
|
||||||
|
|
||||||
|
smean = sliding_mean(dist, WINDOW_S, STEP_S, SEARCH_END_S)
|
||||||
|
if len(smean) < 4:
|
||||||
|
return RoiEstimate(roi_id, None, n_pairs, np.nan, np.nan)
|
||||||
|
|
||||||
|
# Reason: scan candidate split points; for each, compute the median
|
||||||
|
# of the sliding mean BEFORE vs AFTER. The opening is the candidate
|
||||||
|
# that maximises (pre_median - post_median). Median (not mean) makes
|
||||||
|
# this robust to tracking artifacts at either end. Skip the very
|
||||||
|
# ends of the window so we have enough samples on each side.
|
||||||
|
pad = max(1, int(WINDOW_S / STEP_S)) # don't split too close to edges
|
||||||
|
if len(smean) < 2 * pad + 1:
|
||||||
|
return RoiEstimate(roi_id, None, n_pairs, np.nan, np.nan)
|
||||||
|
|
||||||
|
best_drop = -np.inf
|
||||||
|
best_t = None
|
||||||
|
best_pre = best_post = np.nan
|
||||||
|
for i in range(pad, len(smean) - pad):
|
||||||
|
pre = smean["mean_dist"].iloc[:i].median()
|
||||||
|
post = smean["mean_dist"].iloc[i:].median()
|
||||||
|
drop = pre - post
|
||||||
|
if drop > best_drop:
|
||||||
|
best_drop = drop
|
||||||
|
best_t = float(smean["mid_t"].iloc[i])
|
||||||
|
best_pre, best_post = float(pre), float(post)
|
||||||
|
|
||||||
|
# Reason: require a substantive drop — at least 30 px, and post must
|
||||||
|
# be below ~70% of pre. Otherwise the signal is too flat (probably
|
||||||
|
# the barrier was already open when recording started, or the
|
||||||
|
# session is unusable).
|
||||||
|
if best_drop < 30 or best_post > 0.7 * best_pre:
|
||||||
|
return RoiEstimate(roi_id, None, n_pairs, best_pre, best_post)
|
||||||
|
|
||||||
|
# Adjust: best_t was the centre of the post-window starting at index i;
|
||||||
|
# shift back by half a window so we report the actual transition moment.
|
||||||
|
opening_s = max(0.0, best_t - WINDOW_S / 2)
|
||||||
|
return RoiEstimate(roi_id, opening_s, n_pairs, best_pre, best_post)
|
||||||
|
|
||||||
|
|
||||||
|
def detect_opening_time(db_path: Path) -> dict:
|
||||||
|
"""Estimate barrier-opening time for one tracking DB.
|
||||||
|
|
||||||
|
Returns dict with:
|
||||||
|
- opening_s : float | None (median across ROIs that produced an estimate)
|
||||||
|
- per_roi : list[RoiEstimate]
|
||||||
|
- spread_s : max - min of per-ROI estimates (smaller = more agreement)
|
||||||
|
"""
|
||||||
|
estimates: list[RoiEstimate] = []
|
||||||
|
with sqlite3.connect(f"file:{db_path}?mode=ro", uri=True) as conn:
|
||||||
|
for roi in range(1, 7):
|
||||||
|
try:
|
||||||
|
df = pd.read_sql_query(
|
||||||
|
f"SELECT t, x, y, id FROM ROI_{roi}", conn
|
||||||
|
)
|
||||||
|
except Exception:
|
||||||
|
estimates.append(RoiEstimate(roi, None, 0, np.nan, np.nan))
|
||||||
|
continue
|
||||||
|
df["ROI"] = roi
|
||||||
|
estimates.append(detect_one_roi(df))
|
||||||
|
|
||||||
|
valid = [e.opening_s for e in estimates if e.opening_s is not None]
|
||||||
|
if not valid:
|
||||||
|
return {"opening_s": None, "per_roi": estimates, "spread_s": None}
|
||||||
|
return {
|
||||||
|
"opening_s": float(np.median(valid)),
|
||||||
|
"per_roi": estimates,
|
||||||
|
"spread_s": float(np.max(valid) - np.min(valid)),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> None:
|
||||||
|
parser = argparse.ArgumentParser(description=__doc__)
|
||||||
|
parser.add_argument("--db", type=Path, help="single tracking DB to analyze")
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
dbs = [args.db] if args.db else sorted(TRACKING_OUTPUT_DIR.glob("*_tracking.db"))
|
||||||
|
print(f"analyzing {len(dbs)} DB(s)\n")
|
||||||
|
for db in dbs:
|
||||||
|
result = detect_opening_time(db)
|
||||||
|
median_s = result["opening_s"]
|
||||||
|
spread = result["spread_s"]
|
||||||
|
print(f"{db.name}")
|
||||||
|
print(
|
||||||
|
f" median opening: "
|
||||||
|
f"{f'{median_s:.1f}s' if median_s is not None else 'no estimate'}"
|
||||||
|
f" spread: {f'{spread:.1f}s' if spread is not None else 'n/a'}"
|
||||||
|
)
|
||||||
|
for e in result["per_roi"]:
|
||||||
|
print(
|
||||||
|
f" ROI {e.roi}: "
|
||||||
|
f"{'-- ' if e.opening_s is None else f'{e.opening_s:5.1f}s'}"
|
||||||
|
f" pairs={e.n_pairs:>6d} pre={e.pre_min:5.1f} post={e.post_min:5.1f}"
|
||||||
|
)
|
||||||
|
print()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
143
scripts/explore_barrier_signal.py
Normal file
143
scripts/explore_barrier_signal.py
Normal file
|
|
@ -0,0 +1,143 @@
|
||||||
|
"""Look at the tracking signal around the known barrier-opening time.
|
||||||
|
|
||||||
|
Loads one tracking DB whose opening time we know (from
|
||||||
|
2025_07_15_barrier_opening.csv) and plots a few candidate signals against
|
||||||
|
time, with a vertical line at the ground-truth opening:
|
||||||
|
|
||||||
|
1. Y position of each detection (raw scatter)
|
||||||
|
2. Sliding-window Y range (max - min over a window)
|
||||||
|
3. Sliding-window |y - roi_midline| (mean distance from midline)
|
||||||
|
|
||||||
|
The hope is one of these has a clean step-change at t = opening_time
|
||||||
|
that's robustly detectable across ROIs.
|
||||||
|
|
||||||
|
Run:
|
||||||
|
python explore_barrier_signal.py
|
||||||
|
Outputs:
|
||||||
|
figures/barrier_signal_<machine>_<time>.png
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import sqlite3
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
from config import FIGURES, TRACKING_OUTPUT_DIR
|
||||||
|
|
||||||
|
# Ground-truth case: machine 076, session 16-03-10 → opening = 52 s.
|
||||||
|
DB_NAME = "2025-07-15_16-03-10_076e2825a7274661bd0697c42d6fa4c0__1920x1088@25fps-28q_merged_tracking.db"
|
||||||
|
KNOWN_OPENING_S = 52.0
|
||||||
|
WINDOW_S = 10.0 # sliding-window length for the derived signals
|
||||||
|
|
||||||
|
|
||||||
|
def load_roi(db_path: Path, roi: int) -> pd.DataFrame:
|
||||||
|
"""Read one ROI table; return DataFrame with t in seconds."""
|
||||||
|
with sqlite3.connect(f"file:{db_path}?mode=ro", uri=True) as conn:
|
||||||
|
df = pd.read_sql_query(f"SELECT t, x, y, w, h, id FROM ROI_{roi}", conn)
|
||||||
|
df["t_s"] = df["t"] / 1000.0
|
||||||
|
return df
|
||||||
|
|
||||||
|
|
||||||
|
def per_frame_distance(df: pd.DataFrame) -> pd.DataFrame:
|
||||||
|
"""For frames with exactly 2 detections, return (t_s, distance)."""
|
||||||
|
g = df.groupby("t")
|
||||||
|
n_per_frame = g.size()
|
||||||
|
two_fly_t = n_per_frame[n_per_frame == 2].index
|
||||||
|
sub = df[df["t"].isin(two_fly_t)].sort_values(["t", "id"])
|
||||||
|
pairs = sub.groupby("t").agg(
|
||||||
|
x1=("x", "first"), y1=("y", "first"),
|
||||||
|
x2=("x", "last"), y2=("y", "last"),
|
||||||
|
t_s=("t_s", "first"),
|
||||||
|
)
|
||||||
|
pairs["dist_px"] = np.hypot(pairs["x1"] - pairs["x2"], pairs["y1"] - pairs["y2"])
|
||||||
|
return pairs.reset_index(drop=True)
|
||||||
|
|
||||||
|
|
||||||
|
def sliding_signals(df: pd.DataFrame, dist: pd.DataFrame,
|
||||||
|
window_s: float, step_s: float = 1.0) -> pd.DataFrame:
|
||||||
|
"""Per-window summary signals."""
|
||||||
|
if df.empty:
|
||||||
|
return pd.DataFrame()
|
||||||
|
midline = df["y"].median()
|
||||||
|
t0, t1 = df["t_s"].min(), df["t_s"].max()
|
||||||
|
rows = []
|
||||||
|
for start in np.arange(t0, t1 - window_s, step_s):
|
||||||
|
sub = df [(df ["t_s"] >= start) & (df ["t_s"] < start + window_s)]
|
||||||
|
sub_d = dist[(dist["t_s"] >= start) & (dist["t_s"] < start + window_s)]
|
||||||
|
if sub.empty:
|
||||||
|
continue
|
||||||
|
rows.append({
|
||||||
|
"mid_t": start + window_s / 2,
|
||||||
|
"y_range": sub["y"].max() - sub["y"].min(),
|
||||||
|
"y_mid_dist": (sub["y"] - midline).abs().mean(),
|
||||||
|
"min_dist": sub_d["dist_px"].min() if not sub_d.empty else np.nan,
|
||||||
|
"mean_dist": sub_d["dist_px"].mean() if not sub_d.empty else np.nan,
|
||||||
|
})
|
||||||
|
return pd.DataFrame(rows)
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> None:
|
||||||
|
db = TRACKING_OUTPUT_DIR / DB_NAME
|
||||||
|
if not db.exists():
|
||||||
|
raise FileNotFoundError(db)
|
||||||
|
|
||||||
|
fig, axes = plt.subplots(6, 3, figsize=(16, 22), sharex=True)
|
||||||
|
# Zoom: only plot first 200 s — opening is < 90s in all known cases.
|
||||||
|
XLIM = (0, 200)
|
||||||
|
for roi in range(1, 7):
|
||||||
|
df = load_roi(db, roi)
|
||||||
|
dist = per_frame_distance(df)
|
||||||
|
windowed = sliding_signals(df, dist, WINDOW_S)
|
||||||
|
|
||||||
|
ax_raw, ax_dist, ax_min = axes[roi - 1]
|
||||||
|
|
||||||
|
# 1) raw y-positions, zoomed on the early window
|
||||||
|
ax_raw.scatter(df["t_s"], df["y"], s=0.5, alpha=0.4, c="steelblue")
|
||||||
|
ax_raw.axvline(KNOWN_OPENING_S, color="red", lw=1, ls="--",
|
||||||
|
label=f"opening = {KNOWN_OPENING_S}s")
|
||||||
|
ax_raw.set_ylabel(f"ROI {roi}\ny (px)")
|
||||||
|
ax_raw.set_xlim(*XLIM)
|
||||||
|
if roi == 1:
|
||||||
|
ax_raw.set_title("Raw y(t)")
|
||||||
|
ax_raw.legend(loc="upper right", fontsize=8)
|
||||||
|
|
||||||
|
# 2) raw inter-fly distance (per frame)
|
||||||
|
ax_dist.plot(dist["t_s"], dist["dist_px"], lw=0.4, alpha=0.6, color="steelblue")
|
||||||
|
ax_dist.axvline(KNOWN_OPENING_S, color="red", lw=1, ls="--")
|
||||||
|
ax_dist.set_ylabel("dist (px)")
|
||||||
|
ax_dist.set_xlim(*XLIM)
|
||||||
|
if roi == 1:
|
||||||
|
ax_dist.set_title("Per-frame inter-fly distance")
|
||||||
|
|
||||||
|
# 3) sliding window: MIN inter-fly distance in window
|
||||||
|
ax_min.plot(windowed["mid_t"], windowed["min_dist"], color="darkgreen", label="min")
|
||||||
|
ax_min.plot(windowed["mid_t"], windowed["mean_dist"], color="purple", label="mean", lw=0.8)
|
||||||
|
ax_min.axvline(KNOWN_OPENING_S, color="red", lw=1, ls="--")
|
||||||
|
ax_min.set_ylabel("dist (px)")
|
||||||
|
ax_min.set_xlim(*XLIM)
|
||||||
|
if roi == 1:
|
||||||
|
ax_min.set_title(f"min/mean inter-fly distance over {WINDOW_S}s window")
|
||||||
|
ax_min.legend(loc="upper right", fontsize=8)
|
||||||
|
|
||||||
|
for ax in axes[-1]:
|
||||||
|
ax.set_xlabel("time (s)")
|
||||||
|
|
||||||
|
fig.suptitle(
|
||||||
|
f"Barrier-opening signal exploration\n"
|
||||||
|
f"machine 076, session 16-03-10 · ground truth: {KNOWN_OPENING_S}s",
|
||||||
|
fontsize=14,
|
||||||
|
)
|
||||||
|
fig.tight_layout()
|
||||||
|
|
||||||
|
FIGURES.mkdir(parents=True, exist_ok=True)
|
||||||
|
out = FIGURES / "barrier_signal_076_16-03-10.png"
|
||||||
|
fig.savefig(out, dpi=120, bbox_inches="tight")
|
||||||
|
print(f"saved {out}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
115
scripts/merge_2025_07_15_into_xlsx.py
Normal file
115
scripts/merge_2025_07_15_into_xlsx.py
Normal file
|
|
@ -0,0 +1,115 @@
|
||||||
|
"""One-off: pivot the legacy 2025_07_15_metadata_fixed.csv into the merged xlsx.
|
||||||
|
|
||||||
|
The 2025-07-15 pilot batch was indexed by a separate CSV with one row per
|
||||||
|
(machine, HHMMSS, ROI). The unified xlsx instead has one row per fly
|
||||||
|
(machine, ROI) with both `training_date_time` and `testing_date_time`.
|
||||||
|
This script pivots the CSV to match that schema and appends the result
|
||||||
|
to the xlsx, after backing up the original.
|
||||||
|
|
||||||
|
Idempotent: if any row for date == 2025-07-15 already exists, abort.
|
||||||
|
|
||||||
|
Run:
|
||||||
|
python merge_2025_07_15_into_xlsx.py
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import shutil
|
||||||
|
import sys
|
||||||
|
from datetime import datetime
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
from config import VIDEO_INFO_XLSX, DATA_METADATA
|
||||||
|
|
||||||
|
LEGACY_CSV = DATA_METADATA / "2025_07_15_metadata_fixed.csv"
|
||||||
|
|
||||||
|
# Per-machine pairing of training-session HHMMSS → testing-session HHMMSS.
|
||||||
|
# Single-session machines (268, 139) get None for the testing field.
|
||||||
|
SESSION_PAIRS: dict[int, tuple[str, str | None]] = {
|
||||||
|
76: ("16-03-10", "16-31-34"),
|
||||||
|
145: ("16-03-27", "16-31-41"),
|
||||||
|
268: ("16-32-05", None), # only one recording; treat as training
|
||||||
|
139: ("16-31-52", None), # only one recording; never tracked
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def hhmmss_to_xlsx_time(date: str, hhmmss: str) -> str:
|
||||||
|
"""'16-03-10' on date 2025-07-15 → '20250715_403PM'.
|
||||||
|
|
||||||
|
The xlsx uses HHMMam/pm format (the regex in export_video_db_index.py
|
||||||
|
accepts AM/PM with optional minutes). 16:03 → 4:03 PM → '403PM'.
|
||||||
|
"""
|
||||||
|
h, m, _s = (int(p) for p in hhmmss.split("-"))
|
||||||
|
suffix = "AM" if h < 12 else "PM"
|
||||||
|
h12 = h if h == 12 else h % 12
|
||||||
|
ymd = date.replace("-", "")
|
||||||
|
if m == 0:
|
||||||
|
return f"{ymd}_{h12}{suffix}"
|
||||||
|
return f"{ymd}_{h12}{m:02d}{suffix}"
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> None:
|
||||||
|
if not LEGACY_CSV.exists():
|
||||||
|
sys.exit(f"legacy CSV not found at {LEGACY_CSV}")
|
||||||
|
if not VIDEO_INFO_XLSX.exists():
|
||||||
|
sys.exit(f"xlsx not found at {VIDEO_INFO_XLSX}")
|
||||||
|
|
||||||
|
csv = pd.read_csv(LEGACY_CSV)
|
||||||
|
xlsx = pd.read_excel(VIDEO_INFO_XLSX)
|
||||||
|
|
||||||
|
# Idempotency check: if 2025-07-15 already in the xlsx, refuse.
|
||||||
|
existing_dates = pd.to_datetime(xlsx["date"]).dt.strftime("%Y-%m-%d")
|
||||||
|
if (existing_dates == "2025-07-15").any():
|
||||||
|
sys.exit("xlsx already contains 2025-07-15 rows; nothing to do.")
|
||||||
|
|
||||||
|
# Build one row per (machine, ROI). The legacy CSV has duplicate rows
|
||||||
|
# per session — collapse on (machine, ROI) and pick metadata from any.
|
||||||
|
csv["machine_int"] = csv["machine_name"].astype(int)
|
||||||
|
by_fly = csv.groupby(["machine_int", "ROI"], as_index=False).agg(
|
||||||
|
genotype=("genotype", "first"),
|
||||||
|
group=("group", "first"),
|
||||||
|
)
|
||||||
|
|
||||||
|
rows = []
|
||||||
|
for _, fly in by_fly.iterrows():
|
||||||
|
machine_int = int(fly["machine_int"])
|
||||||
|
if machine_int not in SESSION_PAIRS:
|
||||||
|
print(f" skip machine {machine_int}: no session pairing defined")
|
||||||
|
continue
|
||||||
|
train_hhmmss, test_hhmmss = SESSION_PAIRS[machine_int]
|
||||||
|
rows.append({
|
||||||
|
"source_date": "20250715",
|
||||||
|
"date": pd.Timestamp("2025-07-15"),
|
||||||
|
"machine_name": f"ETHOSCOPE_{machine_int:03d}",
|
||||||
|
"roi": int(fly["ROI"]),
|
||||||
|
"species": "Melanogaster/CS" if fly["genotype"] == "CS" else fly["genotype"],
|
||||||
|
"male": fly["group"], # 'trained' / 'naive' already canonical
|
||||||
|
"collected": pd.NaT,
|
||||||
|
"training_date_time": hhmmss_to_xlsx_time("2025-07-15", train_hhmmss),
|
||||||
|
"testing_date_time": hhmmss_to_xlsx_time("2025-07-15", test_hhmmss) if test_hhmmss else "",
|
||||||
|
"training_length_hr": pd.NA,
|
||||||
|
"consolidation_length_hr": pd.NA,
|
||||||
|
"memory": pd.NA,
|
||||||
|
"age": pd.NA,
|
||||||
|
})
|
||||||
|
|
||||||
|
new_df = pd.DataFrame(rows)
|
||||||
|
print(f"adding {len(new_df)} rows for the 2025-07-15 batch:")
|
||||||
|
print(new_df[["machine_name", "roi", "male", "training_date_time", "testing_date_time"]])
|
||||||
|
|
||||||
|
# Back up the xlsx, then append.
|
||||||
|
backup = VIDEO_INFO_XLSX.with_suffix(
|
||||||
|
f".backup_{datetime.now():%Y%m%d_%H%M%S}.xlsx"
|
||||||
|
)
|
||||||
|
shutil.copy2(VIDEO_INFO_XLSX, backup)
|
||||||
|
print(f"\nbacked up xlsx → {backup}")
|
||||||
|
|
||||||
|
merged = pd.concat([xlsx, new_df], ignore_index=True)
|
||||||
|
merged.to_excel(VIDEO_INFO_XLSX, index=False)
|
||||||
|
print(f"wrote {VIDEO_INFO_XLSX} ({len(merged)} rows total)")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
|
|
@ -363,6 +363,12 @@ def main() -> None:
|
||||||
"--limit", type=int, default=None,
|
"--limit", type=int, default=None,
|
||||||
help="only process the first N videos",
|
help="only process the first N videos",
|
||||||
)
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--video", action="append", default=[],
|
||||||
|
metavar="MP4_PATH",
|
||||||
|
help="explicit mp4 path to pick targets for (bypasses the inventory's "
|
||||||
|
"in_xlsx filter). Repeat to specify multiple videos.",
|
||||||
|
)
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
if not INVENTORY_CSV.exists():
|
if not INVENTORY_CSV.exists():
|
||||||
|
|
@ -372,7 +378,27 @@ def main() -> None:
|
||||||
)
|
)
|
||||||
|
|
||||||
inv = pd.read_csv(INVENTORY_CSV)
|
inv = pd.read_csv(INVENTORY_CSV)
|
||||||
todo = inv[inv["in_xlsx"] & ~inv["already_tracked"]].copy()
|
if args.video:
|
||||||
|
# Reason: explicit --video paths skip the in_xlsx filter so we can
|
||||||
|
# re-track recordings that aren't in the merged xlsx (e.g. the
|
||||||
|
# 2025-07-15 multi-recording sessions). Each path must exist in
|
||||||
|
# the inventory so we still get machine_name / session_datetime
|
||||||
|
# for the prompt; build a small synthetic todo from those rows.
|
||||||
|
wanted = {str(Path(p).resolve()) for p in args.video}
|
||||||
|
inv["_resolved"] = inv["mp4_path"].apply(lambda p: str(Path(p).resolve()))
|
||||||
|
todo = inv[inv["_resolved"].isin(wanted)].drop(columns="_resolved").copy()
|
||||||
|
missing = wanted - set(
|
||||||
|
inv["mp4_path"].apply(lambda p: str(Path(p).resolve()))
|
||||||
|
)
|
||||||
|
if missing:
|
||||||
|
print(f"⚠ {len(missing)} requested video(s) not in inventory; "
|
||||||
|
"rebuild it with build_video_inventory.py if needed:")
|
||||||
|
for m in sorted(missing):
|
||||||
|
print(f" {m}")
|
||||||
|
if todo.empty:
|
||||||
|
sys.exit("No matching videos in inventory.")
|
||||||
|
else:
|
||||||
|
todo = inv[inv["in_xlsx"] & ~inv["already_tracked"]].copy()
|
||||||
todo = todo.sort_values(
|
todo = todo.sort_values(
|
||||||
["session_date", "machine_name", "session_time"]
|
["session_date", "machine_name", "session_time"]
|
||||||
).reset_index(drop=True)
|
).reset_index(drop=True)
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue