cupido/scripts/barrier_picker_app/app.py
Giorgio Gilestro d0f0e2d443 Picker: render existing annotation as a clickable mark below the video
When a video that already has a barrier_opening row is loaded, a thin
bar now appears below the player with a red vertical line at the
marked opening_s. A tooltip shows the timestamp, the analyst's
initials, and the bad_rois flag. Clicking the line snaps the player
to the exact opening; clicking elsewhere on the bar seeks
proportionally. Bar is hidden for un-annotated videos.

API change: each queue item now carries an `existing` field with the
opening_s, bad_rois, analyst, etc. from barrier_opening.csv when
available, so the frontend can paint the mark without extra requests.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-01 14:27:33 +01:00

358 lines
14 KiB
Python

"""FastAPI server for the web-based barrier-opening picker.
Browse to http://<host>:8000/ and you'll see a video player loaded with
the next un-annotated video from the queue. Use the arrow keys to
scrub (←/→ ±5 s, Shift+←/→ ±1 s, Ctrl+←/→ ±0.1 s), space to pause/play,
or click the seekbar. When the barrier opens, click one of:
[All barriers open] every ROI is usable post-opening
[Upper barriers open] only ROIs 1,3,5 are usable
[Lower barriers open] only ROIs 2,4,6 are usable
The current playhead time is recorded as the barrier-opening moment;
ROI inclusion is set accordingly. There is also a Skip and a Mark
unusable button.
The queue is built from the merged TSV plus the inventory: every
unique (machine, date, time) that has both a tracking DB and an mp4
on disk and is not yet in barrier_opening.csv. Submissions persist
to barrier_opening.csv after every click — refresh-safe.
Configuration (environment variables):
CUPIDO_DATA_VOLUME /mnt/data/projects/cupido (data volume)
CUPIDO_INVENTORY_CSV /cupido/data/metadata/video_inventory.csv
CUPIDO_OUTPUT_CSV /cupido/data/metadata/barrier_opening.csv
"""
from __future__ import annotations
import os
import re
from dataclasses import dataclass
from pathlib import Path
import pandas as pd
from fastapi import FastAPI, HTTPException, Request
from fastapi.responses import FileResponse, HTMLResponse, JSONResponse, Response
from fastapi.staticfiles import StaticFiles
from pydantic import BaseModel
# ─── Config ──────────────────────────────────────────────────────────────
DATA_VOLUME = Path(os.environ.get("CUPIDO_DATA_VOLUME", "/mnt/data/projects/cupido"))
INVENTORY_CSV = Path(os.environ.get(
"CUPIDO_INVENTORY_CSV", "/cupido/data/metadata/video_inventory.csv"
))
OUTPUT_CSV = Path(os.environ.get(
"CUPIDO_OUTPUT_CSV", "/cupido/data/metadata/barrier_opening.csv"
))
TSV_PATH = DATA_VOLUME / "all_video_info_merged.tsv"
# Reason: the (date, time, machine_uuid) prefix encoded in every tracking
# DB filename and every inventory mp4 filename.
DB_NAME_RE = re.compile(
r"^(\d{4}-\d{2}-\d{2})_(\d{2}-\d{2}-\d{2})_([0-9a-f]{32})__"
)
OUT_COLS = ["machine_name", "session_date", "session_time",
"opening_s", "trim_first_s", "bad_rois", "analyst", "notes"]
# ROI numbering in the HD mating arena (verified via tracking_geometry):
# upper row = ROIs 1, 3, 5 (y ≈ 0.125)
# lower row = ROIs 2, 4, 6 (y ≈ 0.795)
ROIS_UPPER = "1,3,5"
ROIS_LOWER = "2,4,6"
@dataclass(frozen=True)
class QueueItem:
idx: int
machine_name: str
session_date: str
session_time: str
mp4_path: str
duration_s: float | None
done: bool
metadata: dict # experimental fields aggregated from the merged TSV
existing: dict | None # current barrier_opening.csv row if already picked
# ─── Queue building ─────────────────────────────────────────────────────
_META_FIELDS = (
"species", "training_length_hr", "consolidation_length_hr",
"memory", "age", "training_date_time", "testing_date_time",
)
def _aggregate_metadata(rows: pd.DataFrame, db_filename: str) -> dict:
"""Pull the experimental metadata for one video from its TSV rows.
Most fields are uniform across the 6 ROIs of a video so the first-row
value is representative. `male` is a per-fly label, so we summarise
counts. `session_role` flags whether this video was the training or
testing session for the flies in it.
"""
if rows.empty:
return {}
# Reason: the merged xlsx/TSV currently has duplicate rows per
# (date, machine, ROI). De-dup on those keys so the male counts and
# any per-ROI fields aren't doubled.
if {"date", "machine_name", "roi"}.issubset(rows.columns):
rows = rows.drop_duplicates(subset=["date", "machine_name", "roi"])
r0 = rows.iloc[0]
meta = {}
for f in _META_FIELDS:
v = r0.get(f)
if pd.isna(v):
meta[f] = None
else:
meta[f] = v if isinstance(v, str) else (
int(v) if isinstance(v, float) and v.is_integer() else v
)
# Per-ROI tally.
if "male" in rows.columns:
m = rows["male"].dropna()
meta["n_trained"] = int((m == "trained").sum())
meta["n_naive"] = int((m == "naive").sum())
# Was this the training session, the testing session, or both?
is_training = rows["training_db_path"].astype(str).str.endswith(db_filename).any()
is_testing = rows["testing_db_path"].astype(str).str.endswith(db_filename).any()
if is_training and is_testing:
meta["session_role"] = "training+testing"
elif is_training:
meta["session_role"] = "training"
elif is_testing:
meta["session_role"] = "testing"
else:
meta["session_role"] = "?"
return meta
def _build_queue() -> list[QueueItem]:
"""Build the ordered queue of pickable videos."""
if not TSV_PATH.exists():
raise RuntimeError(f"merged TSV not found at {TSV_PATH}")
if not INVENTORY_CSV.exists():
raise RuntimeError(f"inventory not found at {INVENTORY_CSV}")
tsv = pd.read_csv(TSV_PATH, sep="\t")
inv = pd.read_csv(INVENTORY_CSV)
inv_by_key: dict[tuple[str, str, str], dict] = {}
for r in inv.itertuples(index=False):
inv_by_key[(r.machine_name, r.session_date, r.session_time)] = {
"mp4_path": r.mp4_path,
"duration_s": float(r.duration_s) if pd.notna(r.duration_s) else None,
}
existing_by_key: dict[tuple[str, str, str], dict] = {}
if OUTPUT_CSV.exists():
out = pd.read_csv(OUTPUT_CSV)
for _, r in out.iterrows():
key = (r["machine_name"], r["session_date"], r["session_time"])
existing_by_key[key] = {
"opening_s": float(r["opening_s"]) if pd.notna(r["opening_s"]) else None,
"trim_first_s": int(r["trim_first_s"]) if pd.notna(r.get("trim_first_s")) else 0,
"bad_rois": str(r.get("bad_rois", "") or "") if pd.notna(r.get("bad_rois")) else "",
"analyst": str(r.get("analyst", "") or "") if pd.notna(r.get("analyst")) else "",
"notes": str(r.get("notes", "") or "") if pd.notna(r.get("notes")) else "",
}
done_keys = set(existing_by_key)
seen: set[tuple[str, str, str]] = set()
items: list[QueueItem] = []
for col in ("training_db_path", "testing_db_path"):
for row in tsv.itertuples(index=False):
db = getattr(row, col)
if not isinstance(db, str) or not db:
continue
db_path = Path(db)
if not db_path.exists():
continue
m = DB_NAME_RE.match(db_path.name)
if not m:
continue
session_date, session_time = m.group(1), m.group(2)
key = (row.machine_name, session_date, session_time)
if key in seen:
continue
seen.add(key)
inv_row = inv_by_key.get(key)
if inv_row is None or not Path(inv_row["mp4_path"]).exists():
continue
# Reason: gather all TSV rows that reference this video — there
# are typically 6 ROI-rows per session, sometimes also rows
# using it as both training AND testing.
db_filename = db_path.name
related = tsv[
tsv["training_db_path"].astype(str).str.endswith(db_filename)
| tsv["testing_db_path"].astype(str).str.endswith(db_filename)
]
metadata = _aggregate_metadata(related, db_filename)
items.append(QueueItem(
idx=len(items),
machine_name=row.machine_name,
session_date=session_date,
session_time=session_time,
mp4_path=inv_row["mp4_path"],
duration_s=inv_row["duration_s"],
done=key in done_keys,
metadata=metadata,
existing=existing_by_key.get(key),
))
return items
# ─── App ───────────────────────────────────────────────────────────────
app = FastAPI(title="Cupido barrier-opening picker")
STATIC_DIR = Path(__file__).parent / "static"
app.mount("/static", StaticFiles(directory=STATIC_DIR), name="static")
@app.get("/")
async def index() -> FileResponse:
return FileResponse(STATIC_DIR / "index.html")
@app.get("/api/queue")
async def get_queue() -> JSONResponse:
queue = _build_queue()
return JSONResponse([
{
"idx": q.idx,
"machine_name": q.machine_name,
"session_date": q.session_date,
"session_time": q.session_time,
"duration_s": q.duration_s,
"done": q.done,
"metadata": q.metadata,
"existing": q.existing,
}
for q in queue
])
def _stream_video(file_path: Path, request: Request) -> Response:
"""HTTP Range-aware video streaming."""
file_size = file_path.stat().st_size
range_header = request.headers.get("range")
if range_header is None:
return FileResponse(file_path, media_type="video/mp4",
headers={"Accept-Ranges": "bytes"})
# Parse "bytes=START-END" (END optional)
m = re.match(r"bytes=(\d+)-(\d*)", range_header)
if not m:
raise HTTPException(status_code=416, detail="bad Range header")
start = int(m.group(1))
end = int(m.group(2)) if m.group(2) else file_size - 1
end = min(end, file_size - 1)
if start > end:
raise HTTPException(status_code=416, detail="range not satisfiable")
chunk_size = end - start + 1
def iterfile():
with open(file_path, "rb") as f:
f.seek(start)
remaining = chunk_size
while remaining > 0:
buf = f.read(min(64 * 1024, remaining))
if not buf:
break
yield buf
remaining -= len(buf)
return Response(
content=b"".join(iterfile()),
status_code=206,
media_type="video/mp4",
headers={
"Content-Range": f"bytes {start}-{end}/{file_size}",
"Accept-Ranges": "bytes",
"Content-Length": str(chunk_size),
},
)
@app.get("/api/video/{idx}")
async def get_video(idx: int, request: Request) -> Response:
queue = _build_queue()
if not 0 <= idx < len(queue):
raise HTTPException(status_code=404, detail="idx out of range")
return _stream_video(Path(queue[idx].mp4_path), request)
class Submission(BaseModel):
idx: int
time_s: float | None # None when marking unusable
mode: str # "all" | "upper" | "lower" | "unusable" | "skip"
analyst: str = "" # initials of the human picker (required, non-skip)
notes: str = ""
@app.post("/api/submit")
async def submit(payload: Submission) -> dict:
queue = _build_queue()
if not 0 <= payload.idx < len(queue):
raise HTTPException(status_code=404, detail="idx out of range")
item = queue[payload.idx]
if payload.mode == "skip":
return {"status": "skipped"}
analyst = payload.analyst.strip().upper()
if not analyst:
raise HTTPException(status_code=400, detail="analyst initials required")
if payload.mode == "unusable":
row = {
"machine_name": item.machine_name,
"session_date": item.session_date,
"session_time": item.session_time,
"opening_s": float("nan"),
"trim_first_s": 0,
"bad_rois": "",
"analyst": analyst,
"notes": payload.notes or "unusable",
}
else:
if payload.time_s is None:
raise HTTPException(status_code=400, detail="time_s required")
bad_rois = {
"all": "",
"upper": ROIS_LOWER, # upper-only opens → lower row is bad
"lower": ROIS_UPPER, # lower-only opens → upper row is bad
}.get(payload.mode)
if bad_rois is None:
raise HTTPException(status_code=400, detail=f"unknown mode: {payload.mode}")
row = {
"machine_name": item.machine_name,
"session_date": item.session_date,
"session_time": item.session_time,
"opening_s": round(payload.time_s, 1),
"trim_first_s": 0,
"bad_rois": bad_rois,
"analyst": analyst,
"notes": payload.notes,
}
OUTPUT_CSV.parent.mkdir(parents=True, exist_ok=True)
if OUTPUT_CSV.exists():
out = pd.read_csv(OUTPUT_CSV)
else:
out = pd.DataFrame(columns=OUT_COLS)
for col in OUT_COLS:
if col not in out.columns:
out[col] = ""
# Replace any existing row for this key.
mask = ~((out["machine_name"] == row["machine_name"])
& (out["session_date"] == row["session_date"])
& (out["session_time"] == row["session_time"]))
out = pd.concat([out[mask], pd.DataFrame([row])], ignore_index=True)
out[OUT_COLS].to_csv(OUTPUT_CSV, index=False)
return {"status": "saved", "row": row}
if __name__ == "__main__":
import uvicorn
host = os.environ.get("HOST", "0.0.0.0")
port = int(os.environ.get("PORT", "8000"))
uvicorn.run("app:app", host=host, port=port, reload=False)