Dedupe + canonicalise the merged xlsx, then guard the export

108 of 508 rows in all_video_info_merged.xlsx were duplicates left over from merging multiple source spreadsheets — same (date, machine, ROI) appearing under two source_date values, identical data otherwise. The `male` column was also using a mix of variants ('naïve', 'niave', 'naive', 'trained') with the canonical 'naive' a minority of 12/200. scripts/cleanup_xlsx.py Idempotent one-off: backs up the xlsx, dedupes preferring the row whose source_date matches the experiment date, normalises `male` spellings, strips whitespace from string columns. Re-running on a clean file is a no-op. scripts/export_video_db_index.py New _validate_xlsx() runs first thing in main() and aborts the export with an actionable error if duplicates or non-canonical male values are present. Prevents silent regressions when the xlsx is edited or re-merged in the future. Result: TSV is now 400 rows (was 508), exactly 200 trained / 200 naive, no duplicates. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-01 13:39:57 +01:00 · 2026-05-01 13:39:57 +01:00 · 53b45e373b
commit 53b45e373b
parent 4ed988a617
2 changed files with 149 additions and 0 deletions
--- a/scripts/export_video_db_index.py
+++ b/scripts/export_video_db_index.py
@ -128,6 +128,34 @@ def resolve_session(
 _MALE_NAIVE_VARIANTS = {"naïve", "niave", "naive"}


+def _validate_xlsx(df: pd.DataFrame) -> None:
+    """Refuse to export if the xlsx has duplicates or non-canonical values.
+
+    The export pipeline assumes one row per (date, machine_name, roi). If
+    that ever stops being true (e.g. a future merge re-introduces dupes),
+    every downstream count silently doubles. Catch it at the source.
+    """
+    key = ["date", "machine_name", "roi"]
+    dupes = df[df.duplicated(subset=key, keep=False)]
+    if not dupes.empty:
+        n_unique = df[key].drop_duplicates().shape[0]
+        sample = dupes.head(4)[["date", "machine_name", "roi", "source_date"]]
+        raise SystemExit(
+            f"\n  ERROR: xlsx has {len(dupes)} duplicate rows "
+            f"({len(df)} total, {n_unique} unique on {key}).\n"
+            f"  Sample:\n{sample.to_string(index=False)}\n"
+            f"  Run scripts/cleanup_xlsx.py to fix.\n"
+        )
+    bad_male = sorted(set(df["male"].dropna().astype(str).str.strip().unique())
+                      - {"naive", "trained"})
+    if bad_male:
+        raise SystemExit(
+            f"\n  ERROR: xlsx `male` column has non-canonical values: {bad_male}\n"
+            f"  Expected only 'trained' and 'naive'.\n"
+            f"  Run scripts/cleanup_xlsx.py to fix.\n"
+        )
+
+
 def _normalize_metadata(df: pd.DataFrame) -> None:
    """Strip whitespace and canonicalize the ``male`` column in place."""
    for col in df.select_dtypes(include=("object", "string")).columns:
@ -152,6 +180,7 @@ def main() -> None:
    index = build_session_index(inv)

    df = pd.read_excel(VIDEO_INFO_XLSX)
+    _validate_xlsx(df)
    _normalize_metadata(df)
    date_iso = pd.to_datetime(df["date"]).dt.strftime("%Y-%m-%d")