From 53b45e373b3507d3fa1c6aa3c797693a25660f54 Mon Sep 17 00:00:00 2001
From: Giorgio Gilestro <giorgio@gilest.ro>
Date: Fri, 1 May 2026 13:39:57 +0100
Subject: [PATCH] Dedupe + canonicalise the merged xlsx, then guard the export
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

108 of 508 rows in all_video_info_merged.xlsx were duplicates left over
from merging multiple source spreadsheets — same (date, machine, ROI)
appearing under two source_date values, identical data otherwise. The
`male` column was also using a mix of variants ('naïve', 'niave',
'naive', 'trained') with the canonical 'naive' a minority of 12/200.

scripts/cleanup_xlsx.py
    Idempotent one-off: backs up the xlsx, dedupes preferring the row
    whose source_date matches the experiment date, normalises `male`
    spellings, strips whitespace from string columns. Re-running on a
    clean file is a no-op.

scripts/export_video_db_index.py
    New _validate_xlsx() runs first thing in main() and aborts the
    export with an actionable error if duplicates or non-canonical
    male values are present. Prevents silent regressions when the
    xlsx is edited or re-merged in the future.

Result: TSV is now 400 rows (was 508), exactly 200 trained / 200
naive, no duplicates.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 scripts/cleanup_xlsx.py          | 120 +++++++++++++++++++++++++++++++
 scripts/export_video_db_index.py |  29 ++++++++
 2 files changed, 149 insertions(+)
 create mode 100644 scripts/cleanup_xlsx.py

diff --git a/scripts/cleanup_xlsx.py b/scripts/cleanup_xlsx.py
new file mode 100644
index 0000000..589702f
--- /dev/null
+++ b/scripts/cleanup_xlsx.py
@@ -0,0 +1,120 @@
+"""One-off cleanup of all_video_info_merged.xlsx.
+
+Removes accidental duplicate rows that crept in when multiple source
+spreadsheets were merged, and canonicalises the `male` column
+(`naïve` / `niave` → `naive`, plus stripping whitespace).
+
+Idempotent: re-running on a cleaned file is a no-op (besides creating a
+fresh backup).
+
+Dedup rule: when multiple rows share (date, machine_name, roi):
+    1. Prefer the row whose source_date matches the experiment date
+       (DDMMYYYY format). This keeps the most-recently-curated row,
+       since the user typically sanitises in the source_date file
+       matching the experiment date.
+    2. If no row matches, keep the last one (preserve all data when
+       the source_date covers multiple experiment dates, e.g.
+       "03102024-04102024").
+
+Run:
+    python cleanup_xlsx.py            # backs up + writes cleaned xlsx
+    python cleanup_xlsx.py --dry-run  # shows what would change
+"""
+
+from __future__ import annotations
+
+import argparse
+import shutil
+import sys
+from datetime import datetime
+
+import pandas as pd
+
+from config import VIDEO_INFO_XLSX
+
+_MALE_NAIVE_VARIANTS = {"naïve", "niave", "naive"}
+
+
+def normalize_male(v):
+    if pd.isna(v):
+        return v
+    s = str(v).strip()
+    if s.lower() in _MALE_NAIVE_VARIANTS:
+        return "naive"
+    if s.lower() == "trained":
+        return "trained"
+    return s   # leave anything unexpected for the analyst to inspect
+
+
+def strip_strings(df: pd.DataFrame) -> pd.DataFrame:
+    """Strip leading/trailing whitespace from every string cell."""
+    for col in df.select_dtypes(include=["object", "string"]).columns:
+        df[col] = df[col].apply(lambda v: v.strip() if isinstance(v, str) else v)
+    return df
+
+
+def dedup_by_canonical_source(df: pd.DataFrame, key: list[str]) -> pd.DataFrame:
+    """Keep one row per `key` group, preferring source_date == date."""
+    date_compact = pd.to_datetime(df["date"]).dt.strftime("%d%m%Y")
+    df = df.copy()
+    df["_match"] = (df["source_date"].astype(str) == date_compact).astype(int)
+    # Sort so matching-source rows come first within each key group; stable
+    # sort preserves prior row order for the fallback case (no match).
+    df = df.sort_values(["_match"], ascending=False, kind="stable")
+    df = df.drop_duplicates(subset=key, keep="first")
+    df = df.drop(columns="_match")
+    return df.sort_values(key).reset_index(drop=True)
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("--dry-run", action="store_true",
+                        help="show what would change without writing")
+    args = parser.parse_args()
+
+    if not VIDEO_INFO_XLSX.exists():
+        sys.exit(f"xlsx not found at {VIDEO_INFO_XLSX}")
+
+    df = pd.read_excel(VIDEO_INFO_XLSX)
+    n_before = len(df)
+
+    df = strip_strings(df)
+
+    # Dedup
+    key = ["date", "machine_name", "roi"]
+    n_unique = df[key].drop_duplicates().shape[0]
+    if n_unique < n_before:
+        print(f"de-duplicating {n_before - n_unique} rows "
+              f"(currently {n_before} rows, {n_unique} unique by {key})")
+        df = dedup_by_canonical_source(df, key)
+    else:
+        print(f"no duplicate rows (all {n_before} are unique on {key})")
+
+    # Normalise male
+    male_before = df["male"].value_counts(dropna=False).to_dict()
+    df["male"] = df["male"].apply(normalize_male)
+    male_after = df["male"].value_counts(dropna=False).to_dict()
+    if male_before != male_after:
+        print(f"normalised `male` column: {male_before}  →  {male_after}")
+    else:
+        print(f"`male` column already canonical: {male_after}")
+
+    n_after = len(df)
+    print(f"\nfinal: {n_after} rows  (was {n_before})")
+
+    if args.dry_run:
+        print("--dry-run: not writing")
+        return
+
+    backup = VIDEO_INFO_XLSX.with_suffix(
+        f".backup_{datetime.now():%Y%m%d_%H%M%S}.xlsx"
+    )
+    shutil.copy2(VIDEO_INFO_XLSX, backup)
+    print(f"backed up xlsx → {backup}")
+
+    df.to_excel(VIDEO_INFO_XLSX, index=False)
+    print(f"wrote cleaned xlsx → {VIDEO_INFO_XLSX}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/export_video_db_index.py b/scripts/export_video_db_index.py
index 0caa0d4..70011e0 100644
--- a/scripts/export_video_db_index.py
+++ b/scripts/export_video_db_index.py
@@ -128,6 +128,34 @@ def resolve_session(
 _MALE_NAIVE_VARIANTS = {"naïve", "niave", "naive"}
 
 
+def _validate_xlsx(df: pd.DataFrame) -> None:
+    """Refuse to export if the xlsx has duplicates or non-canonical values.
+
+    The export pipeline assumes one row per (date, machine_name, roi). If
+    that ever stops being true (e.g. a future merge re-introduces dupes),
+    every downstream count silently doubles. Catch it at the source.
+    """
+    key = ["date", "machine_name", "roi"]
+    dupes = df[df.duplicated(subset=key, keep=False)]
+    if not dupes.empty:
+        n_unique = df[key].drop_duplicates().shape[0]
+        sample = dupes.head(4)[["date", "machine_name", "roi", "source_date"]]
+        raise SystemExit(
+            f"\n  ERROR: xlsx has {len(dupes)} duplicate rows "
+            f"({len(df)} total, {n_unique} unique on {key}).\n"
+            f"  Sample:\n{sample.to_string(index=False)}\n"
+            f"  Run scripts/cleanup_xlsx.py to fix.\n"
+        )
+    bad_male = sorted(set(df["male"].dropna().astype(str).str.strip().unique())
+                      - {"naive", "trained"})
+    if bad_male:
+        raise SystemExit(
+            f"\n  ERROR: xlsx `male` column has non-canonical values: {bad_male}\n"
+            f"  Expected only 'trained' and 'naive'.\n"
+            f"  Run scripts/cleanup_xlsx.py to fix.\n"
+        )
+
+
 def _normalize_metadata(df: pd.DataFrame) -> None:
     """Strip whitespace and canonicalize the ``male`` column in place."""
     for col in df.select_dtypes(include=("object", "string")).columns:
@@ -152,6 +180,7 @@ def main() -> None:
     index = build_session_index(inv)
 
     df = pd.read_excel(VIDEO_INFO_XLSX)
+    _validate_xlsx(df)
     _normalize_metadata(df)
     date_iso = pd.to_datetime(df["date"]).dt.strftime("%Y-%m-%d")