From 53b45e373b3507d3fa1c6aa3c797693a25660f54 Mon Sep 17 00:00:00 2001 From: Giorgio Gilestro Date: Fri, 1 May 2026 13:39:57 +0100 Subject: [PATCH] Dedupe + canonicalise the merged xlsx, then guard the export MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 108 of 508 rows in all_video_info_merged.xlsx were duplicates left over from merging multiple source spreadsheets — same (date, machine, ROI) appearing under two source_date values, identical data otherwise. The `male` column was also using a mix of variants ('naïve', 'niave', 'naive', 'trained') with the canonical 'naive' a minority of 12/200. scripts/cleanup_xlsx.py Idempotent one-off: backs up the xlsx, dedupes preferring the row whose source_date matches the experiment date, normalises `male` spellings, strips whitespace from string columns. Re-running on a clean file is a no-op. scripts/export_video_db_index.py New _validate_xlsx() runs first thing in main() and aborts the export with an actionable error if duplicates or non-canonical male values are present. Prevents silent regressions when the xlsx is edited or re-merged in the future. Result: TSV is now 400 rows (was 508), exactly 200 trained / 200 naive, no duplicates. Co-Authored-By: Claude Opus 4.7 --- scripts/cleanup_xlsx.py | 120 +++++++++++++++++++++++++++++++ scripts/export_video_db_index.py | 29 ++++++++ 2 files changed, 149 insertions(+) create mode 100644 scripts/cleanup_xlsx.py diff --git a/scripts/cleanup_xlsx.py b/scripts/cleanup_xlsx.py new file mode 100644 index 0000000..589702f --- /dev/null +++ b/scripts/cleanup_xlsx.py @@ -0,0 +1,120 @@ +"""One-off cleanup of all_video_info_merged.xlsx. + +Removes accidental duplicate rows that crept in when multiple source +spreadsheets were merged, and canonicalises the `male` column +(`naïve` / `niave` → `naive`, plus stripping whitespace). + +Idempotent: re-running on a cleaned file is a no-op (besides creating a +fresh backup). + +Dedup rule: when multiple rows share (date, machine_name, roi): + 1. Prefer the row whose source_date matches the experiment date + (DDMMYYYY format). This keeps the most-recently-curated row, + since the user typically sanitises in the source_date file + matching the experiment date. + 2. If no row matches, keep the last one (preserve all data when + the source_date covers multiple experiment dates, e.g. + "03102024-04102024"). + +Run: + python cleanup_xlsx.py # backs up + writes cleaned xlsx + python cleanup_xlsx.py --dry-run # shows what would change +""" + +from __future__ import annotations + +import argparse +import shutil +import sys +from datetime import datetime + +import pandas as pd + +from config import VIDEO_INFO_XLSX + +_MALE_NAIVE_VARIANTS = {"naïve", "niave", "naive"} + + +def normalize_male(v): + if pd.isna(v): + return v + s = str(v).strip() + if s.lower() in _MALE_NAIVE_VARIANTS: + return "naive" + if s.lower() == "trained": + return "trained" + return s # leave anything unexpected for the analyst to inspect + + +def strip_strings(df: pd.DataFrame) -> pd.DataFrame: + """Strip leading/trailing whitespace from every string cell.""" + for col in df.select_dtypes(include=["object", "string"]).columns: + df[col] = df[col].apply(lambda v: v.strip() if isinstance(v, str) else v) + return df + + +def dedup_by_canonical_source(df: pd.DataFrame, key: list[str]) -> pd.DataFrame: + """Keep one row per `key` group, preferring source_date == date.""" + date_compact = pd.to_datetime(df["date"]).dt.strftime("%d%m%Y") + df = df.copy() + df["_match"] = (df["source_date"].astype(str) == date_compact).astype(int) + # Sort so matching-source rows come first within each key group; stable + # sort preserves prior row order for the fallback case (no match). + df = df.sort_values(["_match"], ascending=False, kind="stable") + df = df.drop_duplicates(subset=key, keep="first") + df = df.drop(columns="_match") + return df.sort_values(key).reset_index(drop=True) + + +def main() -> None: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--dry-run", action="store_true", + help="show what would change without writing") + args = parser.parse_args() + + if not VIDEO_INFO_XLSX.exists(): + sys.exit(f"xlsx not found at {VIDEO_INFO_XLSX}") + + df = pd.read_excel(VIDEO_INFO_XLSX) + n_before = len(df) + + df = strip_strings(df) + + # Dedup + key = ["date", "machine_name", "roi"] + n_unique = df[key].drop_duplicates().shape[0] + if n_unique < n_before: + print(f"de-duplicating {n_before - n_unique} rows " + f"(currently {n_before} rows, {n_unique} unique by {key})") + df = dedup_by_canonical_source(df, key) + else: + print(f"no duplicate rows (all {n_before} are unique on {key})") + + # Normalise male + male_before = df["male"].value_counts(dropna=False).to_dict() + df["male"] = df["male"].apply(normalize_male) + male_after = df["male"].value_counts(dropna=False).to_dict() + if male_before != male_after: + print(f"normalised `male` column: {male_before} → {male_after}") + else: + print(f"`male` column already canonical: {male_after}") + + n_after = len(df) + print(f"\nfinal: {n_after} rows (was {n_before})") + + if args.dry_run: + print("--dry-run: not writing") + return + + backup = VIDEO_INFO_XLSX.with_suffix( + f".backup_{datetime.now():%Y%m%d_%H%M%S}.xlsx" + ) + shutil.copy2(VIDEO_INFO_XLSX, backup) + print(f"backed up xlsx → {backup}") + + df.to_excel(VIDEO_INFO_XLSX, index=False) + print(f"wrote cleaned xlsx → {VIDEO_INFO_XLSX}") + + +if __name__ == "__main__": + main() diff --git a/scripts/export_video_db_index.py b/scripts/export_video_db_index.py index 0caa0d4..70011e0 100644 --- a/scripts/export_video_db_index.py +++ b/scripts/export_video_db_index.py @@ -128,6 +128,34 @@ def resolve_session( _MALE_NAIVE_VARIANTS = {"naïve", "niave", "naive"} +def _validate_xlsx(df: pd.DataFrame) -> None: + """Refuse to export if the xlsx has duplicates or non-canonical values. + + The export pipeline assumes one row per (date, machine_name, roi). If + that ever stops being true (e.g. a future merge re-introduces dupes), + every downstream count silently doubles. Catch it at the source. + """ + key = ["date", "machine_name", "roi"] + dupes = df[df.duplicated(subset=key, keep=False)] + if not dupes.empty: + n_unique = df[key].drop_duplicates().shape[0] + sample = dupes.head(4)[["date", "machine_name", "roi", "source_date"]] + raise SystemExit( + f"\n ERROR: xlsx has {len(dupes)} duplicate rows " + f"({len(df)} total, {n_unique} unique on {key}).\n" + f" Sample:\n{sample.to_string(index=False)}\n" + f" Run scripts/cleanup_xlsx.py to fix.\n" + ) + bad_male = sorted(set(df["male"].dropna().astype(str).str.strip().unique()) + - {"naive", "trained"}) + if bad_male: + raise SystemExit( + f"\n ERROR: xlsx `male` column has non-canonical values: {bad_male}\n" + f" Expected only 'trained' and 'naive'.\n" + f" Run scripts/cleanup_xlsx.py to fix.\n" + ) + + def _normalize_metadata(df: pd.DataFrame) -> None: """Strip whitespace and canonicalize the ``male`` column in place.""" for col in df.select_dtypes(include=("object", "string")).columns: @@ -152,6 +180,7 @@ def main() -> None: index = build_session_index(inv) df = pd.read_excel(VIDEO_INFO_XLSX) + _validate_xlsx(df) _normalize_metadata(df) date_iso = pd.to_datetime(df["date"]).dt.strftime("%Y-%m-%d")