"""One-off cleanup of all_video_info_merged.xlsx. Removes accidental duplicate rows that crept in when multiple source spreadsheets were merged, and canonicalises the `male` column (`naïve` / `niave` → `naive`, plus stripping whitespace). Idempotent: re-running on a cleaned file is a no-op (besides creating a fresh backup). Dedup rule: when multiple rows share (date, machine_name, roi): 1. Prefer the row whose source_date matches the experiment date (DDMMYYYY format). This keeps the most-recently-curated row, since the user typically sanitises in the source_date file matching the experiment date. 2. If no row matches, keep the last one (preserve all data when the source_date covers multiple experiment dates, e.g. "03102024-04102024"). Run: python cleanup_xlsx.py # backs up + writes cleaned xlsx python cleanup_xlsx.py --dry-run # shows what would change """ from __future__ import annotations import argparse import shutil import sys from datetime import datetime import pandas as pd from config import VIDEO_INFO_XLSX _MALE_NAIVE_VARIANTS = {"naïve", "niave", "naive"} def normalize_male(v): if pd.isna(v): return v s = str(v).strip() if s.lower() in _MALE_NAIVE_VARIANTS: return "naive" if s.lower() == "trained": return "trained" return s # leave anything unexpected for the analyst to inspect def strip_strings(df: pd.DataFrame) -> pd.DataFrame: """Strip leading/trailing whitespace from every string cell.""" for col in df.select_dtypes(include=["object", "string"]).columns: df[col] = df[col].apply(lambda v: v.strip() if isinstance(v, str) else v) return df def dedup_by_canonical_source(df: pd.DataFrame, key: list[str]) -> pd.DataFrame: """Keep one row per `key` group, preferring source_date == date.""" date_compact = pd.to_datetime(df["date"]).dt.strftime("%d%m%Y") df = df.copy() df["_match"] = (df["source_date"].astype(str) == date_compact).astype(int) # Sort so matching-source rows come first within each key group; stable # sort preserves prior row order for the fallback case (no match). df = df.sort_values(["_match"], ascending=False, kind="stable") df = df.drop_duplicates(subset=key, keep="first") df = df.drop(columns="_match") return df.sort_values(key).reset_index(drop=True) def main() -> None: parser = argparse.ArgumentParser(description=__doc__) parser.add_argument("--dry-run", action="store_true", help="show what would change without writing") args = parser.parse_args() if not VIDEO_INFO_XLSX.exists(): sys.exit(f"xlsx not found at {VIDEO_INFO_XLSX}") df = pd.read_excel(VIDEO_INFO_XLSX) n_before = len(df) df = strip_strings(df) # Dedup key = ["date", "machine_name", "roi"] n_unique = df[key].drop_duplicates().shape[0] if n_unique < n_before: print(f"de-duplicating {n_before - n_unique} rows " f"(currently {n_before} rows, {n_unique} unique by {key})") df = dedup_by_canonical_source(df, key) else: print(f"no duplicate rows (all {n_before} are unique on {key})") # Normalise male male_before = df["male"].value_counts(dropna=False).to_dict() df["male"] = df["male"].apply(normalize_male) male_after = df["male"].value_counts(dropna=False).to_dict() if male_before != male_after: print(f"normalised `male` column: {male_before} → {male_after}") else: print(f"`male` column already canonical: {male_after}") n_after = len(df) print(f"\nfinal: {n_after} rows (was {n_before})") if args.dry_run: print("--dry-run: not writing") return backup = VIDEO_INFO_XLSX.with_suffix( f".backup_{datetime.now():%Y%m%d_%H%M%S}.xlsx" ) shutil.copy2(VIDEO_INFO_XLSX, backup) print(f"backed up xlsx → {backup}") df.to_excel(VIDEO_INFO_XLSX, index=False) print(f"wrote cleaned xlsx → {VIDEO_INFO_XLSX}") if __name__ == "__main__": main()