cupido/scripts/cleanup_xlsx.py

"""One-off cleanup of all_video_info_merged.xlsx.

Removes accidental duplicate rows that crept in when multiple source
spreadsheets were merged, and canonicalises the `male` column
(`naïve` / `niave` → `naive`, plus stripping whitespace).

Idempotent: re-running on a cleaned file is a no-op (besides creating a
fresh backup).

Dedup rule: when multiple rows share (date, machine_name, roi):
    1. Prefer the row whose source_date matches the experiment date
       (DDMMYYYY format). This keeps the most-recently-curated row,
       since the user typically sanitises in the source_date file
       matching the experiment date.
    2. If no row matches, keep the last one (preserve all data when
       the source_date covers multiple experiment dates, e.g.
       "03102024-04102024").

Run:
    python cleanup_xlsx.py            # backs up + writes cleaned xlsx
    python cleanup_xlsx.py --dry-run  # shows what would change
"""

from __future__ import annotations

import argparse
import shutil
import sys
from datetime import datetime

import pandas as pd

from config import VIDEO_INFO_XLSX

_MALE_NAIVE_VARIANTS = {"naïve", "niave", "naive"}


def normalize_male(v):
    if pd.isna(v):
        return v
    s = str(v).strip()
    if s.lower() in _MALE_NAIVE_VARIANTS:
        return "naive"
    if s.lower() == "trained":
        return "trained"
    return s   # leave anything unexpected for the analyst to inspect


def strip_strings(df: pd.DataFrame) -> pd.DataFrame:
    """Strip leading/trailing whitespace from every string cell."""
    for col in df.select_dtypes(include=["object", "string"]).columns:
        df[col] = df[col].apply(lambda v: v.strip() if isinstance(v, str) else v)
    return df


def dedup_by_canonical_source(df: pd.DataFrame, key: list[str]) -> pd.DataFrame:
    """Keep one row per `key` group, preferring source_date == date."""
    date_compact = pd.to_datetime(df["date"]).dt.strftime("%d%m%Y")
    df = df.copy()
    df["_match"] = (df["source_date"].astype(str) == date_compact).astype(int)
    # Sort so matching-source rows come first within each key group; stable
    # sort preserves prior row order for the fallback case (no match).
    df = df.sort_values(["_match"], ascending=False, kind="stable")
    df = df.drop_duplicates(subset=key, keep="first")
    df = df.drop(columns="_match")
    return df.sort_values(key).reset_index(drop=True)


def main() -> None:
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument("--dry-run", action="store_true",
                        help="show what would change without writing")
    args = parser.parse_args()

    if not VIDEO_INFO_XLSX.exists():
        sys.exit(f"xlsx not found at {VIDEO_INFO_XLSX}")

    df = pd.read_excel(VIDEO_INFO_XLSX)
    n_before = len(df)

    df = strip_strings(df)

    # Dedup
    key = ["date", "machine_name", "roi"]
    n_unique = df[key].drop_duplicates().shape[0]
    if n_unique < n_before:
        print(f"de-duplicating {n_before - n_unique} rows "
              f"(currently {n_before} rows, {n_unique} unique by {key})")
        df = dedup_by_canonical_source(df, key)
    else:
        print(f"no duplicate rows (all {n_before} are unique on {key})")

    # Normalise male
    male_before = df["male"].value_counts(dropna=False).to_dict()
    df["male"] = df["male"].apply(normalize_male)
    male_after = df["male"].value_counts(dropna=False).to_dict()
    if male_before != male_after:
        print(f"normalised `male` column: {male_before}  →  {male_after}")
    else:
        print(f"`male` column already canonical: {male_after}")

    n_after = len(df)
    print(f"\nfinal: {n_after} rows  (was {n_before})")

    if args.dry_run:
        print("--dry-run: not writing")
        return

    backup = VIDEO_INFO_XLSX.with_suffix(
        f".backup_{datetime.now():%Y%m%d_%H%M%S}.xlsx"
    )
    shutil.copy2(VIDEO_INFO_XLSX, backup)
    print(f"backed up xlsx → {backup}")

    df.to_excel(VIDEO_INFO_XLSX, index=False)
    print(f"wrote cleaned xlsx → {VIDEO_INFO_XLSX}")


if __name__ == "__main__":
    main()