Dedupe + canonicalise the merged xlsx, then guard the export
108 of 508 rows in all_video_info_merged.xlsx were duplicates left over
from merging multiple source spreadsheets — same (date, machine, ROI)
appearing under two source_date values, identical data otherwise. The
`male` column was also using a mix of variants ('naïve', 'niave',
'naive', 'trained') with the canonical 'naive' a minority of 12/200.
scripts/cleanup_xlsx.py
Idempotent one-off: backs up the xlsx, dedupes preferring the row
whose source_date matches the experiment date, normalises `male`
spellings, strips whitespace from string columns. Re-running on a
clean file is a no-op.
scripts/export_video_db_index.py
New _validate_xlsx() runs first thing in main() and aborts the
export with an actionable error if duplicates or non-canonical
male values are present. Prevents silent regressions when the
xlsx is edited or re-merged in the future.
Result: TSV is now 400 rows (was 508), exactly 200 trained / 200
naive, no duplicates.
Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
parent
4ed988a617
commit
53b45e373b
2 changed files with 149 additions and 0 deletions
120
scripts/cleanup_xlsx.py
Normal file
120
scripts/cleanup_xlsx.py
Normal file
|
|
@ -0,0 +1,120 @@
|
|||
"""One-off cleanup of all_video_info_merged.xlsx.
|
||||
|
||||
Removes accidental duplicate rows that crept in when multiple source
|
||||
spreadsheets were merged, and canonicalises the `male` column
|
||||
(`naïve` / `niave` → `naive`, plus stripping whitespace).
|
||||
|
||||
Idempotent: re-running on a cleaned file is a no-op (besides creating a
|
||||
fresh backup).
|
||||
|
||||
Dedup rule: when multiple rows share (date, machine_name, roi):
|
||||
1. Prefer the row whose source_date matches the experiment date
|
||||
(DDMMYYYY format). This keeps the most-recently-curated row,
|
||||
since the user typically sanitises in the source_date file
|
||||
matching the experiment date.
|
||||
2. If no row matches, keep the last one (preserve all data when
|
||||
the source_date covers multiple experiment dates, e.g.
|
||||
"03102024-04102024").
|
||||
|
||||
Run:
|
||||
python cleanup_xlsx.py # backs up + writes cleaned xlsx
|
||||
python cleanup_xlsx.py --dry-run # shows what would change
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import shutil
|
||||
import sys
|
||||
from datetime import datetime
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from config import VIDEO_INFO_XLSX
|
||||
|
||||
_MALE_NAIVE_VARIANTS = {"naïve", "niave", "naive"}
|
||||
|
||||
|
||||
def normalize_male(v):
|
||||
if pd.isna(v):
|
||||
return v
|
||||
s = str(v).strip()
|
||||
if s.lower() in _MALE_NAIVE_VARIANTS:
|
||||
return "naive"
|
||||
if s.lower() == "trained":
|
||||
return "trained"
|
||||
return s # leave anything unexpected for the analyst to inspect
|
||||
|
||||
|
||||
def strip_strings(df: pd.DataFrame) -> pd.DataFrame:
|
||||
"""Strip leading/trailing whitespace from every string cell."""
|
||||
for col in df.select_dtypes(include=["object", "string"]).columns:
|
||||
df[col] = df[col].apply(lambda v: v.strip() if isinstance(v, str) else v)
|
||||
return df
|
||||
|
||||
|
||||
def dedup_by_canonical_source(df: pd.DataFrame, key: list[str]) -> pd.DataFrame:
|
||||
"""Keep one row per `key` group, preferring source_date == date."""
|
||||
date_compact = pd.to_datetime(df["date"]).dt.strftime("%d%m%Y")
|
||||
df = df.copy()
|
||||
df["_match"] = (df["source_date"].astype(str) == date_compact).astype(int)
|
||||
# Sort so matching-source rows come first within each key group; stable
|
||||
# sort preserves prior row order for the fallback case (no match).
|
||||
df = df.sort_values(["_match"], ascending=False, kind="stable")
|
||||
df = df.drop_duplicates(subset=key, keep="first")
|
||||
df = df.drop(columns="_match")
|
||||
return df.sort_values(key).reset_index(drop=True)
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser(description=__doc__)
|
||||
parser.add_argument("--dry-run", action="store_true",
|
||||
help="show what would change without writing")
|
||||
args = parser.parse_args()
|
||||
|
||||
if not VIDEO_INFO_XLSX.exists():
|
||||
sys.exit(f"xlsx not found at {VIDEO_INFO_XLSX}")
|
||||
|
||||
df = pd.read_excel(VIDEO_INFO_XLSX)
|
||||
n_before = len(df)
|
||||
|
||||
df = strip_strings(df)
|
||||
|
||||
# Dedup
|
||||
key = ["date", "machine_name", "roi"]
|
||||
n_unique = df[key].drop_duplicates().shape[0]
|
||||
if n_unique < n_before:
|
||||
print(f"de-duplicating {n_before - n_unique} rows "
|
||||
f"(currently {n_before} rows, {n_unique} unique by {key})")
|
||||
df = dedup_by_canonical_source(df, key)
|
||||
else:
|
||||
print(f"no duplicate rows (all {n_before} are unique on {key})")
|
||||
|
||||
# Normalise male
|
||||
male_before = df["male"].value_counts(dropna=False).to_dict()
|
||||
df["male"] = df["male"].apply(normalize_male)
|
||||
male_after = df["male"].value_counts(dropna=False).to_dict()
|
||||
if male_before != male_after:
|
||||
print(f"normalised `male` column: {male_before} → {male_after}")
|
||||
else:
|
||||
print(f"`male` column already canonical: {male_after}")
|
||||
|
||||
n_after = len(df)
|
||||
print(f"\nfinal: {n_after} rows (was {n_before})")
|
||||
|
||||
if args.dry_run:
|
||||
print("--dry-run: not writing")
|
||||
return
|
||||
|
||||
backup = VIDEO_INFO_XLSX.with_suffix(
|
||||
f".backup_{datetime.now():%Y%m%d_%H%M%S}.xlsx"
|
||||
)
|
||||
shutil.copy2(VIDEO_INFO_XLSX, backup)
|
||||
print(f"backed up xlsx → {backup}")
|
||||
|
||||
df.to_excel(VIDEO_INFO_XLSX, index=False)
|
||||
print(f"wrote cleaned xlsx → {VIDEO_INFO_XLSX}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
@ -128,6 +128,34 @@ def resolve_session(
|
|||
_MALE_NAIVE_VARIANTS = {"naïve", "niave", "naive"}
|
||||
|
||||
|
||||
def _validate_xlsx(df: pd.DataFrame) -> None:
|
||||
"""Refuse to export if the xlsx has duplicates or non-canonical values.
|
||||
|
||||
The export pipeline assumes one row per (date, machine_name, roi). If
|
||||
that ever stops being true (e.g. a future merge re-introduces dupes),
|
||||
every downstream count silently doubles. Catch it at the source.
|
||||
"""
|
||||
key = ["date", "machine_name", "roi"]
|
||||
dupes = df[df.duplicated(subset=key, keep=False)]
|
||||
if not dupes.empty:
|
||||
n_unique = df[key].drop_duplicates().shape[0]
|
||||
sample = dupes.head(4)[["date", "machine_name", "roi", "source_date"]]
|
||||
raise SystemExit(
|
||||
f"\n ERROR: xlsx has {len(dupes)} duplicate rows "
|
||||
f"({len(df)} total, {n_unique} unique on {key}).\n"
|
||||
f" Sample:\n{sample.to_string(index=False)}\n"
|
||||
f" Run scripts/cleanup_xlsx.py to fix.\n"
|
||||
)
|
||||
bad_male = sorted(set(df["male"].dropna().astype(str).str.strip().unique())
|
||||
- {"naive", "trained"})
|
||||
if bad_male:
|
||||
raise SystemExit(
|
||||
f"\n ERROR: xlsx `male` column has non-canonical values: {bad_male}\n"
|
||||
f" Expected only 'trained' and 'naive'.\n"
|
||||
f" Run scripts/cleanup_xlsx.py to fix.\n"
|
||||
)
|
||||
|
||||
|
||||
def _normalize_metadata(df: pd.DataFrame) -> None:
|
||||
"""Strip whitespace and canonicalize the ``male`` column in place."""
|
||||
for col in df.select_dtypes(include=("object", "string")).columns:
|
||||
|
|
@ -152,6 +180,7 @@ def main() -> None:
|
|||
index = build_session_index(inv)
|
||||
|
||||
df = pd.read_excel(VIDEO_INFO_XLSX)
|
||||
_validate_xlsx(df)
|
||||
_normalize_metadata(df)
|
||||
date_iso = pd.to_datetime(df["date"]).dt.strftime("%Y-%m-%d")
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue