From 3f1d2a10349315f144fb4d5238292832b7543556 Mon Sep 17 00:00:00 2001 From: Giorgio Gilestro Date: Wed, 27 May 2026 11:51:01 +0200 Subject: [PATCH] csv-parser: add CsvFormatTemplate model Co-Authored-By: Claude Opus 4.7 --- app/models.py | 35 +++++++++++++++++++++++++++++++++++ tests/test_llm_csv_parser.py | 31 +++++++++++++++++++++++++++++++ 2 files changed, 66 insertions(+) create mode 100644 tests/test_llm_csv_parser.py diff --git a/app/models.py b/app/models.py index c0d321d..980030d 100644 --- a/app/models.py +++ b/app/models.py @@ -426,3 +426,38 @@ class StripeEvent(Base): UniqueConstraint("event_id", name="uq_stripe_events_event_id"), Index("ix_stripe_events_type_received", "event_type", "received_at"), ) + + +class CsvFormatTemplate(Base): + """Cached column-mapping for a single broker CSV format. + + Populated on the first upload of a previously-unseen format via the + LLM-fallback parser. Subsequent uploads of the same format + (identified by ``fingerprint``, a sha256 of the normalised header + row) replay ``mapping`` deterministically with no LLM call. + + The table holds the actual ``headers`` and one anonymous ``sample_row`` + from the originating upload — there is no ``user_id`` column, no link + back to the uploader. The sample exists so the operator has concrete + material to look at when hand-writing future native parsers; the + system never auto-generates or modifies parser code from this data. + """ + __tablename__ = "csv_format_templates" + + id: Mapped[int] = mapped_column(_PK, primary_key=True, autoincrement=True) + fingerprint: Mapped[str] = mapped_column(String(64), unique=True, nullable=False) + headers: Mapped[list] = mapped_column(JSON, nullable=False) + sample_row: Mapped[list] = mapped_column(JSON, nullable=False) + mapping: Mapped[dict] = mapped_column(JSON, nullable=False) + preamble_rows: Mapped[int] = mapped_column(Integer, nullable=False, default=0) + delimiter: Mapped[str] = mapped_column(String(1), nullable=False, default=",") + broker_label: Mapped[str | None] = mapped_column(String(128)) + first_seen_at: Mapped[datetime] = mapped_column( + DateTime(timezone=True), nullable=False, default=utcnow, + ) + use_count: Mapped[int] = mapped_column(Integer, nullable=False, default=1) + last_used_at: Mapped[datetime] = mapped_column( + DateTime(timezone=True), nullable=False, default=utcnow, + ) + llm_model: Mapped[str | None] = mapped_column(String(64)) + llm_cost_usd: Mapped[float | None] = mapped_column(Float) diff --git a/tests/test_llm_csv_parser.py b/tests/test_llm_csv_parser.py new file mode 100644 index 0000000..6ccf7ff --- /dev/null +++ b/tests/test_llm_csv_parser.py @@ -0,0 +1,31 @@ +"""Unit + integration tests for the LLM-fallback CSV parser.""" +from __future__ import annotations + +import pytest + + +def test_csv_format_template_model_columns(): + """Model exposes every column the spec requires, with correct types.""" + from sqlalchemy import inspect + + from app.models import CsvFormatTemplate + + cols = {c.name: c for c in inspect(CsvFormatTemplate).columns} + assert "fingerprint" in cols + assert "headers" in cols + assert "sample_row" in cols + assert "mapping" in cols + assert "preamble_rows" in cols + assert "delimiter" in cols + assert "broker_label" in cols + assert "first_seen_at" in cols + assert "use_count" in cols + assert "last_used_at" in cols + assert "llm_model" in cols + assert "llm_cost_usd" in cols + # Crucially, no user attribution. + assert "user_id" not in cols + assert "first_seen_user_id" not in cols + # Fingerprint is the cache key. + assert cols["fingerprint"].unique is True + assert cols["fingerprint"].nullable is False