From 3f1d2a10349315f144fb4d5238292832b7543556 Mon Sep 17 00:00:00 2001
From: Giorgio Gilestro <giorgio@gilest.ro>
Date: Wed, 27 May 2026 11:51:01 +0200
Subject: [PATCH] csv-parser: add CsvFormatTemplate model

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 app/models.py                | 35 +++++++++++++++++++++++++++++++++++
 tests/test_llm_csv_parser.py | 31 +++++++++++++++++++++++++++++++
 2 files changed, 66 insertions(+)
 create mode 100644 tests/test_llm_csv_parser.py

diff --git a/app/models.py b/app/models.py
index c0d321d..980030d 100644
--- a/app/models.py
+++ b/app/models.py
@@ -426,3 +426,38 @@ class StripeEvent(Base):
         UniqueConstraint("event_id", name="uq_stripe_events_event_id"),
         Index("ix_stripe_events_type_received", "event_type", "received_at"),
     )
+
+
+class CsvFormatTemplate(Base):
+    """Cached column-mapping for a single broker CSV format.
+
+    Populated on the first upload of a previously-unseen format via the
+    LLM-fallback parser. Subsequent uploads of the same format
+    (identified by ``fingerprint``, a sha256 of the normalised header
+    row) replay ``mapping`` deterministically with no LLM call.
+
+    The table holds the actual ``headers`` and one anonymous ``sample_row``
+    from the originating upload — there is no ``user_id`` column, no link
+    back to the uploader. The sample exists so the operator has concrete
+    material to look at when hand-writing future native parsers; the
+    system never auto-generates or modifies parser code from this data.
+    """
+    __tablename__ = "csv_format_templates"
+
+    id: Mapped[int] = mapped_column(_PK, primary_key=True, autoincrement=True)
+    fingerprint: Mapped[str] = mapped_column(String(64), unique=True, nullable=False)
+    headers: Mapped[list] = mapped_column(JSON, nullable=False)
+    sample_row: Mapped[list] = mapped_column(JSON, nullable=False)
+    mapping: Mapped[dict] = mapped_column(JSON, nullable=False)
+    preamble_rows: Mapped[int] = mapped_column(Integer, nullable=False, default=0)
+    delimiter: Mapped[str] = mapped_column(String(1), nullable=False, default=",")
+    broker_label: Mapped[str | None] = mapped_column(String(128))
+    first_seen_at: Mapped[datetime] = mapped_column(
+        DateTime(timezone=True), nullable=False, default=utcnow,
+    )
+    use_count: Mapped[int] = mapped_column(Integer, nullable=False, default=1)
+    last_used_at: Mapped[datetime] = mapped_column(
+        DateTime(timezone=True), nullable=False, default=utcnow,
+    )
+    llm_model: Mapped[str | None] = mapped_column(String(64))
+    llm_cost_usd: Mapped[float | None] = mapped_column(Float)
diff --git a/tests/test_llm_csv_parser.py b/tests/test_llm_csv_parser.py
new file mode 100644
index 0000000..6ccf7ff
--- /dev/null
+++ b/tests/test_llm_csv_parser.py
@@ -0,0 +1,31 @@
+"""Unit + integration tests for the LLM-fallback CSV parser."""
+from __future__ import annotations
+
+import pytest
+
+
+def test_csv_format_template_model_columns():
+    """Model exposes every column the spec requires, with correct types."""
+    from sqlalchemy import inspect
+
+    from app.models import CsvFormatTemplate
+
+    cols = {c.name: c for c in inspect(CsvFormatTemplate).columns}
+    assert "fingerprint" in cols
+    assert "headers" in cols
+    assert "sample_row" in cols
+    assert "mapping" in cols
+    assert "preamble_rows" in cols
+    assert "delimiter" in cols
+    assert "broker_label" in cols
+    assert "first_seen_at" in cols
+    assert "use_count" in cols
+    assert "last_used_at" in cols
+    assert "llm_model" in cols
+    assert "llm_cost_usd" in cols
+    # Crucially, no user attribution.
+    assert "user_id" not in cols
+    assert "first_seen_user_id" not in cols
+    # Fingerprint is the cache key.
+    assert cols["fingerprint"].unique is True
+    assert cols["fingerprint"].nullable is False