csv-parser: add _fingerprint helper

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
Giorgio Gilestro 2026-05-27 12:08:34 +02:00
parent f3fd769b3b
commit f8a0ed3923
2 changed files with 67 additions and 0 deletions

View file

@ -0,0 +1,42 @@
"""LLM-fallback CSV parser.
When the deterministic Trading 212 parser (``csv_import.parse_t212_csv``)
raises ``CSVImportError`` on an unrecognised format, this service kicks
in:
1. Detect the CSV dialect (delimiter, preamble offset).
2. Compute a fingerprint of the normalised header row.
3. Look up ``CsvFormatTemplate`` by fingerprint. On hit, replay the
cached column-mapping deterministically. On miss, ask the LLM for a
mapping, validate it, persist a new template, and apply it.
The LLM sees only headers + the first 3-5 sample rows. It returns a
column-mapping JSON, never transcribed numbers. The system never
auto-promotes a learned format to a hand-written parser the operator
does that by inspecting collected ``sample_row`` values.
"""
from __future__ import annotations
import hashlib
from app.services.csv_import import CSVImportError
class LLMParseError(CSVImportError):
"""Raised when the LLM call fails or returns an unusable mapping.
Inherits from ``CSVImportError`` so route-level error handling can
treat both deterministic and LLM-path failures uniformly when
desired."""
def _fingerprint(headers: list[str]) -> str:
"""Stable hash of the header row.
Lowercases each header, strips surrounding whitespace, joins with
``|`` (a character extremely unlikely to appear inside a real
header), and returns the sha256 hex digest. Whitespace/case drift
in the same broker's export does not change the fingerprint;
adding or removing a column does."""
normalised = "|".join(h.strip().lower() for h in headers)
return hashlib.sha256(normalised.encode("utf-8")).hexdigest()

View file

@ -29,3 +29,28 @@ def test_csv_format_template_model_columns():
# Fingerprint is the cache key.
assert cols["fingerprint"].unique is True
assert cols["fingerprint"].nullable is False
def test_fingerprint_stable_across_case_and_whitespace():
from app.services.llm_csv_parser import _fingerprint
a = _fingerprint(["Symbol", "Quantity", "Avg Price"])
b = _fingerprint(["symbol", "quantity", "avg price"])
c = _fingerprint([" SYMBOL ", "Quantity", " AVG PRICE"])
assert a == b == c
def test_fingerprint_differs_for_different_columns():
from app.services.llm_csv_parser import _fingerprint
a = _fingerprint(["Symbol", "Quantity"])
b = _fingerprint(["Symbol", "Quantity", "Avg Price"])
assert a != b
def test_fingerprint_is_sha256_hex_64_chars():
from app.services.llm_csv_parser import _fingerprint
f = _fingerprint(["Symbol", "Quantity"])
assert len(f) == 64
assert all(c in "0123456789abcdef" for c in f)