diff --git a/app/services/llm_csv_parser.py b/app/services/llm_csv_parser.py new file mode 100644 index 0000000..9d7a04c --- /dev/null +++ b/app/services/llm_csv_parser.py @@ -0,0 +1,42 @@ +"""LLM-fallback CSV parser. + +When the deterministic Trading 212 parser (``csv_import.parse_t212_csv``) +raises ``CSVImportError`` on an unrecognised format, this service kicks +in: + +1. Detect the CSV dialect (delimiter, preamble offset). +2. Compute a fingerprint of the normalised header row. +3. Look up ``CsvFormatTemplate`` by fingerprint. On hit, replay the + cached column-mapping deterministically. On miss, ask the LLM for a + mapping, validate it, persist a new template, and apply it. + +The LLM sees only headers + the first 3-5 sample rows. It returns a +column-mapping JSON, never transcribed numbers. The system never +auto-promotes a learned format to a hand-written parser — the operator +does that by inspecting collected ``sample_row`` values. +""" +from __future__ import annotations + +import hashlib + +from app.services.csv_import import CSVImportError + + +class LLMParseError(CSVImportError): + """Raised when the LLM call fails or returns an unusable mapping. + + Inherits from ``CSVImportError`` so route-level error handling can + treat both deterministic and LLM-path failures uniformly when + desired.""" + + +def _fingerprint(headers: list[str]) -> str: + """Stable hash of the header row. + + Lowercases each header, strips surrounding whitespace, joins with + ``|`` (a character extremely unlikely to appear inside a real + header), and returns the sha256 hex digest. Whitespace/case drift + in the same broker's export does not change the fingerprint; + adding or removing a column does.""" + normalised = "|".join(h.strip().lower() for h in headers) + return hashlib.sha256(normalised.encode("utf-8")).hexdigest() diff --git a/tests/test_llm_csv_parser.py b/tests/test_llm_csv_parser.py index 6ccf7ff..27223bb 100644 --- a/tests/test_llm_csv_parser.py +++ b/tests/test_llm_csv_parser.py @@ -29,3 +29,28 @@ def test_csv_format_template_model_columns(): # Fingerprint is the cache key. assert cols["fingerprint"].unique is True assert cols["fingerprint"].nullable is False + + +def test_fingerprint_stable_across_case_and_whitespace(): + from app.services.llm_csv_parser import _fingerprint + + a = _fingerprint(["Symbol", "Quantity", "Avg Price"]) + b = _fingerprint(["symbol", "quantity", "avg price"]) + c = _fingerprint([" SYMBOL ", "Quantity", " AVG PRICE"]) + assert a == b == c + + +def test_fingerprint_differs_for_different_columns(): + from app.services.llm_csv_parser import _fingerprint + + a = _fingerprint(["Symbol", "Quantity"]) + b = _fingerprint(["Symbol", "Quantity", "Avg Price"]) + assert a != b + + +def test_fingerprint_is_sha256_hex_64_chars(): + from app.services.llm_csv_parser import _fingerprint + + f = _fingerprint(["Symbol", "Quantity"]) + assert len(f) == 64 + assert all(c in "0123456789abcdef" for c in f)