42 lines
1.6 KiB
Python
42 lines
1.6 KiB
Python
"""LLM-fallback CSV parser.
|
|
|
|
When the deterministic Trading 212 parser (``csv_import.parse_t212_csv``)
|
|
raises ``CSVImportError`` on an unrecognised format, this service kicks
|
|
in:
|
|
|
|
1. Detect the CSV dialect (delimiter, preamble offset).
|
|
2. Compute a fingerprint of the normalised header row.
|
|
3. Look up ``CsvFormatTemplate`` by fingerprint. On hit, replay the
|
|
cached column-mapping deterministically. On miss, ask the LLM for a
|
|
mapping, validate it, persist a new template, and apply it.
|
|
|
|
The LLM sees only headers + the first 3-5 sample rows. It returns a
|
|
column-mapping JSON, never transcribed numbers. The system never
|
|
auto-promotes a learned format to a hand-written parser — the operator
|
|
does that by inspecting collected ``sample_row`` values.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import hashlib
|
|
|
|
from app.services.csv_import import CSVImportError
|
|
|
|
|
|
class LLMParseError(CSVImportError):
|
|
"""Raised when the LLM call fails or returns an unusable mapping.
|
|
|
|
Inherits from ``CSVImportError`` so route-level error handling can
|
|
treat both deterministic and LLM-path failures uniformly when
|
|
desired."""
|
|
|
|
|
|
def _fingerprint(headers: list[str]) -> str:
|
|
"""Stable hash of the header row.
|
|
|
|
Lowercases each header, strips surrounding whitespace, joins with
|
|
``|`` (a character extremely unlikely to appear inside a real
|
|
header), and returns the sha256 hex digest. Whitespace/case drift
|
|
in the same broker's export does not change the fingerprint;
|
|
adding or removing a column does."""
|
|
normalised = "|".join(h.strip().lower() for h in headers)
|
|
return hashlib.sha256(normalised.encode("utf-8")).hexdigest()
|