csv-parser: add _fingerprint helper
Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
parent
f3fd769b3b
commit
f8a0ed3923
2 changed files with 67 additions and 0 deletions
42
app/services/llm_csv_parser.py
Normal file
42
app/services/llm_csv_parser.py
Normal file
|
|
@ -0,0 +1,42 @@
|
|||
"""LLM-fallback CSV parser.
|
||||
|
||||
When the deterministic Trading 212 parser (``csv_import.parse_t212_csv``)
|
||||
raises ``CSVImportError`` on an unrecognised format, this service kicks
|
||||
in:
|
||||
|
||||
1. Detect the CSV dialect (delimiter, preamble offset).
|
||||
2. Compute a fingerprint of the normalised header row.
|
||||
3. Look up ``CsvFormatTemplate`` by fingerprint. On hit, replay the
|
||||
cached column-mapping deterministically. On miss, ask the LLM for a
|
||||
mapping, validate it, persist a new template, and apply it.
|
||||
|
||||
The LLM sees only headers + the first 3-5 sample rows. It returns a
|
||||
column-mapping JSON, never transcribed numbers. The system never
|
||||
auto-promotes a learned format to a hand-written parser — the operator
|
||||
does that by inspecting collected ``sample_row`` values.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import hashlib
|
||||
|
||||
from app.services.csv_import import CSVImportError
|
||||
|
||||
|
||||
class LLMParseError(CSVImportError):
|
||||
"""Raised when the LLM call fails or returns an unusable mapping.
|
||||
|
||||
Inherits from ``CSVImportError`` so route-level error handling can
|
||||
treat both deterministic and LLM-path failures uniformly when
|
||||
desired."""
|
||||
|
||||
|
||||
def _fingerprint(headers: list[str]) -> str:
|
||||
"""Stable hash of the header row.
|
||||
|
||||
Lowercases each header, strips surrounding whitespace, joins with
|
||||
``|`` (a character extremely unlikely to appear inside a real
|
||||
header), and returns the sha256 hex digest. Whitespace/case drift
|
||||
in the same broker's export does not change the fingerprint;
|
||||
adding or removing a column does."""
|
||||
normalised = "|".join(h.strip().lower() for h in headers)
|
||||
return hashlib.sha256(normalised.encode("utf-8")).hexdigest()
|
||||
Loading…
Add table
Add a link
Reference in a new issue