csv-parser: add _fingerprint helper
Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
parent
f3fd769b3b
commit
f8a0ed3923
2 changed files with 67 additions and 0 deletions
42
app/services/llm_csv_parser.py
Normal file
42
app/services/llm_csv_parser.py
Normal file
|
|
@ -0,0 +1,42 @@
|
|||
"""LLM-fallback CSV parser.
|
||||
|
||||
When the deterministic Trading 212 parser (``csv_import.parse_t212_csv``)
|
||||
raises ``CSVImportError`` on an unrecognised format, this service kicks
|
||||
in:
|
||||
|
||||
1. Detect the CSV dialect (delimiter, preamble offset).
|
||||
2. Compute a fingerprint of the normalised header row.
|
||||
3. Look up ``CsvFormatTemplate`` by fingerprint. On hit, replay the
|
||||
cached column-mapping deterministically. On miss, ask the LLM for a
|
||||
mapping, validate it, persist a new template, and apply it.
|
||||
|
||||
The LLM sees only headers + the first 3-5 sample rows. It returns a
|
||||
column-mapping JSON, never transcribed numbers. The system never
|
||||
auto-promotes a learned format to a hand-written parser — the operator
|
||||
does that by inspecting collected ``sample_row`` values.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import hashlib
|
||||
|
||||
from app.services.csv_import import CSVImportError
|
||||
|
||||
|
||||
class LLMParseError(CSVImportError):
|
||||
"""Raised when the LLM call fails or returns an unusable mapping.
|
||||
|
||||
Inherits from ``CSVImportError`` so route-level error handling can
|
||||
treat both deterministic and LLM-path failures uniformly when
|
||||
desired."""
|
||||
|
||||
|
||||
def _fingerprint(headers: list[str]) -> str:
|
||||
"""Stable hash of the header row.
|
||||
|
||||
Lowercases each header, strips surrounding whitespace, joins with
|
||||
``|`` (a character extremely unlikely to appear inside a real
|
||||
header), and returns the sha256 hex digest. Whitespace/case drift
|
||||
in the same broker's export does not change the fingerprint;
|
||||
adding or removing a column does."""
|
||||
normalised = "|".join(h.strip().lower() for h in headers)
|
||||
return hashlib.sha256(normalised.encode("utf-8")).hexdigest()
|
||||
|
|
@ -29,3 +29,28 @@ def test_csv_format_template_model_columns():
|
|||
# Fingerprint is the cache key.
|
||||
assert cols["fingerprint"].unique is True
|
||||
assert cols["fingerprint"].nullable is False
|
||||
|
||||
|
||||
def test_fingerprint_stable_across_case_and_whitespace():
|
||||
from app.services.llm_csv_parser import _fingerprint
|
||||
|
||||
a = _fingerprint(["Symbol", "Quantity", "Avg Price"])
|
||||
b = _fingerprint(["symbol", "quantity", "avg price"])
|
||||
c = _fingerprint([" SYMBOL ", "Quantity", " AVG PRICE"])
|
||||
assert a == b == c
|
||||
|
||||
|
||||
def test_fingerprint_differs_for_different_columns():
|
||||
from app.services.llm_csv_parser import _fingerprint
|
||||
|
||||
a = _fingerprint(["Symbol", "Quantity"])
|
||||
b = _fingerprint(["Symbol", "Quantity", "Avg Price"])
|
||||
assert a != b
|
||||
|
||||
|
||||
def test_fingerprint_is_sha256_hex_64_chars():
|
||||
from app.services.llm_csv_parser import _fingerprint
|
||||
|
||||
f = _fingerprint(["Symbol", "Quantity"])
|
||||
assert len(f) == 64
|
||||
assert all(c in "0123456789abcdef" for c in f)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue