csv-parser: add _validate_mapping helper

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
Giorgio Gilestro 2026-05-27 12:16:26 +02:00
parent 8dcf662945
commit f44b77df6f
2 changed files with 106 additions and 0 deletions

View file

@ -31,6 +31,10 @@ from app.services.csv_import import CSVImportError
# Real broker preambles are typically 1-10 lines. # Real broker preambles are typically 1-10 lines.
_MAX_PREAMBLE_SCAN = 30 _MAX_PREAMBLE_SCAN = 30
# Required and optional keys in the LLM-returned column mapping.
_REQUIRED_MAPPING_KEYS = ("ticker_col", "qty_col")
_OPTIONAL_MAPPING_KEYS = ("name_col", "cost_col", "currency_col")
class LLMParseError(CSVImportError): class LLMParseError(CSVImportError):
"""Raised when the LLM call fails or returns an unusable mapping. """Raised when the LLM call fails or returns an unusable mapping.
@ -124,3 +128,49 @@ def _detect_dialect(raw: bytes) -> tuple[str, int]:
# Next row is also all-alpha → keep scanning # Next row is also all-alpha → keep scanning
break break
return delimiter, 0 return delimiter, 0
def _validate_mapping(
mapping: dict, headers: list[str], first_row: list[str],
) -> None:
"""Verify the LLM-returned mapping is sane.
- ``ticker_col`` and ``qty_col`` are required (non-null).
- Every named column must exist in ``headers``.
- The value at ``qty_col`` on ``first_row`` must parse as a number.
- The value at ``cost_col`` on ``first_row`` (if present) must parse
as a number.
Raises ``LLMParseError`` on any failure, with a message that names
the specific problem (helpful for log forensics and for the
user-facing 400)."""
for key in _REQUIRED_MAPPING_KEYS:
if not mapping.get(key):
raise LLMParseError(
f"LLM mapping missing required column: {key.replace('_col', '')}"
)
headers_set = set(headers)
for key in _REQUIRED_MAPPING_KEYS + _OPTIONAL_MAPPING_KEYS:
col = mapping.get(key)
if col is not None and col not in headers_set:
raise LLMParseError(
f"LLM mapping references unknown column: {col!r}"
)
# Numeric sanity check: qty and (if present) cost must parse on row 1.
header_index = {h: i for i, h in enumerate(headers)}
qty_col = mapping["qty_col"]
qty_value = first_row[header_index[qty_col]] if header_index[qty_col] < len(first_row) else ""
if not _looks_numeric(qty_value):
raise LLMParseError(
f"LLM mapping qty_col={qty_col!r} maps to non-numeric value {qty_value!r}"
)
cost_col = mapping.get("cost_col")
if cost_col is not None:
cost_value = first_row[header_index[cost_col]] if header_index[cost_col] < len(first_row) else ""
if cost_value and not _looks_numeric(cost_value):
raise LLMParseError(
f"LLM mapping cost_col={cost_col!r} maps to non-numeric value {cost_value!r}"
)

View file

@ -96,3 +96,59 @@ def test_detect_dialect_empty_raises():
with pytest.raises(LLMParseError): with pytest.raises(LLMParseError):
_detect_dialect(b"") _detect_dialect(b"")
def test_validate_mapping_accepts_well_formed():
from app.services.llm_csv_parser import _validate_mapping
headers = ["Symbol", "Quantity", "Avg Price", "Currency"]
first_row = ["AAPL", "100", "150.25", "USD"]
mapping = {
"ticker_col": "Symbol",
"qty_col": "Quantity",
"cost_col": "Avg Price",
"currency_col": "Currency",
"name_col": None,
}
_validate_mapping(mapping, headers, first_row) # no raise
def test_validate_mapping_missing_ticker_raises():
from app.services.llm_csv_parser import LLMParseError, _validate_mapping
headers = ["Symbol", "Quantity"]
first_row = ["AAPL", "100"]
mapping = {"ticker_col": None, "qty_col": "Quantity"}
with pytest.raises(LLMParseError, match="ticker"):
_validate_mapping(mapping, headers, first_row)
def test_validate_mapping_missing_qty_raises():
from app.services.llm_csv_parser import LLMParseError, _validate_mapping
headers = ["Symbol", "Quantity"]
first_row = ["AAPL", "100"]
mapping = {"ticker_col": "Symbol", "qty_col": None}
with pytest.raises(LLMParseError, match="qty"):
_validate_mapping(mapping, headers, first_row)
def test_validate_mapping_unknown_column_raises():
from app.services.llm_csv_parser import LLMParseError, _validate_mapping
headers = ["Symbol", "Quantity"]
first_row = ["AAPL", "100"]
mapping = {"ticker_col": "Symbol", "qty_col": "NotARealColumn"}
with pytest.raises(LLMParseError, match="NotARealColumn"):
_validate_mapping(mapping, headers, first_row)
def test_validate_mapping_non_numeric_qty_raises():
from app.services.llm_csv_parser import LLMParseError, _validate_mapping
headers = ["Symbol", "Description"]
first_row = ["AAPL", "Apple Inc"]
# Mapping says qty is "Description", but "Apple Inc" can't parse as a number.
mapping = {"ticker_col": "Symbol", "qty_col": "Description"}
with pytest.raises(LLMParseError, match="numeric"):
_validate_mapping(mapping, headers, first_row)