From f44b77df6f3c352293e4ce12151d6573f1bc211e Mon Sep 17 00:00:00 2001 From: Giorgio Gilestro Date: Wed, 27 May 2026 12:16:26 +0200 Subject: [PATCH] csv-parser: add _validate_mapping helper Co-Authored-By: Claude Opus 4.7 --- app/services/llm_csv_parser.py | 50 ++++++++++++++++++++++++++++++ tests/test_llm_csv_parser.py | 56 ++++++++++++++++++++++++++++++++++ 2 files changed, 106 insertions(+) diff --git a/app/services/llm_csv_parser.py b/app/services/llm_csv_parser.py index 153eb72..ee891fc 100644 --- a/app/services/llm_csv_parser.py +++ b/app/services/llm_csv_parser.py @@ -31,6 +31,10 @@ from app.services.csv_import import CSVImportError # Real broker preambles are typically 1-10 lines. _MAX_PREAMBLE_SCAN = 30 +# Required and optional keys in the LLM-returned column mapping. +_REQUIRED_MAPPING_KEYS = ("ticker_col", "qty_col") +_OPTIONAL_MAPPING_KEYS = ("name_col", "cost_col", "currency_col") + class LLMParseError(CSVImportError): """Raised when the LLM call fails or returns an unusable mapping. @@ -124,3 +128,49 @@ def _detect_dialect(raw: bytes) -> tuple[str, int]: # Next row is also all-alpha → keep scanning break return delimiter, 0 + + +def _validate_mapping( + mapping: dict, headers: list[str], first_row: list[str], +) -> None: + """Verify the LLM-returned mapping is sane. + + - ``ticker_col`` and ``qty_col`` are required (non-null). + - Every named column must exist in ``headers``. + - The value at ``qty_col`` on ``first_row`` must parse as a number. + - The value at ``cost_col`` on ``first_row`` (if present) must parse + as a number. + + Raises ``LLMParseError`` on any failure, with a message that names + the specific problem (helpful for log forensics and for the + user-facing 400).""" + for key in _REQUIRED_MAPPING_KEYS: + if not mapping.get(key): + raise LLMParseError( + f"LLM mapping missing required column: {key.replace('_col', '')}" + ) + + headers_set = set(headers) + for key in _REQUIRED_MAPPING_KEYS + _OPTIONAL_MAPPING_KEYS: + col = mapping.get(key) + if col is not None and col not in headers_set: + raise LLMParseError( + f"LLM mapping references unknown column: {col!r}" + ) + + # Numeric sanity check: qty and (if present) cost must parse on row 1. + header_index = {h: i for i, h in enumerate(headers)} + qty_col = mapping["qty_col"] + qty_value = first_row[header_index[qty_col]] if header_index[qty_col] < len(first_row) else "" + if not _looks_numeric(qty_value): + raise LLMParseError( + f"LLM mapping qty_col={qty_col!r} maps to non-numeric value {qty_value!r}" + ) + + cost_col = mapping.get("cost_col") + if cost_col is not None: + cost_value = first_row[header_index[cost_col]] if header_index[cost_col] < len(first_row) else "" + if cost_value and not _looks_numeric(cost_value): + raise LLMParseError( + f"LLM mapping cost_col={cost_col!r} maps to non-numeric value {cost_value!r}" + ) diff --git a/tests/test_llm_csv_parser.py b/tests/test_llm_csv_parser.py index 08160f7..db75bf4 100644 --- a/tests/test_llm_csv_parser.py +++ b/tests/test_llm_csv_parser.py @@ -96,3 +96,59 @@ def test_detect_dialect_empty_raises(): with pytest.raises(LLMParseError): _detect_dialect(b"") + + +def test_validate_mapping_accepts_well_formed(): + from app.services.llm_csv_parser import _validate_mapping + + headers = ["Symbol", "Quantity", "Avg Price", "Currency"] + first_row = ["AAPL", "100", "150.25", "USD"] + mapping = { + "ticker_col": "Symbol", + "qty_col": "Quantity", + "cost_col": "Avg Price", + "currency_col": "Currency", + "name_col": None, + } + _validate_mapping(mapping, headers, first_row) # no raise + + +def test_validate_mapping_missing_ticker_raises(): + from app.services.llm_csv_parser import LLMParseError, _validate_mapping + + headers = ["Symbol", "Quantity"] + first_row = ["AAPL", "100"] + mapping = {"ticker_col": None, "qty_col": "Quantity"} + with pytest.raises(LLMParseError, match="ticker"): + _validate_mapping(mapping, headers, first_row) + + +def test_validate_mapping_missing_qty_raises(): + from app.services.llm_csv_parser import LLMParseError, _validate_mapping + + headers = ["Symbol", "Quantity"] + first_row = ["AAPL", "100"] + mapping = {"ticker_col": "Symbol", "qty_col": None} + with pytest.raises(LLMParseError, match="qty"): + _validate_mapping(mapping, headers, first_row) + + +def test_validate_mapping_unknown_column_raises(): + from app.services.llm_csv_parser import LLMParseError, _validate_mapping + + headers = ["Symbol", "Quantity"] + first_row = ["AAPL", "100"] + mapping = {"ticker_col": "Symbol", "qty_col": "NotARealColumn"} + with pytest.raises(LLMParseError, match="NotARealColumn"): + _validate_mapping(mapping, headers, first_row) + + +def test_validate_mapping_non_numeric_qty_raises(): + from app.services.llm_csv_parser import LLMParseError, _validate_mapping + + headers = ["Symbol", "Description"] + first_row = ["AAPL", "Apple Inc"] + # Mapping says qty is "Description", but "Apple Inc" can't parse as a number. + mapping = {"ticker_col": "Symbol", "qty_col": "Description"} + with pytest.raises(LLMParseError, match="numeric"): + _validate_mapping(mapping, headers, first_row)