csv-parser: add _validate_mapping helper
Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
parent
8dcf662945
commit
f44b77df6f
2 changed files with 106 additions and 0 deletions
|
|
@ -31,6 +31,10 @@ from app.services.csv_import import CSVImportError
|
|||
# Real broker preambles are typically 1-10 lines.
|
||||
_MAX_PREAMBLE_SCAN = 30
|
||||
|
||||
# Required and optional keys in the LLM-returned column mapping.
|
||||
_REQUIRED_MAPPING_KEYS = ("ticker_col", "qty_col")
|
||||
_OPTIONAL_MAPPING_KEYS = ("name_col", "cost_col", "currency_col")
|
||||
|
||||
|
||||
class LLMParseError(CSVImportError):
|
||||
"""Raised when the LLM call fails or returns an unusable mapping.
|
||||
|
|
@ -124,3 +128,49 @@ def _detect_dialect(raw: bytes) -> tuple[str, int]:
|
|||
# Next row is also all-alpha → keep scanning
|
||||
break
|
||||
return delimiter, 0
|
||||
|
||||
|
||||
def _validate_mapping(
|
||||
mapping: dict, headers: list[str], first_row: list[str],
|
||||
) -> None:
|
||||
"""Verify the LLM-returned mapping is sane.
|
||||
|
||||
- ``ticker_col`` and ``qty_col`` are required (non-null).
|
||||
- Every named column must exist in ``headers``.
|
||||
- The value at ``qty_col`` on ``first_row`` must parse as a number.
|
||||
- The value at ``cost_col`` on ``first_row`` (if present) must parse
|
||||
as a number.
|
||||
|
||||
Raises ``LLMParseError`` on any failure, with a message that names
|
||||
the specific problem (helpful for log forensics and for the
|
||||
user-facing 400)."""
|
||||
for key in _REQUIRED_MAPPING_KEYS:
|
||||
if not mapping.get(key):
|
||||
raise LLMParseError(
|
||||
f"LLM mapping missing required column: {key.replace('_col', '')}"
|
||||
)
|
||||
|
||||
headers_set = set(headers)
|
||||
for key in _REQUIRED_MAPPING_KEYS + _OPTIONAL_MAPPING_KEYS:
|
||||
col = mapping.get(key)
|
||||
if col is not None and col not in headers_set:
|
||||
raise LLMParseError(
|
||||
f"LLM mapping references unknown column: {col!r}"
|
||||
)
|
||||
|
||||
# Numeric sanity check: qty and (if present) cost must parse on row 1.
|
||||
header_index = {h: i for i, h in enumerate(headers)}
|
||||
qty_col = mapping["qty_col"]
|
||||
qty_value = first_row[header_index[qty_col]] if header_index[qty_col] < len(first_row) else ""
|
||||
if not _looks_numeric(qty_value):
|
||||
raise LLMParseError(
|
||||
f"LLM mapping qty_col={qty_col!r} maps to non-numeric value {qty_value!r}"
|
||||
)
|
||||
|
||||
cost_col = mapping.get("cost_col")
|
||||
if cost_col is not None:
|
||||
cost_value = first_row[header_index[cost_col]] if header_index[cost_col] < len(first_row) else ""
|
||||
if cost_value and not _looks_numeric(cost_value):
|
||||
raise LLMParseError(
|
||||
f"LLM mapping cost_col={cost_col!r} maps to non-numeric value {cost_value!r}"
|
||||
)
|
||||
|
|
|
|||
|
|
@ -96,3 +96,59 @@ def test_detect_dialect_empty_raises():
|
|||
|
||||
with pytest.raises(LLMParseError):
|
||||
_detect_dialect(b"")
|
||||
|
||||
|
||||
def test_validate_mapping_accepts_well_formed():
|
||||
from app.services.llm_csv_parser import _validate_mapping
|
||||
|
||||
headers = ["Symbol", "Quantity", "Avg Price", "Currency"]
|
||||
first_row = ["AAPL", "100", "150.25", "USD"]
|
||||
mapping = {
|
||||
"ticker_col": "Symbol",
|
||||
"qty_col": "Quantity",
|
||||
"cost_col": "Avg Price",
|
||||
"currency_col": "Currency",
|
||||
"name_col": None,
|
||||
}
|
||||
_validate_mapping(mapping, headers, first_row) # no raise
|
||||
|
||||
|
||||
def test_validate_mapping_missing_ticker_raises():
|
||||
from app.services.llm_csv_parser import LLMParseError, _validate_mapping
|
||||
|
||||
headers = ["Symbol", "Quantity"]
|
||||
first_row = ["AAPL", "100"]
|
||||
mapping = {"ticker_col": None, "qty_col": "Quantity"}
|
||||
with pytest.raises(LLMParseError, match="ticker"):
|
||||
_validate_mapping(mapping, headers, first_row)
|
||||
|
||||
|
||||
def test_validate_mapping_missing_qty_raises():
|
||||
from app.services.llm_csv_parser import LLMParseError, _validate_mapping
|
||||
|
||||
headers = ["Symbol", "Quantity"]
|
||||
first_row = ["AAPL", "100"]
|
||||
mapping = {"ticker_col": "Symbol", "qty_col": None}
|
||||
with pytest.raises(LLMParseError, match="qty"):
|
||||
_validate_mapping(mapping, headers, first_row)
|
||||
|
||||
|
||||
def test_validate_mapping_unknown_column_raises():
|
||||
from app.services.llm_csv_parser import LLMParseError, _validate_mapping
|
||||
|
||||
headers = ["Symbol", "Quantity"]
|
||||
first_row = ["AAPL", "100"]
|
||||
mapping = {"ticker_col": "Symbol", "qty_col": "NotARealColumn"}
|
||||
with pytest.raises(LLMParseError, match="NotARealColumn"):
|
||||
_validate_mapping(mapping, headers, first_row)
|
||||
|
||||
|
||||
def test_validate_mapping_non_numeric_qty_raises():
|
||||
from app.services.llm_csv_parser import LLMParseError, _validate_mapping
|
||||
|
||||
headers = ["Symbol", "Description"]
|
||||
first_row = ["AAPL", "Apple Inc"]
|
||||
# Mapping says qty is "Description", but "Apple Inc" can't parse as a number.
|
||||
mapping = {"ticker_col": "Symbol", "qty_col": "Description"}
|
||||
with pytest.raises(LLMParseError, match="numeric"):
|
||||
_validate_mapping(mapping, headers, first_row)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue