csv-parser: add _extract_mapping_via_llm with provider-failure wrapping

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
Giorgio Gilestro 2026-05-27 12:21:19 +02:00
parent b99f46d2fc
commit c77b3564f3
2 changed files with 148 additions and 0 deletions

View file

@ -20,8 +20,12 @@ from __future__ import annotations
import csv
import hashlib
import io
import json
import httpx
from app.services.csv_import import CSVImportError, ParsedPie, ParsedPosition
from app.services.openrouter import LogResult, call_llm
# ---------------------------------------------------------------------------
# Module-level constants
@ -31,11 +35,41 @@ from app.services.csv_import import CSVImportError, ParsedPie, ParsedPosition
# Real broker preambles are typically 1-10 lines.
_MAX_PREAMBLE_SCAN = 30
# Number of sample rows to send to the LLM and max token budget for the reply.
_LLM_SAMPLES = 5
_LLM_MAX_TOKENS = 400
# Required and optional keys in the LLM-returned column mapping.
_REQUIRED_MAPPING_KEYS = ("ticker_col", "qty_col")
_OPTIONAL_MAPPING_KEYS = ("name_col", "cost_col", "currency_col")
_SYSTEM_PROMPT = """\
You are an expert at recognising broker portfolio CSV formats.
You will be given the header row and 3-5 sample data rows from a CSV.
Identify which column contains each field. Return ONLY a single JSON
object, no prose, no markdown fences.
Schema (use the EXACT header string from the input; use null if no
column is a good match):
{
"ticker_col": "<header name or null>",
"qty_col": "<header name or null>",
"name_col": "<header name or null>",
"cost_col": "<header name or null>", // average price per share
"currency_col": "<header name or null>",
"broker_label": "<short identifier like 'IBKR Activity Statement' or null>"
}
Rules:
- ticker_col and qty_col are required. If either is missing, return all nulls.
- Use the EXACT header string as it appears in the input do not paraphrase.
- Output JSON ONLY. No prose, no code fences.
"""
class LLMParseError(CSVImportError):
"""Raised when the LLM call fails or returns an unusable mapping.
@ -251,3 +285,47 @@ def _apply_mapping(
value=None,
result=None,
)
def _build_user_prompt(headers: list[str], samples: list[list[str]]) -> str:
lines = ["headers: " + json.dumps(headers)]
lines.append("samples:")
for s in samples[:_LLM_SAMPLES]:
lines.append(" " + ",".join(s))
return "\n".join(lines)
async def _extract_mapping_via_llm(
client: httpx.AsyncClient,
headers: list[str],
samples: list[list[str]],
) -> tuple[dict, LogResult]:
"""Single LLM call returning ``(mapping_dict, LogResult)``.
The LLM is asked for a strict JSON object (no markdown). We attempt
to parse the returned content; ``LLMParseError`` wraps any failure
in a way callers can surface to the user."""
messages = [
{"role": "system", "content": _SYSTEM_PROMPT},
{"role": "user", "content": _build_user_prompt(headers, samples)},
]
try:
result = await call_llm(client, messages, max_tokens=_LLM_MAX_TOKENS)
except Exception as e:
raise LLMParseError(f"LLM provider failed: {e}") from e
content = (result.content or "").strip()
# Strip code fences if the model added them despite instructions.
if content.startswith("```"):
content = content.strip("`")
# Drop optional 'json' language tag.
if content.lstrip().lower().startswith("json"):
content = content.lstrip()[4:]
content = content.strip()
try:
mapping = json.loads(content)
except json.JSONDecodeError as e:
raise LLMParseError(f"LLM did not return valid JSON: {e}") from e
if not isinstance(mapping, dict):
raise LLMParseError("LLM JSON was not an object")
return mapping, result