csv-parser: add public parse_with_llm with cache hit/miss orchestration
Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
parent
c77b3564f3
commit
59b28506df
2 changed files with 260 additions and 0 deletions
|
|
@ -23,7 +23,12 @@ import io
|
|||
import json
|
||||
|
||||
import httpx
|
||||
from sqlalchemy import select
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from app.db import utcnow
|
||||
from app.logging import get_logger
|
||||
from app.models import CsvFormatTemplate
|
||||
from app.services.csv_import import CSVImportError, ParsedPie, ParsedPosition
|
||||
from app.services.openrouter import LogResult, call_llm
|
||||
|
||||
|
|
@ -43,6 +48,11 @@ _LLM_MAX_TOKENS = 400
|
|||
_REQUIRED_MAPPING_KEYS = ("ticker_col", "qty_col")
|
||||
_OPTIONAL_MAPPING_KEYS = ("name_col", "cost_col", "currency_col")
|
||||
|
||||
# Maximum CSV payload size accepted by parse_with_llm.
|
||||
_MAX_CSV_BYTES = 1_048_576
|
||||
|
||||
log = get_logger("llm_csv_parser")
|
||||
|
||||
|
||||
_SYSTEM_PROMPT = """\
|
||||
You are an expert at recognising broker portfolio CSV formats.
|
||||
|
|
@ -329,3 +339,87 @@ async def _extract_mapping_via_llm(
|
|||
if not isinstance(mapping, dict):
|
||||
raise LLMParseError("LLM JSON was not an object")
|
||||
return mapping, result
|
||||
|
||||
|
||||
async def parse_with_llm(raw: bytes, session: AsyncSession) -> ParsedPie:
|
||||
"""Cache-first LLM-fallback CSV parse.
|
||||
|
||||
On cache hit, applies the stored mapping deterministically and
|
||||
increments ``use_count``. On cache miss, calls the LLM, validates
|
||||
the returned mapping against the first data row, and persists a
|
||||
new ``CsvFormatTemplate``. Raises ``LLMParseError`` on any
|
||||
failure; the caller (route layer) maps that to a 400."""
|
||||
if len(raw) > _MAX_CSV_BYTES:
|
||||
raise LLMParseError("CSV too large (1 MB max)")
|
||||
if not raw or not raw.strip():
|
||||
raise LLMParseError("empty CSV")
|
||||
|
||||
delimiter, preamble_rows = _detect_dialect(raw)
|
||||
text = _decode_raw(raw)
|
||||
|
||||
reader = csv.reader(io.StringIO(text), delimiter=delimiter)
|
||||
rows = list(reader)
|
||||
if preamble_rows >= len(rows):
|
||||
raise LLMParseError("no header row found in CSV")
|
||||
headers = [c.strip() for c in rows[preamble_rows]]
|
||||
data_rows = rows[preamble_rows + 1:]
|
||||
if not headers:
|
||||
raise LLMParseError("empty header row")
|
||||
|
||||
first_data_row = next(
|
||||
(r for r in data_rows if any(c.strip() for c in r)), None,
|
||||
)
|
||||
if first_data_row is None:
|
||||
raise LLMParseError("CSV contains a header but no data rows")
|
||||
|
||||
fp = _fingerprint(headers)
|
||||
existing = (await session.execute(
|
||||
select(CsvFormatTemplate).where(CsvFormatTemplate.fingerprint == fp)
|
||||
)).scalar_one_or_none()
|
||||
|
||||
if existing is not None:
|
||||
log.info("csv.format.cache_hit", fingerprint=fp,
|
||||
broker_label=existing.broker_label, use_count=existing.use_count)
|
||||
pie = _apply_mapping(headers, data_rows, existing.mapping)
|
||||
if not pie.positions:
|
||||
raise LLMParseError(
|
||||
"cached mapping produced no positions — the broker may have "
|
||||
"changed their CSV shape; ask the operator to evict the "
|
||||
"stale template"
|
||||
)
|
||||
existing.use_count += 1
|
||||
existing.last_used_at = utcnow()
|
||||
await session.commit()
|
||||
return pie
|
||||
|
||||
log.info("csv.format.cache_miss", fingerprint=fp,
|
||||
header_count=len(headers))
|
||||
samples = [r for r in data_rows[:_LLM_SAMPLES] if any(c.strip() for c in r)]
|
||||
async with httpx.AsyncClient(follow_redirects=True, timeout=30) as client:
|
||||
mapping, llm_log = await _extract_mapping_via_llm(client, headers, samples)
|
||||
_validate_mapping(mapping, headers, first_data_row)
|
||||
|
||||
pie = _apply_mapping(headers, data_rows, mapping)
|
||||
if not pie.positions:
|
||||
raise LLMParseError(
|
||||
"LLM mapping validated but produced no positions — the file "
|
||||
"may not contain portfolio data"
|
||||
)
|
||||
|
||||
now = utcnow()
|
||||
session.add(CsvFormatTemplate(
|
||||
fingerprint=fp,
|
||||
headers=headers,
|
||||
sample_row=first_data_row,
|
||||
mapping=mapping,
|
||||
preamble_rows=preamble_rows,
|
||||
delimiter=delimiter,
|
||||
broker_label=mapping.get("broker_label"),
|
||||
first_seen_at=now,
|
||||
last_used_at=now,
|
||||
use_count=1,
|
||||
llm_model=llm_log.model,
|
||||
llm_cost_usd=llm_log.cost_usd,
|
||||
))
|
||||
await session.commit()
|
||||
return pie
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue