csv-parser: add public parse_with_llm with cache hit/miss orchestration
Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
parent
c77b3564f3
commit
59b28506df
2 changed files with 260 additions and 0 deletions
|
|
@ -4,6 +4,27 @@ from __future__ import annotations
|
|||
import pytest
|
||||
|
||||
|
||||
def _build_session_factory(tmp_path):
|
||||
"""Spin up a fresh in-memory schema and return (engine, factory).
|
||||
Matches the pattern used in tests/test_referral_conversion.py."""
|
||||
from sqlalchemy.ext.asyncio import async_sessionmaker, create_async_engine
|
||||
|
||||
from app import db as db_mod
|
||||
from app.db import Base
|
||||
import app.models # noqa: F401 — registers models on Base.metadata
|
||||
|
||||
engine = create_async_engine(f"sqlite+aiosqlite:///{tmp_path}/csv.db")
|
||||
factory = async_sessionmaker(engine, expire_on_commit=False)
|
||||
db_mod._engine = engine
|
||||
db_mod._session_factory = factory
|
||||
|
||||
async def _setup():
|
||||
async with engine.begin() as conn:
|
||||
await conn.run_sync(Base.metadata.create_all)
|
||||
|
||||
return engine, factory, _setup
|
||||
|
||||
|
||||
def test_csv_format_template_model_columns():
|
||||
"""Model exposes every column the spec requires, with correct types."""
|
||||
from sqlalchemy import inspect
|
||||
|
|
@ -290,3 +311,148 @@ async def test_extract_mapping_via_llm_provider_failure_wraps():
|
|||
|
||||
with pytest.raises(LLMParseError, match="provider"):
|
||||
await _extract_mapping_via_llm(fake_client, ["Symbol"], [["AAPL"]])
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_parse_with_llm_cache_miss_inserts_template(tmp_path):
|
||||
from unittest.mock import AsyncMock
|
||||
from sqlalchemy import select
|
||||
|
||||
from app.models import CsvFormatTemplate
|
||||
from app.services.llm_csv_parser import parse_with_llm
|
||||
from app.services.openrouter import LogResult
|
||||
|
||||
_, factory, setup = _build_session_factory(tmp_path)
|
||||
await setup()
|
||||
|
||||
raw = (
|
||||
b"Symbol,Quantity,Avg Price,Currency\n"
|
||||
b"AAPL,100,150.25,USD\n"
|
||||
b"MSFT,50,310.00,USD\n"
|
||||
)
|
||||
|
||||
import app.services.llm_csv_parser as mod
|
||||
mod.call_llm = AsyncMock(return_value=LogResult(
|
||||
content='{"ticker_col":"Symbol","qty_col":"Quantity",'
|
||||
'"cost_col":"Avg Price","currency_col":"Currency",'
|
||||
'"name_col":null,"broker_label":"Generic broker"}',
|
||||
model="deepseek/deepseek-v4-flash",
|
||||
prompt_tokens=120, completion_tokens=40, cost_usd=0.0002,
|
||||
))
|
||||
|
||||
async with factory() as session:
|
||||
pie = await parse_with_llm(raw, session)
|
||||
|
||||
assert len(pie.positions) == 2
|
||||
assert pie.positions[0].slice == "AAPL"
|
||||
|
||||
async with factory() as session:
|
||||
rows = (await session.execute(select(CsvFormatTemplate))).scalars().all()
|
||||
assert len(rows) == 1
|
||||
tmpl = rows[0]
|
||||
assert tmpl.headers == ["Symbol", "Quantity", "Avg Price", "Currency"]
|
||||
assert tmpl.sample_row == ["AAPL", "100", "150.25", "USD"]
|
||||
assert tmpl.mapping["ticker_col"] == "Symbol"
|
||||
assert tmpl.broker_label == "Generic broker"
|
||||
assert tmpl.use_count == 1
|
||||
assert tmpl.llm_cost_usd == pytest.approx(0.0002)
|
||||
# The crucial PII guarantee:
|
||||
assert not hasattr(tmpl, "user_id"), "sample row must not be linked to a user"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_parse_with_llm_cache_hit_skips_llm(tmp_path):
|
||||
from unittest.mock import AsyncMock
|
||||
from sqlalchemy import select
|
||||
|
||||
from app.db import utcnow
|
||||
from app.models import CsvFormatTemplate
|
||||
from app.services.llm_csv_parser import _fingerprint, parse_with_llm
|
||||
|
||||
_, factory, setup = _build_session_factory(tmp_path)
|
||||
await setup()
|
||||
|
||||
headers = ["Symbol", "Quantity", "Avg Price", "Currency"]
|
||||
fp = _fingerprint(headers)
|
||||
|
||||
# Pre-populate a cache hit row.
|
||||
async with factory() as session:
|
||||
session.add(CsvFormatTemplate(
|
||||
fingerprint=fp,
|
||||
headers=headers,
|
||||
sample_row=["AAPL", "100", "150.25", "USD"],
|
||||
mapping={
|
||||
"ticker_col": "Symbol", "qty_col": "Quantity",
|
||||
"cost_col": "Avg Price", "currency_col": "Currency",
|
||||
"name_col": None,
|
||||
},
|
||||
preamble_rows=0,
|
||||
delimiter=",",
|
||||
broker_label="Cached broker",
|
||||
first_seen_at=utcnow(),
|
||||
last_used_at=utcnow(),
|
||||
use_count=1,
|
||||
llm_model="seed",
|
||||
llm_cost_usd=0.0,
|
||||
))
|
||||
await session.commit()
|
||||
|
||||
raw = (
|
||||
b"Symbol,Quantity,Avg Price,Currency\n"
|
||||
b"NVDA,40,425.50,USD\n"
|
||||
)
|
||||
|
||||
import app.services.llm_csv_parser as mod
|
||||
mod.call_llm = AsyncMock(side_effect=AssertionError("call_llm must NOT be called on cache hit"))
|
||||
|
||||
async with factory() as session:
|
||||
pie = await parse_with_llm(raw, session)
|
||||
|
||||
assert pie.positions[0].slice == "NVDA"
|
||||
|
||||
async with factory() as session:
|
||||
rows = (await session.execute(select(CsvFormatTemplate))).scalars().all()
|
||||
assert len(rows) == 1
|
||||
assert rows[0].use_count == 2
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_parse_with_llm_stale_mapping_raises_but_does_not_evict(tmp_path):
|
||||
from unittest.mock import AsyncMock
|
||||
from sqlalchemy import select
|
||||
|
||||
from app.db import utcnow
|
||||
from app.models import CsvFormatTemplate
|
||||
from app.services.llm_csv_parser import LLMParseError, _fingerprint, parse_with_llm
|
||||
|
||||
_, factory, setup = _build_session_factory(tmp_path)
|
||||
await setup()
|
||||
|
||||
headers = ["Symbol", "Quantity"]
|
||||
fp = _fingerprint(headers)
|
||||
# Cached mapping says qty is in column "Symbol" — clearly wrong; will
|
||||
# never produce a parseable row.
|
||||
async with factory() as session:
|
||||
session.add(CsvFormatTemplate(
|
||||
fingerprint=fp, headers=headers,
|
||||
sample_row=["AAPL", "100"],
|
||||
mapping={"ticker_col": "Symbol", "qty_col": "Symbol"},
|
||||
preamble_rows=0, delimiter=",", broker_label=None,
|
||||
first_seen_at=utcnow(), last_used_at=utcnow(), use_count=1,
|
||||
llm_model="seed", llm_cost_usd=0.0,
|
||||
))
|
||||
await session.commit()
|
||||
|
||||
raw = b"Symbol,Quantity\nAAPL,100\nMSFT,50\n"
|
||||
|
||||
import app.services.llm_csv_parser as mod
|
||||
mod.call_llm = AsyncMock(side_effect=AssertionError("must not be called"))
|
||||
|
||||
async with factory() as session:
|
||||
with pytest.raises(LLMParseError):
|
||||
await parse_with_llm(raw, session)
|
||||
|
||||
# Stale template must NOT have been auto-deleted (operator owns eviction).
|
||||
async with factory() as session:
|
||||
rows = (await session.execute(select(CsvFormatTemplate))).scalars().all()
|
||||
assert len(rows) == 1
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue