read.markets/tests/test_llm_csv_parser.py

"""Unit + integration tests for the LLM-fallback CSV parser."""
from __future__ import annotations

import pytest


def test_csv_format_template_model_columns():
    """Model exposes every column the spec requires, with correct types."""
    from sqlalchemy import inspect

    from app.models import CsvFormatTemplate

    cols = {c.name: c for c in inspect(CsvFormatTemplate).columns}
    assert "fingerprint" in cols
    assert "headers" in cols
    assert "sample_row" in cols
    assert "mapping" in cols
    assert "preamble_rows" in cols
    assert "delimiter" in cols
    assert "broker_label" in cols
    assert "first_seen_at" in cols
    assert "use_count" in cols
    assert "last_used_at" in cols
    assert "model" in cols
    assert "cost_usd" in cols
    assert "prompt_tokens" in cols
    assert "completion_tokens" in cols
    # Crucially, no user attribution.
    assert "user_id" not in cols
    assert "first_seen_user_id" not in cols
    # Fingerprint is the cache key.
    assert cols["fingerprint"].unique is True
    assert cols["fingerprint"].nullable is False


def test_fingerprint_stable_across_case_and_whitespace():
    from app.services.llm_csv_parser import _fingerprint

    a = _fingerprint(["Symbol", "Quantity", "Avg Price"])
    b = _fingerprint(["symbol", "quantity", "avg price"])
    c = _fingerprint(["  SYMBOL ", "Quantity", " AVG PRICE"])
    assert a == b == c


def test_fingerprint_differs_for_different_columns():
    from app.services.llm_csv_parser import _fingerprint

    a = _fingerprint(["Symbol", "Quantity"])
    b = _fingerprint(["Symbol", "Quantity", "Avg Price"])
    assert a != b


def test_fingerprint_is_sha256_hex_64_chars():
    from app.services.llm_csv_parser import _fingerprint

    f = _fingerprint(["Symbol", "Quantity"])
    assert len(f) == 64
    assert all(c in "0123456789abcdef" for c in f)


def test_detect_dialect_no_preamble_comma():
    from app.services.llm_csv_parser import _detect_dialect

    raw = b"Symbol,Quantity,Avg Price\nAAPL,100,150.25\nMSFT,50,310.00\n"
    delimiter, preamble = _detect_dialect(raw)
    assert delimiter == ","
    assert preamble == 0


def test_detect_dialect_with_preamble():
    from app.services.llm_csv_parser import _detect_dialect

    raw = (
        b"Statement,Header,Field Name,Field Value\n"
        b"Statement,Data,BrokerName,Interactive Brokers LLC\n"
        b"Statement,Data,Title,Activity Statement\n"
        b"Statement,Data,Period,\"January 1, 2026 - January 31, 2026\"\n"
        b"Symbol,Quantity,Avg Price,Currency,Description\n"
        b"AAPL,100,150.25,USD,Apple Inc\n"
    )
    delimiter, preamble = _detect_dialect(raw)
    assert delimiter == ","
    # The data-row header line is the FIFTH line (index 4); preamble = 4.
    assert preamble == 4


def test_detect_dialect_tab_delimited():
    from app.services.llm_csv_parser import _detect_dialect

    raw = b"Symbol\tQuantity\tAvg Price\nAAPL\t100\t150.25\n"
    delimiter, preamble = _detect_dialect(raw)
    assert delimiter == "\t"
    assert preamble == 0


def test_detect_dialect_empty_raises():
    from app.services.llm_csv_parser import LLMParseError, _detect_dialect

    with pytest.raises(LLMParseError):
        _detect_dialect(b"")


def test_validate_mapping_accepts_well_formed():
    from app.services.llm_csv_parser import _validate_mapping

    headers = ["Symbol", "Quantity", "Avg Price", "Currency"]
    first_row = ["AAPL", "100", "150.25", "USD"]
    mapping = {
        "ticker_col": "Symbol",
        "qty_col": "Quantity",
        "cost_col": "Avg Price",
        "currency_col": "Currency",
        "name_col": None,
    }
    _validate_mapping(mapping, headers, first_row)  # no raise


def test_validate_mapping_missing_ticker_raises():
    from app.services.llm_csv_parser import LLMParseError, _validate_mapping

    headers = ["Symbol", "Quantity"]
    first_row = ["AAPL", "100"]
    mapping = {"ticker_col": None, "qty_col": "Quantity"}
    with pytest.raises(LLMParseError, match="ticker"):
        _validate_mapping(mapping, headers, first_row)


def test_validate_mapping_missing_qty_raises():
    from app.services.llm_csv_parser import LLMParseError, _validate_mapping

    headers = ["Symbol", "Quantity"]
    first_row = ["AAPL", "100"]
    mapping = {"ticker_col": "Symbol", "qty_col": None}
    with pytest.raises(LLMParseError, match="qty"):
        _validate_mapping(mapping, headers, first_row)


def test_validate_mapping_unknown_column_raises():
    from app.services.llm_csv_parser import LLMParseError, _validate_mapping

    headers = ["Symbol", "Quantity"]
    first_row = ["AAPL", "100"]
    mapping = {"ticker_col": "Symbol", "qty_col": "NotARealColumn"}
    with pytest.raises(LLMParseError, match="NotARealColumn"):
        _validate_mapping(mapping, headers, first_row)


def test_validate_mapping_non_numeric_qty_raises():
    from app.services.llm_csv_parser import LLMParseError, _validate_mapping

    headers = ["Symbol", "Description"]
    first_row = ["AAPL", "Apple Inc"]
    # Mapping says qty is "Description", but "Apple Inc" can't parse as a number.
    mapping = {"ticker_col": "Symbol", "qty_col": "Description"}
    with pytest.raises(LLMParseError, match="numeric"):
        _validate_mapping(mapping, headers, first_row)


def test_apply_mapping_builds_parsed_pie():
    from app.services.csv_import import ParsedPie, ParsedPosition
    from app.services.llm_csv_parser import _apply_mapping

    headers = ["Symbol", "Quantity", "Avg Price", "Currency", "Description"]
    data_rows = [
        ["AAPL", "100", "150.25", "USD", "Apple Inc"],
        ["MSFT", "50", "310.00", "USD", "Microsoft Corp"],
    ]
    mapping = {
        "ticker_col": "Symbol",
        "qty_col": "Quantity",
        "cost_col": "Avg Price",
        "currency_col": "Currency",
        "name_col": "Description",
    }

    pie = _apply_mapping(headers, data_rows, mapping)

    assert isinstance(pie, ParsedPie)
    assert len(pie.positions) == 2
    p0 = pie.positions[0]
    assert isinstance(p0, ParsedPosition)
    assert p0.slice == "AAPL"
    assert p0.name == "Apple Inc"
    assert p0.quantity == 100.0
    assert p0.invested_value == pytest.approx(15025.0)
    # invested = qty * avg_cost = 100 * 150.25 = 15025
    assert pie.invested == pytest.approx(15025.0 + 50 * 310.00)


def test_apply_mapping_handles_missing_optional_columns():
    from app.services.llm_csv_parser import _apply_mapping

    headers = ["Symbol", "Quantity"]
    data_rows = [["AAPL", "100"]]
    mapping = {
        "ticker_col": "Symbol",
        "qty_col": "Quantity",
        "cost_col": None,
        "currency_col": None,
        "name_col": None,
    }

    pie = _apply_mapping(headers, data_rows, mapping)
    p = pie.positions[0]
    assert p.slice == "AAPL"
    assert p.quantity == 100.0
    assert p.invested_value is None
    assert p.name == "AAPL"  # falls back to ticker when name_col absent


def test_apply_mapping_skips_blank_and_unparseable_rows():
    from app.services.llm_csv_parser import _apply_mapping

    headers = ["Symbol", "Quantity"]
    data_rows = [
        ["AAPL", "100"],
        ["", ""],                   # blank
        ["MSFT", "not-a-number"],   # bad qty
        ["NVDA", "40"],
    ]
    mapping = {"ticker_col": "Symbol", "qty_col": "Quantity"}

    pie = _apply_mapping(headers, data_rows, mapping)
    assert [p.slice for p in pie.positions] == ["AAPL", "NVDA"]


async def test_extract_mapping_via_llm_parses_valid_json():
    from unittest.mock import AsyncMock, MagicMock
    from app.services.llm_csv_parser import _extract_mapping_via_llm
    from app.services.openrouter import LogResult

    fake_result = LogResult(
        content='{"ticker_col": "Symbol", "qty_col": "Quantity", '
                '"cost_col": "Avg Price", "currency_col": "Currency", '
                '"name_col": null, "broker_label": "IBKR Activity Statement"}',
        model="deepseek/deepseek-v4-flash",
        prompt_tokens=100,
        completion_tokens=50,
        cost_usd=0.0001,
    )
    fake_client = MagicMock()
    fake_call_llm = AsyncMock(return_value=fake_result)

    import app.services.llm_csv_parser as mod
    mod.call_llm = fake_call_llm  # monkeypatch

    headers = ["Symbol", "Quantity", "Avg Price", "Currency"]
    samples = [["AAPL", "100", "150.25", "USD"]]
    mapping, log = await _extract_mapping_via_llm(fake_client, headers, samples)

    assert mapping["ticker_col"] == "Symbol"
    assert mapping["qty_col"] == "Quantity"
    assert mapping["broker_label"] == "IBKR Activity Statement"
    assert log.model == "deepseek/deepseek-v4-flash"
    fake_call_llm.assert_awaited_once()


async def test_extract_mapping_via_llm_malformed_json_raises():
    from unittest.mock import AsyncMock, MagicMock
    from app.services.llm_csv_parser import LLMParseError, _extract_mapping_via_llm
    from app.services.openrouter import LogResult

    fake_result = LogResult(
        content="Sure thing — here is the mapping!  ticker=Symbol",
        model="deepseek/deepseek-v4-flash",
        prompt_tokens=10,
        completion_tokens=20,
        cost_usd=0.00005,
    )
    fake_client = MagicMock()
    fake_call_llm = AsyncMock(return_value=fake_result)

    import app.services.llm_csv_parser as mod
    mod.call_llm = fake_call_llm

    with pytest.raises(LLMParseError, match="JSON"):
        await _extract_mapping_via_llm(fake_client, ["Symbol"], [["AAPL"]])


async def test_extract_mapping_via_llm_provider_failure_wraps():
    from unittest.mock import AsyncMock, MagicMock
    from app.services.llm_csv_parser import LLMParseError, _extract_mapping_via_llm

    fake_client = MagicMock()
    fake_call_llm = AsyncMock(side_effect=RuntimeError("provider down"))

    import app.services.llm_csv_parser as mod
    mod.call_llm = fake_call_llm

    with pytest.raises(LLMParseError, match="provider"):
        await _extract_mapping_via_llm(fake_client, ["Symbol"], [["AAPL"]])


async def test_parse_with_llm_cache_miss_inserts_template(db_factory):
    from unittest.mock import AsyncMock
    from sqlalchemy import select

    from app.models import CsvFormatTemplate
    from app.services.llm_csv_parser import parse_with_llm
    from app.services.openrouter import LogResult

    factory = db_factory

    raw = (
        b"Symbol,Quantity,Avg Price,Currency\n"
        b"AAPL,100,150.25,USD\n"
        b"MSFT,50,310.00,USD\n"
    )

    import app.services.llm_csv_parser as mod
    mod.call_llm = AsyncMock(return_value=LogResult(
        content='{"ticker_col":"Symbol","qty_col":"Quantity",'
                '"cost_col":"Avg Price","currency_col":"Currency",'
                '"name_col":null,"broker_label":"Generic broker"}',
        model="deepseek/deepseek-v4-flash",
        prompt_tokens=120, completion_tokens=40, cost_usd=0.0002,
    ))

    async with factory() as session:
        pie = await parse_with_llm(raw, session)

    assert len(pie.positions) == 2
    assert pie.positions[0].slice == "AAPL"

    async with factory() as session:
        rows = (await session.execute(select(CsvFormatTemplate))).scalars().all()
    assert len(rows) == 1
    tmpl = rows[0]
    assert tmpl.headers == ["Symbol", "Quantity", "Avg Price", "Currency"]
    assert tmpl.sample_row == ["AAPL", "100", "150.25", "USD"]
    assert tmpl.mapping["ticker_col"] == "Symbol"
    assert tmpl.broker_label == "Generic broker"
    assert tmpl.use_count == 1
    assert tmpl.cost_usd == pytest.approx(0.0002)
    # The crucial PII guarantee:
    assert not hasattr(tmpl, "user_id"), "sample row must not be linked to a user"


async def test_parse_with_llm_cache_hit_skips_llm(db_factory):
    from unittest.mock import AsyncMock
    from sqlalchemy import select

    from app.db import utcnow
    from app.models import CsvFormatTemplate
    from app.services.llm_csv_parser import _fingerprint, parse_with_llm

    factory = db_factory

    headers = ["Symbol", "Quantity", "Avg Price", "Currency"]
    fp = _fingerprint(headers)

    # Pre-populate a cache hit row.
    async with factory() as session:
        session.add(CsvFormatTemplate(
            fingerprint=fp,
            headers=headers,
            sample_row=["AAPL", "100", "150.25", "USD"],
            mapping={
                "ticker_col": "Symbol", "qty_col": "Quantity",
                "cost_col": "Avg Price", "currency_col": "Currency",
                "name_col": None,
            },
            preamble_rows=0,
            delimiter=",",
            broker_label="Cached broker",
            first_seen_at=utcnow(),
            last_used_at=utcnow(),
            use_count=1,
            model="seed",
            cost_usd=0.0,
        ))
        await session.commit()

    raw = (
        b"Symbol,Quantity,Avg Price,Currency\n"
        b"NVDA,40,425.50,USD\n"
    )

    import app.services.llm_csv_parser as mod
    mod.call_llm = AsyncMock(side_effect=AssertionError("call_llm must NOT be called on cache hit"))

    async with factory() as session:
        pie = await parse_with_llm(raw, session)

    assert pie.positions[0].slice == "NVDA"

    async with factory() as session:
        rows = (await session.execute(select(CsvFormatTemplate))).scalars().all()
    assert len(rows) == 1
    assert rows[0].use_count == 2


async def test_parse_with_llm_stale_mapping_raises_but_does_not_evict(db_factory):
    from unittest.mock import AsyncMock
    from sqlalchemy import select

    from app.db import utcnow
    from app.models import CsvFormatTemplate
    from app.services.llm_csv_parser import LLMParseError, _fingerprint, parse_with_llm

    factory = db_factory

    headers = ["Symbol", "Quantity"]
    fp = _fingerprint(headers)
    # Cached mapping says qty is in column "Symbol" — clearly wrong; will
    # never produce a parseable row.
    async with factory() as session:
        session.add(CsvFormatTemplate(
            fingerprint=fp, headers=headers,
            sample_row=["AAPL", "100"],
            mapping={"ticker_col": "Symbol", "qty_col": "Symbol"},
            preamble_rows=0, delimiter=",", broker_label=None,
            first_seen_at=utcnow(), last_used_at=utcnow(), use_count=1,
            model="seed", cost_usd=0.0,
        ))
        await session.commit()

    raw = b"Symbol,Quantity\nAAPL,100\nMSFT,50\n"

    import app.services.llm_csv_parser as mod
    mod.call_llm = AsyncMock(side_effect=AssertionError("must not be called"))

    async with factory() as session:
        with pytest.raises(LLMParseError):
            await parse_with_llm(raw, session)

    # Stale template must NOT have been auto-deleted (operator owns eviction).
    async with factory() as session:
        rows = (await session.execute(select(CsvFormatTemplate))).scalars().all()
    assert len(rows) == 1


async def test_parse_portfolio_route_falls_through_to_llm(db_factory, monkeypatch):
    """End-to-end: T212 parser raises CSVImportError, LLM fallback runs,
    response shape matches the existing JSON contract."""
    from io import BytesIO
    from types import SimpleNamespace
    from unittest.mock import AsyncMock

    from fastapi import UploadFile

    factory = db_factory

    import app.services.llm_csv_parser as mod
    from app.services.openrouter import LogResult
    mod.call_llm = AsyncMock(return_value=LogResult(
        content='{"ticker_col":"Symbol","qty_col":"Quantity",'
                '"cost_col":"Avg Price","currency_col":"Currency",'
                '"name_col":"Description",'
                '"broker_label":"IBKR Activity Statement"}',
        model="deepseek/deepseek-v4-flash",
        prompt_tokens=150, completion_tokens=60, cost_usd=0.0003,
    ))

    # The route's inline Yahoo-fetch block would otherwise hit the network.
    # Patch market.fetch to return a benign placeholder per ticker.
    from app.services import market as market_mod

    async def _fake_fetch(client, symbol, label, group, anchor):
        return SimpleNamespace(
            symbol=symbol, source="test", label=label,
            price=None, currency="USD", as_of="2026-05-27",
            changes=None, error=None,
        )
    monkeypatch.setattr(market_mod, "fetch", _fake_fetch)

    # ticker_universe.upsert_tickers uses MySQL ON DUPLICATE KEY UPDATE
    # which SQLite doesn't compile. Mock the two universe-side effects;
    # neither contributes to the JSON contract we're testing here.
    from app.services import ticker_universe as tu_mod

    async def _fake_upsert(session, tickers):
        return len(list(tickers))

    async def _fake_buffer(tickers):
        return len(list(tickers))

    monkeypatch.setattr(tu_mod, "upsert_tickers", _fake_upsert)
    monkeypatch.setattr(tu_mod, "buffer_tickers", _fake_buffer)

    raw = open("tests/fixtures/ibkr_sample.csv", "rb").read()
    upload = UploadFile(filename="ibkr.csv", file=BytesIO(raw))

    from app.routers.universe import parse_portfolio
    async with factory() as session:
        result = await parse_portfolio(file=upload, session=session)

    assert result["base_currency"] == "GBP"
    # All 5 IBKR positions should round-trip — the LLM path trusts the
    # Yahoo-ready tickers from the file and does NOT drop on a
    # resolve_slice miss (that's the T212 path's behaviour).
    tickers = {p["yahoo_ticker"] for p in result["positions"]}
    assert tickers == {"AAPL", "MSFT", "NVDA", "VOD.L", "ASML.AS"}
    # LLM was called exactly once (cache miss).
    assert mod.call_llm.await_count == 1
    # Currency comes from the LLM-mapped currency_col, falling back to
    # USD only when neither InstrumentMap nor the file specified one.
    by_t = {p["yahoo_ticker"]: p["currency"] for p in result["positions"]}
    assert by_t["VOD.L"] == "GBP"
    assert by_t["ASML.AS"] == "EUR"


def test_parse_portfolio_route_requires_paid():
    """Static check that the /portfolio/parse route is gated by require_paid."""
    from app.routers.universe import router
    from app.services.access import require_paid

    parse_route = next(
        r for r in router.routes
        if getattr(r, "path", "") == "/portfolio/parse"
    )
    # FastAPI stores each Depends(...) as a Dependant whose `.call` attribute
    # is the wrapped callable (`.dependency` is the older name, removed in
    # recent FastAPI versions).
    dep_callables = [d.call for d in parse_route.dependant.dependencies]
    assert require_paid in dep_callables, (
        "The /portfolio/parse route must have Depends(require_paid)"
    )