"""Unit + integration tests for the LLM-fallback CSV parser.""" from __future__ import annotations import pytest def test_csv_format_template_model_columns(): """Model exposes every column the spec requires, with correct types.""" from sqlalchemy import inspect from app.models import CsvFormatTemplate cols = {c.name: c for c in inspect(CsvFormatTemplate).columns} assert "fingerprint" in cols assert "headers" in cols assert "sample_row" in cols assert "mapping" in cols assert "preamble_rows" in cols assert "delimiter" in cols assert "broker_label" in cols assert "first_seen_at" in cols assert "use_count" in cols assert "last_used_at" in cols assert "llm_model" in cols assert "llm_cost_usd" in cols # Crucially, no user attribution. assert "user_id" not in cols assert "first_seen_user_id" not in cols # Fingerprint is the cache key. assert cols["fingerprint"].unique is True assert cols["fingerprint"].nullable is False def test_fingerprint_stable_across_case_and_whitespace(): from app.services.llm_csv_parser import _fingerprint a = _fingerprint(["Symbol", "Quantity", "Avg Price"]) b = _fingerprint(["symbol", "quantity", "avg price"]) c = _fingerprint([" SYMBOL ", "Quantity", " AVG PRICE"]) assert a == b == c def test_fingerprint_differs_for_different_columns(): from app.services.llm_csv_parser import _fingerprint a = _fingerprint(["Symbol", "Quantity"]) b = _fingerprint(["Symbol", "Quantity", "Avg Price"]) assert a != b def test_fingerprint_is_sha256_hex_64_chars(): from app.services.llm_csv_parser import _fingerprint f = _fingerprint(["Symbol", "Quantity"]) assert len(f) == 64 assert all(c in "0123456789abcdef" for c in f) def test_detect_dialect_no_preamble_comma(): from app.services.llm_csv_parser import _detect_dialect raw = b"Symbol,Quantity,Avg Price\nAAPL,100,150.25\nMSFT,50,310.00\n" delimiter, preamble = _detect_dialect(raw) assert delimiter == "," assert preamble == 0 def test_detect_dialect_with_preamble(): from app.services.llm_csv_parser import _detect_dialect raw = ( b"Statement,Header,Field Name,Field Value\n" b"Statement,Data,BrokerName,Interactive Brokers LLC\n" b"Statement,Data,Title,Activity Statement\n" b"Statement,Data,Period,\"January 1, 2026 - January 31, 2026\"\n" b"Symbol,Quantity,Avg Price,Currency,Description\n" b"AAPL,100,150.25,USD,Apple Inc\n" ) delimiter, preamble = _detect_dialect(raw) assert delimiter == "," # The data-row header line is the FIFTH line (index 4); preamble = 4. assert preamble == 4 def test_detect_dialect_tab_delimited(): from app.services.llm_csv_parser import _detect_dialect raw = b"Symbol\tQuantity\tAvg Price\nAAPL\t100\t150.25\n" delimiter, preamble = _detect_dialect(raw) assert delimiter == "\t" assert preamble == 0 def test_detect_dialect_empty_raises(): from app.services.llm_csv_parser import LLMParseError, _detect_dialect with pytest.raises(LLMParseError): _detect_dialect(b"") def test_validate_mapping_accepts_well_formed(): from app.services.llm_csv_parser import _validate_mapping headers = ["Symbol", "Quantity", "Avg Price", "Currency"] first_row = ["AAPL", "100", "150.25", "USD"] mapping = { "ticker_col": "Symbol", "qty_col": "Quantity", "cost_col": "Avg Price", "currency_col": "Currency", "name_col": None, } _validate_mapping(mapping, headers, first_row) # no raise def test_validate_mapping_missing_ticker_raises(): from app.services.llm_csv_parser import LLMParseError, _validate_mapping headers = ["Symbol", "Quantity"] first_row = ["AAPL", "100"] mapping = {"ticker_col": None, "qty_col": "Quantity"} with pytest.raises(LLMParseError, match="ticker"): _validate_mapping(mapping, headers, first_row) def test_validate_mapping_missing_qty_raises(): from app.services.llm_csv_parser import LLMParseError, _validate_mapping headers = ["Symbol", "Quantity"] first_row = ["AAPL", "100"] mapping = {"ticker_col": "Symbol", "qty_col": None} with pytest.raises(LLMParseError, match="qty"): _validate_mapping(mapping, headers, first_row) def test_validate_mapping_unknown_column_raises(): from app.services.llm_csv_parser import LLMParseError, _validate_mapping headers = ["Symbol", "Quantity"] first_row = ["AAPL", "100"] mapping = {"ticker_col": "Symbol", "qty_col": "NotARealColumn"} with pytest.raises(LLMParseError, match="NotARealColumn"): _validate_mapping(mapping, headers, first_row) def test_validate_mapping_non_numeric_qty_raises(): from app.services.llm_csv_parser import LLMParseError, _validate_mapping headers = ["Symbol", "Description"] first_row = ["AAPL", "Apple Inc"] # Mapping says qty is "Description", but "Apple Inc" can't parse as a number. mapping = {"ticker_col": "Symbol", "qty_col": "Description"} with pytest.raises(LLMParseError, match="numeric"): _validate_mapping(mapping, headers, first_row) def test_apply_mapping_builds_parsed_pie(): from app.services.csv_import import ParsedPie, ParsedPosition from app.services.llm_csv_parser import _apply_mapping headers = ["Symbol", "Quantity", "Avg Price", "Currency", "Description"] data_rows = [ ["AAPL", "100", "150.25", "USD", "Apple Inc"], ["MSFT", "50", "310.00", "USD", "Microsoft Corp"], ] mapping = { "ticker_col": "Symbol", "qty_col": "Quantity", "cost_col": "Avg Price", "currency_col": "Currency", "name_col": "Description", } pie = _apply_mapping(headers, data_rows, mapping) assert isinstance(pie, ParsedPie) assert len(pie.positions) == 2 p0 = pie.positions[0] assert isinstance(p0, ParsedPosition) assert p0.slice == "AAPL" assert p0.name == "Apple Inc" assert p0.quantity == 100.0 assert p0.invested_value == pytest.approx(15025.0) # invested = qty * avg_cost = 100 * 150.25 = 15025 assert pie.invested == pytest.approx(15025.0 + 50 * 310.00) def test_apply_mapping_handles_missing_optional_columns(): from app.services.llm_csv_parser import _apply_mapping headers = ["Symbol", "Quantity"] data_rows = [["AAPL", "100"]] mapping = { "ticker_col": "Symbol", "qty_col": "Quantity", "cost_col": None, "currency_col": None, "name_col": None, } pie = _apply_mapping(headers, data_rows, mapping) p = pie.positions[0] assert p.slice == "AAPL" assert p.quantity == 100.0 assert p.invested_value is None assert p.name == "AAPL" # falls back to ticker when name_col absent def test_apply_mapping_skips_blank_and_unparseable_rows(): from app.services.llm_csv_parser import _apply_mapping headers = ["Symbol", "Quantity"] data_rows = [ ["AAPL", "100"], ["", ""], # blank ["MSFT", "not-a-number"], # bad qty ["NVDA", "40"], ] mapping = {"ticker_col": "Symbol", "qty_col": "Quantity"} pie = _apply_mapping(headers, data_rows, mapping) assert [p.slice for p in pie.positions] == ["AAPL", "NVDA"] @pytest.mark.asyncio async def test_extract_mapping_via_llm_parses_valid_json(): from unittest.mock import AsyncMock, MagicMock from app.services.llm_csv_parser import _extract_mapping_via_llm from app.services.openrouter import LogResult fake_result = LogResult( content='{"ticker_col": "Symbol", "qty_col": "Quantity", ' '"cost_col": "Avg Price", "currency_col": "Currency", ' '"name_col": null, "broker_label": "IBKR Activity Statement"}', model="deepseek/deepseek-v4-flash", prompt_tokens=100, completion_tokens=50, cost_usd=0.0001, ) fake_client = MagicMock() fake_call_llm = AsyncMock(return_value=fake_result) import app.services.llm_csv_parser as mod mod.call_llm = fake_call_llm # monkeypatch headers = ["Symbol", "Quantity", "Avg Price", "Currency"] samples = [["AAPL", "100", "150.25", "USD"]] mapping, log = await _extract_mapping_via_llm(fake_client, headers, samples) assert mapping["ticker_col"] == "Symbol" assert mapping["qty_col"] == "Quantity" assert mapping["broker_label"] == "IBKR Activity Statement" assert log.model == "deepseek/deepseek-v4-flash" fake_call_llm.assert_awaited_once() @pytest.mark.asyncio async def test_extract_mapping_via_llm_malformed_json_raises(): from unittest.mock import AsyncMock, MagicMock from app.services.llm_csv_parser import LLMParseError, _extract_mapping_via_llm from app.services.openrouter import LogResult fake_result = LogResult( content="Sure thing — here is the mapping! ticker=Symbol", model="deepseek/deepseek-v4-flash", prompt_tokens=10, completion_tokens=20, cost_usd=0.00005, ) fake_client = MagicMock() fake_call_llm = AsyncMock(return_value=fake_result) import app.services.llm_csv_parser as mod mod.call_llm = fake_call_llm with pytest.raises(LLMParseError, match="JSON"): await _extract_mapping_via_llm(fake_client, ["Symbol"], [["AAPL"]]) @pytest.mark.asyncio async def test_extract_mapping_via_llm_provider_failure_wraps(): from unittest.mock import AsyncMock, MagicMock from app.services.llm_csv_parser import LLMParseError, _extract_mapping_via_llm fake_client = MagicMock() fake_call_llm = AsyncMock(side_effect=RuntimeError("provider down")) import app.services.llm_csv_parser as mod mod.call_llm = fake_call_llm with pytest.raises(LLMParseError, match="provider"): await _extract_mapping_via_llm(fake_client, ["Symbol"], [["AAPL"]])