"""Unit + integration tests for the LLM-fallback CSV parser.""" from __future__ import annotations import pytest def test_csv_format_template_model_columns(): """Model exposes every column the spec requires, with correct types.""" from sqlalchemy import inspect from app.models import CsvFormatTemplate cols = {c.name: c for c in inspect(CsvFormatTemplate).columns} assert "fingerprint" in cols assert "headers" in cols assert "sample_row" in cols assert "mapping" in cols assert "preamble_rows" in cols assert "delimiter" in cols assert "broker_label" in cols assert "first_seen_at" in cols assert "use_count" in cols assert "last_used_at" in cols assert "llm_model" in cols assert "llm_cost_usd" in cols # Crucially, no user attribution. assert "user_id" not in cols assert "first_seen_user_id" not in cols # Fingerprint is the cache key. assert cols["fingerprint"].unique is True assert cols["fingerprint"].nullable is False def test_fingerprint_stable_across_case_and_whitespace(): from app.services.llm_csv_parser import _fingerprint a = _fingerprint(["Symbol", "Quantity", "Avg Price"]) b = _fingerprint(["symbol", "quantity", "avg price"]) c = _fingerprint([" SYMBOL ", "Quantity", " AVG PRICE"]) assert a == b == c def test_fingerprint_differs_for_different_columns(): from app.services.llm_csv_parser import _fingerprint a = _fingerprint(["Symbol", "Quantity"]) b = _fingerprint(["Symbol", "Quantity", "Avg Price"]) assert a != b def test_fingerprint_is_sha256_hex_64_chars(): from app.services.llm_csv_parser import _fingerprint f = _fingerprint(["Symbol", "Quantity"]) assert len(f) == 64 assert all(c in "0123456789abcdef" for c in f) def test_detect_dialect_no_preamble_comma(): from app.services.llm_csv_parser import _detect_dialect raw = b"Symbol,Quantity,Avg Price\nAAPL,100,150.25\nMSFT,50,310.00\n" delimiter, preamble = _detect_dialect(raw) assert delimiter == "," assert preamble == 0 def test_detect_dialect_with_preamble(): from app.services.llm_csv_parser import _detect_dialect raw = ( b"Statement,Header,Field Name,Field Value\n" b"Statement,Data,BrokerName,Interactive Brokers LLC\n" b"Statement,Data,Title,Activity Statement\n" b"Statement,Data,Period,\"January 1, 2026 - January 31, 2026\"\n" b"Symbol,Quantity,Avg Price,Currency,Description\n" b"AAPL,100,150.25,USD,Apple Inc\n" ) delimiter, preamble = _detect_dialect(raw) assert delimiter == "," # The data-row header line is the FIFTH line (index 4); preamble = 4. assert preamble == 4 def test_detect_dialect_tab_delimited(): from app.services.llm_csv_parser import _detect_dialect raw = b"Symbol\tQuantity\tAvg Price\nAAPL\t100\t150.25\n" delimiter, preamble = _detect_dialect(raw) assert delimiter == "\t" assert preamble == 0 def test_detect_dialect_empty_raises(): from app.services.llm_csv_parser import LLMParseError, _detect_dialect with pytest.raises(LLMParseError): _detect_dialect(b"")