read.markets/tests/test_llm_csv_parser.py

"""Unit + integration tests for the LLM-fallback CSV parser."""
from __future__ import annotations

import pytest


def test_csv_format_template_model_columns():
    """Model exposes every column the spec requires, with correct types."""
    from sqlalchemy import inspect

    from app.models import CsvFormatTemplate

    cols = {c.name: c for c in inspect(CsvFormatTemplate).columns}
    assert "fingerprint" in cols
    assert "headers" in cols
    assert "sample_row" in cols
    assert "mapping" in cols
    assert "preamble_rows" in cols
    assert "delimiter" in cols
    assert "broker_label" in cols
    assert "first_seen_at" in cols
    assert "use_count" in cols
    assert "last_used_at" in cols
    assert "llm_model" in cols
    assert "llm_cost_usd" in cols
    # Crucially, no user attribution.
    assert "user_id" not in cols
    assert "first_seen_user_id" not in cols
    # Fingerprint is the cache key.
    assert cols["fingerprint"].unique is True
    assert cols["fingerprint"].nullable is False


def test_fingerprint_stable_across_case_and_whitespace():
    from app.services.llm_csv_parser import _fingerprint

    a = _fingerprint(["Symbol", "Quantity", "Avg Price"])
    b = _fingerprint(["symbol", "quantity", "avg price"])
    c = _fingerprint(["  SYMBOL ", "Quantity", " AVG PRICE"])
    assert a == b == c


def test_fingerprint_differs_for_different_columns():
    from app.services.llm_csv_parser import _fingerprint

    a = _fingerprint(["Symbol", "Quantity"])
    b = _fingerprint(["Symbol", "Quantity", "Avg Price"])
    assert a != b


def test_fingerprint_is_sha256_hex_64_chars():
    from app.services.llm_csv_parser import _fingerprint

    f = _fingerprint(["Symbol", "Quantity"])
    assert len(f) == 64
    assert all(c in "0123456789abcdef" for c in f)


def test_detect_dialect_no_preamble_comma():
    from app.services.llm_csv_parser import _detect_dialect

    raw = b"Symbol,Quantity,Avg Price\nAAPL,100,150.25\nMSFT,50,310.00\n"
    delimiter, preamble = _detect_dialect(raw)
    assert delimiter == ","
    assert preamble == 0


def test_detect_dialect_with_preamble():
    from app.services.llm_csv_parser import _detect_dialect

    raw = (
        b"Statement,Header,Field Name,Field Value\n"
        b"Statement,Data,BrokerName,Interactive Brokers LLC\n"
        b"Statement,Data,Title,Activity Statement\n"
        b"Statement,Data,Period,\"January 1, 2026 - January 31, 2026\"\n"
        b"Symbol,Quantity,Avg Price,Currency,Description\n"
        b"AAPL,100,150.25,USD,Apple Inc\n"
    )
    delimiter, preamble = _detect_dialect(raw)
    assert delimiter == ","
    # The data-row header line is the FIFTH line (index 4); preamble = 4.
    assert preamble == 4


def test_detect_dialect_tab_delimited():
    from app.services.llm_csv_parser import _detect_dialect

    raw = b"Symbol\tQuantity\tAvg Price\nAAPL\t100\t150.25\n"
    delimiter, preamble = _detect_dialect(raw)
    assert delimiter == "\t"
    assert preamble == 0


def test_detect_dialect_empty_raises():
    from app.services.llm_csv_parser import LLMParseError, _detect_dialect

    with pytest.raises(LLMParseError):
        _detect_dialect(b"")