From 8dcf662945fd558efa8a84aae24bbb65639ab5e3 Mon Sep 17 00:00:00 2001 From: Giorgio Gilestro Date: Wed, 27 May 2026 12:14:11 +0200 Subject: [PATCH] csv-parser: add _detect_dialect helper Heuristic refined from the plan draft: candidate header rows must be followed by a row containing at least one numeric token. Without this, IBKR-style multi-line preambles (all-text rows before the real header) would be mistaken for the header at preamble=0. Co-Authored-By: Claude Opus 4.7 --- app/services/llm_csv_parser.py | 84 ++++++++++++++++++++++++++++++++++ tests/test_llm_csv_parser.py | 42 +++++++++++++++++ 2 files changed, 126 insertions(+) diff --git a/app/services/llm_csv_parser.py b/app/services/llm_csv_parser.py index 9d7a04c..153eb72 100644 --- a/app/services/llm_csv_parser.py +++ b/app/services/llm_csv_parser.py @@ -17,10 +17,20 @@ does that by inspecting collected ``sample_row`` values. """ from __future__ import annotations +import csv import hashlib +import io from app.services.csv_import import CSVImportError +# --------------------------------------------------------------------------- +# Module-level constants +# --------------------------------------------------------------------------- + +# Cap for how many leading lines we'll scan looking for the header row. +# Real broker preambles are typically 1-10 lines. +_MAX_PREAMBLE_SCAN = 30 + class LLMParseError(CSVImportError): """Raised when the LLM call fails or returns an unusable mapping. @@ -40,3 +50,77 @@ def _fingerprint(headers: list[str]) -> str: adding or removing a column does.""" normalised = "|".join(h.strip().lower() for h in headers) return hashlib.sha256(normalised.encode("utf-8")).hexdigest() + + +def _decode_raw(raw: bytes) -> str: + """Best-effort UTF-8 decode with BOM strip and lossy fallback.""" + return raw.decode("utf-8-sig", errors="replace") + + +def _looks_numeric(value: str) -> bool: + """True if ``value`` parses as a number after stripping common + decoration (thousands separators, currency symbols, percent signs).""" + s = value.strip().replace(",", "").replace("$", "").replace("€", "") + s = s.replace("£", "").replace("%", "").lstrip("-+") + if not s: + return False + try: + float(s) + return True + except ValueError: + return False + + +def _detect_dialect(raw: bytes) -> tuple[str, int]: + """Detect (delimiter, preamble_rows). + + ``preamble_rows`` is the number of lines BEFORE the row we identify + as the actual table header. The header row is the first line whose + tokens are all non-numeric (so "Symbol,Quantity" is a header but + "AAPL,100" is data). Falls back to assuming the first line is the + header if no clear non-numeric line is found within the scan + window. + + Raises ``LLMParseError`` on empty input.""" + if not raw or not raw.strip(): + raise LLMParseError("empty CSV") + + text = _decode_raw(raw) + # csv.Sniffer is happy with ~4KB. Anything more and it gets slow. + sample = text[:4096] + try: + dialect = csv.Sniffer().sniff(sample, delimiters=",;\t|") + delimiter = dialect.delimiter + except csv.Error: + # Most broker exports are comma-delimited; default rather than + # error out — the caller will still validate column shapes. + delimiter = "," + + rows = list(csv.reader(io.StringIO(text), delimiter=delimiter)) + # Build a flat list of (index, non_empty_tokens) for rows within scan limit + parsed = [] + for i, row in enumerate(rows): + if i >= _MAX_PREAMBLE_SCAN: + break + non_empty = [c.strip() for c in row if c.strip()] + parsed.append((i, non_empty)) + + # Find the first all-alpha candidate row that is followed by a data + # row (one that contains at least one numeric token). This + # distinguishes real header rows from preamble metadata rows that + # also happen to be all-text. + for idx, (i, non_empty) in enumerate(parsed): + if len(non_empty) < 2: + continue + all_alpha = all(not _looks_numeric(c) for c in non_empty) + if not all_alpha: + continue + # Check whether the next non-empty row looks like data (has a numeric) + for _, next_non_empty in parsed[idx + 1:]: + if not next_non_empty: + continue + if any(_looks_numeric(c) for c in next_non_empty): + return delimiter, i + # Next row is also all-alpha → keep scanning + break + return delimiter, 0 diff --git a/tests/test_llm_csv_parser.py b/tests/test_llm_csv_parser.py index 27223bb..08160f7 100644 --- a/tests/test_llm_csv_parser.py +++ b/tests/test_llm_csv_parser.py @@ -54,3 +54,45 @@ def test_fingerprint_is_sha256_hex_64_chars(): f = _fingerprint(["Symbol", "Quantity"]) assert len(f) == 64 assert all(c in "0123456789abcdef" for c in f) + + +def test_detect_dialect_no_preamble_comma(): + from app.services.llm_csv_parser import _detect_dialect + + raw = b"Symbol,Quantity,Avg Price\nAAPL,100,150.25\nMSFT,50,310.00\n" + delimiter, preamble = _detect_dialect(raw) + assert delimiter == "," + assert preamble == 0 + + +def test_detect_dialect_with_preamble(): + from app.services.llm_csv_parser import _detect_dialect + + raw = ( + b"Statement,Header,Field Name,Field Value\n" + b"Statement,Data,BrokerName,Interactive Brokers LLC\n" + b"Statement,Data,Title,Activity Statement\n" + b"Statement,Data,Period,\"January 1, 2026 - January 31, 2026\"\n" + b"Symbol,Quantity,Avg Price,Currency,Description\n" + b"AAPL,100,150.25,USD,Apple Inc\n" + ) + delimiter, preamble = _detect_dialect(raw) + assert delimiter == "," + # The data-row header line is the FIFTH line (index 4); preamble = 4. + assert preamble == 4 + + +def test_detect_dialect_tab_delimited(): + from app.services.llm_csv_parser import _detect_dialect + + raw = b"Symbol\tQuantity\tAvg Price\nAAPL\t100\t150.25\n" + delimiter, preamble = _detect_dialect(raw) + assert delimiter == "\t" + assert preamble == 0 + + +def test_detect_dialect_empty_raises(): + from app.services.llm_csv_parser import LLMParseError, _detect_dialect + + with pytest.raises(LLMParseError): + _detect_dialect(b"")