csv-parser: add _detect_dialect helper

Heuristic refined from the plan draft: candidate header rows must be
followed by a row containing at least one numeric token. Without this,
IBKR-style multi-line preambles (all-text rows before the real header)
would be mistaken for the header at preamble=0.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
Giorgio Gilestro 2026-05-27 12:14:11 +02:00
parent f8a0ed3923
commit 8dcf662945
2 changed files with 126 additions and 0 deletions

View file

@ -17,10 +17,20 @@ does that by inspecting collected ``sample_row`` values.
"""
from __future__ import annotations
import csv
import hashlib
import io
from app.services.csv_import import CSVImportError
# ---------------------------------------------------------------------------
# Module-level constants
# ---------------------------------------------------------------------------
# Cap for how many leading lines we'll scan looking for the header row.
# Real broker preambles are typically 1-10 lines.
_MAX_PREAMBLE_SCAN = 30
class LLMParseError(CSVImportError):
"""Raised when the LLM call fails or returns an unusable mapping.
@ -40,3 +50,77 @@ def _fingerprint(headers: list[str]) -> str:
adding or removing a column does."""
normalised = "|".join(h.strip().lower() for h in headers)
return hashlib.sha256(normalised.encode("utf-8")).hexdigest()
def _decode_raw(raw: bytes) -> str:
"""Best-effort UTF-8 decode with BOM strip and lossy fallback."""
return raw.decode("utf-8-sig", errors="replace")
def _looks_numeric(value: str) -> bool:
"""True if ``value`` parses as a number after stripping common
decoration (thousands separators, currency symbols, percent signs)."""
s = value.strip().replace(",", "").replace("$", "").replace("", "")
s = s.replace("£", "").replace("%", "").lstrip("-+")
if not s:
return False
try:
float(s)
return True
except ValueError:
return False
def _detect_dialect(raw: bytes) -> tuple[str, int]:
"""Detect (delimiter, preamble_rows).
``preamble_rows`` is the number of lines BEFORE the row we identify
as the actual table header. The header row is the first line whose
tokens are all non-numeric (so "Symbol,Quantity" is a header but
"AAPL,100" is data). Falls back to assuming the first line is the
header if no clear non-numeric line is found within the scan
window.
Raises ``LLMParseError`` on empty input."""
if not raw or not raw.strip():
raise LLMParseError("empty CSV")
text = _decode_raw(raw)
# csv.Sniffer is happy with ~4KB. Anything more and it gets slow.
sample = text[:4096]
try:
dialect = csv.Sniffer().sniff(sample, delimiters=",;\t|")
delimiter = dialect.delimiter
except csv.Error:
# Most broker exports are comma-delimited; default rather than
# error out — the caller will still validate column shapes.
delimiter = ","
rows = list(csv.reader(io.StringIO(text), delimiter=delimiter))
# Build a flat list of (index, non_empty_tokens) for rows within scan limit
parsed = []
for i, row in enumerate(rows):
if i >= _MAX_PREAMBLE_SCAN:
break
non_empty = [c.strip() for c in row if c.strip()]
parsed.append((i, non_empty))
# Find the first all-alpha candidate row that is followed by a data
# row (one that contains at least one numeric token). This
# distinguishes real header rows from preamble metadata rows that
# also happen to be all-text.
for idx, (i, non_empty) in enumerate(parsed):
if len(non_empty) < 2:
continue
all_alpha = all(not _looks_numeric(c) for c in non_empty)
if not all_alpha:
continue
# Check whether the next non-empty row looks like data (has a numeric)
for _, next_non_empty in parsed[idx + 1:]:
if not next_non_empty:
continue
if any(_looks_numeric(c) for c in next_non_empty):
return delimiter, i
# Next row is also all-alpha → keep scanning
break
return delimiter, 0