csv-parser: add _detect_dialect helper
Heuristic refined from the plan draft: candidate header rows must be followed by a row containing at least one numeric token. Without this, IBKR-style multi-line preambles (all-text rows before the real header) would be mistaken for the header at preamble=0. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
parent
f8a0ed3923
commit
8dcf662945
2 changed files with 126 additions and 0 deletions
|
|
@ -17,10 +17,20 @@ does that by inspecting collected ``sample_row`` values.
|
||||||
"""
|
"""
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import csv
|
||||||
import hashlib
|
import hashlib
|
||||||
|
import io
|
||||||
|
|
||||||
from app.services.csv_import import CSVImportError
|
from app.services.csv_import import CSVImportError
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Module-level constants
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
# Cap for how many leading lines we'll scan looking for the header row.
|
||||||
|
# Real broker preambles are typically 1-10 lines.
|
||||||
|
_MAX_PREAMBLE_SCAN = 30
|
||||||
|
|
||||||
|
|
||||||
class LLMParseError(CSVImportError):
|
class LLMParseError(CSVImportError):
|
||||||
"""Raised when the LLM call fails or returns an unusable mapping.
|
"""Raised when the LLM call fails or returns an unusable mapping.
|
||||||
|
|
@ -40,3 +50,77 @@ def _fingerprint(headers: list[str]) -> str:
|
||||||
adding or removing a column does."""
|
adding or removing a column does."""
|
||||||
normalised = "|".join(h.strip().lower() for h in headers)
|
normalised = "|".join(h.strip().lower() for h in headers)
|
||||||
return hashlib.sha256(normalised.encode("utf-8")).hexdigest()
|
return hashlib.sha256(normalised.encode("utf-8")).hexdigest()
|
||||||
|
|
||||||
|
|
||||||
|
def _decode_raw(raw: bytes) -> str:
|
||||||
|
"""Best-effort UTF-8 decode with BOM strip and lossy fallback."""
|
||||||
|
return raw.decode("utf-8-sig", errors="replace")
|
||||||
|
|
||||||
|
|
||||||
|
def _looks_numeric(value: str) -> bool:
|
||||||
|
"""True if ``value`` parses as a number after stripping common
|
||||||
|
decoration (thousands separators, currency symbols, percent signs)."""
|
||||||
|
s = value.strip().replace(",", "").replace("$", "").replace("€", "")
|
||||||
|
s = s.replace("£", "").replace("%", "").lstrip("-+")
|
||||||
|
if not s:
|
||||||
|
return False
|
||||||
|
try:
|
||||||
|
float(s)
|
||||||
|
return True
|
||||||
|
except ValueError:
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def _detect_dialect(raw: bytes) -> tuple[str, int]:
|
||||||
|
"""Detect (delimiter, preamble_rows).
|
||||||
|
|
||||||
|
``preamble_rows`` is the number of lines BEFORE the row we identify
|
||||||
|
as the actual table header. The header row is the first line whose
|
||||||
|
tokens are all non-numeric (so "Symbol,Quantity" is a header but
|
||||||
|
"AAPL,100" is data). Falls back to assuming the first line is the
|
||||||
|
header if no clear non-numeric line is found within the scan
|
||||||
|
window.
|
||||||
|
|
||||||
|
Raises ``LLMParseError`` on empty input."""
|
||||||
|
if not raw or not raw.strip():
|
||||||
|
raise LLMParseError("empty CSV")
|
||||||
|
|
||||||
|
text = _decode_raw(raw)
|
||||||
|
# csv.Sniffer is happy with ~4KB. Anything more and it gets slow.
|
||||||
|
sample = text[:4096]
|
||||||
|
try:
|
||||||
|
dialect = csv.Sniffer().sniff(sample, delimiters=",;\t|")
|
||||||
|
delimiter = dialect.delimiter
|
||||||
|
except csv.Error:
|
||||||
|
# Most broker exports are comma-delimited; default rather than
|
||||||
|
# error out — the caller will still validate column shapes.
|
||||||
|
delimiter = ","
|
||||||
|
|
||||||
|
rows = list(csv.reader(io.StringIO(text), delimiter=delimiter))
|
||||||
|
# Build a flat list of (index, non_empty_tokens) for rows within scan limit
|
||||||
|
parsed = []
|
||||||
|
for i, row in enumerate(rows):
|
||||||
|
if i >= _MAX_PREAMBLE_SCAN:
|
||||||
|
break
|
||||||
|
non_empty = [c.strip() for c in row if c.strip()]
|
||||||
|
parsed.append((i, non_empty))
|
||||||
|
|
||||||
|
# Find the first all-alpha candidate row that is followed by a data
|
||||||
|
# row (one that contains at least one numeric token). This
|
||||||
|
# distinguishes real header rows from preamble metadata rows that
|
||||||
|
# also happen to be all-text.
|
||||||
|
for idx, (i, non_empty) in enumerate(parsed):
|
||||||
|
if len(non_empty) < 2:
|
||||||
|
continue
|
||||||
|
all_alpha = all(not _looks_numeric(c) for c in non_empty)
|
||||||
|
if not all_alpha:
|
||||||
|
continue
|
||||||
|
# Check whether the next non-empty row looks like data (has a numeric)
|
||||||
|
for _, next_non_empty in parsed[idx + 1:]:
|
||||||
|
if not next_non_empty:
|
||||||
|
continue
|
||||||
|
if any(_looks_numeric(c) for c in next_non_empty):
|
||||||
|
return delimiter, i
|
||||||
|
# Next row is also all-alpha → keep scanning
|
||||||
|
break
|
||||||
|
return delimiter, 0
|
||||||
|
|
|
||||||
|
|
@ -54,3 +54,45 @@ def test_fingerprint_is_sha256_hex_64_chars():
|
||||||
f = _fingerprint(["Symbol", "Quantity"])
|
f = _fingerprint(["Symbol", "Quantity"])
|
||||||
assert len(f) == 64
|
assert len(f) == 64
|
||||||
assert all(c in "0123456789abcdef" for c in f)
|
assert all(c in "0123456789abcdef" for c in f)
|
||||||
|
|
||||||
|
|
||||||
|
def test_detect_dialect_no_preamble_comma():
|
||||||
|
from app.services.llm_csv_parser import _detect_dialect
|
||||||
|
|
||||||
|
raw = b"Symbol,Quantity,Avg Price\nAAPL,100,150.25\nMSFT,50,310.00\n"
|
||||||
|
delimiter, preamble = _detect_dialect(raw)
|
||||||
|
assert delimiter == ","
|
||||||
|
assert preamble == 0
|
||||||
|
|
||||||
|
|
||||||
|
def test_detect_dialect_with_preamble():
|
||||||
|
from app.services.llm_csv_parser import _detect_dialect
|
||||||
|
|
||||||
|
raw = (
|
||||||
|
b"Statement,Header,Field Name,Field Value\n"
|
||||||
|
b"Statement,Data,BrokerName,Interactive Brokers LLC\n"
|
||||||
|
b"Statement,Data,Title,Activity Statement\n"
|
||||||
|
b"Statement,Data,Period,\"January 1, 2026 - January 31, 2026\"\n"
|
||||||
|
b"Symbol,Quantity,Avg Price,Currency,Description\n"
|
||||||
|
b"AAPL,100,150.25,USD,Apple Inc\n"
|
||||||
|
)
|
||||||
|
delimiter, preamble = _detect_dialect(raw)
|
||||||
|
assert delimiter == ","
|
||||||
|
# The data-row header line is the FIFTH line (index 4); preamble = 4.
|
||||||
|
assert preamble == 4
|
||||||
|
|
||||||
|
|
||||||
|
def test_detect_dialect_tab_delimited():
|
||||||
|
from app.services.llm_csv_parser import _detect_dialect
|
||||||
|
|
||||||
|
raw = b"Symbol\tQuantity\tAvg Price\nAAPL\t100\t150.25\n"
|
||||||
|
delimiter, preamble = _detect_dialect(raw)
|
||||||
|
assert delimiter == "\t"
|
||||||
|
assert preamble == 0
|
||||||
|
|
||||||
|
|
||||||
|
def test_detect_dialect_empty_raises():
|
||||||
|
from app.services.llm_csv_parser import LLMParseError, _detect_dialect
|
||||||
|
|
||||||
|
with pytest.raises(LLMParseError):
|
||||||
|
_detect_dialect(b"")
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue