csv-parser: add _detect_dialect helper
Heuristic refined from the plan draft: candidate header rows must be followed by a row containing at least one numeric token. Without this, IBKR-style multi-line preambles (all-text rows before the real header) would be mistaken for the header at preamble=0. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
parent
f8a0ed3923
commit
8dcf662945
2 changed files with 126 additions and 0 deletions
|
|
@ -54,3 +54,45 @@ def test_fingerprint_is_sha256_hex_64_chars():
|
|||
f = _fingerprint(["Symbol", "Quantity"])
|
||||
assert len(f) == 64
|
||||
assert all(c in "0123456789abcdef" for c in f)
|
||||
|
||||
|
||||
def test_detect_dialect_no_preamble_comma():
|
||||
from app.services.llm_csv_parser import _detect_dialect
|
||||
|
||||
raw = b"Symbol,Quantity,Avg Price\nAAPL,100,150.25\nMSFT,50,310.00\n"
|
||||
delimiter, preamble = _detect_dialect(raw)
|
||||
assert delimiter == ","
|
||||
assert preamble == 0
|
||||
|
||||
|
||||
def test_detect_dialect_with_preamble():
|
||||
from app.services.llm_csv_parser import _detect_dialect
|
||||
|
||||
raw = (
|
||||
b"Statement,Header,Field Name,Field Value\n"
|
||||
b"Statement,Data,BrokerName,Interactive Brokers LLC\n"
|
||||
b"Statement,Data,Title,Activity Statement\n"
|
||||
b"Statement,Data,Period,\"January 1, 2026 - January 31, 2026\"\n"
|
||||
b"Symbol,Quantity,Avg Price,Currency,Description\n"
|
||||
b"AAPL,100,150.25,USD,Apple Inc\n"
|
||||
)
|
||||
delimiter, preamble = _detect_dialect(raw)
|
||||
assert delimiter == ","
|
||||
# The data-row header line is the FIFTH line (index 4); preamble = 4.
|
||||
assert preamble == 4
|
||||
|
||||
|
||||
def test_detect_dialect_tab_delimited():
|
||||
from app.services.llm_csv_parser import _detect_dialect
|
||||
|
||||
raw = b"Symbol\tQuantity\tAvg Price\nAAPL\t100\t150.25\n"
|
||||
delimiter, preamble = _detect_dialect(raw)
|
||||
assert delimiter == "\t"
|
||||
assert preamble == 0
|
||||
|
||||
|
||||
def test_detect_dialect_empty_raises():
|
||||
from app.services.llm_csv_parser import LLMParseError, _detect_dialect
|
||||
|
||||
with pytest.raises(LLMParseError):
|
||||
_detect_dialect(b"")
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue