The DeepSeek-V4-flash reviewer was unreliable in production: it pads its JSON verdicts with internal chain-of-thought even when the prompt forbids it, so the verdict gets truncated at any reasonable max_tokens cap and the parser drops it as malformed (a false-negative verdict that would purge clean rows). A live run on 50 rows reproduced the failure on 8 of 12 rejections, even at 800 tokens. Fix: pin the reviewer call to OpenRouter with anthropic/claude-haiku-4.5. Haiku answers structured-output classification tersely (no scratchpad preamble), which means a 300-token cap is comfortably above the ~30-token JSON verdict. Cost is roughly the same (~$0.0001-$0.0003 per review) and the latency tax is smaller. To enable the pinned-provider call without disrupting other callers, call_llm grows an optional `provider` parameter: when set, only that provider is used (no fallback chain). All existing call sites default to provider=None and keep the chain behaviour. REVIEWER_MODEL is read from settings via getattr-with-fallback so an env override can swap models without code changes — useful if we want to A/B test against e.g. gemini-2.5-flash later. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
153 lines
5.8 KiB
Python
153 lines
5.8 KiB
Python
"""Tests for the JSON-envelope extractor and the reviewer agent.
|
|
|
|
The two together replaced the regex `clean_summary` + `looks_like_leakage`
|
|
scaffolding that used to live in indicator_summary_job. The extractor is
|
|
pure-function so it's covered exhaustively; the reviewer makes an LLM
|
|
call and is exercised via the httpx MockTransport that the other
|
|
openrouter tests use."""
|
|
from __future__ import annotations
|
|
|
|
import httpx
|
|
import pytest
|
|
|
|
from app.jobs.indicator_summary_job import _extract_read
|
|
from app.services import openrouter as ot
|
|
from app.services.output_review import review_read
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# _extract_read — JSON envelope handling
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def test_extract_read_returns_trimmed_field():
|
|
raw = '{"read": " The market is pricing growth. "}'
|
|
assert _extract_read(raw) == "The market is pricing growth."
|
|
|
|
|
|
def test_extract_read_returns_none_on_invalid_json():
|
|
assert _extract_read("not json") is None
|
|
assert _extract_read("{bad}") is None
|
|
assert _extract_read("") is None
|
|
|
|
|
|
def test_extract_read_returns_none_when_field_missing():
|
|
assert _extract_read('{"other": "x"}') is None
|
|
|
|
|
|
def test_extract_read_returns_none_when_field_not_string():
|
|
assert _extract_read('{"read": 42}') is None
|
|
assert _extract_read('{"read": null}') is None
|
|
assert _extract_read('{"read": ["a","b"]}') is None
|
|
|
|
|
|
def test_extract_read_returns_none_when_field_empty():
|
|
assert _extract_read('{"read": ""}') is None
|
|
assert _extract_read('{"read": " "}') is None
|
|
|
|
|
|
def test_extract_read_returns_none_when_envelope_not_object():
|
|
# A bare string or array is valid JSON but not the expected shape.
|
|
assert _extract_read('"just a string"') is None
|
|
assert _extract_read('["a", "b"]') is None
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# review_read — judges candidate read via a second LLM call
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def _mock_post(handler):
|
|
return httpx.MockTransport(handler)
|
|
|
|
|
|
def _configure(monkeypatch):
|
|
"""Minimal env so call_llm believes a provider is configured.
|
|
Both review_read (which pins to OpenRouter for a non-thinking model)
|
|
and the openrouter module itself read get_settings, so we patch
|
|
both module-level references."""
|
|
import app.services.output_review as orr
|
|
settings = type("S", (), {
|
|
"LLM_PROVIDER": "deepseek", "LLM_FALLBACK": "",
|
|
"DEEPSEEK_API_KEY": "sk-d", "OPENROUTER_API_KEY": "sk-or",
|
|
"DEEPSEEK_URL": "https://x/deepseek", "DEEPSEEK_MODEL": "deepseek-v4-flash",
|
|
"OPENROUTER_URL": "https://x/or", "OPENROUTER_MODEL": "deepseek/deepseek-v4-flash",
|
|
"REVIEWER_MODEL": "anthropic/claude-haiku-4.5",
|
|
})()
|
|
monkeypatch.setattr(ot, "get_settings", lambda: settings)
|
|
monkeypatch.setattr(orr, "get_settings", lambda: settings)
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_review_clean_verdict(monkeypatch):
|
|
_configure(monkeypatch)
|
|
def handler(_req):
|
|
return httpx.Response(200, json={
|
|
"choices": [{"message": {"content": '{"clean": true, "reason": "ok"}'},
|
|
"finish_reason": "stop"}],
|
|
"usage": {"prompt_tokens": 50, "completion_tokens": 12, "cost": 0.00007},
|
|
})
|
|
async with httpx.AsyncClient(transport=_mock_post(handler)) as client:
|
|
v = await review_read(client, "Markets are pricing tighter policy.")
|
|
assert v.clean is True
|
|
assert v.cost_usd == 0.00007
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_review_unclean_verdict(monkeypatch):
|
|
_configure(monkeypatch)
|
|
def handler(_req):
|
|
return httpx.Response(200, json={
|
|
"choices": [{"message": {"content":
|
|
'{"clean": false, "reason": "chain of thought"}'},
|
|
"finish_reason": "stop"}],
|
|
"usage": {"prompt_tokens": 50, "completion_tokens": 14, "cost": 0.00009},
|
|
})
|
|
async with httpx.AsyncClient(transport=_mock_post(handler)) as client:
|
|
v = await review_read(client, "Let's see, is it X? Actually Y?")
|
|
assert v.clean is False
|
|
assert "chain of thought" in v.reason
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_review_failsafe_on_malformed_json(monkeypatch):
|
|
"""Reviewer returned prose instead of JSON → conservative reject."""
|
|
_configure(monkeypatch)
|
|
def handler(_req):
|
|
return httpx.Response(200, json={
|
|
"choices": [{"message": {"content": "yes it looks clean"},
|
|
"finish_reason": "stop"}],
|
|
"usage": {"prompt_tokens": 50, "completion_tokens": 6},
|
|
})
|
|
async with httpx.AsyncClient(transport=_mock_post(handler)) as client:
|
|
v = await review_read(client, "Some candidate.")
|
|
assert v.clean is False
|
|
assert "non-JSON" in v.reason
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_review_failsafe_on_missing_clean_field(monkeypatch):
|
|
_configure(monkeypatch)
|
|
def handler(_req):
|
|
return httpx.Response(200, json={
|
|
"choices": [{"message": {"content": '{"reason": "no field"}'},
|
|
"finish_reason": "stop"}],
|
|
"usage": {"prompt_tokens": 50, "completion_tokens": 6},
|
|
})
|
|
async with httpx.AsyncClient(transport=_mock_post(handler)) as client:
|
|
v = await review_read(client, "Some candidate.")
|
|
assert v.clean is False
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_review_failsafe_on_empty_candidate(monkeypatch):
|
|
"""No LLM call should fire if the candidate is empty."""
|
|
_configure(monkeypatch)
|
|
calls = []
|
|
def handler(_req):
|
|
calls.append(1)
|
|
return httpx.Response(500, json={"error": "should not be called"})
|
|
async with httpx.AsyncClient(transport=_mock_post(handler)) as client:
|
|
v = await review_read(client, " ")
|
|
assert v.clean is False
|
|
assert calls == []
|