From 788563a81fcca79b266118d3a1ca7e1c8e19b3fa Mon Sep 17 00:00:00 2001 From: Giorgio Gilestro Date: Fri, 29 May 2026 13:21:26 +0200 Subject: [PATCH] ai: route reviewer through OpenRouter + Claude Haiku 4.5 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The DeepSeek-V4-flash reviewer was unreliable in production: it pads its JSON verdicts with internal chain-of-thought even when the prompt forbids it, so the verdict gets truncated at any reasonable max_tokens cap and the parser drops it as malformed (a false-negative verdict that would purge clean rows). A live run on 50 rows reproduced the failure on 8 of 12 rejections, even at 800 tokens. Fix: pin the reviewer call to OpenRouter with anthropic/claude-haiku-4.5. Haiku answers structured-output classification tersely (no scratchpad preamble), which means a 300-token cap is comfortably above the ~30-token JSON verdict. Cost is roughly the same (~$0.0001-$0.0003 per review) and the latency tax is smaller. To enable the pinned-provider call without disrupting other callers, call_llm grows an optional `provider` parameter: when set, only that provider is used (no fallback chain). All existing call sites default to provider=None and keep the chain behaviour. REVIEWER_MODEL is read from settings via getattr-with-fallback so an env override can swap models without code changes — useful if we want to A/B test against e.g. gemini-2.5-flash later. Co-Authored-By: Claude Opus 4.7 --- app/services/openrouter.py | 13 +++++++++++-- app/services/output_review.py | 31 +++++++++++++++++++++++-------- tests/test_output_review.py | 15 +++++++++++---- 3 files changed, 45 insertions(+), 14 deletions(-) diff --git a/app/services/openrouter.py b/app/services/openrouter.py index 598150c..50e7f7e 100644 --- a/app/services/openrouter.py +++ b/app/services/openrouter.py @@ -199,6 +199,7 @@ async def call_llm( model: str | None = None, max_tokens: int = 4000, response_format: dict | None = None, + provider: str | None = None, ) -> LogResult: """Provider-aware chat completion with fallback. Tries primary (LLM_PROVIDER) first; if it raises after retries, falls through to @@ -211,8 +212,16 @@ async def call_llm( Pass response_format={"type": "json_object"} to force JSON-mode output (the model still needs to be instructed in the system prompt - to emit valid JSON — this flag enforces, not asks).""" - chain = _provider_chain() + to emit valid JSON — this flag enforces, not asks). + + Pass `provider` (e.g. "openrouter") to skip the configured chain + and pin the call to a specific provider. Used by the reviewer agent + to force routing through OpenRouter so it can address a non-DeepSeek + model that doesn't pre-think before emitting JSON.""" + if provider is not None: + chain = [provider] + else: + chain = _provider_chain() if not chain: raise RuntimeError("No LLM provider configured (no API key set)") diff --git a/app/services/output_review.py b/app/services/output_review.py index f228a74..fe22e6d 100644 --- a/app/services/output_review.py +++ b/app/services/output_review.py @@ -20,12 +20,23 @@ from dataclasses import dataclass import httpx +from app.config import get_settings from app.logging import get_logger from app.services.openrouter import call_llm log = get_logger("output_review") +# The reviewer runs through OpenRouter against a small, non-thinking +# model. DeepSeek-V4-flash (our generator default) emits internal +# chain-of-thought before its JSON output even when the prompt forbids +# it, which truncates the JSON at any reasonable max_tokens cap and +# breaks the parser. Anthropic's Haiku family answers structured-output +# tasks tersely and deterministically — no chain-of-thought tax. Cost +# is ~$0.0001-$0.0003 per review depending on candidate length. +DEFAULT_REVIEWER_MODEL = "anthropic/claude-haiku-4.5" + + _SYSTEM_PROMPT = """\ You are a strict editor for a financial-markets dashboard. The author was asked to produce a short interpretive read for human readers. @@ -81,17 +92,21 @@ async def review_read(client: httpx.AsyncClient, candidate: str) -> Verdict: # contain prompt-like prose. {"role": "user", "content": f"Candidate read:\n```\n{candidate}\n```"}, ] + settings = get_settings() + reviewer_model = getattr(settings, "REVIEWER_MODEL", None) or DEFAULT_REVIEWER_MODEL try: result = await call_llm( client, messages, - # 800 tokens is well above the ~30-token JSON verdict the - # prompt asks for. The reviewer model (DeepSeek-V4-flash) - # occasionally pads with its own thinking before the JSON - # even though response_format is enforced; smaller caps - # (120, 300) produced finish_reason=length cutoffs that - # left the JSON half-written and broke the parser. 800 - # removes the artefact entirely at ~$0.00022 per call. - max_tokens=800, + # Pin to OpenRouter so a non-DeepSeek model like Haiku is + # actually reachable; the default provider chain would try + # DeepSeek native first and 404 on the Anthropic model name. + provider="openrouter", + model=reviewer_model, + # 300 tokens is well above the ~30-token JSON verdict. + # Haiku doesn't pad with hidden reasoning the way DeepSeek + # does, so we don't need the 800-token headroom required to + # absorb the generator's chain-of-thought. + max_tokens=300, response_format={"type": "json_object"}, ) except Exception as e: diff --git a/tests/test_output_review.py b/tests/test_output_review.py index 53f0b34..4e6fa4b 100644 --- a/tests/test_output_review.py +++ b/tests/test_output_review.py @@ -62,13 +62,20 @@ def _mock_post(handler): def _configure(monkeypatch): - """Minimal env so call_llm believes a provider is configured.""" - monkeypatch.setattr(ot, "get_settings", lambda: type("S", (), { + """Minimal env so call_llm believes a provider is configured. + Both review_read (which pins to OpenRouter for a non-thinking model) + and the openrouter module itself read get_settings, so we patch + both module-level references.""" + import app.services.output_review as orr + settings = type("S", (), { "LLM_PROVIDER": "deepseek", "LLM_FALLBACK": "", - "DEEPSEEK_API_KEY": "sk-d", "OPENROUTER_API_KEY": "", + "DEEPSEEK_API_KEY": "sk-d", "OPENROUTER_API_KEY": "sk-or", "DEEPSEEK_URL": "https://x/deepseek", "DEEPSEEK_MODEL": "deepseek-v4-flash", "OPENROUTER_URL": "https://x/or", "OPENROUTER_MODEL": "deepseek/deepseek-v4-flash", - })()) + "REVIEWER_MODEL": "anthropic/claude-haiku-4.5", + })() + monkeypatch.setattr(ot, "get_settings", lambda: settings) + monkeypatch.setattr(orr, "get_settings", lambda: settings) @pytest.mark.asyncio