From 788563a81fcca79b266118d3a1ca7e1c8e19b3fa Mon Sep 17 00:00:00 2001
From: Giorgio Gilestro <giorgio@gilest.ro>
Date: Fri, 29 May 2026 13:21:26 +0200
Subject: [PATCH] ai: route reviewer through OpenRouter + Claude Haiku 4.5
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The DeepSeek-V4-flash reviewer was unreliable in production: it pads
its JSON verdicts with internal chain-of-thought even when the prompt
forbids it, so the verdict gets truncated at any reasonable max_tokens
cap and the parser drops it as malformed (a false-negative verdict
that would purge clean rows). A live run on 50 rows reproduced the
failure on 8 of 12 rejections, even at 800 tokens.

Fix: pin the reviewer call to OpenRouter with anthropic/claude-haiku-4.5.
Haiku answers structured-output classification tersely (no scratchpad
preamble), which means a 300-token cap is comfortably above the
~30-token JSON verdict. Cost is roughly the same (~$0.0001-$0.0003 per
review) and the latency tax is smaller.

To enable the pinned-provider call without disrupting other callers,
call_llm grows an optional `provider` parameter: when set, only that
provider is used (no fallback chain). All existing call sites
default to provider=None and keep the chain behaviour.

REVIEWER_MODEL is read from settings via getattr-with-fallback so an
env override can swap models without code changes — useful if we want
to A/B test against e.g. gemini-2.5-flash later.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 app/services/openrouter.py    | 13 +++++++++++--
 app/services/output_review.py | 31 +++++++++++++++++++++++--------
 tests/test_output_review.py   | 15 +++++++++++----
 3 files changed, 45 insertions(+), 14 deletions(-)

diff --git a/app/services/openrouter.py b/app/services/openrouter.py
index 598150c..50e7f7e 100644
--- a/app/services/openrouter.py
+++ b/app/services/openrouter.py
@@ -199,6 +199,7 @@ async def call_llm(
     model: str | None = None,
     max_tokens: int = 4000,
     response_format: dict | None = None,
+    provider: str | None = None,
 ) -> LogResult:
     """Provider-aware chat completion with fallback. Tries primary
     (LLM_PROVIDER) first; if it raises after retries, falls through to
@@ -211,8 +212,16 @@ async def call_llm(
 
     Pass response_format={"type": "json_object"} to force JSON-mode
     output (the model still needs to be instructed in the system prompt
-    to emit valid JSON — this flag enforces, not asks)."""
-    chain = _provider_chain()
+    to emit valid JSON — this flag enforces, not asks).
+
+    Pass `provider` (e.g. "openrouter") to skip the configured chain
+    and pin the call to a specific provider. Used by the reviewer agent
+    to force routing through OpenRouter so it can address a non-DeepSeek
+    model that doesn't pre-think before emitting JSON."""
+    if provider is not None:
+        chain = [provider]
+    else:
+        chain = _provider_chain()
     if not chain:
         raise RuntimeError("No LLM provider configured (no API key set)")
 
diff --git a/app/services/output_review.py b/app/services/output_review.py
index f228a74..fe22e6d 100644
--- a/app/services/output_review.py
+++ b/app/services/output_review.py
@@ -20,12 +20,23 @@ from dataclasses import dataclass
 
 import httpx
 
+from app.config import get_settings
 from app.logging import get_logger
 from app.services.openrouter import call_llm
 
 log = get_logger("output_review")
 
 
+# The reviewer runs through OpenRouter against a small, non-thinking
+# model. DeepSeek-V4-flash (our generator default) emits internal
+# chain-of-thought before its JSON output even when the prompt forbids
+# it, which truncates the JSON at any reasonable max_tokens cap and
+# breaks the parser. Anthropic's Haiku family answers structured-output
+# tasks tersely and deterministically — no chain-of-thought tax. Cost
+# is ~$0.0001-$0.0003 per review depending on candidate length.
+DEFAULT_REVIEWER_MODEL = "anthropic/claude-haiku-4.5"
+
+
 _SYSTEM_PROMPT = """\
 You are a strict editor for a financial-markets dashboard. The author
 was asked to produce a short interpretive read for human readers.
@@ -81,17 +92,21 @@ async def review_read(client: httpx.AsyncClient, candidate: str) -> Verdict:
         # contain prompt-like prose.
         {"role": "user", "content": f"Candidate read:\n```\n{candidate}\n```"},
     ]
+    settings = get_settings()
+    reviewer_model = getattr(settings, "REVIEWER_MODEL", None) or DEFAULT_REVIEWER_MODEL
     try:
         result = await call_llm(
             client, messages,
-            # 800 tokens is well above the ~30-token JSON verdict the
-            # prompt asks for. The reviewer model (DeepSeek-V4-flash)
-            # occasionally pads with its own thinking before the JSON
-            # even though response_format is enforced; smaller caps
-            # (120, 300) produced finish_reason=length cutoffs that
-            # left the JSON half-written and broke the parser. 800
-            # removes the artefact entirely at ~$0.00022 per call.
-            max_tokens=800,
+            # Pin to OpenRouter so a non-DeepSeek model like Haiku is
+            # actually reachable; the default provider chain would try
+            # DeepSeek native first and 404 on the Anthropic model name.
+            provider="openrouter",
+            model=reviewer_model,
+            # 300 tokens is well above the ~30-token JSON verdict.
+            # Haiku doesn't pad with hidden reasoning the way DeepSeek
+            # does, so we don't need the 800-token headroom required to
+            # absorb the generator's chain-of-thought.
+            max_tokens=300,
             response_format={"type": "json_object"},
         )
     except Exception as e:
diff --git a/tests/test_output_review.py b/tests/test_output_review.py
index 53f0b34..4e6fa4b 100644
--- a/tests/test_output_review.py
+++ b/tests/test_output_review.py
@@ -62,13 +62,20 @@ def _mock_post(handler):
 
 
 def _configure(monkeypatch):
-    """Minimal env so call_llm believes a provider is configured."""
-    monkeypatch.setattr(ot, "get_settings", lambda: type("S", (), {
+    """Minimal env so call_llm believes a provider is configured.
+    Both review_read (which pins to OpenRouter for a non-thinking model)
+    and the openrouter module itself read get_settings, so we patch
+    both module-level references."""
+    import app.services.output_review as orr
+    settings = type("S", (), {
         "LLM_PROVIDER": "deepseek", "LLM_FALLBACK": "",
-        "DEEPSEEK_API_KEY": "sk-d", "OPENROUTER_API_KEY": "",
+        "DEEPSEEK_API_KEY": "sk-d", "OPENROUTER_API_KEY": "sk-or",
         "DEEPSEEK_URL": "https://x/deepseek", "DEEPSEEK_MODEL": "deepseek-v4-flash",
         "OPENROUTER_URL": "https://x/or",      "OPENROUTER_MODEL": "deepseek/deepseek-v4-flash",
-    })())
+        "REVIEWER_MODEL": "anthropic/claude-haiku-4.5",
+    })()
+    monkeypatch.setattr(ot, "get_settings", lambda: settings)
+    monkeypatch.setattr(orr, "get_settings", lambda: settings)
 
 
 @pytest.mark.asyncio