diff --git a/app/services/openrouter.py b/app/services/openrouter.py index 598150c..50e7f7e 100644 --- a/app/services/openrouter.py +++ b/app/services/openrouter.py @@ -199,6 +199,7 @@ async def call_llm( model: str | None = None, max_tokens: int = 4000, response_format: dict | None = None, + provider: str | None = None, ) -> LogResult: """Provider-aware chat completion with fallback. Tries primary (LLM_PROVIDER) first; if it raises after retries, falls through to @@ -211,8 +212,16 @@ async def call_llm( Pass response_format={"type": "json_object"} to force JSON-mode output (the model still needs to be instructed in the system prompt - to emit valid JSON — this flag enforces, not asks).""" - chain = _provider_chain() + to emit valid JSON — this flag enforces, not asks). + + Pass `provider` (e.g. "openrouter") to skip the configured chain + and pin the call to a specific provider. Used by the reviewer agent + to force routing through OpenRouter so it can address a non-DeepSeek + model that doesn't pre-think before emitting JSON.""" + if provider is not None: + chain = [provider] + else: + chain = _provider_chain() if not chain: raise RuntimeError("No LLM provider configured (no API key set)") diff --git a/app/services/output_review.py b/app/services/output_review.py index f228a74..fe22e6d 100644 --- a/app/services/output_review.py +++ b/app/services/output_review.py @@ -20,12 +20,23 @@ from dataclasses import dataclass import httpx +from app.config import get_settings from app.logging import get_logger from app.services.openrouter import call_llm log = get_logger("output_review") +# The reviewer runs through OpenRouter against a small, non-thinking +# model. DeepSeek-V4-flash (our generator default) emits internal +# chain-of-thought before its JSON output even when the prompt forbids +# it, which truncates the JSON at any reasonable max_tokens cap and +# breaks the parser. Anthropic's Haiku family answers structured-output +# tasks tersely and deterministically — no chain-of-thought tax. Cost +# is ~$0.0001-$0.0003 per review depending on candidate length. +DEFAULT_REVIEWER_MODEL = "anthropic/claude-haiku-4.5" + + _SYSTEM_PROMPT = """\ You are a strict editor for a financial-markets dashboard. The author was asked to produce a short interpretive read for human readers. @@ -81,17 +92,21 @@ async def review_read(client: httpx.AsyncClient, candidate: str) -> Verdict: # contain prompt-like prose. {"role": "user", "content": f"Candidate read:\n```\n{candidate}\n```"}, ] + settings = get_settings() + reviewer_model = getattr(settings, "REVIEWER_MODEL", None) or DEFAULT_REVIEWER_MODEL try: result = await call_llm( client, messages, - # 800 tokens is well above the ~30-token JSON verdict the - # prompt asks for. The reviewer model (DeepSeek-V4-flash) - # occasionally pads with its own thinking before the JSON - # even though response_format is enforced; smaller caps - # (120, 300) produced finish_reason=length cutoffs that - # left the JSON half-written and broke the parser. 800 - # removes the artefact entirely at ~$0.00022 per call. - max_tokens=800, + # Pin to OpenRouter so a non-DeepSeek model like Haiku is + # actually reachable; the default provider chain would try + # DeepSeek native first and 404 on the Anthropic model name. + provider="openrouter", + model=reviewer_model, + # 300 tokens is well above the ~30-token JSON verdict. + # Haiku doesn't pad with hidden reasoning the way DeepSeek + # does, so we don't need the 800-token headroom required to + # absorb the generator's chain-of-thought. + max_tokens=300, response_format={"type": "json_object"}, ) except Exception as e: diff --git a/tests/test_output_review.py b/tests/test_output_review.py index 53f0b34..4e6fa4b 100644 --- a/tests/test_output_review.py +++ b/tests/test_output_review.py @@ -62,13 +62,20 @@ def _mock_post(handler): def _configure(monkeypatch): - """Minimal env so call_llm believes a provider is configured.""" - monkeypatch.setattr(ot, "get_settings", lambda: type("S", (), { + """Minimal env so call_llm believes a provider is configured. + Both review_read (which pins to OpenRouter for a non-thinking model) + and the openrouter module itself read get_settings, so we patch + both module-level references.""" + import app.services.output_review as orr + settings = type("S", (), { "LLM_PROVIDER": "deepseek", "LLM_FALLBACK": "", - "DEEPSEEK_API_KEY": "sk-d", "OPENROUTER_API_KEY": "", + "DEEPSEEK_API_KEY": "sk-d", "OPENROUTER_API_KEY": "sk-or", "DEEPSEEK_URL": "https://x/deepseek", "DEEPSEEK_MODEL": "deepseek-v4-flash", "OPENROUTER_URL": "https://x/or", "OPENROUTER_MODEL": "deepseek/deepseek-v4-flash", - })()) + "REVIEWER_MODEL": "anthropic/claude-haiku-4.5", + })() + monkeypatch.setattr(ot, "get_settings", lambda: settings) + monkeypatch.setattr(orr, "get_settings", lambda: settings) @pytest.mark.asyncio