ai: route reviewer through OpenRouter + Claude Haiku 4.5
The DeepSeek-V4-flash reviewer was unreliable in production: it pads its JSON verdicts with internal chain-of-thought even when the prompt forbids it, so the verdict gets truncated at any reasonable max_tokens cap and the parser drops it as malformed (a false-negative verdict that would purge clean rows). A live run on 50 rows reproduced the failure on 8 of 12 rejections, even at 800 tokens. Fix: pin the reviewer call to OpenRouter with anthropic/claude-haiku-4.5. Haiku answers structured-output classification tersely (no scratchpad preamble), which means a 300-token cap is comfortably above the ~30-token JSON verdict. Cost is roughly the same (~$0.0001-$0.0003 per review) and the latency tax is smaller. To enable the pinned-provider call without disrupting other callers, call_llm grows an optional `provider` parameter: when set, only that provider is used (no fallback chain). All existing call sites default to provider=None and keep the chain behaviour. REVIEWER_MODEL is read from settings via getattr-with-fallback so an env override can swap models without code changes — useful if we want to A/B test against e.g. gemini-2.5-flash later. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
parent
8b9d3c9c3e
commit
788563a81f
3 changed files with 45 additions and 14 deletions
|
|
@ -199,6 +199,7 @@ async def call_llm(
|
||||||
model: str | None = None,
|
model: str | None = None,
|
||||||
max_tokens: int = 4000,
|
max_tokens: int = 4000,
|
||||||
response_format: dict | None = None,
|
response_format: dict | None = None,
|
||||||
|
provider: str | None = None,
|
||||||
) -> LogResult:
|
) -> LogResult:
|
||||||
"""Provider-aware chat completion with fallback. Tries primary
|
"""Provider-aware chat completion with fallback. Tries primary
|
||||||
(LLM_PROVIDER) first; if it raises after retries, falls through to
|
(LLM_PROVIDER) first; if it raises after retries, falls through to
|
||||||
|
|
@ -211,7 +212,15 @@ async def call_llm(
|
||||||
|
|
||||||
Pass response_format={"type": "json_object"} to force JSON-mode
|
Pass response_format={"type": "json_object"} to force JSON-mode
|
||||||
output (the model still needs to be instructed in the system prompt
|
output (the model still needs to be instructed in the system prompt
|
||||||
to emit valid JSON — this flag enforces, not asks)."""
|
to emit valid JSON — this flag enforces, not asks).
|
||||||
|
|
||||||
|
Pass `provider` (e.g. "openrouter") to skip the configured chain
|
||||||
|
and pin the call to a specific provider. Used by the reviewer agent
|
||||||
|
to force routing through OpenRouter so it can address a non-DeepSeek
|
||||||
|
model that doesn't pre-think before emitting JSON."""
|
||||||
|
if provider is not None:
|
||||||
|
chain = [provider]
|
||||||
|
else:
|
||||||
chain = _provider_chain()
|
chain = _provider_chain()
|
||||||
if not chain:
|
if not chain:
|
||||||
raise RuntimeError("No LLM provider configured (no API key set)")
|
raise RuntimeError("No LLM provider configured (no API key set)")
|
||||||
|
|
|
||||||
|
|
@ -20,12 +20,23 @@ from dataclasses import dataclass
|
||||||
|
|
||||||
import httpx
|
import httpx
|
||||||
|
|
||||||
|
from app.config import get_settings
|
||||||
from app.logging import get_logger
|
from app.logging import get_logger
|
||||||
from app.services.openrouter import call_llm
|
from app.services.openrouter import call_llm
|
||||||
|
|
||||||
log = get_logger("output_review")
|
log = get_logger("output_review")
|
||||||
|
|
||||||
|
|
||||||
|
# The reviewer runs through OpenRouter against a small, non-thinking
|
||||||
|
# model. DeepSeek-V4-flash (our generator default) emits internal
|
||||||
|
# chain-of-thought before its JSON output even when the prompt forbids
|
||||||
|
# it, which truncates the JSON at any reasonable max_tokens cap and
|
||||||
|
# breaks the parser. Anthropic's Haiku family answers structured-output
|
||||||
|
# tasks tersely and deterministically — no chain-of-thought tax. Cost
|
||||||
|
# is ~$0.0001-$0.0003 per review depending on candidate length.
|
||||||
|
DEFAULT_REVIEWER_MODEL = "anthropic/claude-haiku-4.5"
|
||||||
|
|
||||||
|
|
||||||
_SYSTEM_PROMPT = """\
|
_SYSTEM_PROMPT = """\
|
||||||
You are a strict editor for a financial-markets dashboard. The author
|
You are a strict editor for a financial-markets dashboard. The author
|
||||||
was asked to produce a short interpretive read for human readers.
|
was asked to produce a short interpretive read for human readers.
|
||||||
|
|
@ -81,17 +92,21 @@ async def review_read(client: httpx.AsyncClient, candidate: str) -> Verdict:
|
||||||
# contain prompt-like prose.
|
# contain prompt-like prose.
|
||||||
{"role": "user", "content": f"Candidate read:\n```\n{candidate}\n```"},
|
{"role": "user", "content": f"Candidate read:\n```\n{candidate}\n```"},
|
||||||
]
|
]
|
||||||
|
settings = get_settings()
|
||||||
|
reviewer_model = getattr(settings, "REVIEWER_MODEL", None) or DEFAULT_REVIEWER_MODEL
|
||||||
try:
|
try:
|
||||||
result = await call_llm(
|
result = await call_llm(
|
||||||
client, messages,
|
client, messages,
|
||||||
# 800 tokens is well above the ~30-token JSON verdict the
|
# Pin to OpenRouter so a non-DeepSeek model like Haiku is
|
||||||
# prompt asks for. The reviewer model (DeepSeek-V4-flash)
|
# actually reachable; the default provider chain would try
|
||||||
# occasionally pads with its own thinking before the JSON
|
# DeepSeek native first and 404 on the Anthropic model name.
|
||||||
# even though response_format is enforced; smaller caps
|
provider="openrouter",
|
||||||
# (120, 300) produced finish_reason=length cutoffs that
|
model=reviewer_model,
|
||||||
# left the JSON half-written and broke the parser. 800
|
# 300 tokens is well above the ~30-token JSON verdict.
|
||||||
# removes the artefact entirely at ~$0.00022 per call.
|
# Haiku doesn't pad with hidden reasoning the way DeepSeek
|
||||||
max_tokens=800,
|
# does, so we don't need the 800-token headroom required to
|
||||||
|
# absorb the generator's chain-of-thought.
|
||||||
|
max_tokens=300,
|
||||||
response_format={"type": "json_object"},
|
response_format={"type": "json_object"},
|
||||||
)
|
)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
|
|
||||||
|
|
@ -62,13 +62,20 @@ def _mock_post(handler):
|
||||||
|
|
||||||
|
|
||||||
def _configure(monkeypatch):
|
def _configure(monkeypatch):
|
||||||
"""Minimal env so call_llm believes a provider is configured."""
|
"""Minimal env so call_llm believes a provider is configured.
|
||||||
monkeypatch.setattr(ot, "get_settings", lambda: type("S", (), {
|
Both review_read (which pins to OpenRouter for a non-thinking model)
|
||||||
|
and the openrouter module itself read get_settings, so we patch
|
||||||
|
both module-level references."""
|
||||||
|
import app.services.output_review as orr
|
||||||
|
settings = type("S", (), {
|
||||||
"LLM_PROVIDER": "deepseek", "LLM_FALLBACK": "",
|
"LLM_PROVIDER": "deepseek", "LLM_FALLBACK": "",
|
||||||
"DEEPSEEK_API_KEY": "sk-d", "OPENROUTER_API_KEY": "",
|
"DEEPSEEK_API_KEY": "sk-d", "OPENROUTER_API_KEY": "sk-or",
|
||||||
"DEEPSEEK_URL": "https://x/deepseek", "DEEPSEEK_MODEL": "deepseek-v4-flash",
|
"DEEPSEEK_URL": "https://x/deepseek", "DEEPSEEK_MODEL": "deepseek-v4-flash",
|
||||||
"OPENROUTER_URL": "https://x/or", "OPENROUTER_MODEL": "deepseek/deepseek-v4-flash",
|
"OPENROUTER_URL": "https://x/or", "OPENROUTER_MODEL": "deepseek/deepseek-v4-flash",
|
||||||
})())
|
"REVIEWER_MODEL": "anthropic/claude-haiku-4.5",
|
||||||
|
})()
|
||||||
|
monkeypatch.setattr(ot, "get_settings", lambda: settings)
|
||||||
|
monkeypatch.setattr(orr, "get_settings", lambda: settings)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue