diff --git a/app/services/openrouter.py b/app/services/openrouter.py index ca31f2f..598150c 100644 --- a/app/services/openrouter.py +++ b/app/services/openrouter.py @@ -136,10 +136,15 @@ async def _call_provider( messages: list[dict], model: str | None, max_tokens: int, + response_format: dict | None = None, ) -> LogResult: """One provider call with tenacity retries on transport/HTTP errors. Lives inside the retry decorator so retries happen within a provider, - not across the fallback chain.""" + not across the fallback chain. + + `response_format` is forwarded to the provider verbatim — DeepSeek and + OpenRouter both accept the OpenAI-shaped {"type": "json_object"} for + JSON-mode generation. None means free-form text.""" url, api_key, default_model, extra_headers = _endpoint_for(provider) used_model = model or default_model headers = { @@ -147,18 +152,22 @@ async def _call_provider( "Content-Type": "application/json", **extra_headers, } - r = await client.post( - url, - headers=headers, - json={"model": used_model, "messages": messages, "max_tokens": max_tokens}, - timeout=180, - ) + body: dict = {"model": used_model, "messages": messages, "max_tokens": max_tokens} + if response_format is not None: + body["response_format"] = response_format + r = await client.post(url, headers=headers, json=body, timeout=180) r.raise_for_status() data = r.json() msg = data["choices"][0]["message"] - # Some providers return null content + populated `reasoning` for thinking - # models, or null content when finish_reason=length cut off the response. - content = msg.get("content") or msg.get("reasoning") + # The `content` field is the model's user-facing answer. The optional + # `reasoning` field is the model's internal chain-of-thought — never + # safe to publish; it contains raw scratchpad ("Let's see…", + # mid-sentence question marks, planning notes). If `content` is empty + # (provider issue, finish_reason=length cutoff, or the model spent + # its budget on thinking), treat that as a generation failure and + # raise so the caller can retry or skip the row. Do NOT fall back to + # reasoning — see the 2026-05-29 valuation-read leak. + content = msg.get("content") if not content: finish = data["choices"][0].get("finish_reason") raise RuntimeError( @@ -189,6 +198,7 @@ async def call_llm( messages: list[dict], model: str | None = None, max_tokens: int = 4000, + response_format: dict | None = None, ) -> LogResult: """Provider-aware chat completion with fallback. Tries primary (LLM_PROVIDER) first; if it raises after retries, falls through to @@ -197,7 +207,11 @@ async def call_llm( The returned LogResult.model is prefixed with the provider that actually answered (e.g. ``deepseek/deepseek-v4-flash`` or ``openrouter/deepseek/deepseek-v4-flash``) — useful admin metadata - even though we hide it from the user-facing UI.""" + even though we hide it from the user-facing UI. + + Pass response_format={"type": "json_object"} to force JSON-mode + output (the model still needs to be instructed in the system prompt + to emit valid JSON — this flag enforces, not asks).""" chain = _provider_chain() if not chain: raise RuntimeError("No LLM provider configured (no API key set)") @@ -207,6 +221,7 @@ async def call_llm( try: result = await _call_provider( client, provider, messages, model, max_tokens, + response_format=response_format, ) if i > 0: from app.logging import get_logger diff --git a/tests/test_openrouter_transport.py b/tests/test_openrouter_transport.py index dfc14b0..e836044 100644 --- a/tests/test_openrouter_transport.py +++ b/tests/test_openrouter_transport.py @@ -183,10 +183,12 @@ async def test_call_llm_uses_upstream_cost_when_provided(monkeypatch): @pytest.mark.asyncio -async def test_call_llm_falls_back_to_reasoning_field_when_content_null(monkeypatch): - """Thinking models sometimes return null `content` plus a populated - `reasoning` block — we surface the reasoning so the caller still gets - something usable rather than treating the row as empty.""" +async def test_call_llm_does_not_publish_reasoning_when_content_null(monkeypatch): + """The `reasoning` field is the model's internal chain-of-thought + (scratchpad: "Let's see…", planning notes, half-formed math). It is + never safe to surface as the user-facing answer — see the + 2026-05-29 valuation-read leak. If `content` is null we treat the + row as a generation failure and raise; the caller can retry or skip.""" _configure(monkeypatch, DEEPSEEK_API_KEY="sk-d", LLM_FALLBACK="") def handler(request: httpx.Request) -> httpx.Response: @@ -199,8 +201,8 @@ async def test_call_llm_falls_back_to_reasoning_field_when_content_null(monkeypa }) async with httpx.AsyncClient(transport=_mock_post(handler)) as client: - result = await ot.call_llm(client, [{"role": "user", "content": "hi"}]) - assert result.content == "deep thought" + with pytest.raises(RuntimeError, match="LLM returned empty content"): + await ot.call_llm(client, [{"role": "user", "content": "hi"}]) @pytest.mark.asyncio @@ -228,7 +230,7 @@ async def test_call_llm_falls_back_to_secondary_when_primary_raises(monkeypatch) prompt_tokens=1, completion_tokens=2, cost_usd=0.0, ) - async def fake(_client, provider, _messages, _model, _max_tokens): + async def fake(_client, provider, _messages, _model, _max_tokens, response_format=None): calls.append(provider) if provider == "deepseek": raise RuntimeError("primary down") @@ -247,7 +249,7 @@ async def test_call_llm_raises_last_exception_when_chain_exhausted(monkeypatch): _configure(monkeypatch, DEEPSEEK_API_KEY="sk-d", OPENROUTER_API_KEY="sk-or") - async def fake(_client, provider, _messages, _model, _max_tokens): + async def fake(_client, provider, _messages, _model, _max_tokens, response_format=None): raise RuntimeError(f"{provider} broken") with patch.object(ot, "_call_provider", fake):