llm: support JSON-mode + stop publishing the reasoning field
Two changes to the LLM call path that together close the
chain-of-thought leakage surface:
1. _call_provider accepts an optional `response_format` (forwarded to
the OpenAI-shaped API — DeepSeek and OpenRouter both honour
{"type": "json_object"}). Threaded through call_llm so callers can
force structured output without monkey-patching the body. The
indicator-summary job will use this next: it'll require the model
to emit {"read": "..."} and parse the field, making prose outside
the JSON object physically impossible to publish.
2. Empty `content` no longer falls back to the `reasoning` field.
`reasoning` is the model's internal scratchpad — "Let's see...",
half-formed math, planning notes. We had a fallback that surfaced
it when content was null, but the field is intended for debugging
the model, not for publication. After the 2026-05-29 valuation
read leaked into production, the fallback is gone: an empty
content row now raises so the caller retries or skips, and the
previous good row remains visible. Test updated to assert this
safer behaviour.
Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
parent
8347c90235
commit
19d4854f50
2 changed files with 36 additions and 19 deletions
|
|
@ -136,10 +136,15 @@ async def _call_provider(
|
|||
messages: list[dict],
|
||||
model: str | None,
|
||||
max_tokens: int,
|
||||
response_format: dict | None = None,
|
||||
) -> LogResult:
|
||||
"""One provider call with tenacity retries on transport/HTTP errors.
|
||||
Lives inside the retry decorator so retries happen within a provider,
|
||||
not across the fallback chain."""
|
||||
not across the fallback chain.
|
||||
|
||||
`response_format` is forwarded to the provider verbatim — DeepSeek and
|
||||
OpenRouter both accept the OpenAI-shaped {"type": "json_object"} for
|
||||
JSON-mode generation. None means free-form text."""
|
||||
url, api_key, default_model, extra_headers = _endpoint_for(provider)
|
||||
used_model = model or default_model
|
||||
headers = {
|
||||
|
|
@ -147,18 +152,22 @@ async def _call_provider(
|
|||
"Content-Type": "application/json",
|
||||
**extra_headers,
|
||||
}
|
||||
r = await client.post(
|
||||
url,
|
||||
headers=headers,
|
||||
json={"model": used_model, "messages": messages, "max_tokens": max_tokens},
|
||||
timeout=180,
|
||||
)
|
||||
body: dict = {"model": used_model, "messages": messages, "max_tokens": max_tokens}
|
||||
if response_format is not None:
|
||||
body["response_format"] = response_format
|
||||
r = await client.post(url, headers=headers, json=body, timeout=180)
|
||||
r.raise_for_status()
|
||||
data = r.json()
|
||||
msg = data["choices"][0]["message"]
|
||||
# Some providers return null content + populated `reasoning` for thinking
|
||||
# models, or null content when finish_reason=length cut off the response.
|
||||
content = msg.get("content") or msg.get("reasoning")
|
||||
# The `content` field is the model's user-facing answer. The optional
|
||||
# `reasoning` field is the model's internal chain-of-thought — never
|
||||
# safe to publish; it contains raw scratchpad ("Let's see…",
|
||||
# mid-sentence question marks, planning notes). If `content` is empty
|
||||
# (provider issue, finish_reason=length cutoff, or the model spent
|
||||
# its budget on thinking), treat that as a generation failure and
|
||||
# raise so the caller can retry or skip the row. Do NOT fall back to
|
||||
# reasoning — see the 2026-05-29 valuation-read leak.
|
||||
content = msg.get("content")
|
||||
if not content:
|
||||
finish = data["choices"][0].get("finish_reason")
|
||||
raise RuntimeError(
|
||||
|
|
@ -189,6 +198,7 @@ async def call_llm(
|
|||
messages: list[dict],
|
||||
model: str | None = None,
|
||||
max_tokens: int = 4000,
|
||||
response_format: dict | None = None,
|
||||
) -> LogResult:
|
||||
"""Provider-aware chat completion with fallback. Tries primary
|
||||
(LLM_PROVIDER) first; if it raises after retries, falls through to
|
||||
|
|
@ -197,7 +207,11 @@ async def call_llm(
|
|||
The returned LogResult.model is prefixed with the provider that
|
||||
actually answered (e.g. ``deepseek/deepseek-v4-flash`` or
|
||||
``openrouter/deepseek/deepseek-v4-flash``) — useful admin metadata
|
||||
even though we hide it from the user-facing UI."""
|
||||
even though we hide it from the user-facing UI.
|
||||
|
||||
Pass response_format={"type": "json_object"} to force JSON-mode
|
||||
output (the model still needs to be instructed in the system prompt
|
||||
to emit valid JSON — this flag enforces, not asks)."""
|
||||
chain = _provider_chain()
|
||||
if not chain:
|
||||
raise RuntimeError("No LLM provider configured (no API key set)")
|
||||
|
|
@ -207,6 +221,7 @@ async def call_llm(
|
|||
try:
|
||||
result = await _call_provider(
|
||||
client, provider, messages, model, max_tokens,
|
||||
response_format=response_format,
|
||||
)
|
||||
if i > 0:
|
||||
from app.logging import get_logger
|
||||
|
|
|
|||
|
|
@ -183,10 +183,12 @@ async def test_call_llm_uses_upstream_cost_when_provided(monkeypatch):
|
|||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_call_llm_falls_back_to_reasoning_field_when_content_null(monkeypatch):
|
||||
"""Thinking models sometimes return null `content` plus a populated
|
||||
`reasoning` block — we surface the reasoning so the caller still gets
|
||||
something usable rather than treating the row as empty."""
|
||||
async def test_call_llm_does_not_publish_reasoning_when_content_null(monkeypatch):
|
||||
"""The `reasoning` field is the model's internal chain-of-thought
|
||||
(scratchpad: "Let's see…", planning notes, half-formed math). It is
|
||||
never safe to surface as the user-facing answer — see the
|
||||
2026-05-29 valuation-read leak. If `content` is null we treat the
|
||||
row as a generation failure and raise; the caller can retry or skip."""
|
||||
_configure(monkeypatch, DEEPSEEK_API_KEY="sk-d", LLM_FALLBACK="")
|
||||
|
||||
def handler(request: httpx.Request) -> httpx.Response:
|
||||
|
|
@ -199,8 +201,8 @@ async def test_call_llm_falls_back_to_reasoning_field_when_content_null(monkeypa
|
|||
})
|
||||
|
||||
async with httpx.AsyncClient(transport=_mock_post(handler)) as client:
|
||||
result = await ot.call_llm(client, [{"role": "user", "content": "hi"}])
|
||||
assert result.content == "deep thought"
|
||||
with pytest.raises(RuntimeError, match="LLM returned empty content"):
|
||||
await ot.call_llm(client, [{"role": "user", "content": "hi"}])
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
|
|
@ -228,7 +230,7 @@ async def test_call_llm_falls_back_to_secondary_when_primary_raises(monkeypatch)
|
|||
prompt_tokens=1, completion_tokens=2, cost_usd=0.0,
|
||||
)
|
||||
|
||||
async def fake(_client, provider, _messages, _model, _max_tokens):
|
||||
async def fake(_client, provider, _messages, _model, _max_tokens, response_format=None):
|
||||
calls.append(provider)
|
||||
if provider == "deepseek":
|
||||
raise RuntimeError("primary down")
|
||||
|
|
@ -247,7 +249,7 @@ async def test_call_llm_raises_last_exception_when_chain_exhausted(monkeypatch):
|
|||
_configure(monkeypatch,
|
||||
DEEPSEEK_API_KEY="sk-d", OPENROUTER_API_KEY="sk-or")
|
||||
|
||||
async def fake(_client, provider, _messages, _model, _max_tokens):
|
||||
async def fake(_client, provider, _messages, _model, _max_tokens, response_format=None):
|
||||
raise RuntimeError(f"{provider} broken")
|
||||
|
||||
with patch.object(ot, "_call_provider", fake):
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue