diff --git a/app/services/output_review.py b/app/services/output_review.py index cdf545d..f228a74 100644 --- a/app/services/output_review.py +++ b/app/services/output_review.py @@ -84,15 +84,14 @@ async def review_read(client: httpx.AsyncClient, candidate: str) -> Verdict: try: result = await call_llm( client, messages, - # 300 tokens is comfortably above the 30-token JSON verdict - # the prompt asks for. An earlier 120-token cap was producing - # frequent finish_reason=length cutoffs that left the JSON - # half-written ('{"clean": false, "reason": "Text…'), which - # the parser then rejected as malformed — a false-negative - # in the verdict. The extra headroom costs ~$0.00015 per - # call (DeepSeek output rates) and removes that whole class - # of artefact. - max_tokens=300, + # 800 tokens is well above the ~30-token JSON verdict the + # prompt asks for. The reviewer model (DeepSeek-V4-flash) + # occasionally pads with its own thinking before the JSON + # even though response_format is enforced; smaller caps + # (120, 300) produced finish_reason=length cutoffs that + # left the JSON half-written and broke the parser. 800 + # removes the artefact entirely at ~$0.00022 per call. + max_tokens=800, response_format={"type": "json_object"}, ) except Exception as e: