Live re-check on 50 recent IndicatorSummary rows after the previous 120 → 300 bump still produced 4 'reviewer returned non-JSON' verdicts out of 12 rejections. DeepSeek-V4-flash sometimes prefixes its JSON output with a short stretch of thinking even though response_format is enforced, which truncates the JSON at the back end of the 300-token cap. 800 tokens is comfortably above any realistic verdict + preamble at ~$0.00022/call (DeepSeek output rates). Negligible cost given the hourly call volume. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
114 lines
4.9 KiB
Python
114 lines
4.9 KiB
Python
"""Second-pass reviewer agent for AI-generated reads.
|
|
|
|
The per-group and aggregate indicator summaries are generated in JSON
|
|
mode and the publishable text comes out of a single "read" field, but a
|
|
misbehaving model can still slip chain-of-thought INSIDE the field
|
|
("Let's see…", "X? Actually Y?", multi-question parentheticals). This
|
|
module makes a small second LLM call that judges the candidate read as
|
|
clean / unclean. Cost is ~$0.0001 per check; latency ~1-2 s in the
|
|
hourly job. No user-facing latency.
|
|
|
|
The reviewer is deliberately a tiny, JSON-shaped classifier — same
|
|
JSON-mode mechanism as the generator, so the verdict can't be lost in
|
|
prose. If parsing fails or the call errors, the row is rejected
|
|
(fail-safe: the previously cached good summary stays visible).
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
from dataclasses import dataclass
|
|
|
|
import httpx
|
|
|
|
from app.logging import get_logger
|
|
from app.services.openrouter import call_llm
|
|
|
|
log = get_logger("output_review")
|
|
|
|
|
|
_SYSTEM_PROMPT = """\
|
|
You are a strict editor for a financial-markets dashboard. The author
|
|
was asked to produce a short interpretive read for human readers.
|
|
You receive their proposed read and decide if it is publishable as-is.
|
|
|
|
Mark CLEAN only if the text reads like a finished interpretation a
|
|
reader could see on a public dashboard without confusion.
|
|
|
|
Mark UNCLEAN if the text contains ANY of:
|
|
- Chain-of-thought / scratchpad markers used as thinking — phrases like
|
|
"Let me", "Let's see", "we need to", "actually" (correcting itself),
|
|
"wait", "hmm", "or rather", "I should".
|
|
- Self-questioning parentheticals: "Q1 2026? Actually Q4 2025?",
|
|
"is it X or Y?", any place where the author appears to be working
|
|
out the answer in front of the reader.
|
|
- Multiple rhetorical questions or any question that interrupts the
|
|
declarative voice. A clean interpretive read is assertive.
|
|
- Meta-commentary about the task, output format, word limits, or
|
|
instructions — e.g. "as required by the constraints", "the prompt
|
|
asks", "let me address each".
|
|
- Partial / truncated content. Starts mid-word, mid-number, mid-clause.
|
|
- Visible internal numbers without clear meaning ("change 1y +5.9%?"),
|
|
raw column names ("as_of 2026-01-01"), or any debug-like fragments.
|
|
- Anything other than the finished, publishable interpretation.
|
|
|
|
Return ONLY a JSON object with this exact shape:
|
|
{"clean": true | false, "reason": "<≤20 words, plain text>"}
|
|
No preamble, no markdown fences, no other fields.
|
|
"""
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class Verdict:
|
|
clean: bool
|
|
reason: str
|
|
cost_usd: float | None # cost of the review call itself, for the ledger
|
|
|
|
|
|
async def review_read(client: httpx.AsyncClient, candidate: str) -> Verdict:
|
|
"""Ask the LLM whether `candidate` is a publishable read.
|
|
|
|
Returns Verdict(clean, reason, cost). Any error — provider failure,
|
|
JSON parse failure, missing field, wrong type — yields a CONSERVATIVE
|
|
verdict (clean=False) so the caller drops the candidate. The
|
|
previously cached good summary stays visible on the dashboard."""
|
|
if not candidate or not candidate.strip():
|
|
return Verdict(clean=False, reason="empty candidate", cost_usd=0.0)
|
|
|
|
messages = [
|
|
{"role": "system", "content": _SYSTEM_PROMPT},
|
|
# Sent as a fenced user turn so the model can't confuse the
|
|
# candidate with instructions, even if the candidate happens to
|
|
# contain prompt-like prose.
|
|
{"role": "user", "content": f"Candidate read:\n```\n{candidate}\n```"},
|
|
]
|
|
try:
|
|
result = await call_llm(
|
|
client, messages,
|
|
# 800 tokens is well above the ~30-token JSON verdict the
|
|
# prompt asks for. The reviewer model (DeepSeek-V4-flash)
|
|
# occasionally pads with its own thinking before the JSON
|
|
# even though response_format is enforced; smaller caps
|
|
# (120, 300) produced finish_reason=length cutoffs that
|
|
# left the JSON half-written and broke the parser. 800
|
|
# removes the artefact entirely at ~$0.00022 per call.
|
|
max_tokens=800,
|
|
response_format={"type": "json_object"},
|
|
)
|
|
except Exception as e:
|
|
log.warning("review.call_failed", error=str(e)[:200])
|
|
return Verdict(clean=False, reason=f"reviewer error: {str(e)[:80]}",
|
|
cost_usd=None)
|
|
|
|
try:
|
|
parsed = json.loads(result.content)
|
|
except json.JSONDecodeError:
|
|
log.warning("review.parse_failed", preview=result.content[:200])
|
|
return Verdict(clean=False, reason="reviewer returned non-JSON",
|
|
cost_usd=result.cost_usd)
|
|
|
|
clean = parsed.get("clean")
|
|
reason = parsed.get("reason") or ""
|
|
if not isinstance(clean, bool):
|
|
return Verdict(clean=False, reason="reviewer omitted bool 'clean'",
|
|
cost_usd=result.cost_usd)
|
|
return Verdict(clean=clean, reason=str(reason)[:200], cost_usd=result.cost_usd)
|