"""Second-pass reviewer agent for AI-generated reads. The per-group and aggregate indicator summaries are generated in JSON mode and the publishable text comes out of a single "read" field, but a misbehaving model can still slip chain-of-thought INSIDE the field ("Let's see…", "X? Actually Y?", multi-question parentheticals). This module makes a small second LLM call that judges the candidate read as clean / unclean. Cost is ~$0.0001 per check; latency ~1-2 s in the hourly job. No user-facing latency. The reviewer is deliberately a tiny, JSON-shaped classifier — same JSON-mode mechanism as the generator, so the verdict can't be lost in prose. If parsing fails or the call errors, the row is rejected (fail-safe: the previously cached good summary stays visible). """ from __future__ import annotations import json from dataclasses import dataclass import httpx from app.config import get_settings from app.logging import get_logger from app.services.openrouter import call_llm log = get_logger("output_review") # The reviewer runs through OpenRouter against a small, non-thinking # model. DeepSeek-V4-flash (our generator default) emits internal # chain-of-thought before its JSON output even when the prompt forbids # it, which truncates the JSON at any reasonable max_tokens cap and # breaks the parser. Anthropic's Haiku family answers structured-output # tasks tersely and deterministically — no chain-of-thought tax. Cost # is ~$0.0001-$0.0003 per review depending on candidate length. DEFAULT_REVIEWER_MODEL = "anthropic/claude-haiku-4.5" _SYSTEM_PROMPT = """\ You are a strict editor for a financial-markets dashboard. The author was asked to produce a short interpretive read for human readers. You receive their proposed read and decide if it is publishable as-is. Mark CLEAN only if the text reads like a finished interpretation a reader could see on a public dashboard without confusion. Mark UNCLEAN if the text contains ANY of: - Chain-of-thought / scratchpad markers used as thinking — phrases like "Let me", "Let's see", "we need to", "actually" (correcting itself), "wait", "hmm", "or rather", "I should". - Self-questioning parentheticals: "Q1 2026? Actually Q4 2025?", "is it X or Y?", any place where the author appears to be working out the answer in front of the reader. - Multiple rhetorical questions or any question that interrupts the declarative voice. A clean interpretive read is assertive. - Meta-commentary about the task, output format, word limits, or instructions — e.g. "as required by the constraints", "the prompt asks", "let me address each". - Partial / truncated content. Starts mid-word, mid-number, mid-clause. - Visible internal numbers without clear meaning ("change 1y +5.9%?"), raw column names ("as_of 2026-01-01"), or any debug-like fragments. - Anything other than the finished, publishable interpretation. Return ONLY a JSON object with this exact shape: {"clean": true | false, "reason": "<≤20 words, plain text>"} No preamble, no markdown fences, no other fields. """ @dataclass(frozen=True) class Verdict: clean: bool reason: str cost_usd: float | None # cost of the review call itself, for the ledger async def review_read(client: httpx.AsyncClient, candidate: str) -> Verdict: """Ask the LLM whether `candidate` is a publishable read. Returns Verdict(clean, reason, cost). Any error — provider failure, JSON parse failure, missing field, wrong type — yields a CONSERVATIVE verdict (clean=False) so the caller drops the candidate. The previously cached good summary stays visible on the dashboard.""" if not candidate or not candidate.strip(): return Verdict(clean=False, reason="empty candidate", cost_usd=0.0) messages = [ {"role": "system", "content": _SYSTEM_PROMPT}, # Sent as a fenced user turn so the model can't confuse the # candidate with instructions, even if the candidate happens to # contain prompt-like prose. {"role": "user", "content": f"Candidate read:\n```\n{candidate}\n```"}, ] settings = get_settings() reviewer_model = getattr(settings, "REVIEWER_MODEL", None) or DEFAULT_REVIEWER_MODEL try: result = await call_llm( client, messages, # Pin to OpenRouter so a non-DeepSeek model like Haiku is # actually reachable; the default provider chain would try # DeepSeek native first and 404 on the Anthropic model name. provider="openrouter", model=reviewer_model, # 300 tokens is well above the ~30-token JSON verdict. # Haiku doesn't pad with hidden reasoning the way DeepSeek # does, so we don't need the 800-token headroom required to # absorb the generator's chain-of-thought. max_tokens=300, response_format={"type": "json_object"}, ) except Exception as e: log.warning("review.call_failed", error=str(e)[:200]) return Verdict(clean=False, reason=f"reviewer error: {str(e)[:80]}", cost_usd=None) # Haiku (and several other models) occasionally wrap their JSON # output in a markdown code fence even with response_format set — # ```json\n{...}\n``` — so strip a single leading/trailing fence # before parsing. We do this defensively for any model; it's a # no-op for callers that already emit bare JSON. raw = result.content.strip() if raw.startswith("```"): first_nl = raw.find("\n") if first_nl != -1: raw = raw[first_nl + 1:] if raw.rstrip().endswith("```"): raw = raw.rstrip()[:-3].rstrip() raw = raw.strip() try: parsed = json.loads(raw) except json.JSONDecodeError: log.warning("review.parse_failed", preview=result.content[:200]) return Verdict(clean=False, reason="reviewer returned non-JSON", cost_usd=result.cost_usd) clean = parsed.get("clean") reason = parsed.get("reason") or "" if not isinstance(clean, bool): return Verdict(clean=False, reason="reviewer omitted bool 'clean'", cost_usd=result.cost_usd) return Verdict(clean=clean, reason=str(reason)[:200], cost_usd=result.cost_usd)