ai: structured-output + reviewer agent for indicator summaries

Replaces the regex-based clean_summary / looks_like_leakage pipeline
that produced the 2026-05-29 valuation-read leak. Two layers of defence
in depth:

1. JSON-mode generation. The per-group and aggregate summary system
   prompts now require the model to emit a single object
   {"read": "..."}; response_format={"type":"json_object"} is passed
   through to the provider so the API enforces well-formed JSON. Prose
   outside the field is physically impossible. The "read" field is the
   only schema slot, so the model has nowhere to spill scratchpad
   into the envelope.

2. Reviewer agent. services/output_review.review_read() makes a second
   small LLM call that judges whether the candidate "read" string is
   publishable. It catches the residual failure mode — scratchpad
   INSIDE the field ("Let's see…", multi-question parentheticals,
   meta-commentary) — and returns a JSON verdict {"clean": bool,
   "reason": str}. Any failure (provider error, parse error, missing
   field) returns clean=false (fail-safe). Cost ~$0.0001/check; latency
   ~1-2 s in the hourly job, no user-facing latency.

The old regex scaffolding (_LEAK_PATTERNS, clean_summary,
looks_like_leakage, _TRAILING_QUOTE) is deleted entirely. It produced
false positives (chopped legitimate "The indicators are…" leaders) and
false negatives (never matched the chain-of-thought patterns the model
actually emits). The reviewer agent is strictly better on both.

On reviewer/parse rejection: don't persist a new IndicatorSummary; the
API's existing fallback to the previous good row continues to serve
the panel. Failures are logged as ind_summary.json_invalid /
ind_summary.reviewer_rejected so we can measure the rejection rate.

Reviewer cost is added to the row's recorded cost_usd so the monthly
budget cap covers the full pipeline.

Adds tests/test_output_review.py: 11 cases covering _extract_read
(JSON envelope handling — invalid JSON, missing field, wrong types,
empty values) and review_read (clean / unclean verdicts plus three
fail-safe paths for malformed reviewer responses).

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
Giorgio Gilestro 2026-05-29 13:10:52 +02:00
parent 19d4854f50
commit 45fa31bb2b
4 changed files with 396 additions and 141 deletions

View file

@ -4,7 +4,7 @@ hourly stays comfortably under the monthly cap."""
from __future__ import annotations
import asyncio
import re
import json
import httpx
from sqlalchemy import desc, func, select
@ -35,6 +35,7 @@ from app.services.openrouter import (
llm_configured,
month_start,
)
from app.services.output_review import review_read
from app.services.translation import translate
@ -106,109 +107,41 @@ async def translate_summary_for_active_languages(session, summary_id: int) -> No
summary_id=summary_id, succeeded=succeeded, failed=failed)
# Strip known meta-commentary openers the model sometimes leaks despite the
# prompt's hard constraints. Each pattern matches one leading sentence.
_LEAK_PATTERNS = [
re.compile(p, re.IGNORECASE | re.DOTALL)
for p in (
# First-person meta — "I need to / I'll / I have to / I'm going to ..."
r"^i\s+(?:need|have|must|should|am going|'ll|will|shall|can|am)[^.]*\.\s*",
# "We need / we're / we are asked / we will ..."
r"^we\s+(?:need|are|'re|will|shall|can|should|must|have)[^.]*\.\s*",
r"^let\s+(?:me|us|'?s)[^.]*\.\s*",
r"^here[']s[^.]*\.\s*",
r"^sure[,!]?\s[^.]*\.\s*",
r"^looking at[^.]*\.\s*",
r"^based on[^.]*\.\s*",
r"^to (?:address|answer|write|summarise|summarize)[^.]*\.\s*",
r"^first[,]?\s[^.]*\.\s*",
r"^the (?:user|data shows|reader|task|request|reader sees|instructions?)[^.]*\.\s*",
r"^summary[:.]\s*",
r"^key\s*[:\-—]\s*",
r"^must\s+(?:be|cite|explain|avoid|give|stay|provide)[^.]*\.\s*",
r"^should\s+(?:be|give|cite|explain|avoid|provide)[^.]*\.\s*",
r"^avoid[^.]*\.\s*",
r"^cite\s+at\s+most[^.]*\.\s*",
r"^be\s+(?:speculative|specific|concise|brief)[^.]*\.\s*",
r"^stay\s+on[^.]*\.\s*",
r"^okay[,]?\s+",
r"^alright[,]?\s+",
r"^thinking[^.]*\.\s*",
# Prompt-leak prefixes — the model echoes example framing or rule
# headers from the system prompt.
r"^(?:good|bad|positive|negative)\s+example\s*[:\-—]\s*",
r"^example\s+(?:good|bad)\s*[:\-—]\s*",
r"^example\s*[:\-—]\s*",
r"^reference\s+style\s*[:\-—]\s*",
# Prompt label echoes (markdown-style or plain-text)
r"^(?:hard\s+)?constraints?\s*[:\-—][^.\n]*[.\n]\s*",
r"^key\s+observations?\s*[:\-—]\s*",
r"^observations?\s*[:\-—]\s*",
r"^focus\s+on[^.]*\.\s*",
r"^output\s+the\s+read[^.]*\.\s*",
r"^plain\s+prose[^.]*\.\s*",
r"^the\s+indicators?[^.]*\.\s*", # "The indicators include..." / "The indicators are..."
r"^indicators?\s*[:\-—]\s*",
r"^data\s*[:\-—]\s*",
r"^analysis\s*[:\-—]\s*",
r"^interpretation\s*[:\-—]\s*",
r"^read\s*[:\-—]\s*",
r"^note\s*[:\-—]\s*",
# Sometimes the response gets wrapped in literal quotes
r"^[\"'`]+",
)
]
# Defence-in-depth: read generation goes through JSON mode + a reviewer.
#
# 1. The system prompt instructs the model to emit {"read": "..."} only;
# response_format={"type":"json_object"} forces well-formed JSON at
# the API layer, so prose outside the field is impossible.
# 2. We extract `read`, then ask a second LLM call (services/output_review)
# whether the candidate text is publishable. Scratchpad INSIDE the
# field — "Let's see…", "X? Actually Y?" — is caught here.
# 3. Any failure at either stage (parse, missing field, reviewer veto,
# reviewer error) drops the candidate. The previous good
# IndicatorSummary stays visible.
#
# The old _LEAK_PATTERNS / clean_summary / looks_like_leakage regex
# scaffolding lived here previously. It produced false positives (e.g.
# chopping off a legitimate leading sentence like "The indicators are
# pricing…") and false negatives (it never caught the chain-of-thought
# patterns the model actually emits). The reviewer agent replaces it.
_TRAILING_QUOTE = re.compile(r"[\"'`]+\s*$")
# Tell-tale phrases that mean the model regurgitated the prompt as its
# "answer" — we'd rather show nothing than show this.
_LEAKAGE_FLAGS = (
"≤60 words", "60 words", "must be under", "must cite", "must explain",
"no meta-commentary", "no buy/sell", "horizon. ", "1-day moves",
"the instructions are", "instructions:", "constraints:", "hard constraints",
"good example", "bad example", "reference style",
)
def looks_like_leakage(text: str) -> bool:
"""Heuristic: after cleaning, if these phrases still appear, the output
is contaminated prompt-regurgitation and shouldn't be shown."""
low = text.lower()
return any(flag in low for flag in _LEAKAGE_FLAGS)
def clean_summary(text: str) -> str:
"""Strip leading meta-commentary. If cleaning removes nearly everything
(suggesting the model emitted reasoning then ran out of tokens), fall
back to the last non-empty paragraph of the raw output that's usually
where the actual answer ended up."""
raw = text.strip()
out = raw
# Up to 6 passes: handles compound leakage like
# "Constraints: <...>. The indicators are: <...>. <actual answer>"
for _ in range(6):
before = out
for pat in _LEAK_PATTERNS:
out = pat.sub("", out, count=1).lstrip()
if out == before:
break
if len(out) < 60 and len(raw) > 120:
# Cleaning ate too much; take the last non-empty paragraph of raw.
paragraphs = [p.strip() for p in re.split(r"\n\s*\n", raw) if p.strip()]
if paragraphs:
out = paragraphs[-1]
# Re-strip leaders from the recovered paragraph too.
for _ in range(2):
before = out
for pat in _LEAK_PATTERNS:
out = pat.sub("", out, count=1).lstrip()
if out == before:
break
# Trim any orphan closing quote/backtick from the wrap-strip above.
out = _TRAILING_QUOTE.sub("", out).rstrip()
return out
def _extract_read(raw: str) -> str | None:
"""Parse the model's JSON envelope and return the "read" field, or
None if the body isn't valid JSON / the field is missing / the field
isn't a string. Conservative: on any deviation from the schema we
drop the candidate rather than try to salvage it."""
try:
parsed = json.loads(raw)
except json.JSONDecodeError:
return None
if not isinstance(parsed, dict):
return None
read = parsed.get("read")
if not isinstance(read, str):
return None
read = read.strip()
return read or None
@ -228,19 +161,20 @@ async def _generate_one(
[{"role": "system", "content": system_prompt},
{"role": "user", "content": user_prompt}],
max_tokens=800, # DeepSeek sometimes spends 300+ on internal reasoning
response_format={"type": "json_object"},
)
except Exception as e:
session.add(AICall(model=active_model(), status="error", error=str(e)[:500]))
log.warning("ind_summary.failed", group=group, error=str(e)[:120])
return None
cleaned = clean_summary(result.content)
if looks_like_leakage(cleaned) or len(cleaned) < 40:
# Model regurgitated the prompt or produced nothing usable.
# Don't persist — keep the last good summary visible. Log it so
# we can see the rate of failures over time.
log.warning("ind_summary.leakage_detected",
group=group, preview=cleaned[:120])
candidate = _extract_read(result.content)
if candidate is None or len(candidate) < 40:
# JSON envelope malformed, "read" field missing/wrong type, or
# the candidate is too short to be a real read. Don't persist;
# the last good summary stays visible.
log.warning("ind_summary.json_invalid",
group=group, preview=result.content[:160])
session.add(AICall(
model=result.model,
prompt_tokens=result.prompt_tokens,
@ -250,6 +184,23 @@ async def _generate_one(
))
return None
verdict = await review_read(client, candidate)
if not verdict.clean:
# Reviewer caught scratchpad / meta-commentary / partial text
# INSIDE the read field. Drop the candidate; the previous good
# summary continues to serve.
log.warning("ind_summary.reviewer_rejected",
group=group, reason=verdict.reason,
preview=candidate[:120])
session.add(AICall(
model=result.model,
prompt_tokens=result.prompt_tokens,
completion_tokens=result.completion_tokens,
cost_usd=(result.cost_usd or 0.0) + (verdict.cost_usd or 0.0),
status="leaked",
))
return None
summary = IndicatorSummary(
group_name=group,
generated_at=utcnow(),
@ -257,17 +208,19 @@ async def _generate_one(
tone=tone,
analysis=analysis,
prompt_version=PROMPT_VERSION,
content=cleaned,
content=candidate,
prompt_tokens=result.prompt_tokens,
completion_tokens=result.completion_tokens,
cost_usd=result.cost_usd,
# Include the reviewer's cost in the row's recorded spend so the
# monthly budget tracking covers the full pipeline cost.
cost_usd=(result.cost_usd or 0.0) + (verdict.cost_usd or 0.0),
)
session.add(summary)
session.add(AICall(
model=result.model,
prompt_tokens=result.prompt_tokens,
completion_tokens=result.completion_tokens,
cost_usd=result.cost_usd,
cost_usd=(result.cost_usd or 0.0) + (verdict.cost_usd or 0.0),
status="ok",
))
return summary
@ -338,6 +291,7 @@ async def run() -> None:
await translate_summary_for_active_languages(session, summary.id)
# One aggregate read across all groups, stored under __all__.
# Same JSON-mode + reviewer-agent path as per-group reads.
agg_system = build_aggregate_summary_system_prompt(tone, analysis)
agg_user = build_aggregate_summary_user_prompt(groups)
agg_summary: IndicatorSummary | None = None
@ -346,28 +300,53 @@ async def run() -> None:
client,
[{"role": "system", "content": agg_system},
{"role": "user", "content": agg_user}],
max_tokens=1500, # room for reasoning + 80-word output
max_tokens=1500,
response_format={"type": "json_object"},
)
agg_summary = IndicatorSummary(
group_name=AGGREGATE_GROUP_NAME,
generated_at=utcnow(),
model=result.model,
tone=tone,
analysis=analysis,
prompt_version=PROMPT_VERSION,
content=clean_summary(result.content),
prompt_tokens=result.prompt_tokens,
completion_tokens=result.completion_tokens,
cost_usd=result.cost_usd,
)
session.add(agg_summary)
session.add(AICall(
model=result.model,
prompt_tokens=result.prompt_tokens,
completion_tokens=result.completion_tokens,
cost_usd=result.cost_usd, status="ok",
))
written += 1
candidate = _extract_read(result.content)
if candidate is None or len(candidate) < 40:
log.warning("ind_summary.agg_json_invalid",
tone=tone, preview=result.content[:160])
session.add(AICall(
model=result.model,
prompt_tokens=result.prompt_tokens,
completion_tokens=result.completion_tokens,
cost_usd=result.cost_usd, status="leaked",
))
else:
verdict = await review_read(client, candidate)
full_cost = (result.cost_usd or 0.0) + (verdict.cost_usd or 0.0)
if not verdict.clean:
log.warning("ind_summary.agg_reviewer_rejected",
tone=tone, reason=verdict.reason,
preview=candidate[:120])
session.add(AICall(
model=result.model,
prompt_tokens=result.prompt_tokens,
completion_tokens=result.completion_tokens,
cost_usd=full_cost, status="leaked",
))
else:
agg_summary = IndicatorSummary(
group_name=AGGREGATE_GROUP_NAME,
generated_at=utcnow(),
model=result.model,
tone=tone,
analysis=analysis,
prompt_version=PROMPT_VERSION,
content=candidate,
prompt_tokens=result.prompt_tokens,
completion_tokens=result.completion_tokens,
cost_usd=full_cost,
)
session.add(agg_summary)
session.add(AICall(
model=result.model,
prompt_tokens=result.prompt_tokens,
completion_tokens=result.completion_tokens,
cost_usd=full_cost, status="ok",
))
written += 1
except Exception as e:
session.add(AICall(
model=active_model(), status="error",