ai: structured-output + reviewer agent for indicator summaries

Replaces the regex-based clean_summary / looks_like_leakage pipeline that produced the 2026-05-29 valuation-read leak. Two layers of defence in depth: 1. JSON-mode generation. The per-group and aggregate summary system prompts now require the model to emit a single object {"read": "..."}; response_format={"type":"json_object"} is passed through to the provider so the API enforces well-formed JSON. Prose outside the field is physically impossible. The "read" field is the only schema slot, so the model has nowhere to spill scratchpad into the envelope. 2. Reviewer agent. services/output_review.review_read() makes a second small LLM call that judges whether the candidate "read" string is publishable. It catches the residual failure mode — scratchpad INSIDE the field ("Let's see…", multi-question parentheticals, meta-commentary) — and returns a JSON verdict {"clean": bool, "reason": str}. Any failure (provider error, parse error, missing field) returns clean=false (fail-safe). Cost ~$0.0001/check; latency ~1-2 s in the hourly job, no user-facing latency. The old regex scaffolding (_LEAK_PATTERNS, clean_summary, looks_like_leakage, _TRAILING_QUOTE) is deleted entirely. It produced false positives (chopped legitimate "The indicators are…" leaders) and false negatives (never matched the chain-of-thought patterns the model actually emits). The reviewer agent is strictly better on both. On reviewer/parse rejection: don't persist a new IndicatorSummary; the API's existing fallback to the previous good row continues to serve the panel. Failures are logged as ind_summary.json_invalid / ind_summary.reviewer_rejected so we can measure the rejection rate. Reviewer cost is added to the row's recorded cost_usd so the monthly budget cap covers the full pipeline. Adds tests/test_output_review.py: 11 cases covering _extract_read (JSON envelope handling — invalid JSON, missing field, wrong types, empty values) and review_read (clean / unclean verdicts plus three fail-safe paths for malformed reviewer responses). Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-29 13:10:52 +02:00 · 2026-05-29 13:10:52 +02:00 · 45fa31bb2b
commit 45fa31bb2b
parent 19d4854f50
4 changed files with 396 additions and 141 deletions
--- a/app/jobs/indicator_summary_job.py
+++ b/app/jobs/indicator_summary_job.py
@ -4,7 +4,7 @@ hourly stays comfortably under the monthly cap."""
 from __future__ import annotations

 import asyncio
-import re
+import json

 import httpx
 from sqlalchemy import desc, func, select
@ -35,6 +35,7 @@ from app.services.openrouter import (
    llm_configured,
    month_start,
 )
+from app.services.output_review import review_read
 from app.services.translation import translate


@ -106,109 +107,41 @@ async def translate_summary_for_active_languages(session, summary_id: int) -> No
                 summary_id=summary_id, succeeded=succeeded, failed=failed)


-# Strip known meta-commentary openers the model sometimes leaks despite the
-# prompt's hard constraints. Each pattern matches one leading sentence.
-_LEAK_PATTERNS = [
-    re.compile(p, re.IGNORECASE | re.DOTALL)
-    for p in (
-        # First-person meta — "I need to / I'll / I have to / I'm going to ..."
-        r"^i\s+(?:need|have|must|should|am going|'ll|will|shall|can|am)[^.]*\.\s*",
-        # "We need / we're / we are asked / we will ..."
-        r"^we\s+(?:need|are|'re|will|shall|can|should|must|have)[^.]*\.\s*",
-        r"^let\s+(?:me|us|'?s)[^.]*\.\s*",
-        r"^here['’]s[^.]*\.\s*",
-        r"^sure[,!]?\s[^.]*\.\s*",
-        r"^looking at[^.]*\.\s*",
-        r"^based on[^.]*\.\s*",
-        r"^to (?:address|answer|write|summarise|summarize)[^.]*\.\s*",
-        r"^first[,]?\s[^.]*\.\s*",
-        r"^the (?:user|data shows|reader|task|request|reader sees|instructions?)[^.]*\.\s*",
-        r"^summary[:.]\s*",
-        r"^key\s*[:\-—]\s*",
-        r"^must\s+(?:be|cite|explain|avoid|give|stay|provide)[^.]*\.\s*",
-        r"^should\s+(?:be|give|cite|explain|avoid|provide)[^.]*\.\s*",
-        r"^avoid[^.]*\.\s*",
-        r"^cite\s+at\s+most[^.]*\.\s*",
-        r"^be\s+(?:speculative|specific|concise|brief)[^.]*\.\s*",
-        r"^stay\s+on[^.]*\.\s*",
-        r"^okay[,]?\s+",
-        r"^alright[,]?\s+",
-        r"^thinking[^.]*\.\s*",
-        # Prompt-leak prefixes — the model echoes example framing or rule
-        # headers from the system prompt.
-        r"^(?:good|bad|positive|negative)\s+example\s*[:\-—]\s*",
-        r"^example\s+(?:good|bad)\s*[:\-—]\s*",
-        r"^example\s*[:\-—]\s*",
-        r"^reference\s+style\s*[:\-—]\s*",
-        # Prompt label echoes (markdown-style or plain-text)
-        r"^(?:hard\s+)?constraints?\s*[:\-—][^.\n]*[.\n]\s*",
-        r"^key\s+observations?\s*[:\-—]\s*",
-        r"^observations?\s*[:\-—]\s*",
-        r"^focus\s+on[^.]*\.\s*",
-        r"^output\s+the\s+read[^.]*\.\s*",
-        r"^plain\s+prose[^.]*\.\s*",
-        r"^the\s+indicators?[^.]*\.\s*",   # "The indicators include..." / "The indicators are..."
-        r"^indicators?\s*[:\-—]\s*",
-        r"^data\s*[:\-—]\s*",
-        r"^analysis\s*[:\-—]\s*",
-        r"^interpretation\s*[:\-—]\s*",
-        r"^read\s*[:\-—]\s*",
-        r"^note\s*[:\-—]\s*",
-        # Sometimes the response gets wrapped in literal quotes
-        r"^[\"“'`]+",
-    )
-]
+# Defence-in-depth: read generation goes through JSON mode + a reviewer.
+#
+# 1. The system prompt instructs the model to emit {"read": "..."} only;
+#    response_format={"type":"json_object"} forces well-formed JSON at
+#    the API layer, so prose outside the field is impossible.
+# 2. We extract `read`, then ask a second LLM call (services/output_review)
+#    whether the candidate text is publishable. Scratchpad INSIDE the
+#    field — "Let's see…", "X? Actually Y?" — is caught here.
+# 3. Any failure at either stage (parse, missing field, reviewer veto,
+#    reviewer error) drops the candidate. The previous good
+#    IndicatorSummary stays visible.
+#
+# The old _LEAK_PATTERNS / clean_summary / looks_like_leakage regex
+# scaffolding lived here previously. It produced false positives (e.g.
+# chopping off a legitimate leading sentence like "The indicators are
+# pricing…") and false negatives (it never caught the chain-of-thought
+# patterns the model actually emits). The reviewer agent replaces it.


-_TRAILING_QUOTE = re.compile(r"[\"”'`]+\s*$")
-
-# Tell-tale phrases that mean the model regurgitated the prompt as its
-# "answer" — we'd rather show nothing than show this.
-_LEAKAGE_FLAGS = (
-    "≤60 words", "60 words", "must be under", "must cite", "must explain",
-    "no meta-commentary", "no buy/sell", "horizon. ", "1-day moves",
-    "the instructions are", "instructions:", "constraints:", "hard constraints",
-    "good example", "bad example", "reference style",
-)
-
-
-def looks_like_leakage(text: str) -> bool:
-    """Heuristic: after cleaning, if these phrases still appear, the output
-    is contaminated prompt-regurgitation and shouldn't be shown."""
-    low = text.lower()
-    return any(flag in low for flag in _LEAKAGE_FLAGS)
-
-
-def clean_summary(text: str) -> str:
-    """Strip leading meta-commentary. If cleaning removes nearly everything
-    (suggesting the model emitted reasoning then ran out of tokens), fall
-    back to the last non-empty paragraph of the raw output — that's usually
-    where the actual answer ended up."""
-    raw = text.strip()
-    out = raw
-    # Up to 6 passes: handles compound leakage like
-    # "Constraints: <...>. The indicators are: <...>. <actual answer>"
-    for _ in range(6):
-        before = out
-        for pat in _LEAK_PATTERNS:
-            out = pat.sub("", out, count=1).lstrip()
-        if out == before:
-            break
-    if len(out) < 60 and len(raw) > 120:
-        # Cleaning ate too much; take the last non-empty paragraph of raw.
-        paragraphs = [p.strip() for p in re.split(r"\n\s*\n", raw) if p.strip()]
-        if paragraphs:
-            out = paragraphs[-1]
-            # Re-strip leaders from the recovered paragraph too.
-            for _ in range(2):
-                before = out
-                for pat in _LEAK_PATTERNS:
-                    out = pat.sub("", out, count=1).lstrip()
-                if out == before:
-                    break
-    # Trim any orphan closing quote/backtick from the wrap-strip above.
-    out = _TRAILING_QUOTE.sub("", out).rstrip()
-    return out
+def _extract_read(raw: str) -> str | None:
+    """Parse the model's JSON envelope and return the "read" field, or
+    None if the body isn't valid JSON / the field is missing / the field
+    isn't a string. Conservative: on any deviation from the schema we
+    drop the candidate rather than try to salvage it."""
+    try:
+        parsed = json.loads(raw)
+    except json.JSONDecodeError:
+        return None
+    if not isinstance(parsed, dict):
+        return None
+    read = parsed.get("read")
+    if not isinstance(read, str):
+        return None
+    read = read.strip()
+    return read or None



@ -228,19 +161,20 @@ async def _generate_one(
            [{"role": "system", "content": system_prompt},
             {"role": "user",   "content": user_prompt}],
            max_tokens=800,  # DeepSeek sometimes spends 300+ on internal reasoning
+            response_format={"type": "json_object"},
        )
    except Exception as e:
        session.add(AICall(model=active_model(), status="error", error=str(e)[:500]))
        log.warning("ind_summary.failed", group=group, error=str(e)[:120])
        return None

-    cleaned = clean_summary(result.content)
-    if looks_like_leakage(cleaned) or len(cleaned) < 40:
-        # Model regurgitated the prompt or produced nothing usable.
-        # Don't persist — keep the last good summary visible. Log it so
-        # we can see the rate of failures over time.
-        log.warning("ind_summary.leakage_detected",
-                    group=group, preview=cleaned[:120])
+    candidate = _extract_read(result.content)
+    if candidate is None or len(candidate) < 40:
+        # JSON envelope malformed, "read" field missing/wrong type, or
+        # the candidate is too short to be a real read. Don't persist;
+        # the last good summary stays visible.
+        log.warning("ind_summary.json_invalid",
+                    group=group, preview=result.content[:160])
        session.add(AICall(
            model=result.model,
            prompt_tokens=result.prompt_tokens,
@ -250,6 +184,23 @@ async def _generate_one(
        ))
        return None

+    verdict = await review_read(client, candidate)
+    if not verdict.clean:
+        # Reviewer caught scratchpad / meta-commentary / partial text
+        # INSIDE the read field. Drop the candidate; the previous good
+        # summary continues to serve.
+        log.warning("ind_summary.reviewer_rejected",
+                    group=group, reason=verdict.reason,
+                    preview=candidate[:120])
+        session.add(AICall(
+            model=result.model,
+            prompt_tokens=result.prompt_tokens,
+            completion_tokens=result.completion_tokens,
+            cost_usd=(result.cost_usd or 0.0) + (verdict.cost_usd or 0.0),
+            status="leaked",
+        ))
+        return None
+
    summary = IndicatorSummary(
        group_name=group,
        generated_at=utcnow(),
@ -257,17 +208,19 @@ async def _generate_one(
        tone=tone,
        analysis=analysis,
        prompt_version=PROMPT_VERSION,
-        content=cleaned,
+        content=candidate,
        prompt_tokens=result.prompt_tokens,
        completion_tokens=result.completion_tokens,
-        cost_usd=result.cost_usd,
+        # Include the reviewer's cost in the row's recorded spend so the
+        # monthly budget tracking covers the full pipeline cost.
+        cost_usd=(result.cost_usd or 0.0) + (verdict.cost_usd or 0.0),
    )
    session.add(summary)
    session.add(AICall(
        model=result.model,
        prompt_tokens=result.prompt_tokens,
        completion_tokens=result.completion_tokens,
-        cost_usd=result.cost_usd,
+        cost_usd=(result.cost_usd or 0.0) + (verdict.cost_usd or 0.0),
        status="ok",
    ))
    return summary
@ -338,6 +291,7 @@ async def run() -> None:
                        await translate_summary_for_active_languages(session, summary.id)

                # One aggregate read across all groups, stored under __all__.
+                # Same JSON-mode + reviewer-agent path as per-group reads.
                agg_system = build_aggregate_summary_system_prompt(tone, analysis)
                agg_user = build_aggregate_summary_user_prompt(groups)
                agg_summary: IndicatorSummary | None = None
@ -346,8 +300,33 @@ async def run() -> None:
                        client,
                        [{"role": "system", "content": agg_system},
                         {"role": "user", "content": agg_user}],
-                        max_tokens=1500,  # room for reasoning + 80-word output
+                        max_tokens=1500,
+                        response_format={"type": "json_object"},
                    )
+                    candidate = _extract_read(result.content)
+                    if candidate is None or len(candidate) < 40:
+                        log.warning("ind_summary.agg_json_invalid",
+                                    tone=tone, preview=result.content[:160])
+                        session.add(AICall(
+                            model=result.model,
+                            prompt_tokens=result.prompt_tokens,
+                            completion_tokens=result.completion_tokens,
+                            cost_usd=result.cost_usd, status="leaked",
+                        ))
+                    else:
+                        verdict = await review_read(client, candidate)
+                        full_cost = (result.cost_usd or 0.0) + (verdict.cost_usd or 0.0)
+                        if not verdict.clean:
+                            log.warning("ind_summary.agg_reviewer_rejected",
+                                        tone=tone, reason=verdict.reason,
+                                        preview=candidate[:120])
+                            session.add(AICall(
+                                model=result.model,
+                                prompt_tokens=result.prompt_tokens,
+                                completion_tokens=result.completion_tokens,
+                                cost_usd=full_cost, status="leaked",
+                            ))
+                        else:
                            agg_summary = IndicatorSummary(
                                group_name=AGGREGATE_GROUP_NAME,
                                generated_at=utcnow(),
@ -355,17 +334,17 @@ async def run() -> None:
                                tone=tone,
                                analysis=analysis,
                                prompt_version=PROMPT_VERSION,
-                        content=clean_summary(result.content),
+                                content=candidate,
                                prompt_tokens=result.prompt_tokens,
                                completion_tokens=result.completion_tokens,
-                        cost_usd=result.cost_usd,
+                                cost_usd=full_cost,
                            )
                            session.add(agg_summary)
                            session.add(AICall(
                                model=result.model,
                                prompt_tokens=result.prompt_tokens,
                                completion_tokens=result.completion_tokens,
-                        cost_usd=result.cost_usd, status="ok",
+                                cost_usd=full_cost, status="ok",
                            ))
                            written += 1
                except Exception as e:
--- a/app/services/llm_prompts.py
+++ b/app/services/llm_prompts.py
@ -296,12 +296,25 @@ question via the chat sidebar.
 def build_summary_system_prompt(tone: str, analysis: str) -> str:
    """A lean, focused system prompt for the per-indicator-group hourly
    summary. INTERPRETATION not description — the reader has the table
-    next to this paragraph; they don't need numbers recited at them."""
+    next to this paragraph; they don't need numbers recited at them.
+
+    Output is JSON-mode: the model must emit a single object
+    {"read": "..."}. The wrapper makes scratchpad outside the field
+    physically impossible — the API enforces well-formed JSON, and the
+    only schema slot is the publishable read. Scratchpad inside the
+    field is caught by the reviewer agent (services/output_review)."""
    tone_block = _TONE[_resolve_tone(tone)]
    analysis_block = _ANALYSIS.get(analysis.upper(), _ANALYSIS["SPECULATIVE"])
    return f"""You write a TINY interpretation (≤60 words, 2-3 sentences) \
 of ONE indicator group for a strategic markets dashboard.

+# Output format (strict)
+Return ONLY a single JSON object with exactly one field:
+{{"read": "<your 2-3 sentence interpretation>"}}
+Nothing outside that JSON object. No preamble. No markdown fences. \
+No additional fields. The "read" string is what the user sees verbatim, \
+so it must already be the finished, publishable text — never your thinking.
+
 # What this is for
 The reader is looking at the table of numbers right next to your text. \
 They can see the values. They CANNOT see the meaning. Your job is to \
@ -316,19 +329,20 @@ Even at 2-3 sentences, contrast what the underlying factors justify \
 they don't diverge, say so in one clause. Never just describe the move \
 without placing it on this axis.

-# Hard constraints
+# Hard constraints on the "read" string
 - Plain prose, ONE paragraph. No markdown, no headers, no lists, no labels.
 - Open IMMEDIATELY with substance. NEVER start with: "I need to", "I'll", \
 "We need to", "We are asked", "Here's", "Let me", "Let's", "Sure", "Looking \
 at", "Based on", "Summary:", "The data shows", "First", "To address". No \
 meta-commentary at all.
+- No rhetorical questions, no "X? Actually Y?" self-corrections, no \
+parenthetical asides that question your own numbers. The text is the \
+finished read, not the thinking.
 - Cite at most 2-3 specific numbers and ONLY when they anchor an \
 interpretation. Don't list moves; explain them.
 - Multi-week / multi-month horizon. 1-day moves under 2% are noise — skip.
 - No buy/sell language. No predictions. No watch list. No TL;DR. No date \
 header. No "system temperature" line — that belongs to the full daily log.
- Output the read directly. Do NOT include phrases like "Example", "Good \
-example", "Bad example", "Reference", or any meta-framing of your output.

 {tone_block}

@ -350,13 +364,22 @@ def build_summary_user_prompt(group_name: str, quotes: list[dict]) -> str:

 def build_aggregate_summary_system_prompt(tone: str, analysis: str) -> str:
    """System prompt for the cross-group aggregate read shown on the dashboard.
-    Wider lens than a per-group summary — synthesise across all groups."""
+    Wider lens than a per-group summary — synthesise across all groups.
+
+    Same JSON-mode contract as build_summary_system_prompt: output is
+    {"read": "..."} only; the field is the publishable text verbatim."""
    tone_block = _TONE[_resolve_tone(tone)]
    analysis_block = _ANALYSIS.get(analysis.upper(), _ANALYSIS["SPECULATIVE"])
    return f"""You write a single SHORT cross-asset INTERPRETATION (≤80 \
 words, 2-4 sentences) for the dashboard header. The reader is glancing — \
 give them the meaning of the whole tape, not a recap.

+# Output format (strict)
+Return ONLY a single JSON object with exactly one field:
+{{"read": "<your 2-4 sentence cross-asset interpretation>"}}
+Nothing outside that JSON object. No preamble. No markdown fences. \
+No additional fields. The "read" string is what the user sees verbatim.
+
 # What this is for
 The reader can see every indicator on the dashboard below this paragraph. \
 Your job is NOT to summarise the moves. It is to explain what the moves, \
@ -371,19 +394,19 @@ crowd is actually doing (irrational: positioning, narrative momentum, \
 flows). At least one of the 2-4 sentences must name this gap or, if the \
 two cohere, explicitly say so.

-# Hard constraints
+# Hard constraints on the "read" string
 - Plain prose, ONE paragraph. No markdown, headers, lists, or labels.
 - Open IMMEDIATELY with substance. NEVER start with: "I need to", "I'll", \
 "We need to", "Here's", "Let me", "Looking at", "Based on", "Sure", "Summary:", \
 "The data shows", "Across the board". No meta-commentary.
+- No rhetorical questions, no "X? Actually Y?" self-corrections, no \
+parenthetical asides that question your own numbers.
 - Identify the single most important **cross-asset implication**: e.g. \
 "rates and credit disagree", "equities outrun fundamentals", "geopolitical \
 risk premium is in commodities but not vol". Cite no more than 3 specific \
 numbers, and only as anchors for the interpretation.
 - Multi-week / multi-month horizon. 1-day moves under 2% are noise.
 - No buy/sell language. No predictions of specific levels.
- Output the read directly. Do NOT include phrases like "Example", "Good \
-example", "Bad example", "Reference", or any meta-framing of your output.

 {tone_block}

--- a/app/services/output_review.py
+++ b/app/services/output_review.py
@ -0,0 +1,107 @@
+"""Second-pass reviewer agent for AI-generated reads.
+
+The per-group and aggregate indicator summaries are generated in JSON
+mode and the publishable text comes out of a single "read" field, but a
+misbehaving model can still slip chain-of-thought INSIDE the field
+("Let's see…", "X? Actually Y?", multi-question parentheticals). This
+module makes a small second LLM call that judges the candidate read as
+clean / unclean. Cost is ~$0.0001 per check; latency ~1-2 s in the
+hourly job. No user-facing latency.
+
+The reviewer is deliberately a tiny, JSON-shaped classifier — same
+JSON-mode mechanism as the generator, so the verdict can't be lost in
+prose. If parsing fails or the call errors, the row is rejected
+(fail-safe: the previously cached good summary stays visible).
+"""
+from __future__ import annotations
+
+import json
+from dataclasses import dataclass
+
+import httpx
+
+from app.logging import get_logger
+from app.services.openrouter import call_llm
+
+log = get_logger("output_review")
+
+
+_SYSTEM_PROMPT = """\
+You are a strict editor for a financial-markets dashboard. The author
+was asked to produce a short interpretive read for human readers.
+You receive their proposed read and decide if it is publishable as-is.
+
+Mark CLEAN only if the text reads like a finished interpretation a
+reader could see on a public dashboard without confusion.
+
+Mark UNCLEAN if the text contains ANY of:
+- Chain-of-thought / scratchpad markers used as thinking — phrases like
+  "Let me", "Let's see", "we need to", "actually" (correcting itself),
+  "wait", "hmm", "or rather", "I should".
+- Self-questioning parentheticals: "Q1 2026? Actually Q4 2025?",
+  "is it X or Y?", any place where the author appears to be working
+  out the answer in front of the reader.
+- Multiple rhetorical questions or any question that interrupts the
+  declarative voice. A clean interpretive read is assertive.
+- Meta-commentary about the task, output format, word limits, or
+  instructions — e.g. "as required by the constraints", "the prompt
+  asks", "let me address each".
+- Partial / truncated content. Starts mid-word, mid-number, mid-clause.
+- Visible internal numbers without clear meaning ("change 1y +5.9%?"),
+  raw column names ("as_of 2026-01-01"), or any debug-like fragments.
+- Anything other than the finished, publishable interpretation.
+
+Return ONLY a JSON object with this exact shape:
+{"clean": true | false, "reason": "<≤20 words, plain text>"}
+No preamble, no markdown fences, no other fields.
+"""
+
+
+@dataclass(frozen=True)
+class Verdict:
+    clean: bool
+    reason: str
+    cost_usd: float | None  # cost of the review call itself, for the ledger
+
+
+async def review_read(client: httpx.AsyncClient, candidate: str) -> Verdict:
+    """Ask the LLM whether `candidate` is a publishable read.
+
+    Returns Verdict(clean, reason, cost). Any error — provider failure,
+    JSON parse failure, missing field, wrong type — yields a CONSERVATIVE
+    verdict (clean=False) so the caller drops the candidate. The
+    previously cached good summary stays visible on the dashboard."""
+    if not candidate or not candidate.strip():
+        return Verdict(clean=False, reason="empty candidate", cost_usd=0.0)
+
+    messages = [
+        {"role": "system", "content": _SYSTEM_PROMPT},
+        # Sent as a fenced user turn so the model can't confuse the
+        # candidate with instructions, even if the candidate happens to
+        # contain prompt-like prose.
+        {"role": "user", "content": f"Candidate read:\n```\n{candidate}\n```"},
+    ]
+    try:
+        result = await call_llm(
+            client, messages,
+            max_tokens=120,
+            response_format={"type": "json_object"},
+        )
+    except Exception as e:
+        log.warning("review.call_failed", error=str(e)[:200])
+        return Verdict(clean=False, reason=f"reviewer error: {str(e)[:80]}",
+                       cost_usd=None)
+
+    try:
+        parsed = json.loads(result.content)
+    except json.JSONDecodeError:
+        log.warning("review.parse_failed", preview=result.content[:200])
+        return Verdict(clean=False, reason="reviewer returned non-JSON",
+                       cost_usd=result.cost_usd)
+
+    clean = parsed.get("clean")
+    reason = parsed.get("reason") or ""
+    if not isinstance(clean, bool):
+        return Verdict(clean=False, reason="reviewer omitted bool 'clean'",
+                       cost_usd=result.cost_usd)
+    return Verdict(clean=clean, reason=str(reason)[:200], cost_usd=result.cost_usd)
--- a/tests/test_output_review.py
+++ b/tests/test_output_review.py
@ -0,0 +1,146 @@
+"""Tests for the JSON-envelope extractor and the reviewer agent.
+
+The two together replaced the regex `clean_summary` + `looks_like_leakage`
+scaffolding that used to live in indicator_summary_job. The extractor is
+pure-function so it's covered exhaustively; the reviewer makes an LLM
+call and is exercised via the httpx MockTransport that the other
+openrouter tests use."""
+from __future__ import annotations
+
+import httpx
+import pytest
+
+from app.jobs.indicator_summary_job import _extract_read
+from app.services import openrouter as ot
+from app.services.output_review import review_read
+
+
+# ---------------------------------------------------------------------------
+# _extract_read — JSON envelope handling
+# ---------------------------------------------------------------------------
+
+
+def test_extract_read_returns_trimmed_field():
+    raw = '{"read": "  The market is pricing growth.  "}'
+    assert _extract_read(raw) == "The market is pricing growth."
+
+
+def test_extract_read_returns_none_on_invalid_json():
+    assert _extract_read("not json") is None
+    assert _extract_read("{bad}") is None
+    assert _extract_read("") is None
+
+
+def test_extract_read_returns_none_when_field_missing():
+    assert _extract_read('{"other": "x"}') is None
+
+
+def test_extract_read_returns_none_when_field_not_string():
+    assert _extract_read('{"read": 42}') is None
+    assert _extract_read('{"read": null}') is None
+    assert _extract_read('{"read": ["a","b"]}') is None
+
+
+def test_extract_read_returns_none_when_field_empty():
+    assert _extract_read('{"read": ""}') is None
+    assert _extract_read('{"read": "   "}') is None
+
+
+def test_extract_read_returns_none_when_envelope_not_object():
+    # A bare string or array is valid JSON but not the expected shape.
+    assert _extract_read('"just a string"') is None
+    assert _extract_read('["a", "b"]') is None
+
+
+# ---------------------------------------------------------------------------
+# review_read — judges candidate read via a second LLM call
+# ---------------------------------------------------------------------------
+
+
+def _mock_post(handler):
+    return httpx.MockTransport(handler)
+
+
+def _configure(monkeypatch):
+    """Minimal env so call_llm believes a provider is configured."""
+    monkeypatch.setattr(ot, "get_settings", lambda: type("S", (), {
+        "LLM_PROVIDER": "deepseek", "LLM_FALLBACK": "",
+        "DEEPSEEK_API_KEY": "sk-d", "OPENROUTER_API_KEY": "",
+        "DEEPSEEK_URL": "https://x/deepseek", "DEEPSEEK_MODEL": "deepseek-v4-flash",
+        "OPENROUTER_URL": "https://x/or",      "OPENROUTER_MODEL": "deepseek/deepseek-v4-flash",
+    })())
+
+
+@pytest.mark.asyncio
+async def test_review_clean_verdict(monkeypatch):
+    _configure(monkeypatch)
+    def handler(_req):
+        return httpx.Response(200, json={
+            "choices": [{"message": {"content": '{"clean": true, "reason": "ok"}'},
+                         "finish_reason": "stop"}],
+            "usage": {"prompt_tokens": 50, "completion_tokens": 12, "cost": 0.00007},
+        })
+    async with httpx.AsyncClient(transport=_mock_post(handler)) as client:
+        v = await review_read(client, "Markets are pricing tighter policy.")
+    assert v.clean is True
+    assert v.cost_usd == 0.00007
+
+
+@pytest.mark.asyncio
+async def test_review_unclean_verdict(monkeypatch):
+    _configure(monkeypatch)
+    def handler(_req):
+        return httpx.Response(200, json={
+            "choices": [{"message": {"content":
+                         '{"clean": false, "reason": "chain of thought"}'},
+                         "finish_reason": "stop"}],
+            "usage": {"prompt_tokens": 50, "completion_tokens": 14, "cost": 0.00009},
+        })
+    async with httpx.AsyncClient(transport=_mock_post(handler)) as client:
+        v = await review_read(client, "Let's see, is it X? Actually Y?")
+    assert v.clean is False
+    assert "chain of thought" in v.reason
+
+
+@pytest.mark.asyncio
+async def test_review_failsafe_on_malformed_json(monkeypatch):
+    """Reviewer returned prose instead of JSON → conservative reject."""
+    _configure(monkeypatch)
+    def handler(_req):
+        return httpx.Response(200, json={
+            "choices": [{"message": {"content": "yes it looks clean"},
+                         "finish_reason": "stop"}],
+            "usage": {"prompt_tokens": 50, "completion_tokens": 6},
+        })
+    async with httpx.AsyncClient(transport=_mock_post(handler)) as client:
+        v = await review_read(client, "Some candidate.")
+    assert v.clean is False
+    assert "non-JSON" in v.reason
+
+
+@pytest.mark.asyncio
+async def test_review_failsafe_on_missing_clean_field(monkeypatch):
+    _configure(monkeypatch)
+    def handler(_req):
+        return httpx.Response(200, json={
+            "choices": [{"message": {"content": '{"reason": "no field"}'},
+                         "finish_reason": "stop"}],
+            "usage": {"prompt_tokens": 50, "completion_tokens": 6},
+        })
+    async with httpx.AsyncClient(transport=_mock_post(handler)) as client:
+        v = await review_read(client, "Some candidate.")
+    assert v.clean is False
+
+
+@pytest.mark.asyncio
+async def test_review_failsafe_on_empty_candidate(monkeypatch):
+    """No LLM call should fire if the candidate is empty."""
+    _configure(monkeypatch)
+    calls = []
+    def handler(_req):
+        calls.append(1)
+        return httpx.Response(500, json={"error": "should not be called"})
+    async with httpx.AsyncClient(transport=_mock_post(handler)) as client:
+        v = await review_read(client, "   ")
+    assert v.clean is False
+    assert calls == []