diff --git a/app/jobs/indicator_summary_job.py b/app/jobs/indicator_summary_job.py index 97c5f80..422c49c 100644 --- a/app/jobs/indicator_summary_job.py +++ b/app/jobs/indicator_summary_job.py @@ -4,7 +4,7 @@ hourly stays comfortably under the monthly cap.""" from __future__ import annotations import asyncio -import re +import json import httpx from sqlalchemy import desc, func, select @@ -35,6 +35,7 @@ from app.services.openrouter import ( llm_configured, month_start, ) +from app.services.output_review import review_read from app.services.translation import translate @@ -106,109 +107,41 @@ async def translate_summary_for_active_languages(session, summary_id: int) -> No summary_id=summary_id, succeeded=succeeded, failed=failed) -# Strip known meta-commentary openers the model sometimes leaks despite the -# prompt's hard constraints. Each pattern matches one leading sentence. -_LEAK_PATTERNS = [ - re.compile(p, re.IGNORECASE | re.DOTALL) - for p in ( - # First-person meta — "I need to / I'll / I have to / I'm going to ..." - r"^i\s+(?:need|have|must|should|am going|'ll|will|shall|can|am)[^.]*\.\s*", - # "We need / we're / we are asked / we will ..." - r"^we\s+(?:need|are|'re|will|shall|can|should|must|have)[^.]*\.\s*", - r"^let\s+(?:me|us|'?s)[^.]*\.\s*", - r"^here['’]s[^.]*\.\s*", - r"^sure[,!]?\s[^.]*\.\s*", - r"^looking at[^.]*\.\s*", - r"^based on[^.]*\.\s*", - r"^to (?:address|answer|write|summarise|summarize)[^.]*\.\s*", - r"^first[,]?\s[^.]*\.\s*", - r"^the (?:user|data shows|reader|task|request|reader sees|instructions?)[^.]*\.\s*", - r"^summary[:.]\s*", - r"^key\s*[:\-—]\s*", - r"^must\s+(?:be|cite|explain|avoid|give|stay|provide)[^.]*\.\s*", - r"^should\s+(?:be|give|cite|explain|avoid|provide)[^.]*\.\s*", - r"^avoid[^.]*\.\s*", - r"^cite\s+at\s+most[^.]*\.\s*", - r"^be\s+(?:speculative|specific|concise|brief)[^.]*\.\s*", - r"^stay\s+on[^.]*\.\s*", - r"^okay[,]?\s+", - r"^alright[,]?\s+", - r"^thinking[^.]*\.\s*", - # Prompt-leak prefixes — the model echoes example framing or rule - # headers from the system prompt. - r"^(?:good|bad|positive|negative)\s+example\s*[:\-—]\s*", - r"^example\s+(?:good|bad)\s*[:\-—]\s*", - r"^example\s*[:\-—]\s*", - r"^reference\s+style\s*[:\-—]\s*", - # Prompt label echoes (markdown-style or plain-text) - r"^(?:hard\s+)?constraints?\s*[:\-—][^.\n]*[.\n]\s*", - r"^key\s+observations?\s*[:\-—]\s*", - r"^observations?\s*[:\-—]\s*", - r"^focus\s+on[^.]*\.\s*", - r"^output\s+the\s+read[^.]*\.\s*", - r"^plain\s+prose[^.]*\.\s*", - r"^the\s+indicators?[^.]*\.\s*", # "The indicators include..." / "The indicators are..." - r"^indicators?\s*[:\-—]\s*", - r"^data\s*[:\-—]\s*", - r"^analysis\s*[:\-—]\s*", - r"^interpretation\s*[:\-—]\s*", - r"^read\s*[:\-—]\s*", - r"^note\s*[:\-—]\s*", - # Sometimes the response gets wrapped in literal quotes - r"^[\"“'`]+", - ) -] +# Defence-in-depth: read generation goes through JSON mode + a reviewer. +# +# 1. The system prompt instructs the model to emit {"read": "..."} only; +# response_format={"type":"json_object"} forces well-formed JSON at +# the API layer, so prose outside the field is impossible. +# 2. We extract `read`, then ask a second LLM call (services/output_review) +# whether the candidate text is publishable. Scratchpad INSIDE the +# field — "Let's see…", "X? Actually Y?" — is caught here. +# 3. Any failure at either stage (parse, missing field, reviewer veto, +# reviewer error) drops the candidate. The previous good +# IndicatorSummary stays visible. +# +# The old _LEAK_PATTERNS / clean_summary / looks_like_leakage regex +# scaffolding lived here previously. It produced false positives (e.g. +# chopping off a legitimate leading sentence like "The indicators are +# pricing…") and false negatives (it never caught the chain-of-thought +# patterns the model actually emits). The reviewer agent replaces it. -_TRAILING_QUOTE = re.compile(r"[\"”'`]+\s*$") - -# Tell-tale phrases that mean the model regurgitated the prompt as its -# "answer" — we'd rather show nothing than show this. -_LEAKAGE_FLAGS = ( - "≤60 words", "60 words", "must be under", "must cite", "must explain", - "no meta-commentary", "no buy/sell", "horizon. ", "1-day moves", - "the instructions are", "instructions:", "constraints:", "hard constraints", - "good example", "bad example", "reference style", -) - - -def looks_like_leakage(text: str) -> bool: - """Heuristic: after cleaning, if these phrases still appear, the output - is contaminated prompt-regurgitation and shouldn't be shown.""" - low = text.lower() - return any(flag in low for flag in _LEAKAGE_FLAGS) - - -def clean_summary(text: str) -> str: - """Strip leading meta-commentary. If cleaning removes nearly everything - (suggesting the model emitted reasoning then ran out of tokens), fall - back to the last non-empty paragraph of the raw output — that's usually - where the actual answer ended up.""" - raw = text.strip() - out = raw - # Up to 6 passes: handles compound leakage like - # "Constraints: <...>. The indicators are: <...>. " - for _ in range(6): - before = out - for pat in _LEAK_PATTERNS: - out = pat.sub("", out, count=1).lstrip() - if out == before: - break - if len(out) < 60 and len(raw) > 120: - # Cleaning ate too much; take the last non-empty paragraph of raw. - paragraphs = [p.strip() for p in re.split(r"\n\s*\n", raw) if p.strip()] - if paragraphs: - out = paragraphs[-1] - # Re-strip leaders from the recovered paragraph too. - for _ in range(2): - before = out - for pat in _LEAK_PATTERNS: - out = pat.sub("", out, count=1).lstrip() - if out == before: - break - # Trim any orphan closing quote/backtick from the wrap-strip above. - out = _TRAILING_QUOTE.sub("", out).rstrip() - return out +def _extract_read(raw: str) -> str | None: + """Parse the model's JSON envelope and return the "read" field, or + None if the body isn't valid JSON / the field is missing / the field + isn't a string. Conservative: on any deviation from the schema we + drop the candidate rather than try to salvage it.""" + try: + parsed = json.loads(raw) + except json.JSONDecodeError: + return None + if not isinstance(parsed, dict): + return None + read = parsed.get("read") + if not isinstance(read, str): + return None + read = read.strip() + return read or None @@ -228,19 +161,20 @@ async def _generate_one( [{"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt}], max_tokens=800, # DeepSeek sometimes spends 300+ on internal reasoning + response_format={"type": "json_object"}, ) except Exception as e: session.add(AICall(model=active_model(), status="error", error=str(e)[:500])) log.warning("ind_summary.failed", group=group, error=str(e)[:120]) return None - cleaned = clean_summary(result.content) - if looks_like_leakage(cleaned) or len(cleaned) < 40: - # Model regurgitated the prompt or produced nothing usable. - # Don't persist — keep the last good summary visible. Log it so - # we can see the rate of failures over time. - log.warning("ind_summary.leakage_detected", - group=group, preview=cleaned[:120]) + candidate = _extract_read(result.content) + if candidate is None or len(candidate) < 40: + # JSON envelope malformed, "read" field missing/wrong type, or + # the candidate is too short to be a real read. Don't persist; + # the last good summary stays visible. + log.warning("ind_summary.json_invalid", + group=group, preview=result.content[:160]) session.add(AICall( model=result.model, prompt_tokens=result.prompt_tokens, @@ -250,6 +184,23 @@ async def _generate_one( )) return None + verdict = await review_read(client, candidate) + if not verdict.clean: + # Reviewer caught scratchpad / meta-commentary / partial text + # INSIDE the read field. Drop the candidate; the previous good + # summary continues to serve. + log.warning("ind_summary.reviewer_rejected", + group=group, reason=verdict.reason, + preview=candidate[:120]) + session.add(AICall( + model=result.model, + prompt_tokens=result.prompt_tokens, + completion_tokens=result.completion_tokens, + cost_usd=(result.cost_usd or 0.0) + (verdict.cost_usd or 0.0), + status="leaked", + )) + return None + summary = IndicatorSummary( group_name=group, generated_at=utcnow(), @@ -257,17 +208,19 @@ async def _generate_one( tone=tone, analysis=analysis, prompt_version=PROMPT_VERSION, - content=cleaned, + content=candidate, prompt_tokens=result.prompt_tokens, completion_tokens=result.completion_tokens, - cost_usd=result.cost_usd, + # Include the reviewer's cost in the row's recorded spend so the + # monthly budget tracking covers the full pipeline cost. + cost_usd=(result.cost_usd or 0.0) + (verdict.cost_usd or 0.0), ) session.add(summary) session.add(AICall( model=result.model, prompt_tokens=result.prompt_tokens, completion_tokens=result.completion_tokens, - cost_usd=result.cost_usd, + cost_usd=(result.cost_usd or 0.0) + (verdict.cost_usd or 0.0), status="ok", )) return summary @@ -338,6 +291,7 @@ async def run() -> None: await translate_summary_for_active_languages(session, summary.id) # One aggregate read across all groups, stored under __all__. + # Same JSON-mode + reviewer-agent path as per-group reads. agg_system = build_aggregate_summary_system_prompt(tone, analysis) agg_user = build_aggregate_summary_user_prompt(groups) agg_summary: IndicatorSummary | None = None @@ -346,28 +300,53 @@ async def run() -> None: client, [{"role": "system", "content": agg_system}, {"role": "user", "content": agg_user}], - max_tokens=1500, # room for reasoning + 80-word output + max_tokens=1500, + response_format={"type": "json_object"}, ) - agg_summary = IndicatorSummary( - group_name=AGGREGATE_GROUP_NAME, - generated_at=utcnow(), - model=result.model, - tone=tone, - analysis=analysis, - prompt_version=PROMPT_VERSION, - content=clean_summary(result.content), - prompt_tokens=result.prompt_tokens, - completion_tokens=result.completion_tokens, - cost_usd=result.cost_usd, - ) - session.add(agg_summary) - session.add(AICall( - model=result.model, - prompt_tokens=result.prompt_tokens, - completion_tokens=result.completion_tokens, - cost_usd=result.cost_usd, status="ok", - )) - written += 1 + candidate = _extract_read(result.content) + if candidate is None or len(candidate) < 40: + log.warning("ind_summary.agg_json_invalid", + tone=tone, preview=result.content[:160]) + session.add(AICall( + model=result.model, + prompt_tokens=result.prompt_tokens, + completion_tokens=result.completion_tokens, + cost_usd=result.cost_usd, status="leaked", + )) + else: + verdict = await review_read(client, candidate) + full_cost = (result.cost_usd or 0.0) + (verdict.cost_usd or 0.0) + if not verdict.clean: + log.warning("ind_summary.agg_reviewer_rejected", + tone=tone, reason=verdict.reason, + preview=candidate[:120]) + session.add(AICall( + model=result.model, + prompt_tokens=result.prompt_tokens, + completion_tokens=result.completion_tokens, + cost_usd=full_cost, status="leaked", + )) + else: + agg_summary = IndicatorSummary( + group_name=AGGREGATE_GROUP_NAME, + generated_at=utcnow(), + model=result.model, + tone=tone, + analysis=analysis, + prompt_version=PROMPT_VERSION, + content=candidate, + prompt_tokens=result.prompt_tokens, + completion_tokens=result.completion_tokens, + cost_usd=full_cost, + ) + session.add(agg_summary) + session.add(AICall( + model=result.model, + prompt_tokens=result.prompt_tokens, + completion_tokens=result.completion_tokens, + cost_usd=full_cost, status="ok", + )) + written += 1 except Exception as e: session.add(AICall( model=active_model(), status="error", diff --git a/app/services/llm_prompts.py b/app/services/llm_prompts.py index 9840ec2..726b60a 100644 --- a/app/services/llm_prompts.py +++ b/app/services/llm_prompts.py @@ -296,12 +296,25 @@ question via the chat sidebar. def build_summary_system_prompt(tone: str, analysis: str) -> str: """A lean, focused system prompt for the per-indicator-group hourly summary. INTERPRETATION not description — the reader has the table - next to this paragraph; they don't need numbers recited at them.""" + next to this paragraph; they don't need numbers recited at them. + + Output is JSON-mode: the model must emit a single object + {"read": "..."}. The wrapper makes scratchpad outside the field + physically impossible — the API enforces well-formed JSON, and the + only schema slot is the publishable read. Scratchpad inside the + field is caught by the reviewer agent (services/output_review).""" tone_block = _TONE[_resolve_tone(tone)] analysis_block = _ANALYSIS.get(analysis.upper(), _ANALYSIS["SPECULATIVE"]) return f"""You write a TINY interpretation (≤60 words, 2-3 sentences) \ of ONE indicator group for a strategic markets dashboard. +# Output format (strict) +Return ONLY a single JSON object with exactly one field: +{{"read": ""}} +Nothing outside that JSON object. No preamble. No markdown fences. \ +No additional fields. The "read" string is what the user sees verbatim, \ +so it must already be the finished, publishable text — never your thinking. + # What this is for The reader is looking at the table of numbers right next to your text. \ They can see the values. They CANNOT see the meaning. Your job is to \ @@ -316,19 +329,20 @@ Even at 2-3 sentences, contrast what the underlying factors justify \ they don't diverge, say so in one clause. Never just describe the move \ without placing it on this axis. -# Hard constraints +# Hard constraints on the "read" string - Plain prose, ONE paragraph. No markdown, no headers, no lists, no labels. - Open IMMEDIATELY with substance. NEVER start with: "I need to", "I'll", \ "We need to", "We are asked", "Here's", "Let me", "Let's", "Sure", "Looking \ at", "Based on", "Summary:", "The data shows", "First", "To address". No \ meta-commentary at all. +- No rhetorical questions, no "X? Actually Y?" self-corrections, no \ +parenthetical asides that question your own numbers. The text is the \ +finished read, not the thinking. - Cite at most 2-3 specific numbers and ONLY when they anchor an \ interpretation. Don't list moves; explain them. - Multi-week / multi-month horizon. 1-day moves under 2% are noise — skip. - No buy/sell language. No predictions. No watch list. No TL;DR. No date \ header. No "system temperature" line — that belongs to the full daily log. -- Output the read directly. Do NOT include phrases like "Example", "Good \ -example", "Bad example", "Reference", or any meta-framing of your output. {tone_block} @@ -350,13 +364,22 @@ def build_summary_user_prompt(group_name: str, quotes: list[dict]) -> str: def build_aggregate_summary_system_prompt(tone: str, analysis: str) -> str: """System prompt for the cross-group aggregate read shown on the dashboard. - Wider lens than a per-group summary — synthesise across all groups.""" + Wider lens than a per-group summary — synthesise across all groups. + + Same JSON-mode contract as build_summary_system_prompt: output is + {"read": "..."} only; the field is the publishable text verbatim.""" tone_block = _TONE[_resolve_tone(tone)] analysis_block = _ANALYSIS.get(analysis.upper(), _ANALYSIS["SPECULATIVE"]) return f"""You write a single SHORT cross-asset INTERPRETATION (≤80 \ words, 2-4 sentences) for the dashboard header. The reader is glancing — \ give them the meaning of the whole tape, not a recap. +# Output format (strict) +Return ONLY a single JSON object with exactly one field: +{{"read": ""}} +Nothing outside that JSON object. No preamble. No markdown fences. \ +No additional fields. The "read" string is what the user sees verbatim. + # What this is for The reader can see every indicator on the dashboard below this paragraph. \ Your job is NOT to summarise the moves. It is to explain what the moves, \ @@ -371,19 +394,19 @@ crowd is actually doing (irrational: positioning, narrative momentum, \ flows). At least one of the 2-4 sentences must name this gap or, if the \ two cohere, explicitly say so. -# Hard constraints +# Hard constraints on the "read" string - Plain prose, ONE paragraph. No markdown, headers, lists, or labels. - Open IMMEDIATELY with substance. NEVER start with: "I need to", "I'll", \ "We need to", "Here's", "Let me", "Looking at", "Based on", "Sure", "Summary:", \ "The data shows", "Across the board". No meta-commentary. +- No rhetorical questions, no "X? Actually Y?" self-corrections, no \ +parenthetical asides that question your own numbers. - Identify the single most important **cross-asset implication**: e.g. \ "rates and credit disagree", "equities outrun fundamentals", "geopolitical \ risk premium is in commodities but not vol". Cite no more than 3 specific \ numbers, and only as anchors for the interpretation. - Multi-week / multi-month horizon. 1-day moves under 2% are noise. - No buy/sell language. No predictions of specific levels. -- Output the read directly. Do NOT include phrases like "Example", "Good \ -example", "Bad example", "Reference", or any meta-framing of your output. {tone_block} diff --git a/app/services/output_review.py b/app/services/output_review.py new file mode 100644 index 0000000..3af2a7a --- /dev/null +++ b/app/services/output_review.py @@ -0,0 +1,107 @@ +"""Second-pass reviewer agent for AI-generated reads. + +The per-group and aggregate indicator summaries are generated in JSON +mode and the publishable text comes out of a single "read" field, but a +misbehaving model can still slip chain-of-thought INSIDE the field +("Let's see…", "X? Actually Y?", multi-question parentheticals). This +module makes a small second LLM call that judges the candidate read as +clean / unclean. Cost is ~$0.0001 per check; latency ~1-2 s in the +hourly job. No user-facing latency. + +The reviewer is deliberately a tiny, JSON-shaped classifier — same +JSON-mode mechanism as the generator, so the verdict can't be lost in +prose. If parsing fails or the call errors, the row is rejected +(fail-safe: the previously cached good summary stays visible). +""" +from __future__ import annotations + +import json +from dataclasses import dataclass + +import httpx + +from app.logging import get_logger +from app.services.openrouter import call_llm + +log = get_logger("output_review") + + +_SYSTEM_PROMPT = """\ +You are a strict editor for a financial-markets dashboard. The author +was asked to produce a short interpretive read for human readers. +You receive their proposed read and decide if it is publishable as-is. + +Mark CLEAN only if the text reads like a finished interpretation a +reader could see on a public dashboard without confusion. + +Mark UNCLEAN if the text contains ANY of: +- Chain-of-thought / scratchpad markers used as thinking — phrases like + "Let me", "Let's see", "we need to", "actually" (correcting itself), + "wait", "hmm", "or rather", "I should". +- Self-questioning parentheticals: "Q1 2026? Actually Q4 2025?", + "is it X or Y?", any place where the author appears to be working + out the answer in front of the reader. +- Multiple rhetorical questions or any question that interrupts the + declarative voice. A clean interpretive read is assertive. +- Meta-commentary about the task, output format, word limits, or + instructions — e.g. "as required by the constraints", "the prompt + asks", "let me address each". +- Partial / truncated content. Starts mid-word, mid-number, mid-clause. +- Visible internal numbers without clear meaning ("change 1y +5.9%?"), + raw column names ("as_of 2026-01-01"), or any debug-like fragments. +- Anything other than the finished, publishable interpretation. + +Return ONLY a JSON object with this exact shape: +{"clean": true | false, "reason": "<≤20 words, plain text>"} +No preamble, no markdown fences, no other fields. +""" + + +@dataclass(frozen=True) +class Verdict: + clean: bool + reason: str + cost_usd: float | None # cost of the review call itself, for the ledger + + +async def review_read(client: httpx.AsyncClient, candidate: str) -> Verdict: + """Ask the LLM whether `candidate` is a publishable read. + + Returns Verdict(clean, reason, cost). Any error — provider failure, + JSON parse failure, missing field, wrong type — yields a CONSERVATIVE + verdict (clean=False) so the caller drops the candidate. The + previously cached good summary stays visible on the dashboard.""" + if not candidate or not candidate.strip(): + return Verdict(clean=False, reason="empty candidate", cost_usd=0.0) + + messages = [ + {"role": "system", "content": _SYSTEM_PROMPT}, + # Sent as a fenced user turn so the model can't confuse the + # candidate with instructions, even if the candidate happens to + # contain prompt-like prose. + {"role": "user", "content": f"Candidate read:\n```\n{candidate}\n```"}, + ] + try: + result = await call_llm( + client, messages, + max_tokens=120, + response_format={"type": "json_object"}, + ) + except Exception as e: + log.warning("review.call_failed", error=str(e)[:200]) + return Verdict(clean=False, reason=f"reviewer error: {str(e)[:80]}", + cost_usd=None) + + try: + parsed = json.loads(result.content) + except json.JSONDecodeError: + log.warning("review.parse_failed", preview=result.content[:200]) + return Verdict(clean=False, reason="reviewer returned non-JSON", + cost_usd=result.cost_usd) + + clean = parsed.get("clean") + reason = parsed.get("reason") or "" + if not isinstance(clean, bool): + return Verdict(clean=False, reason="reviewer omitted bool 'clean'", + cost_usd=result.cost_usd) + return Verdict(clean=clean, reason=str(reason)[:200], cost_usd=result.cost_usd) diff --git a/tests/test_output_review.py b/tests/test_output_review.py new file mode 100644 index 0000000..53f0b34 --- /dev/null +++ b/tests/test_output_review.py @@ -0,0 +1,146 @@ +"""Tests for the JSON-envelope extractor and the reviewer agent. + +The two together replaced the regex `clean_summary` + `looks_like_leakage` +scaffolding that used to live in indicator_summary_job. The extractor is +pure-function so it's covered exhaustively; the reviewer makes an LLM +call and is exercised via the httpx MockTransport that the other +openrouter tests use.""" +from __future__ import annotations + +import httpx +import pytest + +from app.jobs.indicator_summary_job import _extract_read +from app.services import openrouter as ot +from app.services.output_review import review_read + + +# --------------------------------------------------------------------------- +# _extract_read — JSON envelope handling +# --------------------------------------------------------------------------- + + +def test_extract_read_returns_trimmed_field(): + raw = '{"read": " The market is pricing growth. "}' + assert _extract_read(raw) == "The market is pricing growth." + + +def test_extract_read_returns_none_on_invalid_json(): + assert _extract_read("not json") is None + assert _extract_read("{bad}") is None + assert _extract_read("") is None + + +def test_extract_read_returns_none_when_field_missing(): + assert _extract_read('{"other": "x"}') is None + + +def test_extract_read_returns_none_when_field_not_string(): + assert _extract_read('{"read": 42}') is None + assert _extract_read('{"read": null}') is None + assert _extract_read('{"read": ["a","b"]}') is None + + +def test_extract_read_returns_none_when_field_empty(): + assert _extract_read('{"read": ""}') is None + assert _extract_read('{"read": " "}') is None + + +def test_extract_read_returns_none_when_envelope_not_object(): + # A bare string or array is valid JSON but not the expected shape. + assert _extract_read('"just a string"') is None + assert _extract_read('["a", "b"]') is None + + +# --------------------------------------------------------------------------- +# review_read — judges candidate read via a second LLM call +# --------------------------------------------------------------------------- + + +def _mock_post(handler): + return httpx.MockTransport(handler) + + +def _configure(monkeypatch): + """Minimal env so call_llm believes a provider is configured.""" + monkeypatch.setattr(ot, "get_settings", lambda: type("S", (), { + "LLM_PROVIDER": "deepseek", "LLM_FALLBACK": "", + "DEEPSEEK_API_KEY": "sk-d", "OPENROUTER_API_KEY": "", + "DEEPSEEK_URL": "https://x/deepseek", "DEEPSEEK_MODEL": "deepseek-v4-flash", + "OPENROUTER_URL": "https://x/or", "OPENROUTER_MODEL": "deepseek/deepseek-v4-flash", + })()) + + +@pytest.mark.asyncio +async def test_review_clean_verdict(monkeypatch): + _configure(monkeypatch) + def handler(_req): + return httpx.Response(200, json={ + "choices": [{"message": {"content": '{"clean": true, "reason": "ok"}'}, + "finish_reason": "stop"}], + "usage": {"prompt_tokens": 50, "completion_tokens": 12, "cost": 0.00007}, + }) + async with httpx.AsyncClient(transport=_mock_post(handler)) as client: + v = await review_read(client, "Markets are pricing tighter policy.") + assert v.clean is True + assert v.cost_usd == 0.00007 + + +@pytest.mark.asyncio +async def test_review_unclean_verdict(monkeypatch): + _configure(monkeypatch) + def handler(_req): + return httpx.Response(200, json={ + "choices": [{"message": {"content": + '{"clean": false, "reason": "chain of thought"}'}, + "finish_reason": "stop"}], + "usage": {"prompt_tokens": 50, "completion_tokens": 14, "cost": 0.00009}, + }) + async with httpx.AsyncClient(transport=_mock_post(handler)) as client: + v = await review_read(client, "Let's see, is it X? Actually Y?") + assert v.clean is False + assert "chain of thought" in v.reason + + +@pytest.mark.asyncio +async def test_review_failsafe_on_malformed_json(monkeypatch): + """Reviewer returned prose instead of JSON → conservative reject.""" + _configure(monkeypatch) + def handler(_req): + return httpx.Response(200, json={ + "choices": [{"message": {"content": "yes it looks clean"}, + "finish_reason": "stop"}], + "usage": {"prompt_tokens": 50, "completion_tokens": 6}, + }) + async with httpx.AsyncClient(transport=_mock_post(handler)) as client: + v = await review_read(client, "Some candidate.") + assert v.clean is False + assert "non-JSON" in v.reason + + +@pytest.mark.asyncio +async def test_review_failsafe_on_missing_clean_field(monkeypatch): + _configure(monkeypatch) + def handler(_req): + return httpx.Response(200, json={ + "choices": [{"message": {"content": '{"reason": "no field"}'}, + "finish_reason": "stop"}], + "usage": {"prompt_tokens": 50, "completion_tokens": 6}, + }) + async with httpx.AsyncClient(transport=_mock_post(handler)) as client: + v = await review_read(client, "Some candidate.") + assert v.clean is False + + +@pytest.mark.asyncio +async def test_review_failsafe_on_empty_candidate(monkeypatch): + """No LLM call should fire if the candidate is empty.""" + _configure(monkeypatch) + calls = [] + def handler(_req): + calls.append(1) + return httpx.Response(500, json={"error": "should not be called"}) + async with httpx.AsyncClient(transport=_mock_post(handler)) as client: + v = await review_read(client, " ") + assert v.clean is False + assert calls == []