read.markets/tests/test_output_review.py
Giorgio Gilestro 45fa31bb2b ai: structured-output + reviewer agent for indicator summaries
Replaces the regex-based clean_summary / looks_like_leakage pipeline
that produced the 2026-05-29 valuation-read leak. Two layers of defence
in depth:

1. JSON-mode generation. The per-group and aggregate summary system
   prompts now require the model to emit a single object
   {"read": "..."}; response_format={"type":"json_object"} is passed
   through to the provider so the API enforces well-formed JSON. Prose
   outside the field is physically impossible. The "read" field is the
   only schema slot, so the model has nowhere to spill scratchpad
   into the envelope.

2. Reviewer agent. services/output_review.review_read() makes a second
   small LLM call that judges whether the candidate "read" string is
   publishable. It catches the residual failure mode — scratchpad
   INSIDE the field ("Let's see…", multi-question parentheticals,
   meta-commentary) — and returns a JSON verdict {"clean": bool,
   "reason": str}. Any failure (provider error, parse error, missing
   field) returns clean=false (fail-safe). Cost ~$0.0001/check; latency
   ~1-2 s in the hourly job, no user-facing latency.

The old regex scaffolding (_LEAK_PATTERNS, clean_summary,
looks_like_leakage, _TRAILING_QUOTE) is deleted entirely. It produced
false positives (chopped legitimate "The indicators are…" leaders) and
false negatives (never matched the chain-of-thought patterns the model
actually emits). The reviewer agent is strictly better on both.

On reviewer/parse rejection: don't persist a new IndicatorSummary; the
API's existing fallback to the previous good row continues to serve
the panel. Failures are logged as ind_summary.json_invalid /
ind_summary.reviewer_rejected so we can measure the rejection rate.

Reviewer cost is added to the row's recorded cost_usd so the monthly
budget cap covers the full pipeline.

Adds tests/test_output_review.py: 11 cases covering _extract_read
(JSON envelope handling — invalid JSON, missing field, wrong types,
empty values) and review_read (clean / unclean verdicts plus three
fail-safe paths for malformed reviewer responses).

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-29 13:10:52 +02:00

146 lines
5.5 KiB
Python

"""Tests for the JSON-envelope extractor and the reviewer agent.
The two together replaced the regex `clean_summary` + `looks_like_leakage`
scaffolding that used to live in indicator_summary_job. The extractor is
pure-function so it's covered exhaustively; the reviewer makes an LLM
call and is exercised via the httpx MockTransport that the other
openrouter tests use."""
from __future__ import annotations
import httpx
import pytest
from app.jobs.indicator_summary_job import _extract_read
from app.services import openrouter as ot
from app.services.output_review import review_read
# ---------------------------------------------------------------------------
# _extract_read — JSON envelope handling
# ---------------------------------------------------------------------------
def test_extract_read_returns_trimmed_field():
raw = '{"read": " The market is pricing growth. "}'
assert _extract_read(raw) == "The market is pricing growth."
def test_extract_read_returns_none_on_invalid_json():
assert _extract_read("not json") is None
assert _extract_read("{bad}") is None
assert _extract_read("") is None
def test_extract_read_returns_none_when_field_missing():
assert _extract_read('{"other": "x"}') is None
def test_extract_read_returns_none_when_field_not_string():
assert _extract_read('{"read": 42}') is None
assert _extract_read('{"read": null}') is None
assert _extract_read('{"read": ["a","b"]}') is None
def test_extract_read_returns_none_when_field_empty():
assert _extract_read('{"read": ""}') is None
assert _extract_read('{"read": " "}') is None
def test_extract_read_returns_none_when_envelope_not_object():
# A bare string or array is valid JSON but not the expected shape.
assert _extract_read('"just a string"') is None
assert _extract_read('["a", "b"]') is None
# ---------------------------------------------------------------------------
# review_read — judges candidate read via a second LLM call
# ---------------------------------------------------------------------------
def _mock_post(handler):
return httpx.MockTransport(handler)
def _configure(monkeypatch):
"""Minimal env so call_llm believes a provider is configured."""
monkeypatch.setattr(ot, "get_settings", lambda: type("S", (), {
"LLM_PROVIDER": "deepseek", "LLM_FALLBACK": "",
"DEEPSEEK_API_KEY": "sk-d", "OPENROUTER_API_KEY": "",
"DEEPSEEK_URL": "https://x/deepseek", "DEEPSEEK_MODEL": "deepseek-v4-flash",
"OPENROUTER_URL": "https://x/or", "OPENROUTER_MODEL": "deepseek/deepseek-v4-flash",
})())
@pytest.mark.asyncio
async def test_review_clean_verdict(monkeypatch):
_configure(monkeypatch)
def handler(_req):
return httpx.Response(200, json={
"choices": [{"message": {"content": '{"clean": true, "reason": "ok"}'},
"finish_reason": "stop"}],
"usage": {"prompt_tokens": 50, "completion_tokens": 12, "cost": 0.00007},
})
async with httpx.AsyncClient(transport=_mock_post(handler)) as client:
v = await review_read(client, "Markets are pricing tighter policy.")
assert v.clean is True
assert v.cost_usd == 0.00007
@pytest.mark.asyncio
async def test_review_unclean_verdict(monkeypatch):
_configure(monkeypatch)
def handler(_req):
return httpx.Response(200, json={
"choices": [{"message": {"content":
'{"clean": false, "reason": "chain of thought"}'},
"finish_reason": "stop"}],
"usage": {"prompt_tokens": 50, "completion_tokens": 14, "cost": 0.00009},
})
async with httpx.AsyncClient(transport=_mock_post(handler)) as client:
v = await review_read(client, "Let's see, is it X? Actually Y?")
assert v.clean is False
assert "chain of thought" in v.reason
@pytest.mark.asyncio
async def test_review_failsafe_on_malformed_json(monkeypatch):
"""Reviewer returned prose instead of JSON → conservative reject."""
_configure(monkeypatch)
def handler(_req):
return httpx.Response(200, json={
"choices": [{"message": {"content": "yes it looks clean"},
"finish_reason": "stop"}],
"usage": {"prompt_tokens": 50, "completion_tokens": 6},
})
async with httpx.AsyncClient(transport=_mock_post(handler)) as client:
v = await review_read(client, "Some candidate.")
assert v.clean is False
assert "non-JSON" in v.reason
@pytest.mark.asyncio
async def test_review_failsafe_on_missing_clean_field(monkeypatch):
_configure(monkeypatch)
def handler(_req):
return httpx.Response(200, json={
"choices": [{"message": {"content": '{"reason": "no field"}'},
"finish_reason": "stop"}],
"usage": {"prompt_tokens": 50, "completion_tokens": 6},
})
async with httpx.AsyncClient(transport=_mock_post(handler)) as client:
v = await review_read(client, "Some candidate.")
assert v.clean is False
@pytest.mark.asyncio
async def test_review_failsafe_on_empty_candidate(monkeypatch):
"""No LLM call should fire if the candidate is empty."""
_configure(monkeypatch)
calls = []
def handler(_req):
calls.append(1)
return httpx.Response(500, json={"error": "should not be called"})
async with httpx.AsyncClient(transport=_mock_post(handler)) as client:
v = await review_read(client, " ")
assert v.clean is False
assert calls == []