diff --git a/app/jobs/indicator_summary_job.py b/app/jobs/indicator_summary_job.py index 147aa27..efefd20 100644 --- a/app/jobs/indicator_summary_job.py +++ b/app/jobs/indicator_summary_job.py @@ -44,15 +44,63 @@ _LEAK_PATTERNS = [ r"^based on[^.]*\.\s*", r"^to (?:address|answer|write|summarise|summarize)[^.]*\.\s*", r"^first[,]?\s[^.]*\.\s*", - r"^the (?:user|data shows|reader|task|request)[^.]*\.\s*", + r"^the (?:user|data shows|reader|task|request|reader sees|instructions?)[^.]*\.\s*", r"^summary[:.]\s*", + r"^key\s*[:\-—]\s*", + r"^must\s+(?:be|cite|explain|avoid|give|stay|provide)[^.]*\.\s*", + r"^should\s+(?:be|give|cite|explain|avoid|provide)[^.]*\.\s*", + r"^avoid[^.]*\.\s*", + r"^cite\s+at\s+most[^.]*\.\s*", + r"^be\s+(?:speculative|specific|concise|brief)[^.]*\.\s*", + r"^stay\s+on[^.]*\.\s*", r"^okay[,]?\s+", r"^alright[,]?\s+", r"^thinking[^.]*\.\s*", + # Prompt-leak prefixes — the model echoes example framing or rule + # headers from the system prompt. + r"^(?:good|bad|positive|negative)\s+example\s*[:\-—]\s*", + r"^example\s+(?:good|bad)\s*[:\-—]\s*", + r"^example\s*[:\-—]\s*", + r"^reference\s+style\s*[:\-—]\s*", + # Prompt label echoes (markdown-style or plain-text) + r"^(?:hard\s+)?constraints?\s*[:\-—][^.\n]*[.\n]\s*", + r"^key\s+observations?\s*[:\-—]\s*", + r"^observations?\s*[:\-—]\s*", + r"^focus\s+on[^.]*\.\s*", + r"^output\s+the\s+read[^.]*\.\s*", + r"^plain\s+prose[^.]*\.\s*", + r"^the\s+indicators?[^.]*\.\s*", # "The indicators include..." / "The indicators are..." + r"^indicators?\s*[:\-—]\s*", + r"^data\s*[:\-—]\s*", + r"^analysis\s*[:\-—]\s*", + r"^interpretation\s*[:\-—]\s*", + r"^read\s*[:\-—]\s*", + r"^note\s*[:\-—]\s*", + # Sometimes the response gets wrapped in literal quotes + r"^[\"“'`]+", ) ] +_TRAILING_QUOTE = re.compile(r"[\"”'`]+\s*$") + +# Tell-tale phrases that mean the model regurgitated the prompt as its +# "answer" — we'd rather show nothing than show this. +_LEAKAGE_FLAGS = ( + "≤60 words", "60 words", "must be under", "must cite", "must explain", + "no meta-commentary", "no buy/sell", "horizon. ", "1-day moves", + "the instructions are", "instructions:", "constraints:", "hard constraints", + "good example", "bad example", "reference style", +) + + +def looks_like_leakage(text: str) -> bool: + """Heuristic: after cleaning, if these phrases still appear, the output + is contaminated prompt-regurgitation and shouldn't be shown.""" + low = text.lower() + return any(flag in low for flag in _LEAKAGE_FLAGS) + + def clean_summary(text: str) -> str: """Strip leading meta-commentary. If cleaning removes nearly everything (suggesting the model emitted reasoning then ran out of tokens), fall @@ -60,7 +108,9 @@ def clean_summary(text: str) -> str: where the actual answer ended up.""" raw = text.strip() out = raw - for _ in range(2): + # Up to 6 passes: handles compound leakage like + # "Constraints: <...>. The indicators are: <...>. " + for _ in range(6): before = out for pat in _LEAK_PATTERNS: out = pat.sub("", out, count=1).lstrip() @@ -71,6 +121,15 @@ def clean_summary(text: str) -> str: paragraphs = [p.strip() for p in re.split(r"\n\s*\n", raw) if p.strip()] if paragraphs: out = paragraphs[-1] + # Re-strip leaders from the recovered paragraph too. + for _ in range(2): + before = out + for pat in _LEAK_PATTERNS: + out = pat.sub("", out, count=1).lstrip() + if out == before: + break + # Trim any orphan closing quote/backtick from the wrap-strip above. + out = _TRAILING_QUOTE.sub("", out).rstrip() return out @@ -128,6 +187,22 @@ async def _generate_one( log.warning("ind_summary.failed", group=group, error=str(e)[:120]) return False + cleaned = clean_summary(result.content) + if looks_like_leakage(cleaned) or len(cleaned) < 40: + # Model regurgitated the prompt or produced nothing usable. + # Don't persist — keep the last good summary visible. Log it so + # we can see the rate of failures over time. + log.warning("ind_summary.leakage_detected", + group=group, preview=cleaned[:120]) + session.add(AICall( + model=result.model, + prompt_tokens=result.prompt_tokens, + completion_tokens=result.completion_tokens, + cost_usd=result.cost_usd, + status="leaked", + )) + return False + session.add(IndicatorSummary( group_name=group, generated_at=utcnow(), @@ -135,7 +210,7 @@ async def _generate_one( tone=tone, analysis=analysis, prompt_version=PROMPT_VERSION, - content=clean_summary(result.content), + content=cleaned, prompt_tokens=result.prompt_tokens, completion_tokens=result.completion_tokens, cost_usd=result.cost_usd, diff --git a/app/services/openrouter.py b/app/services/openrouter.py index cff414a..7ddf2fd 100644 --- a/app/services/openrouter.py +++ b/app/services/openrouter.py @@ -199,20 +199,12 @@ interpretation. Don't list moves; explain them. - Multi-week / multi-month horizon. 1-day moves under 2% are noise — skip. - No buy/sell language. No predictions. No watch list. No TL;DR. No date \ header. No "system temperature" line — that belongs to the full daily log. +- Output the read directly. Do NOT include phrases like "Example", "Good \ +example", "Bad example", "Reference", or any meta-framing of your output. {tone_block} {analysis_block} - -# Bad example — describes what happened -"S&P +5.2% 1m and Nasdaq +8.8% 1m diverge from FTSE -3.4% and Euro Stoxx \ --2.6%. The US-vs-rest gap is widening." - -# Good example — interprets what it means -"The US-vs-rest equity gap is funded by AI-capex concentration in 7 names; \ -the breadth-weighted RSP barely keeps pace with SPY, which is the classic \ -late-cycle marker — narrow leadership, not broad recovery. The 5% 1m gap \ -between Nasdaq and FTSE is a narrative trade, not a fundamental one." """ @@ -255,21 +247,12 @@ risk premium is in commodities but not vol". Cite no more than 3 specific \ numbers, and only as anchors for the interpretation. - Multi-week / multi-month horizon. 1-day moves under 2% are noise. - No buy/sell language. No predictions of specific levels. +- Output the read directly. Do NOT include phrases like "Example", "Good \ +example", "Bad example", "Reference", or any meta-framing of your output. {tone_block} {analysis_block} - -# Bad example — describes -"Equities are up, real yields are higher, HY OAS is tight, breadth is \ -narrowing." - -# Good example — interprets -"The tape is paying a rising real discount rate (US 10y real +15bp 1m) with \ -conviction for AI growth, but credit refuses to confirm and breadth is \ -narrowing — that combination is what late-cycle looks like, not pre-crash. \ -The risk is not the level but the convergence: if any one of credit, \ -breadth, or vol turns, the others will follow fast." """