news: auto-tag headlines + market-aware cadence + filter UI
- Move news_job from hourly to 3x/hour (cron 10,30,50), with a CadencePolicy gate that throttles to active hours (07-21 UTC weekdays at 20 min), off-hours (3 h), weekends (6 h). Keeps the daytime feed fresh without spamming RSS sources overnight. - Tag each headline on ingestion via DeepSeek (BATCH_SIZE=25, max_tokens=4000, json.JSONDecoder().raw_decode + per-row regex recovery for resilient parsing). Vocabulary: 16 tags including new EU / USA / AI / Conflict. NULL tags are picked up automatically on the next news_job run, so back-tagging is implicit rather than a separate migration step. - Tag UI: pill bar above the feed with off → include → exclude cycle on click; shift-click jumps straight to exclude. State persists in localStorage and is injected into /api/news requests via htmx:configRequest. Per-row chips sit to the right of the headline (new 5-column grid: age | source | title | tags | UTC) so vertical density stays high. - Strategic log header bug: model was hallucinating "(Updated 21:30 UTC)" in future tense. Bumped PROMPT_VERSION 6→7, added explicit ban on time-of-day clauses, and supply the actual current UTC time in the user prompt so the model has no need to invent one. Migration 0012 adds headlines.tags (JSON, nullable). Tests cover vocabulary integrity, validation/normalisation, and the JSON-recovery parser (17 tests).
This commit is contained in:
parent
6e7f57c6b2
commit
2013bfa8cc
15 changed files with 745 additions and 25 deletions
|
|
@ -29,6 +29,11 @@ class CadencePolicy:
|
|||
(7, 21), # EU/US (LSE open through NYSE close)
|
||||
# (0, 8), # Asia (Tokyo + HK/Shanghai) — uncomment to add
|
||||
)
|
||||
# Minimum gap between successful runs DURING the active window. The
|
||||
# cron may fire more frequently than this — we just skip until enough
|
||||
# time has passed since the last success. Default 0 means "run on
|
||||
# every cron fire" (the original AI-job behaviour).
|
||||
active_gap_h: float = 0.0
|
||||
# Minimum gap between successful runs outside the active window.
|
||||
off_hours_gap_h: float = 4.0
|
||||
weekend_gap_h: float = 12.0
|
||||
|
|
@ -44,7 +49,7 @@ class CadencePolicy:
|
|||
if now.weekday() >= 5:
|
||||
return self.weekend_gap_h
|
||||
if self.is_active_window(now):
|
||||
return 0.0 # always run during the active window
|
||||
return self.active_gap_h
|
||||
return self.off_hours_gap_h
|
||||
|
||||
def should_run(
|
||||
|
|
@ -55,8 +60,6 @@ class CadencePolicy:
|
|||
"""Returns (should_run, reason). The reason is human-readable for logs
|
||||
and the job_runs.error column when a run is skipped."""
|
||||
now = now or datetime.now(timezone.utc)
|
||||
if self.is_active_window(now):
|
||||
return True, "active window"
|
||||
min_gap = self.min_gap_hours(now)
|
||||
if last_success_at is None:
|
||||
return True, "no prior successful run"
|
||||
|
|
@ -64,9 +67,27 @@ class CadencePolicy:
|
|||
if last_success_at.tzinfo is None:
|
||||
last_success_at = last_success_at.replace(tzinfo=timezone.utc)
|
||||
age_h = (now - last_success_at).total_seconds() / 3600.0
|
||||
if min_gap <= 0 and self.is_active_window(now):
|
||||
return True, "active window"
|
||||
if age_h >= min_gap:
|
||||
return True, f"off-hours but last run {age_h:.1f}h ago (≥ {min_gap}h)"
|
||||
return False, f"off-hours throttled — last run {age_h:.1f}h ago (< {min_gap}h)"
|
||||
band = "active" if self.is_active_window(now) else (
|
||||
"weekend" if now.weekday() >= 5 else "off-hours"
|
||||
)
|
||||
return True, f"{band}: last run {age_h:.2f}h ago (≥ {min_gap:.2f}h)"
|
||||
band = "active" if self.is_active_window(now) else (
|
||||
"weekend" if now.weekday() >= 5 else "off-hours"
|
||||
)
|
||||
return False, f"{band} throttled — last run {age_h:.2f}h ago (< {min_gap:.2f}h)"
|
||||
|
||||
|
||||
# AI jobs: run hot during the active window, throttle off-hours.
|
||||
DEFAULT_POLICY = CadencePolicy()
|
||||
|
||||
# News + tagging: 3 runs/hour during the active window (20-min gap),
|
||||
# every 3h off-hours, every 6h on weekends. Cron fires every 20 min;
|
||||
# the policy gates whether each fire actually does work.
|
||||
NEWS_POLICY = CadencePolicy(
|
||||
active_gap_h=1.0 / 3.0, # 20 minutes
|
||||
off_hours_gap_h=3.0,
|
||||
weekend_gap_h=6.0,
|
||||
)
|
||||
|
|
|
|||
290
app/services/news_tagging.py
Normal file
290
app/services/news_tagging.py
Normal file
|
|
@ -0,0 +1,290 @@
|
|||
"""AI-driven content tagging for headlines.
|
||||
|
||||
Each headline gets 1-3 tags from a fixed vocabulary (markets, geopolitics,
|
||||
tech, etc.). Tagging happens at ingest time inside `news_job` — only
|
||||
rows whose `tags` column is still NULL are processed, so re-runs are
|
||||
idempotent and recover from prior failures naturally.
|
||||
|
||||
Implementation notes:
|
||||
|
||||
- Titles only (not body) — they're informative enough and keep the
|
||||
prompt + cost small.
|
||||
- Batched: ~50 titles per LLM call. Returns JSON with one entry per
|
||||
input id. Unknown / hallucinated tags are dropped against the
|
||||
vocabulary; an empty tag list falls back to ["other"] so we can tell
|
||||
"tagged but bland" from "not yet tagged" (NULL).
|
||||
- Uses the existing call_llm dispatcher → DeepSeek-direct primary,
|
||||
OpenRouter fallback, per Phase G provider config.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import re
|
||||
from dataclasses import dataclass
|
||||
|
||||
import httpx
|
||||
|
||||
from app.logging import get_logger
|
||||
from app.services.openrouter import call_llm
|
||||
|
||||
|
||||
log = get_logger("news_tagging")
|
||||
|
||||
|
||||
# Frozen vocabulary. Keep ASCII-lowercase, hyphenated. If you add or
|
||||
# remove a tag, also update the system prompt below and the test fixture.
|
||||
TAG_VOCABULARY: tuple[str, ...] = (
|
||||
"markets",
|
||||
"monetary-policy",
|
||||
"economy",
|
||||
"geopolitics",
|
||||
"conflict", # wars, military actions, armed escalation
|
||||
"energy",
|
||||
"commodities",
|
||||
"tech",
|
||||
"ai", # AI-specific: model releases, capex, regulation
|
||||
"crypto",
|
||||
"corporate",
|
||||
"regulation",
|
||||
# Geographic emphasis tags — overlap freely with thematic ones.
|
||||
"usa",
|
||||
"eu",
|
||||
"china",
|
||||
"other",
|
||||
)
|
||||
|
||||
# Display labels for the toggle UI (Title Case + readable). Keys must
|
||||
# match TAG_VOCABULARY exactly.
|
||||
TAG_LABELS: dict[str, str] = {
|
||||
"markets": "Markets",
|
||||
"monetary-policy": "Monetary policy",
|
||||
"economy": "Economy",
|
||||
"geopolitics": "Geopolitics",
|
||||
"conflict": "Conflict",
|
||||
"energy": "Energy",
|
||||
"commodities": "Commodities",
|
||||
"tech": "Tech",
|
||||
"ai": "AI",
|
||||
"crypto": "Crypto",
|
||||
"corporate": "Corporate",
|
||||
"regulation": "Regulation",
|
||||
"usa": "USA",
|
||||
"eu": "EU",
|
||||
"china": "China",
|
||||
"other": "Other",
|
||||
}
|
||||
|
||||
_VOCAB_SET = frozenset(TAG_VOCABULARY)
|
||||
|
||||
# Batch size for one LLM call. Small enough that one batch of output
|
||||
# (50 items × ~30 tokens each = ~1500 tokens) fits well under any
|
||||
# reasonable max_tokens, AND so a single batch failure only loses a
|
||||
# small number of rows to next-cycle retry.
|
||||
BATCH_SIZE = 25
|
||||
|
||||
# Max tags per headline. Stories often touch multiple themes; we cap at
|
||||
# three so the UI chips don't blow up.
|
||||
MAX_TAGS_PER_HEADLINE = 3
|
||||
|
||||
|
||||
_SYSTEM_PROMPT = """\
|
||||
You tag financial / business news headlines with ONE to THREE content tags \
|
||||
from a fixed vocabulary. You receive a JSON array of headlines, each with \
|
||||
an `id` and a `title`. Return a JSON array of objects: `{"id": ..., \
|
||||
"tags": ["...", "..."]}`. Output nothing else — no prose, no markdown, no \
|
||||
preamble. The first character of your response must be `[`.
|
||||
|
||||
# Vocabulary (use ONLY these values, lowercase, hyphens not spaces)
|
||||
## Thematic tags
|
||||
- markets — direct market moves: stocks, bonds, FX, indices
|
||||
- monetary-policy — central banks, rate decisions, QE/QT, Fed/ECB/BOJ
|
||||
- economy — macro data: CPI, GDP, jobs, PMI, retail sales
|
||||
- geopolitics — sanctions, diplomacy, chokepoints, elections, trade
|
||||
- conflict — active wars, military strikes, armed escalation
|
||||
(use ALONGSIDE geopolitics, not instead of)
|
||||
- energy — oil, gas, OPEC, energy transition, utilities
|
||||
- commodities — gold, copper, agri, industrial metals (non-energy)
|
||||
- tech — Big Tech, chips, semiconductors, software, social media
|
||||
- ai — AI-specific: model releases, AI capex, AI regulation
|
||||
(overlap with tech freely)
|
||||
- crypto — bitcoin, ethereum, stablecoins, crypto regulation
|
||||
- corporate — earnings, M&A, layoffs, single-company news without
|
||||
a clear sector fit above
|
||||
- regulation — antitrust, securities regs, EU/SEC rulings, trade rules
|
||||
## Geographic tags (overlap freely with thematic ones)
|
||||
- usa — US-specific news, US policy, US-driven stories
|
||||
- eu — EU / Eurozone / individual EU member states
|
||||
- china — China-specific news
|
||||
## Fallback
|
||||
- other — last resort: entertainment, sport, weather, off-topic
|
||||
|
||||
# Tagging discipline
|
||||
- 1 to 3 tags per headline. Prefer 1-2; use 3 only when the story \
|
||||
genuinely spans multiple themes.
|
||||
- Tags can OVERLAP. "China bans US chips" → ["china", "tech", "geopolitics"].
|
||||
- For armed conflict, combine: "Israel strikes Lebanon" → ["conflict", "geopolitics"].
|
||||
- For AI stories, prefer "ai" over generic "tech" if the headline is AI-centric.
|
||||
- Geographic tags are additive: a US-focused tech story → ["tech", "usa"].
|
||||
- "other" is a last resort. If a headline is entertainment, sport, weather, \
|
||||
or otherwise off-topic for a macro dashboard, tag it "other".
|
||||
- Order tags by relevance: most specific first.
|
||||
"""
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class _ToTag:
|
||||
id: int
|
||||
title: str
|
||||
|
||||
|
||||
def _validate_tags(raw: list) -> list[str]:
|
||||
"""Filter a model-returned tag list down to known vocabulary + cap."""
|
||||
if not isinstance(raw, list):
|
||||
return []
|
||||
cleaned: list[str] = []
|
||||
seen: set[str] = set()
|
||||
for t in raw:
|
||||
if not isinstance(t, str):
|
||||
continue
|
||||
# Normalise: lowercase, replace spaces with hyphens (common drift).
|
||||
norm = t.strip().lower().replace(" ", "-")
|
||||
if norm in _VOCAB_SET and norm not in seen:
|
||||
cleaned.append(norm)
|
||||
seen.add(norm)
|
||||
if len(cleaned) >= MAX_TAGS_PER_HEADLINE:
|
||||
break
|
||||
return cleaned
|
||||
|
||||
|
||||
def _parse_batch_response(content: str, expected_ids: set[int]) -> dict[int, list[str]]:
|
||||
"""Parse the model's JSON output into {id: tags}.
|
||||
|
||||
Robust to leading prose / code fences / trailing notes — uses
|
||||
``json.JSONDecoder.raw_decode`` to parse the first complete JSON
|
||||
value starting at the first ``[``. Anything after that array is
|
||||
ignored. If the first parse fails, we fall back to extracting
|
||||
well-formed ``{"id": ..., "tags": [...]}`` objects via regex so a
|
||||
single corrupt item doesn't lose the whole batch.
|
||||
"""
|
||||
out: dict[int, list[str]] = {}
|
||||
if not content:
|
||||
return out
|
||||
|
||||
# Trim common preambles + code fences.
|
||||
stripped = content.strip()
|
||||
# First-`[` to last-position parse via raw_decode.
|
||||
start = stripped.find("[")
|
||||
if start == -1:
|
||||
log.warning("news_tagging.unparseable", preview=content[:120])
|
||||
return out
|
||||
try:
|
||||
data, _end = json.JSONDecoder().raw_decode(stripped[start:])
|
||||
if isinstance(data, list):
|
||||
for item in data:
|
||||
_absorb(item, expected_ids, out)
|
||||
return out
|
||||
except json.JSONDecodeError:
|
||||
pass # fall through to per-item recovery
|
||||
|
||||
# Recovery path: scrape individual objects. Looks for shapes like
|
||||
# `{"id": 123, "tags": ["a", "b"]}` and tolerates any garbage between.
|
||||
matched = 0
|
||||
for m in re.finditer(
|
||||
r'\{\s*"id"\s*:\s*"?(\d+)"?\s*,\s*"tags"\s*:\s*(\[[^\]]*\])\s*\}',
|
||||
stripped,
|
||||
):
|
||||
try:
|
||||
item = {"id": int(m.group(1)), "tags": json.loads(m.group(2))}
|
||||
except (ValueError, json.JSONDecodeError):
|
||||
continue
|
||||
if _absorb(item, expected_ids, out):
|
||||
matched += 1
|
||||
if not out:
|
||||
log.warning(
|
||||
"news_tagging.json_error_unrecoverable",
|
||||
preview=content[:200],
|
||||
)
|
||||
elif matched < len(expected_ids):
|
||||
log.info(
|
||||
"news_tagging.json_partial_recovery",
|
||||
recovered=matched, expected=len(expected_ids),
|
||||
)
|
||||
return out
|
||||
|
||||
|
||||
def _absorb(item, expected_ids: set[int], out: dict[int, list[str]]) -> bool:
|
||||
"""Place one well-formed item into the output dict if it matches an
|
||||
expected id. Returns True if it landed."""
|
||||
if not isinstance(item, dict):
|
||||
return False
|
||||
try:
|
||||
iid = int(item.get("id"))
|
||||
except (TypeError, ValueError):
|
||||
return False
|
||||
if iid not in expected_ids or iid in out:
|
||||
return False
|
||||
tags = _validate_tags(item.get("tags"))
|
||||
# Empty post-validation = model picked nothing in vocabulary. Fall
|
||||
# back to "other" so the row is marked tagged (distinguishes
|
||||
# "tagged poorly" from "not yet tagged").
|
||||
out[iid] = tags or ["other"]
|
||||
return True
|
||||
|
||||
|
||||
async def tag_batch(
|
||||
client: httpx.AsyncClient,
|
||||
items: list[_ToTag],
|
||||
) -> dict[int, list[str]]:
|
||||
"""Tag one batch of (id, title) pairs. Returns {id: tags}. Items not
|
||||
in the result remain untagged (NULL in the DB) and are retried on the
|
||||
next news_job run."""
|
||||
if not items:
|
||||
return {}
|
||||
user_msg = (
|
||||
"# Headlines to tag\n```json\n"
|
||||
+ json.dumps(
|
||||
[{"id": it.id, "title": it.title} for it in items],
|
||||
ensure_ascii=False,
|
||||
)
|
||||
+ "\n```"
|
||||
)
|
||||
try:
|
||||
result = await call_llm(
|
||||
client,
|
||||
messages=[
|
||||
{"role": "system", "content": _SYSTEM_PROMPT},
|
||||
{"role": "user", "content": user_msg},
|
||||
],
|
||||
# Generous ceiling: ~30 tokens/item × 25 items + reasoning
|
||||
# overhead for thinking models. Hitting the cap returns empty
|
||||
# content (finish_reason=length) and triggers the fallback.
|
||||
max_tokens=4000,
|
||||
)
|
||||
except Exception as e:
|
||||
log.warning("news_tagging.llm_failed", n=len(items), error=str(e)[:200])
|
||||
return {}
|
||||
return _parse_batch_response(result.content, {it.id for it in items})
|
||||
|
||||
|
||||
async def tag_titles(items: list[_ToTag]) -> dict[int, list[str]]:
|
||||
"""Tag a list of titles, splitting into BATCH_SIZE chunks. Returns
|
||||
{id: tags}. Failed batches contribute nothing — their items stay
|
||||
untagged for next time."""
|
||||
if not items:
|
||||
return {}
|
||||
out: dict[int, list[str]] = {}
|
||||
async with httpx.AsyncClient(follow_redirects=True, timeout=60) as client:
|
||||
for i in range(0, len(items), BATCH_SIZE):
|
||||
chunk = items[i:i + BATCH_SIZE]
|
||||
batch_out = await tag_batch(client, chunk)
|
||||
out.update(batch_out)
|
||||
log.info(
|
||||
"news_tagging.batch_complete",
|
||||
requested=len(items), tagged=len(out),
|
||||
)
|
||||
return out
|
||||
|
||||
|
||||
# Public re-export for the news_job hook + callers that want to assemble
|
||||
# their own (id, title) tuples without importing the private dataclass.
|
||||
ToTag = _ToTag
|
||||
|
|
@ -26,7 +26,10 @@ OPENROUTER_URL = "https://openrouter.ai/api/v1/chat/completions"
|
|||
# framing aimed at young investors entering the trading world. NOVICE retuned
|
||||
# to be pedagogical (defining terms, anti-pattern teach-backs); INTERMEDIATE
|
||||
# kept terse but with light-touch educational nudges. See tasks/todo.md.
|
||||
PROMPT_VERSION = 6
|
||||
# v7 (2026-05-18): Forbid "(Updated HH:MM UTC)" clauses in the date header —
|
||||
# the model was hallucinating future times. The user prompt now carries the
|
||||
# actual current UTC time so the model has accurate temporal context.
|
||||
PROMPT_VERSION = 7
|
||||
|
||||
|
||||
# --- Core: invariant across tone/analysis settings ----------------------------
|
||||
|
|
@ -49,7 +52,11 @@ cover the same event, read the gap in framing — that's the data.
|
|||
implications is filler.
|
||||
|
||||
# Structure
|
||||
- One-line date header + any anchor framing (e.g. "Week 11 since Hormuz").
|
||||
- One-line date header containing ONLY the date (e.g. `2026-05-18`) and \
|
||||
optional anchor framing on the same line (e.g. "Week 11 since Hormuz"). \
|
||||
**Never include a time-of-day clause like "(Updated 21:30 UTC)"** — \
|
||||
generation time is recorded as metadata elsewhere. Inventing a future or \
|
||||
arbitrary time in the header confuses readers.
|
||||
- Immediately after the date header — with **nothing** in between — write a \
|
||||
TL;DR. Format it as:
|
||||
|
||||
|
|
@ -423,7 +430,12 @@ def build_user_prompt(
|
|||
"""Assemble the user message from already-fetched-and-persisted data.
|
||||
If `previous_log` is a StrategicLog from earlier today, it's included
|
||||
as 'Update mode' context — the model will revise rather than restart."""
|
||||
parts = [f"# Strategic log request — {today.strftime('%Y-%m-%d')}"]
|
||||
parts = [
|
||||
f"# Strategic log request — {today.strftime('%Y-%m-%d')}",
|
||||
# Explicit current time so the model doesn't hallucinate one. The
|
||||
# date header it writes MUST stay date-only (per system prompt).
|
||||
f"Current time: {today.strftime('%Y-%m-%d %H:%M UTC')}",
|
||||
]
|
||||
if anchor:
|
||||
parts.append(f"Anchor reference date: {anchor}")
|
||||
if reference_line:
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue