read.markets/app/services/news.py
Giorgio Gilestro a10409c02b initial commit — cassandra v0.1
Containerised macro-strategy dashboard: 4-panel web UI (indicators,
portfolio, flash news, AI strategic log), MariaDB store, hourly
ingestion jobs, OpenRouter-backed AI analysis.

Ports the four prototype scripts in the parent dir (market_pulse,
flash_news, trading212, strategic_log) into async services backed by a
persistent DB and served via FastAPI + Jinja2 + HTMX. APScheduler runs
as a separate compose service for crash-safety and easier restarts.

Portfolio composition + position names come live from Trading 212;
news per-ticker headlines reuse those names. Tone (NOVICE/INTERMEDIATE/
PRO) and analysis style (DRY/SPECULATIVE) are env-configurable and
stored on each log row so historical entries show what produced them.

Default model is deepseek/deepseek-v4-flash (overridable via env).
Light/dark theme toggle, sans-serif for prose surfaces, monospace for
data. Bearer-token auth, OpenRouter monthly cost cap, RSS feeds auto-
disabled on consecutive failures.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-15 21:56:10 +01:00

167 lines
5.7 KiB
Python

"""RSS feed aggregator + Yahoo per-ticker news.
Ported from /home/gg/ownCloud/Family/Finances/Wealth/flash_news.py — same
parsing, dedupe, and ticker-name resolution logic, async HTTP via httpx.
"""
from __future__ import annotations
import hashlib
import re
from dataclasses import dataclass
from datetime import datetime, timezone
from email.utils import parsedate_to_datetime
from xml.etree import ElementTree as ET
import httpx
UA = {"User-Agent": "Mozilla/5.0 (cassandra) Python/httpx"}
ATOM_NS = "{http://www.w3.org/2005/Atom}"
DC_NS = "{http://purl.org/dc/elements/1.1/}"
YAHOO_NEWS = "https://query1.finance.yahoo.com/v1/finance/search"
YAHOO_CHART = "https://query1.finance.yahoo.com/v8/finance/chart/{symbol}"
_NAME_STOPWORDS = {"plc", "corp", "inc", "ltd", "fund", "etf", "ucits",
"class", "shares", "trust", "the", "and", "of"}
@dataclass
class Headline:
when: datetime # tz-aware UTC
source: str
category: str
title: str
url: str
@property
def fingerprint(self) -> str:
"""sha1 of normalised title — used as DB UNIQUE."""
norm = " ".join(self.title.lower().split())
return hashlib.sha1(norm.encode("utf-8")).hexdigest()
def _parse_date(s: str | None) -> datetime | None:
if not s:
return None
try:
return parsedate_to_datetime(s).astimezone(timezone.utc)
except (TypeError, ValueError):
pass
try:
return datetime.fromisoformat(s.replace("Z", "+00:00")).astimezone(timezone.utc)
except ValueError:
return None
def parse_feed(name: str, category: str, xml_bytes: bytes) -> list[Headline]:
try:
root = ET.fromstring(xml_bytes)
except ET.ParseError:
return []
out: list[Headline] = []
rss_items = root.findall(".//item")
if rss_items:
for it in rss_items:
title = (it.findtext("title") or "").strip()
link = (it.findtext("link") or "").strip()
pub = it.findtext("pubDate") or it.findtext(f"{DC_NS}date")
when = _parse_date(pub) or datetime.now(timezone.utc)
if title and link:
out.append(Headline(when, name, category, title, link))
else:
for entry in root.findall(f".//{ATOM_NS}entry"):
title = (entry.findtext(f"{ATOM_NS}title") or "").strip()
link_el = entry.find(f"{ATOM_NS}link")
link = (link_el.get("href") if link_el is not None else "") or ""
pub = entry.findtext(f"{ATOM_NS}published") or entry.findtext(f"{ATOM_NS}updated")
when = _parse_date(pub) or datetime.now(timezone.utc)
if title and link:
out.append(Headline(when, name, category, title, link.strip()))
return out
async def fetch_feed(
client: httpx.AsyncClient, name: str, category: str, url: str
) -> list[Headline]:
"""Returns headlines on success, empty list on any failure (caller logs)."""
r = await client.get(url, headers=UA, timeout=12)
r.raise_for_status()
return parse_feed(name, category, r.content)
async def _resolve_ticker_name(client: httpx.AsyncClient, ticker: str) -> str:
"""Look up the company longName so news search hits headlines that actually
mention the company rather than matching the literal ticker string."""
try:
r = await client.get(
YAHOO_CHART.format(symbol=ticker),
params={"interval": "1d", "range": "5d"},
headers=UA, timeout=8,
)
r.raise_for_status()
meta = r.json()["chart"]["result"][0]["meta"]
return meta.get("longName") or meta.get("shortName") or ticker
except Exception:
return ticker
async def fetch_yahoo_news(
client: httpx.AsyncClient,
ticker: str,
count: int = 10,
query_override: str | None = None,
) -> list[Headline]:
"""Filtered Yahoo per-ticker headlines. Niche UCITS ETFs return empty
rather than the generic firehose because of the token-overlap guard.
If `query_override` is provided (e.g. a name already fetched from
Trading 212 instruments), it skips the Yahoo chart-meta round-trip."""
query = query_override or await _resolve_ticker_name(client, ticker)
tokens = [
t.lower() for t in re.split(r"[\s.]+", query)
if len(t) >= 3 and t.lower() not in _NAME_STOPWORDS
]
try:
r = await client.get(
YAHOO_NEWS,
params={"q": query, "newsCount": count, "quotesCount": 0},
headers=UA, timeout=10,
)
r.raise_for_status()
items = r.json().get("news", [])
out: list[Headline] = []
for it in items:
title = (it.get("title") or "").strip()
link = (it.get("link") or "").strip()
if not (title and link):
continue
if tokens and not any(t in title.lower() for t in tokens):
continue
ts = it.get("providerPublishTime")
when = (
datetime.fromtimestamp(ts, timezone.utc) if ts
else datetime.now(timezone.utc)
)
out.append(Headline(when, f"Yahoo:{ticker}", "ticker", title, link))
return out
except Exception:
return []
def dedupe(headlines: list[Headline]) -> list[Headline]:
"""URL first, then normalised title — same logic as the prototype."""
seen_url: set[str] = set()
seen_fp: set[str] = set()
out: list[Headline] = []
for h in headlines:
if h.url in seen_url or h.fingerprint in seen_fp:
continue
seen_url.add(h.url)
seen_fp.add(h.fingerprint)
out.append(h)
return out
def matches_any(text: str, keywords: list[str]) -> bool:
t = text.lower()
return any(kw in t for kw in keywords)