"""RSS feed aggregator + Yahoo per-ticker news. Ported from /home/gg/ownCloud/Family/Finances/Wealth/flash_news.py — same parsing, dedupe, and ticker-name resolution logic, async HTTP via httpx. """ from __future__ import annotations import hashlib import re from dataclasses import dataclass from datetime import datetime, timezone from email.utils import parsedate_to_datetime from xml.etree import ElementTree as ET import httpx UA = {"User-Agent": "Mozilla/5.0 (cassandra) Python/httpx"} ATOM_NS = "{http://www.w3.org/2005/Atom}" DC_NS = "{http://purl.org/dc/elements/1.1/}" YAHOO_NEWS = "https://query1.finance.yahoo.com/v1/finance/search" YAHOO_CHART = "https://query1.finance.yahoo.com/v8/finance/chart/{symbol}" _NAME_STOPWORDS = {"plc", "corp", "inc", "ltd", "fund", "etf", "ucits", "class", "shares", "trust", "the", "and", "of"} @dataclass class Headline: when: datetime # tz-aware UTC source: str category: str title: str url: str @property def fingerprint(self) -> str: """sha1 of normalised title — used as DB UNIQUE.""" norm = " ".join(self.title.lower().split()) return hashlib.sha1(norm.encode("utf-8")).hexdigest() def _parse_date(s: str | None) -> datetime | None: if not s: return None try: return parsedate_to_datetime(s).astimezone(timezone.utc) except (TypeError, ValueError): pass try: return datetime.fromisoformat(s.replace("Z", "+00:00")).astimezone(timezone.utc) except ValueError: return None def parse_feed(name: str, category: str, xml_bytes: bytes) -> list[Headline]: try: root = ET.fromstring(xml_bytes) except ET.ParseError: return [] out: list[Headline] = [] rss_items = root.findall(".//item") if rss_items: for it in rss_items: title = (it.findtext("title") or "").strip() link = (it.findtext("link") or "").strip() pub = it.findtext("pubDate") or it.findtext(f"{DC_NS}date") when = _parse_date(pub) or datetime.now(timezone.utc) if title and link: out.append(Headline(when, name, category, title, link)) else: for entry in root.findall(f".//{ATOM_NS}entry"): title = (entry.findtext(f"{ATOM_NS}title") or "").strip() link_el = entry.find(f"{ATOM_NS}link") link = (link_el.get("href") if link_el is not None else "") or "" pub = entry.findtext(f"{ATOM_NS}published") or entry.findtext(f"{ATOM_NS}updated") when = _parse_date(pub) or datetime.now(timezone.utc) if title and link: out.append(Headline(when, name, category, title, link.strip())) return out async def fetch_feed( client: httpx.AsyncClient, name: str, category: str, url: str ) -> list[Headline]: """Returns headlines on success, empty list on any failure (caller logs).""" r = await client.get(url, headers=UA, timeout=12) r.raise_for_status() return parse_feed(name, category, r.content) async def _resolve_ticker_name(client: httpx.AsyncClient, ticker: str) -> str: """Look up the company longName so news search hits headlines that actually mention the company rather than matching the literal ticker string.""" try: r = await client.get( YAHOO_CHART.format(symbol=ticker), params={"interval": "1d", "range": "5d"}, headers=UA, timeout=8, ) r.raise_for_status() meta = r.json()["chart"]["result"][0]["meta"] return meta.get("longName") or meta.get("shortName") or ticker except Exception: return ticker async def fetch_yahoo_news( client: httpx.AsyncClient, ticker: str, count: int = 10, query_override: str | None = None, ) -> list[Headline]: """Filtered Yahoo per-ticker headlines. Niche UCITS ETFs return empty rather than the generic firehose because of the token-overlap guard. If `query_override` is provided (e.g. a name already fetched from Trading 212 instruments), it skips the Yahoo chart-meta round-trip.""" query = query_override or await _resolve_ticker_name(client, ticker) tokens = [ t.lower() for t in re.split(r"[\s.]+", query) if len(t) >= 3 and t.lower() not in _NAME_STOPWORDS ] try: r = await client.get( YAHOO_NEWS, params={"q": query, "newsCount": count, "quotesCount": 0}, headers=UA, timeout=10, ) r.raise_for_status() items = r.json().get("news", []) out: list[Headline] = [] for it in items: title = (it.get("title") or "").strip() link = (it.get("link") or "").strip() if not (title and link): continue if tokens and not any(t in title.lower() for t in tokens): continue ts = it.get("providerPublishTime") when = ( datetime.fromtimestamp(ts, timezone.utc) if ts else datetime.now(timezone.utc) ) out.append(Headline(when, f"Yahoo:{ticker}", "ticker", title, link)) return out except Exception: return [] def dedupe(headlines: list[Headline]) -> list[Headline]: """URL first, then normalised title — same logic as the prototype.""" seen_url: set[str] = set() seen_fp: set[str] = set() out: list[Headline] = [] for h in headlines: if h.url in seen_url or h.fingerprint in seen_fp: continue seen_url.add(h.url) seen_fp.add(h.fingerprint) out.append(h) return out def matches_any(text: str, keywords: list[str]) -> bool: t = text.lower() return any(kw in t for kw in keywords)