read.markets/app/services/news.py

"""RSS feed aggregator + Yahoo per-ticker news.

Ported from /home/gg/ownCloud/Family/Finances/Wealth/flash_news.py — same
parsing, dedupe, and ticker-name resolution logic, async HTTP via httpx.
"""
from __future__ import annotations

import hashlib
import re
from dataclasses import dataclass
from datetime import datetime, timezone
from email.utils import parsedate_to_datetime
from xml.etree import ElementTree as ET

import httpx


UA = {"User-Agent": "Mozilla/5.0 (cassandra) Python/httpx"}
ATOM_NS = "{http://www.w3.org/2005/Atom}"
DC_NS = "{http://purl.org/dc/elements/1.1/}"
YAHOO_NEWS = "https://query1.finance.yahoo.com/v1/finance/search"
YAHOO_CHART = "https://query1.finance.yahoo.com/v8/finance/chart/{symbol}"

_NAME_STOPWORDS = {"plc", "corp", "inc", "ltd", "fund", "etf", "ucits",
                   "class", "shares", "trust", "the", "and", "of"}


@dataclass
class Headline:
    when: datetime  # tz-aware UTC
    source: str
    category: str
    title: str
    url: str

    @property
    def fingerprint(self) -> str:
        """sha1 of normalised title — used as DB UNIQUE."""
        norm = " ".join(self.title.lower().split())
        return hashlib.sha1(norm.encode("utf-8")).hexdigest()


def _parse_date(s: str | None) -> datetime | None:
    if not s:
        return None
    try:
        return parsedate_to_datetime(s).astimezone(timezone.utc)
    except (TypeError, ValueError):
        pass
    try:
        return datetime.fromisoformat(s.replace("Z", "+00:00")).astimezone(timezone.utc)
    except ValueError:
        return None


def parse_feed(name: str, category: str, xml_bytes: bytes) -> list[Headline]:
    try:
        root = ET.fromstring(xml_bytes)
    except ET.ParseError:
        return []
    out: list[Headline] = []
    rss_items = root.findall(".//item")
    if rss_items:
        for it in rss_items:
            title = (it.findtext("title") or "").strip()
            link = (it.findtext("link") or "").strip()
            pub = it.findtext("pubDate") or it.findtext(f"{DC_NS}date")
            when = _parse_date(pub) or datetime.now(timezone.utc)
            if title and link:
                out.append(Headline(when, name, category, title, link))
    else:
        for entry in root.findall(f".//{ATOM_NS}entry"):
            title = (entry.findtext(f"{ATOM_NS}title") or "").strip()
            link_el = entry.find(f"{ATOM_NS}link")
            link = (link_el.get("href") if link_el is not None else "") or ""
            pub = entry.findtext(f"{ATOM_NS}published") or entry.findtext(f"{ATOM_NS}updated")
            when = _parse_date(pub) or datetime.now(timezone.utc)
            if title and link:
                out.append(Headline(when, name, category, title, link.strip()))
    return out


async def fetch_feed(
    client: httpx.AsyncClient, name: str, category: str, url: str
) -> list[Headline]:
    """Returns headlines on success, empty list on any failure (caller logs)."""
    r = await client.get(url, headers=UA, timeout=12)
    r.raise_for_status()
    return parse_feed(name, category, r.content)


async def _resolve_ticker_name(client: httpx.AsyncClient, ticker: str) -> str:
    """Look up the company longName so news search hits headlines that actually
    mention the company rather than matching the literal ticker string."""
    try:
        r = await client.get(
            YAHOO_CHART.format(symbol=ticker),
            params={"interval": "1d", "range": "5d"},
            headers=UA, timeout=8,
        )
        r.raise_for_status()
        meta = r.json()["chart"]["result"][0]["meta"]
        return meta.get("longName") or meta.get("shortName") or ticker
    except Exception:
        return ticker


async def fetch_yahoo_news(
    client: httpx.AsyncClient,
    ticker: str,
    count: int = 10,
    query_override: str | None = None,
) -> list[Headline]:
    """Filtered Yahoo per-ticker headlines. Niche UCITS ETFs return empty
    rather than the generic firehose because of the token-overlap guard.

    If `query_override` is provided (e.g. a name already fetched from
    Trading 212 instruments), it skips the Yahoo chart-meta round-trip."""
    query = query_override or await _resolve_ticker_name(client, ticker)
    tokens = [
        t.lower() for t in re.split(r"[\s.]+", query)
        if len(t) >= 3 and t.lower() not in _NAME_STOPWORDS
    ]
    try:
        r = await client.get(
            YAHOO_NEWS,
            params={"q": query, "newsCount": count, "quotesCount": 0},
            headers=UA, timeout=10,
        )
        r.raise_for_status()
        items = r.json().get("news", [])
        out: list[Headline] = []
        for it in items:
            title = (it.get("title") or "").strip()
            link = (it.get("link") or "").strip()
            if not (title and link):
                continue
            if tokens and not any(t in title.lower() for t in tokens):
                continue
            ts = it.get("providerPublishTime")
            when = (
                datetime.fromtimestamp(ts, timezone.utc) if ts
                else datetime.now(timezone.utc)
            )
            out.append(Headline(when, f"Yahoo:{ticker}", "ticker", title, link))
        return out
    except Exception:
        return []


def dedupe(headlines: list[Headline]) -> list[Headline]:
    """URL first, then normalised title — same logic as the prototype."""
    seen_url: set[str] = set()
    seen_fp: set[str] = set()
    out: list[Headline] = []
    for h in headlines:
        if h.url in seen_url or h.fingerprint in seen_fp:
            continue
        seen_url.add(h.url)
        seen_fp.add(h.fingerprint)
        out.append(h)
    return out


def matches_any(text: str, keywords: list[str]) -> bool:
    t = text.lower()
    return any(kw in t for kw in keywords)