diff --git a/app/services/translation.py b/app/services/translation.py new file mode 100644 index 0000000..46dbabb --- /dev/null +++ b/app/services/translation.py @@ -0,0 +1,82 @@ +"""Markdown translation via the existing LLM provider chain. + +DeepSeek-4-flash at ~$0.28/M output tokens is cheap enough that we +don't bother with a separate translation-only model. ``call_llm``'s +provider chain (DeepSeek primary, OpenRouter fallback) handles this +path identically to any other LLM call. + +The translator is content-aware in one important way: it instructs the +model to preserve markdown structure, ticker symbols, numbers, dates, +and percentages verbatim. This keeps generated artefacts (tables of +quotes, embedded percentages, dated references) intact across the +translation boundary. +""" +from __future__ import annotations + +import httpx + +from app.services.i18n import LANGUAGES +from app.services.openrouter import LogResult, call_llm + + +_SYSTEM_PROMPT_TMPL = """\ +You are an expert translator working on financial-markets commentary. +Translate the following markdown text to {language}. + +Strict rules: +- Preserve ALL markdown formatting (headings, lists, emphasis, links, + tables, code spans). +- Do NOT translate ticker symbols (AAPL, MSFT, VOD.L, ASML.AS, etc.), + company legal names, percentages, dates, ISO currency codes, or any + numbers. +- Do NOT add commentary, preambles, or apologies. Output ONLY the + translated markdown. +""" + + +async def translate( + client: httpx.AsyncClient, + text: str, + target_lang: str, +) -> tuple[str, LogResult]: + """Translate markdown ``text`` to ``target_lang``. + + Returns ``(translated_markdown, LogResult)``. Caller persists the + cost/model provenance from LogResult next to the cached row. + + Short-circuits without calling the LLM when ``target_lang`` is + ``'en'``, unknown, or empty — returns the source unchanged with a + zero-cost stub LogResult. This lets fan-out callers iterate over + all languages without per-call gating. + + Raises on provider failure (HTTP error, all chain providers down). + Callers in fan-out paths should catch and log per-language. + """ + if not target_lang or target_lang == "en" or target_lang not in LANGUAGES: + # No-op fast path. Returning a fake LogResult keeps the call + # signature stable for callers who unpack the tuple. + return text, LogResult( + content=text, model="noop", + prompt_tokens=0, completion_tokens=0, cost_usd=0.0, + ) + + system_prompt = _SYSTEM_PROMPT_TMPL.format(language=LANGUAGES[target_lang]) + messages = [ + {"role": "system", "content": system_prompt}, + {"role": "user", "content": text}, + ] + result = await call_llm(client, messages) + + content = (result.content or "").strip() + # Strip code fences if the model wrapped its output despite the system rule. + if content.startswith("```"): + # Drop the opening fence (with optional language tag). + first_nl = content.find("\n") + if first_nl != -1: + content = content[first_nl + 1:] + # Drop the closing fence. + if content.rstrip().endswith("```"): + content = content.rstrip()[:-3].rstrip() + content = content.strip() + + return content, result diff --git a/tests/test_i18n.py b/tests/test_i18n.py index f0edce0..6fc207e 100644 --- a/tests/test_i18n.py +++ b/tests/test_i18n.py @@ -42,3 +42,78 @@ def test_respond_in_clause_unknown_lang_falls_back_to_english(): prompt assembly. Unknown codes map to no-suffix (English default).""" from app.services.i18n import respond_in_clause assert respond_in_clause("xx") == "" + + +@pytest.mark.asyncio +async def test_translate_happy_path(monkeypatch): + from unittest.mock import AsyncMock, MagicMock + + from app.services import translation as mod + from app.services.openrouter import LogResult + + monkeypatch.setattr(mod, "call_llm", AsyncMock(return_value=LogResult( + content="# Apertura\n\nIl mercato è in calo dello 0,4%.", + model="deepseek/deepseek-v4-flash", + prompt_tokens=300, completion_tokens=80, cost_usd=0.00002, + ))) + + client = MagicMock() + translated, llm_log = await mod.translate( + client, "# Open\n\nThe market is down 0.4%.", "it", + ) + assert "Apertura" in translated + assert llm_log.model == "deepseek/deepseek-v4-flash" + assert llm_log.cost_usd == pytest.approx(0.00002) + + +@pytest.mark.asyncio +async def test_translate_strips_code_fences(monkeypatch): + """If the LLM wraps the output in ```markdown ... ```, strip it.""" + from unittest.mock import AsyncMock, MagicMock + + from app.services import translation as mod + from app.services.openrouter import LogResult + + fenced = "```markdown\n# Titolo\n\nCorpo.\n```" + monkeypatch.setattr(mod, "call_llm", AsyncMock(return_value=LogResult( + content=fenced, model="m", prompt_tokens=10, completion_tokens=20, cost_usd=0.0, + ))) + + client = MagicMock() + translated, _ = await mod.translate(client, "# Title\n\nBody.", "it") + assert "```" not in translated + assert translated.startswith("# Titolo") + + +@pytest.mark.asyncio +async def test_translate_provider_failure_propagates(monkeypatch): + from unittest.mock import AsyncMock, MagicMock + + from app.services import translation as mod + + monkeypatch.setattr(mod, "call_llm", AsyncMock(side_effect=RuntimeError("upstream down"))) + + client = MagicMock() + with pytest.raises(RuntimeError, match="upstream down"): + await mod.translate(client, "# Title\n\nBody.", "it") + + +@pytest.mark.asyncio +async def test_translate_unknown_lang_returns_source_unchanged(monkeypatch): + """Defensive: an unknown lang code (or 'en') short-circuits without + calling the LLM. Callers shouldn't have to gate the call themselves.""" + from unittest.mock import AsyncMock, MagicMock + + from app.services import translation as mod + from app.services.openrouter import LogResult + + call_mock = AsyncMock(return_value=LogResult( + content="should not be returned", + model="m", prompt_tokens=0, completion_tokens=0, cost_usd=0.0, + )) + monkeypatch.setattr(mod, "call_llm", call_mock) + + client = MagicMock() + out, _ = await mod.translate(client, "Hello world.", "en") + assert out == "Hello world." + call_mock.assert_not_awaited()