i18n: add translate() helper backed by call_llm

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
Giorgio Gilestro 2026-05-27 16:48:32 +02:00
parent 5730aad73c
commit 7683f82820
2 changed files with 157 additions and 0 deletions

View file

@ -0,0 +1,82 @@
"""Markdown translation via the existing LLM provider chain.
DeepSeek-4-flash at ~$0.28/M output tokens is cheap enough that we
don't bother with a separate translation-only model. ``call_llm``'s
provider chain (DeepSeek primary, OpenRouter fallback) handles this
path identically to any other LLM call.
The translator is content-aware in one important way: it instructs the
model to preserve markdown structure, ticker symbols, numbers, dates,
and percentages verbatim. This keeps generated artefacts (tables of
quotes, embedded percentages, dated references) intact across the
translation boundary.
"""
from __future__ import annotations
import httpx
from app.services.i18n import LANGUAGES
from app.services.openrouter import LogResult, call_llm
_SYSTEM_PROMPT_TMPL = """\
You are an expert translator working on financial-markets commentary.
Translate the following markdown text to {language}.
Strict rules:
- Preserve ALL markdown formatting (headings, lists, emphasis, links,
tables, code spans).
- Do NOT translate ticker symbols (AAPL, MSFT, VOD.L, ASML.AS, etc.),
company legal names, percentages, dates, ISO currency codes, or any
numbers.
- Do NOT add commentary, preambles, or apologies. Output ONLY the
translated markdown.
"""
async def translate(
client: httpx.AsyncClient,
text: str,
target_lang: str,
) -> tuple[str, LogResult]:
"""Translate markdown ``text`` to ``target_lang``.
Returns ``(translated_markdown, LogResult)``. Caller persists the
cost/model provenance from LogResult next to the cached row.
Short-circuits without calling the LLM when ``target_lang`` is
``'en'``, unknown, or empty returns the source unchanged with a
zero-cost stub LogResult. This lets fan-out callers iterate over
all languages without per-call gating.
Raises on provider failure (HTTP error, all chain providers down).
Callers in fan-out paths should catch and log per-language.
"""
if not target_lang or target_lang == "en" or target_lang not in LANGUAGES:
# No-op fast path. Returning a fake LogResult keeps the call
# signature stable for callers who unpack the tuple.
return text, LogResult(
content=text, model="noop",
prompt_tokens=0, completion_tokens=0, cost_usd=0.0,
)
system_prompt = _SYSTEM_PROMPT_TMPL.format(language=LANGUAGES[target_lang])
messages = [
{"role": "system", "content": system_prompt},
{"role": "user", "content": text},
]
result = await call_llm(client, messages)
content = (result.content or "").strip()
# Strip code fences if the model wrapped its output despite the system rule.
if content.startswith("```"):
# Drop the opening fence (with optional language tag).
first_nl = content.find("\n")
if first_nl != -1:
content = content[first_nl + 1:]
# Drop the closing fence.
if content.rstrip().endswith("```"):
content = content.rstrip()[:-3].rstrip()
content = content.strip()
return content, result

View file

@ -42,3 +42,78 @@ def test_respond_in_clause_unknown_lang_falls_back_to_english():
prompt assembly. Unknown codes map to no-suffix (English default).""" prompt assembly. Unknown codes map to no-suffix (English default)."""
from app.services.i18n import respond_in_clause from app.services.i18n import respond_in_clause
assert respond_in_clause("xx") == "" assert respond_in_clause("xx") == ""
@pytest.mark.asyncio
async def test_translate_happy_path(monkeypatch):
from unittest.mock import AsyncMock, MagicMock
from app.services import translation as mod
from app.services.openrouter import LogResult
monkeypatch.setattr(mod, "call_llm", AsyncMock(return_value=LogResult(
content="# Apertura\n\nIl mercato è in calo dello 0,4%.",
model="deepseek/deepseek-v4-flash",
prompt_tokens=300, completion_tokens=80, cost_usd=0.00002,
)))
client = MagicMock()
translated, llm_log = await mod.translate(
client, "# Open\n\nThe market is down 0.4%.", "it",
)
assert "Apertura" in translated
assert llm_log.model == "deepseek/deepseek-v4-flash"
assert llm_log.cost_usd == pytest.approx(0.00002)
@pytest.mark.asyncio
async def test_translate_strips_code_fences(monkeypatch):
"""If the LLM wraps the output in ```markdown ... ```, strip it."""
from unittest.mock import AsyncMock, MagicMock
from app.services import translation as mod
from app.services.openrouter import LogResult
fenced = "```markdown\n# Titolo\n\nCorpo.\n```"
monkeypatch.setattr(mod, "call_llm", AsyncMock(return_value=LogResult(
content=fenced, model="m", prompt_tokens=10, completion_tokens=20, cost_usd=0.0,
)))
client = MagicMock()
translated, _ = await mod.translate(client, "# Title\n\nBody.", "it")
assert "```" not in translated
assert translated.startswith("# Titolo")
@pytest.mark.asyncio
async def test_translate_provider_failure_propagates(monkeypatch):
from unittest.mock import AsyncMock, MagicMock
from app.services import translation as mod
monkeypatch.setattr(mod, "call_llm", AsyncMock(side_effect=RuntimeError("upstream down")))
client = MagicMock()
with pytest.raises(RuntimeError, match="upstream down"):
await mod.translate(client, "# Title\n\nBody.", "it")
@pytest.mark.asyncio
async def test_translate_unknown_lang_returns_source_unchanged(monkeypatch):
"""Defensive: an unknown lang code (or 'en') short-circuits without
calling the LLM. Callers shouldn't have to gate the call themselves."""
from unittest.mock import AsyncMock, MagicMock
from app.services import translation as mod
from app.services.openrouter import LogResult
call_mock = AsyncMock(return_value=LogResult(
content="should not be returned",
model="m", prompt_tokens=0, completion_tokens=0, cost_usd=0.0,
))
monkeypatch.setattr(mod, "call_llm", call_mock)
client = MagicMock()
out, _ = await mod.translate(client, "Hello world.", "en")
assert out == "Hello world."
call_mock.assert_not_awaited()