i18n: add translate() helper backed by call_llm
Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
parent
5730aad73c
commit
7683f82820
2 changed files with 157 additions and 0 deletions
82
app/services/translation.py
Normal file
82
app/services/translation.py
Normal file
|
|
@ -0,0 +1,82 @@
|
|||
"""Markdown translation via the existing LLM provider chain.
|
||||
|
||||
DeepSeek-4-flash at ~$0.28/M output tokens is cheap enough that we
|
||||
don't bother with a separate translation-only model. ``call_llm``'s
|
||||
provider chain (DeepSeek primary, OpenRouter fallback) handles this
|
||||
path identically to any other LLM call.
|
||||
|
||||
The translator is content-aware in one important way: it instructs the
|
||||
model to preserve markdown structure, ticker symbols, numbers, dates,
|
||||
and percentages verbatim. This keeps generated artefacts (tables of
|
||||
quotes, embedded percentages, dated references) intact across the
|
||||
translation boundary.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import httpx
|
||||
|
||||
from app.services.i18n import LANGUAGES
|
||||
from app.services.openrouter import LogResult, call_llm
|
||||
|
||||
|
||||
_SYSTEM_PROMPT_TMPL = """\
|
||||
You are an expert translator working on financial-markets commentary.
|
||||
Translate the following markdown text to {language}.
|
||||
|
||||
Strict rules:
|
||||
- Preserve ALL markdown formatting (headings, lists, emphasis, links,
|
||||
tables, code spans).
|
||||
- Do NOT translate ticker symbols (AAPL, MSFT, VOD.L, ASML.AS, etc.),
|
||||
company legal names, percentages, dates, ISO currency codes, or any
|
||||
numbers.
|
||||
- Do NOT add commentary, preambles, or apologies. Output ONLY the
|
||||
translated markdown.
|
||||
"""
|
||||
|
||||
|
||||
async def translate(
|
||||
client: httpx.AsyncClient,
|
||||
text: str,
|
||||
target_lang: str,
|
||||
) -> tuple[str, LogResult]:
|
||||
"""Translate markdown ``text`` to ``target_lang``.
|
||||
|
||||
Returns ``(translated_markdown, LogResult)``. Caller persists the
|
||||
cost/model provenance from LogResult next to the cached row.
|
||||
|
||||
Short-circuits without calling the LLM when ``target_lang`` is
|
||||
``'en'``, unknown, or empty — returns the source unchanged with a
|
||||
zero-cost stub LogResult. This lets fan-out callers iterate over
|
||||
all languages without per-call gating.
|
||||
|
||||
Raises on provider failure (HTTP error, all chain providers down).
|
||||
Callers in fan-out paths should catch and log per-language.
|
||||
"""
|
||||
if not target_lang or target_lang == "en" or target_lang not in LANGUAGES:
|
||||
# No-op fast path. Returning a fake LogResult keeps the call
|
||||
# signature stable for callers who unpack the tuple.
|
||||
return text, LogResult(
|
||||
content=text, model="noop",
|
||||
prompt_tokens=0, completion_tokens=0, cost_usd=0.0,
|
||||
)
|
||||
|
||||
system_prompt = _SYSTEM_PROMPT_TMPL.format(language=LANGUAGES[target_lang])
|
||||
messages = [
|
||||
{"role": "system", "content": system_prompt},
|
||||
{"role": "user", "content": text},
|
||||
]
|
||||
result = await call_llm(client, messages)
|
||||
|
||||
content = (result.content or "").strip()
|
||||
# Strip code fences if the model wrapped its output despite the system rule.
|
||||
if content.startswith("```"):
|
||||
# Drop the opening fence (with optional language tag).
|
||||
first_nl = content.find("\n")
|
||||
if first_nl != -1:
|
||||
content = content[first_nl + 1:]
|
||||
# Drop the closing fence.
|
||||
if content.rstrip().endswith("```"):
|
||||
content = content.rstrip()[:-3].rstrip()
|
||||
content = content.strip()
|
||||
|
||||
return content, result
|
||||
|
|
@ -42,3 +42,78 @@ def test_respond_in_clause_unknown_lang_falls_back_to_english():
|
|||
prompt assembly. Unknown codes map to no-suffix (English default)."""
|
||||
from app.services.i18n import respond_in_clause
|
||||
assert respond_in_clause("xx") == ""
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_translate_happy_path(monkeypatch):
|
||||
from unittest.mock import AsyncMock, MagicMock
|
||||
|
||||
from app.services import translation as mod
|
||||
from app.services.openrouter import LogResult
|
||||
|
||||
monkeypatch.setattr(mod, "call_llm", AsyncMock(return_value=LogResult(
|
||||
content="# Apertura\n\nIl mercato è in calo dello 0,4%.",
|
||||
model="deepseek/deepseek-v4-flash",
|
||||
prompt_tokens=300, completion_tokens=80, cost_usd=0.00002,
|
||||
)))
|
||||
|
||||
client = MagicMock()
|
||||
translated, llm_log = await mod.translate(
|
||||
client, "# Open\n\nThe market is down 0.4%.", "it",
|
||||
)
|
||||
assert "Apertura" in translated
|
||||
assert llm_log.model == "deepseek/deepseek-v4-flash"
|
||||
assert llm_log.cost_usd == pytest.approx(0.00002)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_translate_strips_code_fences(monkeypatch):
|
||||
"""If the LLM wraps the output in ```markdown ... ```, strip it."""
|
||||
from unittest.mock import AsyncMock, MagicMock
|
||||
|
||||
from app.services import translation as mod
|
||||
from app.services.openrouter import LogResult
|
||||
|
||||
fenced = "```markdown\n# Titolo\n\nCorpo.\n```"
|
||||
monkeypatch.setattr(mod, "call_llm", AsyncMock(return_value=LogResult(
|
||||
content=fenced, model="m", prompt_tokens=10, completion_tokens=20, cost_usd=0.0,
|
||||
)))
|
||||
|
||||
client = MagicMock()
|
||||
translated, _ = await mod.translate(client, "# Title\n\nBody.", "it")
|
||||
assert "```" not in translated
|
||||
assert translated.startswith("# Titolo")
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_translate_provider_failure_propagates(monkeypatch):
|
||||
from unittest.mock import AsyncMock, MagicMock
|
||||
|
||||
from app.services import translation as mod
|
||||
|
||||
monkeypatch.setattr(mod, "call_llm", AsyncMock(side_effect=RuntimeError("upstream down")))
|
||||
|
||||
client = MagicMock()
|
||||
with pytest.raises(RuntimeError, match="upstream down"):
|
||||
await mod.translate(client, "# Title\n\nBody.", "it")
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_translate_unknown_lang_returns_source_unchanged(monkeypatch):
|
||||
"""Defensive: an unknown lang code (or 'en') short-circuits without
|
||||
calling the LLM. Callers shouldn't have to gate the call themselves."""
|
||||
from unittest.mock import AsyncMock, MagicMock
|
||||
|
||||
from app.services import translation as mod
|
||||
from app.services.openrouter import LogResult
|
||||
|
||||
call_mock = AsyncMock(return_value=LogResult(
|
||||
content="should not be returned",
|
||||
model="m", prompt_tokens=0, completion_tokens=0, cost_usd=0.0,
|
||||
))
|
||||
monkeypatch.setattr(mod, "call_llm", call_mock)
|
||||
|
||||
client = MagicMock()
|
||||
out, _ = await mod.translate(client, "Hello world.", "en")
|
||||
assert out == "Hello world."
|
||||
call_mock.assert_not_awaited()
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue