i18n: add translate() helper backed by call_llm
Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
parent
5730aad73c
commit
7683f82820
2 changed files with 157 additions and 0 deletions
82
app/services/translation.py
Normal file
82
app/services/translation.py
Normal file
|
|
@ -0,0 +1,82 @@
|
|||
"""Markdown translation via the existing LLM provider chain.
|
||||
|
||||
DeepSeek-4-flash at ~$0.28/M output tokens is cheap enough that we
|
||||
don't bother with a separate translation-only model. ``call_llm``'s
|
||||
provider chain (DeepSeek primary, OpenRouter fallback) handles this
|
||||
path identically to any other LLM call.
|
||||
|
||||
The translator is content-aware in one important way: it instructs the
|
||||
model to preserve markdown structure, ticker symbols, numbers, dates,
|
||||
and percentages verbatim. This keeps generated artefacts (tables of
|
||||
quotes, embedded percentages, dated references) intact across the
|
||||
translation boundary.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import httpx
|
||||
|
||||
from app.services.i18n import LANGUAGES
|
||||
from app.services.openrouter import LogResult, call_llm
|
||||
|
||||
|
||||
_SYSTEM_PROMPT_TMPL = """\
|
||||
You are an expert translator working on financial-markets commentary.
|
||||
Translate the following markdown text to {language}.
|
||||
|
||||
Strict rules:
|
||||
- Preserve ALL markdown formatting (headings, lists, emphasis, links,
|
||||
tables, code spans).
|
||||
- Do NOT translate ticker symbols (AAPL, MSFT, VOD.L, ASML.AS, etc.),
|
||||
company legal names, percentages, dates, ISO currency codes, or any
|
||||
numbers.
|
||||
- Do NOT add commentary, preambles, or apologies. Output ONLY the
|
||||
translated markdown.
|
||||
"""
|
||||
|
||||
|
||||
async def translate(
|
||||
client: httpx.AsyncClient,
|
||||
text: str,
|
||||
target_lang: str,
|
||||
) -> tuple[str, LogResult]:
|
||||
"""Translate markdown ``text`` to ``target_lang``.
|
||||
|
||||
Returns ``(translated_markdown, LogResult)``. Caller persists the
|
||||
cost/model provenance from LogResult next to the cached row.
|
||||
|
||||
Short-circuits without calling the LLM when ``target_lang`` is
|
||||
``'en'``, unknown, or empty — returns the source unchanged with a
|
||||
zero-cost stub LogResult. This lets fan-out callers iterate over
|
||||
all languages without per-call gating.
|
||||
|
||||
Raises on provider failure (HTTP error, all chain providers down).
|
||||
Callers in fan-out paths should catch and log per-language.
|
||||
"""
|
||||
if not target_lang or target_lang == "en" or target_lang not in LANGUAGES:
|
||||
# No-op fast path. Returning a fake LogResult keeps the call
|
||||
# signature stable for callers who unpack the tuple.
|
||||
return text, LogResult(
|
||||
content=text, model="noop",
|
||||
prompt_tokens=0, completion_tokens=0, cost_usd=0.0,
|
||||
)
|
||||
|
||||
system_prompt = _SYSTEM_PROMPT_TMPL.format(language=LANGUAGES[target_lang])
|
||||
messages = [
|
||||
{"role": "system", "content": system_prompt},
|
||||
{"role": "user", "content": text},
|
||||
]
|
||||
result = await call_llm(client, messages)
|
||||
|
||||
content = (result.content or "").strip()
|
||||
# Strip code fences if the model wrapped its output despite the system rule.
|
||||
if content.startswith("```"):
|
||||
# Drop the opening fence (with optional language tag).
|
||||
first_nl = content.find("\n")
|
||||
if first_nl != -1:
|
||||
content = content[first_nl + 1:]
|
||||
# Drop the closing fence.
|
||||
if content.rstrip().endswith("```"):
|
||||
content = content.rstrip()[:-3].rstrip()
|
||||
content = content.strip()
|
||||
|
||||
return content, result
|
||||
Loading…
Add table
Add a link
Reference in a new issue