82 lines
3 KiB
Python
82 lines
3 KiB
Python
"""Markdown translation via the existing LLM provider chain.
|
|
|
|
DeepSeek-4-flash at ~$0.28/M output tokens is cheap enough that we
|
|
don't bother with a separate translation-only model. ``call_llm``'s
|
|
provider chain (DeepSeek primary, OpenRouter fallback) handles this
|
|
path identically to any other LLM call.
|
|
|
|
The translator is content-aware in one important way: it instructs the
|
|
model to preserve markdown structure, ticker symbols, numbers, dates,
|
|
and percentages verbatim. This keeps generated artefacts (tables of
|
|
quotes, embedded percentages, dated references) intact across the
|
|
translation boundary.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import httpx
|
|
|
|
from app.services.i18n import LANGUAGES
|
|
from app.services.openrouter import LogResult, call_llm
|
|
|
|
|
|
_SYSTEM_PROMPT_TMPL = """\
|
|
You are an expert translator working on financial-markets commentary.
|
|
Translate the following markdown text to {language}.
|
|
|
|
Strict rules:
|
|
- Preserve ALL markdown formatting (headings, lists, emphasis, links,
|
|
tables, code spans).
|
|
- Do NOT translate ticker symbols (AAPL, MSFT, VOD.L, ASML.AS, etc.),
|
|
company legal names, percentages, dates, ISO currency codes, or any
|
|
numbers.
|
|
- Do NOT add commentary, preambles, or apologies. Output ONLY the
|
|
translated markdown.
|
|
"""
|
|
|
|
|
|
async def translate(
|
|
client: httpx.AsyncClient,
|
|
text: str,
|
|
target_lang: str,
|
|
) -> tuple[str, LogResult]:
|
|
"""Translate markdown ``text`` to ``target_lang``.
|
|
|
|
Returns ``(translated_markdown, LogResult)``. Caller persists the
|
|
cost/model provenance from LogResult next to the cached row.
|
|
|
|
Short-circuits without calling the LLM when ``target_lang`` is
|
|
``'en'``, unknown, or empty — returns the source unchanged with a
|
|
zero-cost stub LogResult. This lets fan-out callers iterate over
|
|
all languages without per-call gating.
|
|
|
|
Raises on provider failure (HTTP error, all chain providers down).
|
|
Callers in fan-out paths should catch and log per-language.
|
|
"""
|
|
if not target_lang or target_lang == "en" or target_lang not in LANGUAGES:
|
|
# No-op fast path. Returning a fake LogResult keeps the call
|
|
# signature stable for callers who unpack the tuple.
|
|
return text, LogResult(
|
|
content=text, model="noop",
|
|
prompt_tokens=0, completion_tokens=0, cost_usd=0.0,
|
|
)
|
|
|
|
system_prompt = _SYSTEM_PROMPT_TMPL.format(language=LANGUAGES[target_lang])
|
|
messages = [
|
|
{"role": "system", "content": system_prompt},
|
|
{"role": "user", "content": text},
|
|
]
|
|
result = await call_llm(client, messages)
|
|
|
|
content = (result.content or "").strip()
|
|
# Strip code fences if the model wrapped its output despite the system rule.
|
|
if content.startswith("```"):
|
|
# Drop the opening fence (with optional language tag).
|
|
first_nl = content.find("\n")
|
|
if first_nl != -1:
|
|
content = content[first_nl + 1:]
|
|
# Drop the closing fence.
|
|
if content.rstrip().endswith("```"):
|
|
content = content.rstrip()[:-3].rstrip()
|
|
content = content.strip()
|
|
|
|
return content, result
|