read.markets/scripts/backfill_truncated_translations.py

"""One-off backfill: re-translate StrategicLog rows whose Italian (or
other-language) translation was truncated by the old 4000-token cap in
services/translation.py.

Selection criteria for a "truncated" row:
- completion_tokens >= 3990 (right at or above the old cap), OR
- the translated content is shorter than half the English source

Usage inside the app container:
    docker compose exec app python -m scripts.backfill_truncated_translations \
        --date 2026-05-28               # restrict to one day, repeatable
    docker compose exec app python -m scripts.backfill_truncated_translations \
        --since 2026-04-01              # everything from a date onward
    docker compose exec app python -m scripts.backfill_truncated_translations \
        --all                            # entire history (slow / costs $$)
    docker compose exec app python -m scripts.backfill_truncated_translations \
        --date 2026-05-28 --dry-run     # just print what would be touched

Idempotent: each affected row is deleted then re-inserted in its own
transaction, so a re-run only re-translates rows that are STILL flagged
truncated after the previous pass.
"""
from __future__ import annotations

import argparse
import asyncio
import sys
from datetime import date, datetime

import httpx
from sqlalchemy import and_, delete, func, or_, select

from app.db import get_session_factory
from app.logging import get_logger
from app.models import StrategicLog, StrategicLogTranslation
from app.services.translation import translate

log = get_logger("backfill.translations")

# Italian (and the other expansive Romance / Germanic targets we support)
# typically produce 15-25 % MORE characters than the English source, so
# a translation shorter than the source — let alone much shorter — is a
# truncation signal even if completion_tokens didn't land exactly at the
# old 4000-token cap. We tolerate down to 70 % of source length to avoid
# touching the occasional legitimately-compressed translation.
SHORTNESS_RATIO = 0.7


def _is_truncated(en_chars: int, tr_chars: int, tr_completion: int | None) -> bool:
    if en_chars <= 0:
        return False
    return tr_chars < en_chars * SHORTNESS_RATIO


async def _find_targets(session, day: date | None, since: date | None, all_: bool):
    q = (
        select(
            StrategicLog.id.label("log_id"),
            StrategicLog.generated_at,
            func.char_length(StrategicLog.content).label("en_chars"),
            StrategicLogTranslation.id.label("tr_id"),
            StrategicLogTranslation.lang,
            StrategicLogTranslation.completion_tokens.label("tr_tok"),
            func.char_length(StrategicLogTranslation.content).label("tr_chars"),
        )
        .join(StrategicLogTranslation,
              StrategicLogTranslation.log_id == StrategicLog.id)
    )
    if day is not None:
        q = q.where(func.date(StrategicLog.generated_at) == day)
    elif since is not None:
        q = q.where(StrategicLog.generated_at >= since)
    # all_ → no date filter
    q = q.order_by(StrategicLog.generated_at, StrategicLogTranslation.lang)
    rows = (await session.execute(q)).all()
    return [r for r in rows if _is_truncated(r.en_chars, r.tr_chars, r.tr_tok)]


async def _retranslate_one(session, client: httpx.AsyncClient, log_id: int, lang: str):
    """Delete the existing (log_id, lang) translation row and write a fresh
    one via the (now uncapped) translation service. Each row commits
    independently so a per-row failure doesn't roll back the rest."""
    src_row = (await session.execute(
        select(StrategicLog).where(StrategicLog.id == log_id)
    )).scalar_one_or_none()
    if src_row is None:
        log.warning("backfill.missing_source", log_id=log_id)
        return False

    await session.execute(
        delete(StrategicLogTranslation)
        .where(StrategicLogTranslation.log_id == log_id)
        .where(StrategicLogTranslation.lang == lang)
    )
    await session.commit()

    try:
        translated_md, llm_result = await translate(client, src_row.content, lang)
    except Exception as exc:
        log.warning("backfill.translate_failed",
                    log_id=log_id, lang=lang, error=str(exc)[:200])
        return False

    session.add(StrategicLogTranslation(
        log_id=log_id,
        lang=lang,
        content=translated_md,
        model=llm_result.model,
        prompt_tokens=llm_result.prompt_tokens,
        completion_tokens=llm_result.completion_tokens,
        cost_usd=llm_result.cost_usd,
    ))
    await session.commit()
    return True


async def main(args):
    day = datetime.strptime(args.date, "%Y-%m-%d").date() if args.date else None
    since = datetime.strptime(args.since, "%Y-%m-%d").date() if args.since else None
    if not (day or since or args.all):
        print("Specify --date, --since, or --all", file=sys.stderr)
        sys.exit(2)

    session_factory = get_session_factory()
    async with session_factory() as session:
        targets = await _find_targets(session, day, since, args.all)
        print(f"Found {len(targets)} truncated translation row(s):")
        for r in targets:
            print(f"  log_id={r.log_id} lang={r.lang} "
                  f"en={r.en_chars}c tr={r.tr_chars}c "
                  f"tok={r.tr_tok} at {r.generated_at}")
        if args.dry_run or not targets:
            return

        ok = 0
        async with httpx.AsyncClient(follow_redirects=True) as client:
            for r in targets:
                print(f"  re-translating log_id={r.log_id} lang={r.lang}…", end=" ")
                done = await _retranslate_one(session, client, r.log_id, r.lang)
                print("OK" if done else "FAILED")
                if done:
                    ok += 1
        print(f"\nRe-translated {ok}/{len(targets)} row(s).")


if __name__ == "__main__":
    p = argparse.ArgumentParser()
    grp = p.add_mutually_exclusive_group()
    grp.add_argument("--date", help="single day YYYY-MM-DD")
    grp.add_argument("--since", help="from YYYY-MM-DD onward")
    grp.add_argument("--all", action="store_true", help="entire history")
    p.add_argument("--dry-run", action="store_true",
                   help="list affected rows without rewriting")
    asyncio.run(main(p.parse_args()))