feat(translate): 增加译文清洗 — pipeline 接入源头防御 + 批量清洗历史脚本

- 新增 app/services/translation/text_clean.py
  clean_markdown_asterisks / clean_html_inner_text / wrap_html 共用工具,
  清洗 LLM 输出残留的 ** / * / *** markdown 标记
- 改 pipeline.translate_article: 写库前清洗 tr_title/tr_body,
  新翻译不再带 **;同时把私有 _wrap_html 替换为公开 wrap_html
- 新增 app/scripts/clean_translations.py
  批量清洗历史脏数据 — 5 字段(title_zh/body_zh_text/body_zh_html/
  body_zh_formatted/summary_zh),支持 dry-run/limit/source-slug/field
This commit is contained in:
xiaji
2026-06-16 22:12:45 +08:00
parent b5dfedc862
commit 8dccf08126
3 changed files with 468 additions and 11 deletions

View File

@@ -19,6 +19,7 @@ from app.models.source import Source, SourceKind
from app.services.fetchers import get_fetcher
from app.services.fetchers.base import FetchedItem, url_hash
from app.services.translation.service import service as translation_service
from app.services.translation.text_clean import clean_markdown_asterisks, wrap_html
logger = logging.getLogger("news.pipeline")
@@ -232,15 +233,20 @@ async def translate_article(article_id: int) -> None:
await session.commit()
return
# 写库前清洗:LLM 偶尔会把 markdown 加粗标记 ** / * 带进译文。
# 源头控制比事后批量洗更稳 — 历史脏数据由 scripts/clean_translations.py 处理。
tr_title_clean = clean_markdown_asterisks(tr_title.text)
tr_body_clean = clean_markdown_asterisks(tr_body)
# 写回
async with AsyncSessionLocal() as session:
art = (
await session.execute(select(Article).where(Article.id == article_id_ref))
).scalar_one_or_none()
if art:
art.title_zh = tr_title.text if tr_title.text else None
art.body_zh_text = tr_body or None
art.body_zh_html = _wrap_html(tr_body) if tr_body else None
art.title_zh = tr_title_clean or None
art.body_zh_text = tr_body_clean or None
art.body_zh_html = wrap_html(tr_body_clean) if tr_body_clean else None
art.translation_status = status
art.translation_engine = engine_label
art.translation_chars = total_chars
@@ -293,14 +299,6 @@ def _split_long_para(para: str, max_chars: int) -> list[str]:
return parts
def _wrap_html(text: str) -> str:
"""把译文包成 HTML 段落。"""
from bs4 import BeautifulSoup
parts = [f"<p>{p.strip()}</p>" for p in text.split("\n\n") if p.strip()]
return "\n".join(parts) if parts else ""
# === 全量跑(供测试 / 手动触发) ===
async def run_once() -> None:
async with AsyncSessionLocal() as session: