feat(translate): 增加译文清洗 — pipeline 接入源头防御 + 批量清洗历史脚本

- 新增 app/services/translation/text_clean.py clean_markdown_asterisks / clean_html_inner_text / wrap_html 共用工具, 清洗 LLM 输出残留的 ** / * / *** markdown 标记 - 改 pipeline.translate_article: 写库前清洗 tr_title/tr_body, 新翻译不再带 **;同时把私有 _wrap_html 替换为公开 wrap_html - 新增 app/scripts/clean_translations.py 批量清洗历史脏数据 — 5 字段(title_zh/body_zh_text/body_zh_html/ body_zh_formatted/summary_zh),支持 dry-run/limit/source-slug/field
2026-06-16 22:12:45 +08:00
parent b5dfedc862
commit 8dccf08126
3 changed files with 468 additions and 11 deletions
--- a/backend/app/workers/pipeline.py
+++ b/backend/app/workers/pipeline.py
@@ -19,6 +19,7 @@ from app.models.source import Source, SourceKind
 from app.services.fetchers import get_fetcher
 from app.services.fetchers.base import FetchedItem, url_hash
 from app.services.translation.service import service as translation_service
+from app.services.translation.text_clean import clean_markdown_asterisks, wrap_html

 logger = logging.getLogger("news.pipeline")

@@ -232,15 +233,20 @@ async def translate_article(article_id: int) -> None:
                await session.commit()
        return

+    # 写库前清洗:LLM 偶尔会把 markdown 加粗标记 ** / * 带进译文。
+    # 源头控制比事后批量洗更稳 — 历史脏数据由 scripts/clean_translations.py 处理。
+    tr_title_clean = clean_markdown_asterisks(tr_title.text)
+    tr_body_clean = clean_markdown_asterisks(tr_body)
+
    # 写回
    async with AsyncSessionLocal() as session:
        art = (
            await session.execute(select(Article).where(Article.id == article_id_ref))
        ).scalar_one_or_none()
        if art:
-            art.title_zh = tr_title.text if tr_title.text else None
-            art.body_zh_text = tr_body or None
-            art.body_zh_html = _wrap_html(tr_body) if tr_body else None
+            art.title_zh = tr_title_clean or None
+            art.body_zh_text = tr_body_clean or None
+            art.body_zh_html = wrap_html(tr_body_clean) if tr_body_clean else None
            art.translation_status = status
            art.translation_engine = engine_label
            art.translation_chars = total_chars
@@ -293,14 +299,6 @@ def _split_long_para(para: str, max_chars: int) -> list[str]:
    return parts


-def _wrap_html(text: str) -> str:
-    """把译文包成 HTML 段落。"""
-    from bs4 import BeautifulSoup
-
-    parts = [f"<p>{p.strip()}</p>" for p in text.split("\n\n") if p.strip()]
-    return "\n".join(parts) if parts else ""
-
-
 # === 全量跑(供测试 / 手动触发) ===
 async def run_once() -> None:
    async with AsyncSessionLocal() as session: