feat(translate): 增加译文清洗 — pipeline 接入源头防御 + 批量清洗历史脚本
- 新增 app/services/translation/text_clean.py clean_markdown_asterisks / clean_html_inner_text / wrap_html 共用工具, 清洗 LLM 输出残留的 ** / * / *** markdown 标记 - 改 pipeline.translate_article: 写库前清洗 tr_title/tr_body, 新翻译不再带 **;同时把私有 _wrap_html 替换为公开 wrap_html - 新增 app/scripts/clean_translations.py 批量清洗历史脏数据 — 5 字段(title_zh/body_zh_text/body_zh_html/ body_zh_formatted/summary_zh),支持 dry-run/limit/source-slug/field
This commit is contained in:
@@ -19,6 +19,7 @@ from app.models.source import Source, SourceKind
|
||||
from app.services.fetchers import get_fetcher
|
||||
from app.services.fetchers.base import FetchedItem, url_hash
|
||||
from app.services.translation.service import service as translation_service
|
||||
from app.services.translation.text_clean import clean_markdown_asterisks, wrap_html
|
||||
|
||||
logger = logging.getLogger("news.pipeline")
|
||||
|
||||
@@ -232,15 +233,20 @@ async def translate_article(article_id: int) -> None:
|
||||
await session.commit()
|
||||
return
|
||||
|
||||
# 写库前清洗:LLM 偶尔会把 markdown 加粗标记 ** / * 带进译文。
|
||||
# 源头控制比事后批量洗更稳 — 历史脏数据由 scripts/clean_translations.py 处理。
|
||||
tr_title_clean = clean_markdown_asterisks(tr_title.text)
|
||||
tr_body_clean = clean_markdown_asterisks(tr_body)
|
||||
|
||||
# 写回
|
||||
async with AsyncSessionLocal() as session:
|
||||
art = (
|
||||
await session.execute(select(Article).where(Article.id == article_id_ref))
|
||||
).scalar_one_or_none()
|
||||
if art:
|
||||
art.title_zh = tr_title.text if tr_title.text else None
|
||||
art.body_zh_text = tr_body or None
|
||||
art.body_zh_html = _wrap_html(tr_body) if tr_body else None
|
||||
art.title_zh = tr_title_clean or None
|
||||
art.body_zh_text = tr_body_clean or None
|
||||
art.body_zh_html = wrap_html(tr_body_clean) if tr_body_clean else None
|
||||
art.translation_status = status
|
||||
art.translation_engine = engine_label
|
||||
art.translation_chars = total_chars
|
||||
@@ -293,14 +299,6 @@ def _split_long_para(para: str, max_chars: int) -> list[str]:
|
||||
return parts
|
||||
|
||||
|
||||
def _wrap_html(text: str) -> str:
|
||||
"""把译文包成 HTML 段落。"""
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
parts = [f"<p>{p.strip()}</p>" for p in text.split("\n\n") if p.strip()]
|
||||
return "\n".join(parts) if parts else ""
|
||||
|
||||
|
||||
# === 全量跑(供测试 / 手动触发) ===
|
||||
async def run_once() -> None:
|
||||
async with AsyncSessionLocal() as session:
|
||||
|
||||
Reference in New Issue
Block a user