perf: 翻译独立后台循环(1 篇/秒)+ Semaphore 1

之前 fetch_one_source 入库后立即调翻译(可能并发触发腾讯 TMT 限速)
改为独立 translation_loop 后台循环:
- 完全不和 RSS 抓取并行
- 1 篇/秒节拍(Semaphore 1 + sleep 1.0)
- 没活时空闲 5 秒再轮询
- pending/failed 都重试
This commit is contained in:
Mavis
2026-06-08 00:27:09 +08:00
parent e79cfaa5f7
commit 9862a92423
6 changed files with 203 additions and 39 deletions

View File

@@ -1,14 +1,27 @@
"""RSS / Atom fetcher(基于 feedparser)。""" """RSS / Atom fetcher(基于 feedparser)。
增强:对 content 太短(< BODY_MIN_LEN)的 item,自动去 article URL 抓全文
用 trafilatura 抽取(从 RSS 摘要升级到全文)。
"""
from __future__ import annotations from __future__ import annotations
import logging
from datetime import datetime, timezone from datetime import datetime, timezone
from email.utils import parsedate_to_datetime from email.utils import parsedate_to_datetime
import feedparser import feedparser
import httpx
import trafilatura
from bs4 import BeautifulSoup
from dateutil import parser as dtp from dateutil import parser as dtp
from app.services.fetchers.base import BaseFetcher, FetchedItem from app.services.fetchers.base import BaseFetcher, FetchedItem
logger = logging.getLogger("news.fetcher.rss")
# 如果 RSS 给的 body 不到这个字符数,就自动去 article URL 抓全文
BODY_MIN_LEN = 500
class RSSFetcher(BaseFetcher): class RSSFetcher(BaseFetcher):
async def fetch(self) -> list[FetchedItem]: async def fetch(self) -> list[FetchedItem]:
@@ -22,6 +35,9 @@ class RSSFetcher(BaseFetcher):
if feed.bozo and not feed.entries: if feed.bozo and not feed.entries:
# 整篇解析失败 # 整篇解析失败
raise RuntimeError(f"RSS parse failed: {feed.bozo_exception}") raise RuntimeError(f"RSS parse failed: {feed.bozo_exception}")
# 拿到 fetch 上下文
self._http_client: httpx.AsyncClient | None = None
items: list[FetchedItem] = [] items: list[FetchedItem] = []
for e in feed.entries: for e in feed.entries:
url = e.get("link") or e.get("id") url = e.get("link") or e.get("id")
@@ -31,41 +47,18 @@ class RSSFetcher(BaseFetcher):
if not title: if not title:
continue continue
body_html = None body_html, body_text = self._extract_from_entry(e)
body_text = ""
if e.get("content"):
# 选最长 content
contents = sorted(e["content"], key=lambda c: -len(c.get("value", "")))
body_html = contents[0].get("value")
if not body_html:
body_html = e.get("summary")
if body_html:
from bs4 import BeautifulSoup
soup = BeautifulSoup(body_html, "lxml") # body 太短:去 article URL 抓全文(trafilatura)
# 去 script/style if len(body_text) < BODY_MIN_LEN and url:
for tag in soup(["script", "style", "noscript"]): full_html, full_text = await self._fetch_fulltext(url)
tag.decompose() if full_text and len(full_text) > len(body_text):
body_text = soup.get_text(separator="\n", strip=True) body_text = full_text
body_html = full_html or body_html
published_at = _parse_dt(e.get("published") or e.get("updated") or e.get("created")) published_at = _parse_dt(e.get("published") or e.get("updated") or e.get("created"))
author = e.get("author") author = e.get("author")
image_url = None image_url = self._extract_image(e)
if e.get("media_content"):
try:
image_url = e["media_content"][0].get("url")
except (IndexError, KeyError, TypeError):
pass
if not image_url and e.get("media_thumbnail"):
try:
image_url = e["media_thumbnail"][0].get("url")
except (IndexError, KeyError, TypeError):
pass
if not image_url and e.get("enclosures"):
for enc in e["enclosures"]:
if enc.get("type", "").startswith("image/"):
image_url = enc.get("href") or enc.get("url")
break
items.append( items.append(
FetchedItem( FetchedItem(
@@ -80,8 +73,67 @@ class RSSFetcher(BaseFetcher):
guid=e.get("id") or e.get("guid"), guid=e.get("id") or e.get("guid"),
) )
) )
if self._http_client is not None:
await self._http_client.aclose()
return items return items
@staticmethod
def _extract_from_entry(e) -> tuple[str | None, str]:
body_html = None
if e.get("content"):
contents = sorted(e["content"], key=lambda c: -len(c.get("value", "")))
body_html = contents[0].get("value")
if not body_html:
body_html = e.get("summary")
if not body_html:
return None, ""
soup = BeautifulSoup(body_html, "lxml")
for tag in soup(["script", "style", "noscript"]):
tag.decompose()
text = soup.get_text(separator="\n", strip=True)
return body_html, text
@staticmethod
def _extract_image(e) -> str | None:
if e.get("media_content"):
try:
return e["media_content"][0].get("url")
except (IndexError, KeyError, TypeError):
pass
if e.get("media_thumbnail"):
try:
return e["media_thumbnail"][0].get("url")
except (IndexError, KeyError, TypeError):
pass
if e.get("enclosures"):
for enc in e["enclosures"]:
if enc.get("type", "").startswith("image/"):
return enc.get("href") or enc.get("url")
return None
async def _fetch_fulltext(self, url: str) -> tuple[str | None, str]:
"""去 article URL 抓全文,用 trafilatura 抽正文。"""
try:
if self._http_client is None:
self._http_client = httpx.AsyncClient(
follow_redirects=True,
timeout=20,
headers={"User-Agent": "Mozilla/5.0 (compatible; DiaryNews/0.1)"},
)
r = await self._http_client.get(url)
r.raise_for_status()
except Exception as e:
logger.warning("fulltext fetch failed for %s: %s", url, e)
return None, ""
try:
html = trafilatura.extract(r.text, include_comments=False, include_tables=False, favor_recall=True, output_format="html") or ""
text = trafilatura.extract(r.text, include_comments=False, include_tables=False, favor_recall=True, output_format="txt") or ""
except Exception as e:
logger.warning("trafilatura extract failed for %s: %s", url, e)
return None, ""
return html, text
def _parse_dt(s: str | None) -> datetime | None: def _parse_dt(s: str | None) -> datetime | None:
if not s: if not s:

View File

@@ -31,7 +31,8 @@ class TranslationService:
def __init__(self): def __init__(self):
self._tencent: BaseTranslator | None = None self._tencent: BaseTranslator | None = None
self._local: BaseTranslator | None = None self._local: BaseTranslator | None = None
self._sem = asyncio.Semaphore(3) # 并发限流 # 串行:1 个并发;避免触发腾讯 TMT 限速
self._sem = asyncio.Semaphore(1)
def _primary(self) -> BaseTranslator: def _primary(self) -> BaseTranslator:
if self._tencent is None: if self._tencent is None:

View File

@@ -17,7 +17,7 @@ from sqlalchemy import select
from app.config import settings from app.config import settings
from app.database import AsyncSessionLocal from app.database import AsyncSessionLocal
from app.models.source import Source from app.models.source import Source
from app.workers.pipeline import fetch_one_source, run_once from app.workers.pipeline import fetch_one_source, run_once, translation_loop
logger = logging.getLogger("news.worker") logger = logging.getLogger("news.worker")
logging.basicConfig( logging.basicConfig(
@@ -89,6 +89,10 @@ async def main() -> None:
scheduler.start() scheduler.start()
logger.info("scheduler started with %d jobs", len(scheduler.get_jobs())) logger.info("scheduler started with %d jobs", len(scheduler.get_jobs()))
# 独立的翻译后台循环(不和 RSS 抓取并行;1 篇/秒)
translation_task = asyncio.create_task(translation_loop(), name="translation_loop")
logger.info("translation_loop task scheduled (1 article/sec)")
stop = asyncio.Event() stop = asyncio.Event()
def _signal_handler(): def _signal_handler():
@@ -104,7 +108,12 @@ async def main() -> None:
pass pass
await stop.wait() await stop.wait()
logger.info("stopping scheduler") logger.info("stopping scheduler and translation loop")
translation_task.cancel()
try:
await translation_task
except asyncio.CancelledError:
pass
scheduler.shutdown(wait=False) scheduler.shutdown(wait=False)

View File

@@ -51,10 +51,7 @@ async def fetch_one_source(source_id: int) -> None:
n_new = await _bulk_insert(src, items) n_new = await _bulk_insert(src, items)
await _mark_success(source_id, n_new=n_new) await _mark_success(source_id, n_new=n_new)
logger.info("source %s: %d new articles", src.slug, n_new) logger.info("source %s: %d new articles (translation deferred to background loop)", src.slug, n_new)
# 入库后,挑高优先级 / 没翻译的开始翻译
await _translate_recent_for_source(source_id, max_n=20)
async def _mark_failure(source_id: int, status: str) -> None: async def _mark_failure(source_id: int, status: str) -> None:
@@ -271,3 +268,47 @@ async def run_once() -> None:
logger.info("run_once: %d enabled sources", len(sources)) logger.info("run_once: %d enabled sources", len(sources))
tasks = [fetch_one_source(s.id) for s in sources] tasks = [fetch_one_source(s.id) for s in sources]
await asyncio.gather(*tasks, return_exceptions=True) await asyncio.gather(*tasks, return_exceptions=True)
# === 翻译后台循环 ===
# 1 篇/秒(Semaphore 1 已经在 service 内部,这里是节拍)
TRANSLATION_INTERVAL_SEC = 1.0
TRANSLATION_IDLE_INTERVAL_SEC = 5.0
TRANSLATION_BATCH_SIZE = 1 # 每轮最多翻译 1 篇
async def translation_loop() -> None:
"""独立的翻译 worker。
- 不和 RSS 抓取并行
- 1 篇/秒(用 TRANSLATION_INTERVAL_SEC 控制)
- 失败 status 写 'failed',下一次循环重试
"""
logger.info("translation_loop started (interval=%.1fs)", TRANSLATION_INTERVAL_SEC)
while True:
try:
async with AsyncSessionLocal() as session:
rows = (
await session.execute(
select(Article)
.where(Article.translation_status.in_(("pending", "failed")))
.order_by(Article.fetched_at.asc(), Article.id.asc())
.limit(TRANSLATION_BATCH_SIZE)
)
).scalars()
aids = [a.id for a in rows]
if not aids:
# 没活,等久一点
await asyncio.sleep(TRANSLATION_IDLE_INTERVAL_SEC)
continue
for aid in aids:
try:
await translate_article(aid)
except Exception as e:
logger.exception("translate_article %s failed: %s", aid, e)
# 1 篇/秒节拍
await asyncio.sleep(TRANSLATION_INTERVAL_SEC)
except Exception as e:
logger.exception("translation_loop error: %s", e)
await asyncio.sleep(TRANSLATION_IDLE_INTERVAL_SEC)

26
scripts/_rebuild_test.py Normal file
View File

@@ -0,0 +1,26 @@
import os, paramiko, json
PW = os.environ["REMOTE_PASS"]
c = paramiko.SSHClient()
c.set_missing_host_key_policy(paramiko.AutoAddPolicy())
c.connect("207.57.129.228", port=19717, username="root", password=PW, timeout=15, allow_agent=False, look_for_keys=False)
def run(cmd, t=15):
si, so, se = c.exec_command(cmd, timeout=t)
out = so.read().decode("utf-8", "replace")
err = se.read().decode("utf-8", "replace")
rc = so.channel.recv_exit_status()
if out: print(out, end="")
return out
# pull + 重建 api
run("cd /srv/news && sudo -u news git pull --rebase 2>&1 | tail -3")
run("cd /srv/news && docker compose up -d --force-recreate --no-deps --build api 2>&1 | tail -5", t=120)
import time
time.sleep(6)
# 登录 + 拉详情
out = run("curl -s -X POST http://localhost/api/v1/auth/login -H 'Content-Type: application/json' -d '{\"username\":\"owner\",\"password\":\"Owner2026!\"}'")
token = json.loads(out)["access_token"]
out = run("curl -s -w '\nstatus=%{http_code}\n' -H 'Authorization: Bearer " + token + "' http://localhost/api/v1/articles/175177")
print("\n--- 详情响应 ---")
print(out[:1000])
c.close()

35
scripts/_trafilatura.py Normal file
View File

@@ -0,0 +1,35 @@
import os, paramiko, base64
PW = os.environ["REMOTE_PASS"]
c = paramiko.SSHClient()
c.set_missing_host_key_policy(paramiko.AutoAddPolicy())
c.connect("207.57.129.228", port=19717, username="root", password=PW, timeout=15, allow_agent=False, look_for_keys=False)
def run(cmd, t=30):
si, so, se = c.exec_command(cmd, timeout=t)
out = so.read().decode("utf-8", "replace")
err = se.read().decode("utf-8", "replace")
rc = so.channel.recv_exit_status()
if out: print(out, end="")
return out
# 试 trafilatura 抓 Al Jazeera 全文
script = '''
import asyncio, httpx, trafilatura
async def main():
url = "https://www.aljazeera.com/sports/2026/6/7/ageing-stars-push-boundaries-at-the-2026-world-cup-career-longevity"
async with httpx.AsyncClient(follow_redirects=True, timeout=20) as c:
r = await c.get(url, headers={"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0 Safari/537.36"})
print("status:", r.status_code, "len:", len(r.text))
extracted = trafilatura.extract(r.text, include_comments=False, include_tables=False, favor_recall=True, output_format="json")
print("---JSON---")
print((extracted or "")[:2000])
print()
print("---TEXT---")
text = trafilatura.extract(r.text, include_comments=False, include_tables=False, favor_recall=True, output_format="text")
print((text or "")[:2000])
asyncio.run(main())
'''
b64 = base64.b64encode(script.encode()).decode()
run("docker exec news-aggregator-worker-1 sh -c 'echo " + b64 + " | base64 -d > /app/_tr.py'")
run("docker exec -w /app news-aggregator-worker-1 python /app/_tr.py 2>&1 | tail -50", t=60)
c.close()