perf: 翻译独立后台循环(1 篇/秒)+ Semaphore 1

之前 fetch_one_source 入库后立即调翻译(可能并发触发腾讯 TMT 限速)
改为独立 translation_loop 后台循环:
- 完全不和 RSS 抓取并行
- 1 篇/秒节拍(Semaphore 1 + sleep 1.0)
- 没活时空闲 5 秒再轮询
- pending/failed 都重试
This commit is contained in:
Mavis
2026-06-08 00:27:09 +08:00
parent e79cfaa5f7
commit 9862a92423
6 changed files with 203 additions and 39 deletions

View File

@@ -1,14 +1,27 @@
"""RSS / Atom fetcher(基于 feedparser)。"""
"""RSS / Atom fetcher(基于 feedparser)。
增强:对 content 太短(< BODY_MIN_LEN)的 item,自动去 article URL 抓全文
用 trafilatura 抽取(从 RSS 摘要升级到全文)。
"""
from __future__ import annotations
import logging
from datetime import datetime, timezone
from email.utils import parsedate_to_datetime
import feedparser
import httpx
import trafilatura
from bs4 import BeautifulSoup
from dateutil import parser as dtp
from app.services.fetchers.base import BaseFetcher, FetchedItem
logger = logging.getLogger("news.fetcher.rss")
# 如果 RSS 给的 body 不到这个字符数,就自动去 article URL 抓全文
BODY_MIN_LEN = 500
class RSSFetcher(BaseFetcher):
async def fetch(self) -> list[FetchedItem]:
@@ -22,6 +35,9 @@ class RSSFetcher(BaseFetcher):
if feed.bozo and not feed.entries:
# 整篇解析失败
raise RuntimeError(f"RSS parse failed: {feed.bozo_exception}")
# 拿到 fetch 上下文
self._http_client: httpx.AsyncClient | None = None
items: list[FetchedItem] = []
for e in feed.entries:
url = e.get("link") or e.get("id")
@@ -31,41 +47,18 @@ class RSSFetcher(BaseFetcher):
if not title:
continue
body_html = None
body_text = ""
if e.get("content"):
# 选最长 content
contents = sorted(e["content"], key=lambda c: -len(c.get("value", "")))
body_html = contents[0].get("value")
if not body_html:
body_html = e.get("summary")
if body_html:
from bs4 import BeautifulSoup
body_html, body_text = self._extract_from_entry(e)
soup = BeautifulSoup(body_html, "lxml")
# 去 script/style
for tag in soup(["script", "style", "noscript"]):
tag.decompose()
body_text = soup.get_text(separator="\n", strip=True)
# body 太短:去 article URL 抓全文(trafilatura)
if len(body_text) < BODY_MIN_LEN and url:
full_html, full_text = await self._fetch_fulltext(url)
if full_text and len(full_text) > len(body_text):
body_text = full_text
body_html = full_html or body_html
published_at = _parse_dt(e.get("published") or e.get("updated") or e.get("created"))
author = e.get("author")
image_url = None
if e.get("media_content"):
try:
image_url = e["media_content"][0].get("url")
except (IndexError, KeyError, TypeError):
pass
if not image_url and e.get("media_thumbnail"):
try:
image_url = e["media_thumbnail"][0].get("url")
except (IndexError, KeyError, TypeError):
pass
if not image_url and e.get("enclosures"):
for enc in e["enclosures"]:
if enc.get("type", "").startswith("image/"):
image_url = enc.get("href") or enc.get("url")
break
image_url = self._extract_image(e)
items.append(
FetchedItem(
@@ -80,8 +73,67 @@ class RSSFetcher(BaseFetcher):
guid=e.get("id") or e.get("guid"),
)
)
if self._http_client is not None:
await self._http_client.aclose()
return items
@staticmethod
def _extract_from_entry(e) -> tuple[str | None, str]:
body_html = None
if e.get("content"):
contents = sorted(e["content"], key=lambda c: -len(c.get("value", "")))
body_html = contents[0].get("value")
if not body_html:
body_html = e.get("summary")
if not body_html:
return None, ""
soup = BeautifulSoup(body_html, "lxml")
for tag in soup(["script", "style", "noscript"]):
tag.decompose()
text = soup.get_text(separator="\n", strip=True)
return body_html, text
@staticmethod
def _extract_image(e) -> str | None:
if e.get("media_content"):
try:
return e["media_content"][0].get("url")
except (IndexError, KeyError, TypeError):
pass
if e.get("media_thumbnail"):
try:
return e["media_thumbnail"][0].get("url")
except (IndexError, KeyError, TypeError):
pass
if e.get("enclosures"):
for enc in e["enclosures"]:
if enc.get("type", "").startswith("image/"):
return enc.get("href") or enc.get("url")
return None
async def _fetch_fulltext(self, url: str) -> tuple[str | None, str]:
"""去 article URL 抓全文,用 trafilatura 抽正文。"""
try:
if self._http_client is None:
self._http_client = httpx.AsyncClient(
follow_redirects=True,
timeout=20,
headers={"User-Agent": "Mozilla/5.0 (compatible; DiaryNews/0.1)"},
)
r = await self._http_client.get(url)
r.raise_for_status()
except Exception as e:
logger.warning("fulltext fetch failed for %s: %s", url, e)
return None, ""
try:
html = trafilatura.extract(r.text, include_comments=False, include_tables=False, favor_recall=True, output_format="html") or ""
text = trafilatura.extract(r.text, include_comments=False, include_tables=False, favor_recall=True, output_format="txt") or ""
except Exception as e:
logger.warning("trafilatura extract failed for %s: %s", url, e)
return None, ""
return html, text
def _parse_dt(s: str | None) -> datetime | None:
if not s: