perf: 翻译独立后台循环(1 篇/秒)+ Semaphore 1

之前 fetch_one_source 入库后立即调翻译(可能并发触发腾讯 TMT 限速) 改为独立 translation_loop 后台循环: - 完全不和 RSS 抓取并行 - 1 篇/秒节拍(Semaphore 1 + sleep 1.0) - 没活时空闲 5 秒再轮询 - pending/failed 都重试
2026-06-08 00:27:09 +08:00
parent e79cfaa5f7
commit 9862a92423
6 changed files with 203 additions and 39 deletions
--- a/backend/app/services/fetchers/rss.py
+++ b/backend/app/services/fetchers/rss.py
@@ -1,14 +1,27 @@
-"""RSS / Atom fetcher(基于 feedparser)。"""
+"""RSS / Atom fetcher(基于 feedparser)。
+
+增强:对 content 太短(< BODY_MIN_LEN)的 item,自动去 article URL 抓全文
+用 trafilatura 抽取(从 RSS 摘要升级到全文)。
+"""
 from __future__ import annotations

+import logging
 from datetime import datetime, timezone
 from email.utils import parsedate_to_datetime

 import feedparser
+import httpx
+import trafilatura
+from bs4 import BeautifulSoup
 from dateutil import parser as dtp

 from app.services.fetchers.base import BaseFetcher, FetchedItem

+logger = logging.getLogger("news.fetcher.rss")
+
+# 如果 RSS 给的 body 不到这个字符数,就自动去 article URL 抓全文
+BODY_MIN_LEN = 500
+

 class RSSFetcher(BaseFetcher):
    async def fetch(self) -> list[FetchedItem]:
@@ -22,6 +35,9 @@ class RSSFetcher(BaseFetcher):
        if feed.bozo and not feed.entries:
            # 整篇解析失败
            raise RuntimeError(f"RSS parse failed: {feed.bozo_exception}")
+
+        # 拿到 fetch 上下文
+        self._http_client: httpx.AsyncClient | None = None
        items: list[FetchedItem] = []
        for e in feed.entries:
            url = e.get("link") or e.get("id")
@@ -31,41 +47,18 @@ class RSSFetcher(BaseFetcher):
            if not title:
                continue

-            body_html = None
-            body_text = ""
-            if e.get("content"):
-                # 选最长 content
-                contents = sorted(e["content"], key=lambda c: -len(c.get("value", "")))
-                body_html = contents[0].get("value")
-            if not body_html:
-                body_html = e.get("summary")
-            if body_html:
-                from bs4 import BeautifulSoup
+            body_html, body_text = self._extract_from_entry(e)

-                soup = BeautifulSoup(body_html, "lxml")
-                # 去 script/style
-                for tag in soup(["script", "style", "noscript"]):
-                    tag.decompose()
-                body_text = soup.get_text(separator="\n", strip=True)
+            # body 太短:去 article URL 抓全文(trafilatura)
+            if len(body_text) < BODY_MIN_LEN and url:
+                full_html, full_text = await self._fetch_fulltext(url)
+                if full_text and len(full_text) > len(body_text):
+                    body_text = full_text
+                    body_html = full_html or body_html

            published_at = _parse_dt(e.get("published") or e.get("updated") or e.get("created"))
            author = e.get("author")
-            image_url = None
-            if e.get("media_content"):
-                try:
-                    image_url = e["media_content"][0].get("url")
-                except (IndexError, KeyError, TypeError):
-                    pass
-            if not image_url and e.get("media_thumbnail"):
-                try:
-                    image_url = e["media_thumbnail"][0].get("url")
-                except (IndexError, KeyError, TypeError):
-                    pass
-            if not image_url and e.get("enclosures"):
-                for enc in e["enclosures"]:
-                    if enc.get("type", "").startswith("image/"):
-                        image_url = enc.get("href") or enc.get("url")
-                        break
+            image_url = self._extract_image(e)

            items.append(
                FetchedItem(
@@ -80,8 +73,67 @@ class RSSFetcher(BaseFetcher):
                    guid=e.get("id") or e.get("guid"),
                )
            )
+        if self._http_client is not None:
+            await self._http_client.aclose()
        return items

+    @staticmethod
+    def _extract_from_entry(e) -> tuple[str | None, str]:
+        body_html = None
+        if e.get("content"):
+            contents = sorted(e["content"], key=lambda c: -len(c.get("value", "")))
+            body_html = contents[0].get("value")
+        if not body_html:
+            body_html = e.get("summary")
+        if not body_html:
+            return None, ""
+        soup = BeautifulSoup(body_html, "lxml")
+        for tag in soup(["script", "style", "noscript"]):
+            tag.decompose()
+        text = soup.get_text(separator="\n", strip=True)
+        return body_html, text
+
+    @staticmethod
+    def _extract_image(e) -> str | None:
+        if e.get("media_content"):
+            try:
+                return e["media_content"][0].get("url")
+            except (IndexError, KeyError, TypeError):
+                pass
+        if e.get("media_thumbnail"):
+            try:
+                return e["media_thumbnail"][0].get("url")
+            except (IndexError, KeyError, TypeError):
+                pass
+        if e.get("enclosures"):
+            for enc in e["enclosures"]:
+                if enc.get("type", "").startswith("image/"):
+                    return enc.get("href") or enc.get("url")
+        return None
+
+    async def _fetch_fulltext(self, url: str) -> tuple[str | None, str]:
+        """去 article URL 抓全文,用 trafilatura 抽正文。"""
+        try:
+            if self._http_client is None:
+                self._http_client = httpx.AsyncClient(
+                    follow_redirects=True,
+                    timeout=20,
+                    headers={"User-Agent": "Mozilla/5.0 (compatible; DiaryNews/0.1)"},
+                )
+            r = await self._http_client.get(url)
+            r.raise_for_status()
+        except Exception as e:
+            logger.warning("fulltext fetch failed for %s: %s", url, e)
+            return None, ""
+
+        try:
+            html = trafilatura.extract(r.text, include_comments=False, include_tables=False, favor_recall=True, output_format="html") or ""
+            text = trafilatura.extract(r.text, include_comments=False, include_tables=False, favor_recall=True, output_format="txt") or ""
+        except Exception as e:
+            logger.warning("trafilatura extract failed for %s: %s", url, e)
+            return None, ""
+        return html, text
+

 def _parse_dt(s: str | None) -> datetime | None:
    if not s: