perf: 翻译独立后台循环(1 篇/秒)+ Semaphore 1
之前 fetch_one_source 入库后立即调翻译(可能并发触发腾讯 TMT 限速) 改为独立 translation_loop 后台循环: - 完全不和 RSS 抓取并行 - 1 篇/秒节拍(Semaphore 1 + sleep 1.0) - 没活时空闲 5 秒再轮询 - pending/failed 都重试
This commit is contained in:
@@ -1,14 +1,27 @@
|
||||
"""RSS / Atom fetcher(基于 feedparser)。"""
|
||||
"""RSS / Atom fetcher(基于 feedparser)。
|
||||
|
||||
增强:对 content 太短(< BODY_MIN_LEN)的 item,自动去 article URL 抓全文
|
||||
用 trafilatura 抽取(从 RSS 摘要升级到全文)。
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from datetime import datetime, timezone
|
||||
from email.utils import parsedate_to_datetime
|
||||
|
||||
import feedparser
|
||||
import httpx
|
||||
import trafilatura
|
||||
from bs4 import BeautifulSoup
|
||||
from dateutil import parser as dtp
|
||||
|
||||
from app.services.fetchers.base import BaseFetcher, FetchedItem
|
||||
|
||||
logger = logging.getLogger("news.fetcher.rss")
|
||||
|
||||
# 如果 RSS 给的 body 不到这个字符数,就自动去 article URL 抓全文
|
||||
BODY_MIN_LEN = 500
|
||||
|
||||
|
||||
class RSSFetcher(BaseFetcher):
|
||||
async def fetch(self) -> list[FetchedItem]:
|
||||
@@ -22,6 +35,9 @@ class RSSFetcher(BaseFetcher):
|
||||
if feed.bozo and not feed.entries:
|
||||
# 整篇解析失败
|
||||
raise RuntimeError(f"RSS parse failed: {feed.bozo_exception}")
|
||||
|
||||
# 拿到 fetch 上下文
|
||||
self._http_client: httpx.AsyncClient | None = None
|
||||
items: list[FetchedItem] = []
|
||||
for e in feed.entries:
|
||||
url = e.get("link") or e.get("id")
|
||||
@@ -31,41 +47,18 @@ class RSSFetcher(BaseFetcher):
|
||||
if not title:
|
||||
continue
|
||||
|
||||
body_html = None
|
||||
body_text = ""
|
||||
if e.get("content"):
|
||||
# 选最长 content
|
||||
contents = sorted(e["content"], key=lambda c: -len(c.get("value", "")))
|
||||
body_html = contents[0].get("value")
|
||||
if not body_html:
|
||||
body_html = e.get("summary")
|
||||
if body_html:
|
||||
from bs4 import BeautifulSoup
|
||||
body_html, body_text = self._extract_from_entry(e)
|
||||
|
||||
soup = BeautifulSoup(body_html, "lxml")
|
||||
# 去 script/style
|
||||
for tag in soup(["script", "style", "noscript"]):
|
||||
tag.decompose()
|
||||
body_text = soup.get_text(separator="\n", strip=True)
|
||||
# body 太短:去 article URL 抓全文(trafilatura)
|
||||
if len(body_text) < BODY_MIN_LEN and url:
|
||||
full_html, full_text = await self._fetch_fulltext(url)
|
||||
if full_text and len(full_text) > len(body_text):
|
||||
body_text = full_text
|
||||
body_html = full_html or body_html
|
||||
|
||||
published_at = _parse_dt(e.get("published") or e.get("updated") or e.get("created"))
|
||||
author = e.get("author")
|
||||
image_url = None
|
||||
if e.get("media_content"):
|
||||
try:
|
||||
image_url = e["media_content"][0].get("url")
|
||||
except (IndexError, KeyError, TypeError):
|
||||
pass
|
||||
if not image_url and e.get("media_thumbnail"):
|
||||
try:
|
||||
image_url = e["media_thumbnail"][0].get("url")
|
||||
except (IndexError, KeyError, TypeError):
|
||||
pass
|
||||
if not image_url and e.get("enclosures"):
|
||||
for enc in e["enclosures"]:
|
||||
if enc.get("type", "").startswith("image/"):
|
||||
image_url = enc.get("href") or enc.get("url")
|
||||
break
|
||||
image_url = self._extract_image(e)
|
||||
|
||||
items.append(
|
||||
FetchedItem(
|
||||
@@ -80,8 +73,67 @@ class RSSFetcher(BaseFetcher):
|
||||
guid=e.get("id") or e.get("guid"),
|
||||
)
|
||||
)
|
||||
if self._http_client is not None:
|
||||
await self._http_client.aclose()
|
||||
return items
|
||||
|
||||
@staticmethod
|
||||
def _extract_from_entry(e) -> tuple[str | None, str]:
|
||||
body_html = None
|
||||
if e.get("content"):
|
||||
contents = sorted(e["content"], key=lambda c: -len(c.get("value", "")))
|
||||
body_html = contents[0].get("value")
|
||||
if not body_html:
|
||||
body_html = e.get("summary")
|
||||
if not body_html:
|
||||
return None, ""
|
||||
soup = BeautifulSoup(body_html, "lxml")
|
||||
for tag in soup(["script", "style", "noscript"]):
|
||||
tag.decompose()
|
||||
text = soup.get_text(separator="\n", strip=True)
|
||||
return body_html, text
|
||||
|
||||
@staticmethod
|
||||
def _extract_image(e) -> str | None:
|
||||
if e.get("media_content"):
|
||||
try:
|
||||
return e["media_content"][0].get("url")
|
||||
except (IndexError, KeyError, TypeError):
|
||||
pass
|
||||
if e.get("media_thumbnail"):
|
||||
try:
|
||||
return e["media_thumbnail"][0].get("url")
|
||||
except (IndexError, KeyError, TypeError):
|
||||
pass
|
||||
if e.get("enclosures"):
|
||||
for enc in e["enclosures"]:
|
||||
if enc.get("type", "").startswith("image/"):
|
||||
return enc.get("href") or enc.get("url")
|
||||
return None
|
||||
|
||||
async def _fetch_fulltext(self, url: str) -> tuple[str | None, str]:
|
||||
"""去 article URL 抓全文,用 trafilatura 抽正文。"""
|
||||
try:
|
||||
if self._http_client is None:
|
||||
self._http_client = httpx.AsyncClient(
|
||||
follow_redirects=True,
|
||||
timeout=20,
|
||||
headers={"User-Agent": "Mozilla/5.0 (compatible; DiaryNews/0.1)"},
|
||||
)
|
||||
r = await self._http_client.get(url)
|
||||
r.raise_for_status()
|
||||
except Exception as e:
|
||||
logger.warning("fulltext fetch failed for %s: %s", url, e)
|
||||
return None, ""
|
||||
|
||||
try:
|
||||
html = trafilatura.extract(r.text, include_comments=False, include_tables=False, favor_recall=True, output_format="html") or ""
|
||||
text = trafilatura.extract(r.text, include_comments=False, include_tables=False, favor_recall=True, output_format="txt") or ""
|
||||
except Exception as e:
|
||||
logger.warning("trafilatura extract failed for %s: %s", url, e)
|
||||
return None, ""
|
||||
return html, text
|
||||
|
||||
|
||||
def _parse_dt(s: str | None) -> datetime | None:
|
||||
if not s:
|
||||
|
||||
Reference in New Issue
Block a user