From a5548d6e64e6f3339c5d84bd0decd4056d26f025 Mon Sep 17 00:00:00 2001 From: Mavis Date: Mon, 8 Jun 2026 15:55:30 +0800 Subject: [PATCH] =?UTF-8?q?fix(fetcher):=20fulltext=20=E6=8A=93=E5=8F=96?= =?UTF-8?q?=E7=94=A8=E7=9C=9F=E5=AE=9E=E6=B5=8F=E8=A7=88=E5=99=A8=20UA,?= =?UTF-8?q?=E7=BB=95=E8=BF=87=20NHK=20=E7=AD=89=20403?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- backend/app/services/fetchers/rss.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/backend/app/services/fetchers/rss.py b/backend/app/services/fetchers/rss.py index 20fd9c3..eeeef68 100644 --- a/backend/app/services/fetchers/rss.py +++ b/backend/app/services/fetchers/rss.py @@ -115,10 +115,19 @@ class RSSFetcher(BaseFetcher): """去 article URL 抓全文,用 trafilatura 抽正文。""" try: if self._http_client is None: + # 用真实浏览器 UA(很多站[如 NHK news.web]把爬虫 UA 直接 403) self._http_client = httpx.AsyncClient( follow_redirects=True, timeout=20, - headers={"User-Agent": "Mozilla/5.0 (compatible; DiaryNews/0.1)"}, + headers={ + "User-Agent": ( + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " + "AppleWebKit/537.36 (KHTML, like Gecko) " + "Chrome/120.0.0.0 Safari/537.36" + ), + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", + "Accept-Language": "ja,en-US;q=0.9,en;q=0.8,zh;q=0.7", + }, ) r = await self._http_client.get(url) r.raise_for_status()