From a5548d6e64e6f3339c5d84bd0decd4056d26f025 Mon Sep 17 00:00:00 2001
From: Mavis <Mavis@local>
Date: Mon, 8 Jun 2026 15:55:30 +0800
Subject: [PATCH] =?UTF-8?q?fix(fetcher):=20fulltext=20=E6=8A=93=E5=8F=96?=
 =?UTF-8?q?=E7=94=A8=E7=9C=9F=E5=AE=9E=E6=B5=8F=E8=A7=88=E5=99=A8=20UA,?=
 =?UTF-8?q?=E7=BB=95=E8=BF=87=20NHK=20=E7=AD=89=20403?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 backend/app/services/fetchers/rss.py | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/backend/app/services/fetchers/rss.py b/backend/app/services/fetchers/rss.py
index 20fd9c3..eeeef68 100644
--- a/backend/app/services/fetchers/rss.py
+++ b/backend/app/services/fetchers/rss.py
@@ -115,10 +115,19 @@ class RSSFetcher(BaseFetcher):
         """去 article URL 抓全文,用 trafilatura 抽正文。"""
         try:
             if self._http_client is None:
+                # 用真实浏览器 UA(很多站[如 NHK news.web]把爬虫 UA 直接 403)
                 self._http_client = httpx.AsyncClient(
                     follow_redirects=True,
                     timeout=20,
-                    headers={"User-Agent": "Mozilla/5.0 (compatible; DiaryNews/0.1)"},
+                    headers={
+                        "User-Agent": (
+                            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
+                            "AppleWebKit/537.36 (KHTML, like Gecko) "
+                            "Chrome/120.0.0.0 Safari/537.36"
+                        ),
+                        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
+                        "Accept-Language": "ja,en-US;q=0.9,en;q=0.8,zh;q=0.7",
+                    },
                 )
             r = await self._http_client.get(url)
             r.raise_for_status()