fix(fetcher): fulltext 抓取用真实浏览器 UA,绕过 NHK 等 403

2026-06-08 15:55:30 +08:00
parent 6b5828c1c0
commit a5548d6e64
1 changed files with 10 additions and 1 deletions
--- a/backend/app/services/fetchers/rss.py
+++ b/backend/app/services/fetchers/rss.py
@@ -115,10 +115,19 @@ class RSSFetcher(BaseFetcher):
        """去 article URL 抓全文,用 trafilatura 抽正文。"""
        try:
            if self._http_client is None:
+                # 用真实浏览器 UA(很多站[如 NHK news.web]把爬虫 UA 直接 403)
                self._http_client = httpx.AsyncClient(
                    follow_redirects=True,
                    timeout=20,
-                    headers={"User-Agent": "Mozilla/5.0 (compatible; DiaryNews/0.1)"},
+                    headers={
+                        "User-Agent": (
+                            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
+                            "AppleWebKit/537.36 (KHTML, like Gecko) "
+                            "Chrome/120.0.0.0 Safari/537.36"
+                        ),
+                        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
+                        "Accept-Language": "ja,en-US;q=0.9,en;q=0.8,zh;q=0.7",
+                    },
                )
            r = await self._http_client.get(url)
            r.raise_for_status()