diff --git a/backend/app/services/fetchers/rss.py b/backend/app/services/fetchers/rss.py index 20fd9c3..eeeef68 100644 --- a/backend/app/services/fetchers/rss.py +++ b/backend/app/services/fetchers/rss.py @@ -115,10 +115,19 @@ class RSSFetcher(BaseFetcher): """去 article URL 抓全文,用 trafilatura 抽正文。""" try: if self._http_client is None: + # 用真实浏览器 UA(很多站[如 NHK news.web]把爬虫 UA 直接 403) self._http_client = httpx.AsyncClient( follow_redirects=True, timeout=20, - headers={"User-Agent": "Mozilla/5.0 (compatible; DiaryNews/0.1)"}, + headers={ + "User-Agent": ( + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " + "AppleWebKit/537.36 (KHTML, like Gecko) " + "Chrome/120.0.0.0 Safari/537.36" + ), + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", + "Accept-Language": "ja,en-US;q=0.9,en;q=0.8,zh;q=0.7", + }, ) r = await self._http_client.get(url) r.raise_for_status()