fix(fetcher): fulltext 抓取用真实浏览器 UA,绕过 NHK 等 403
This commit is contained in:
@@ -115,10 +115,19 @@ class RSSFetcher(BaseFetcher):
|
|||||||
"""去 article URL 抓全文,用 trafilatura 抽正文。"""
|
"""去 article URL 抓全文,用 trafilatura 抽正文。"""
|
||||||
try:
|
try:
|
||||||
if self._http_client is None:
|
if self._http_client is None:
|
||||||
|
# 用真实浏览器 UA(很多站[如 NHK news.web]把爬虫 UA 直接 403)
|
||||||
self._http_client = httpx.AsyncClient(
|
self._http_client = httpx.AsyncClient(
|
||||||
follow_redirects=True,
|
follow_redirects=True,
|
||||||
timeout=20,
|
timeout=20,
|
||||||
headers={"User-Agent": "Mozilla/5.0 (compatible; DiaryNews/0.1)"},
|
headers={
|
||||||
|
"User-Agent": (
|
||||||
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
||||||
|
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
||||||
|
"Chrome/120.0.0.0 Safari/537.36"
|
||||||
|
),
|
||||||
|
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
|
||||||
|
"Accept-Language": "ja,en-US;q=0.9,en;q=0.8,zh;q=0.7",
|
||||||
|
},
|
||||||
)
|
)
|
||||||
r = await self._http_client.get(url)
|
r = await self._http_client.get(url)
|
||||||
r.raise_for_status()
|
r.raise_for_status()
|
||||||
|
|||||||
Reference in New Issue
Block a user