2026-06-08 00:27:09 +08:00
|
|
|
"""RSS / Atom fetcher(基于 feedparser)。
|
|
|
|
|
|
|
|
|
|
增强:对 content 太短(< BODY_MIN_LEN)的 item,自动去 article URL 抓全文
|
|
|
|
|
用 trafilatura 抽取(从 RSS 摘要升级到全文)。
|
|
|
|
|
"""
|
2026-06-07 21:51:01 +08:00
|
|
|
from __future__ import annotations
|
|
|
|
|
|
2026-06-08 00:27:09 +08:00
|
|
|
import logging
|
2026-06-07 21:51:01 +08:00
|
|
|
from datetime import datetime, timezone
|
|
|
|
|
from email.utils import parsedate_to_datetime
|
|
|
|
|
|
|
|
|
|
import feedparser
|
2026-06-08 00:27:09 +08:00
|
|
|
import httpx
|
|
|
|
|
import trafilatura
|
|
|
|
|
from bs4 import BeautifulSoup
|
2026-06-07 21:51:01 +08:00
|
|
|
from dateutil import parser as dtp
|
|
|
|
|
|
|
|
|
|
from app.services.fetchers.base import BaseFetcher, FetchedItem
|
|
|
|
|
|
2026-06-08 00:27:09 +08:00
|
|
|
logger = logging.getLogger("news.fetcher.rss")
|
|
|
|
|
|
|
|
|
|
# 如果 RSS 给的 body 不到这个字符数,就自动去 article URL 抓全文
|
|
|
|
|
BODY_MIN_LEN = 500
|
|
|
|
|
|
2026-06-07 21:51:01 +08:00
|
|
|
|
|
|
|
|
class RSSFetcher(BaseFetcher):
|
|
|
|
|
async def fetch(self) -> list[FetchedItem]:
|
|
|
|
|
raw = await self._http_get()
|
|
|
|
|
# feedparser 在不同 Python 下处理 bytes/str
|
|
|
|
|
try:
|
|
|
|
|
text = raw.decode("utf-8")
|
|
|
|
|
except UnicodeDecodeError:
|
|
|
|
|
text = raw.decode("utf-8", errors="replace")
|
|
|
|
|
feed = feedparser.parse(text)
|
|
|
|
|
if feed.bozo and not feed.entries:
|
|
|
|
|
# 整篇解析失败
|
|
|
|
|
raise RuntimeError(f"RSS parse failed: {feed.bozo_exception}")
|
2026-06-08 00:27:09 +08:00
|
|
|
|
|
|
|
|
# 拿到 fetch 上下文
|
|
|
|
|
self._http_client: httpx.AsyncClient | None = None
|
2026-06-07 21:51:01 +08:00
|
|
|
items: list[FetchedItem] = []
|
|
|
|
|
for e in feed.entries:
|
|
|
|
|
url = e.get("link") or e.get("id")
|
|
|
|
|
if not url:
|
|
|
|
|
continue
|
|
|
|
|
title = (e.get("title") or "").strip()
|
|
|
|
|
if not title:
|
|
|
|
|
continue
|
|
|
|
|
|
2026-06-08 00:27:09 +08:00
|
|
|
body_html, body_text = self._extract_from_entry(e)
|
|
|
|
|
|
|
|
|
|
# body 太短:去 article URL 抓全文(trafilatura)
|
|
|
|
|
if len(body_text) < BODY_MIN_LEN and url:
|
|
|
|
|
full_html, full_text = await self._fetch_fulltext(url)
|
|
|
|
|
if full_text and len(full_text) > len(body_text):
|
|
|
|
|
body_text = full_text
|
|
|
|
|
body_html = full_html or body_html
|
2026-06-07 21:51:01 +08:00
|
|
|
|
|
|
|
|
published_at = _parse_dt(e.get("published") or e.get("updated") or e.get("created"))
|
|
|
|
|
author = e.get("author")
|
2026-06-08 00:27:09 +08:00
|
|
|
image_url = self._extract_image(e)
|
2026-06-07 21:51:01 +08:00
|
|
|
|
|
|
|
|
items.append(
|
|
|
|
|
FetchedItem(
|
|
|
|
|
url=url,
|
|
|
|
|
title=title,
|
|
|
|
|
body_html=body_html,
|
|
|
|
|
body_text=body_text,
|
|
|
|
|
published_at=published_at,
|
|
|
|
|
lang=e.get("language") or feed.feed.get("language"),
|
|
|
|
|
author=author,
|
|
|
|
|
image_url=image_url,
|
|
|
|
|
guid=e.get("id") or e.get("guid"),
|
|
|
|
|
)
|
|
|
|
|
)
|
2026-06-08 00:27:09 +08:00
|
|
|
if self._http_client is not None:
|
|
|
|
|
await self._http_client.aclose()
|
2026-06-07 21:51:01 +08:00
|
|
|
return items
|
|
|
|
|
|
2026-06-08 00:27:09 +08:00
|
|
|
@staticmethod
|
|
|
|
|
def _extract_from_entry(e) -> tuple[str | None, str]:
|
|
|
|
|
body_html = None
|
|
|
|
|
if e.get("content"):
|
|
|
|
|
contents = sorted(e["content"], key=lambda c: -len(c.get("value", "")))
|
|
|
|
|
body_html = contents[0].get("value")
|
|
|
|
|
if not body_html:
|
|
|
|
|
body_html = e.get("summary")
|
|
|
|
|
if not body_html:
|
|
|
|
|
return None, ""
|
|
|
|
|
soup = BeautifulSoup(body_html, "lxml")
|
|
|
|
|
for tag in soup(["script", "style", "noscript"]):
|
|
|
|
|
tag.decompose()
|
|
|
|
|
text = soup.get_text(separator="\n", strip=True)
|
|
|
|
|
return body_html, text
|
|
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
|
def _extract_image(e) -> str | None:
|
|
|
|
|
if e.get("media_content"):
|
|
|
|
|
try:
|
|
|
|
|
return e["media_content"][0].get("url")
|
|
|
|
|
except (IndexError, KeyError, TypeError):
|
|
|
|
|
pass
|
|
|
|
|
if e.get("media_thumbnail"):
|
|
|
|
|
try:
|
|
|
|
|
return e["media_thumbnail"][0].get("url")
|
|
|
|
|
except (IndexError, KeyError, TypeError):
|
|
|
|
|
pass
|
|
|
|
|
if e.get("enclosures"):
|
|
|
|
|
for enc in e["enclosures"]:
|
|
|
|
|
if enc.get("type", "").startswith("image/"):
|
|
|
|
|
return enc.get("href") or enc.get("url")
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
async def _fetch_fulltext(self, url: str) -> tuple[str | None, str]:
|
|
|
|
|
"""去 article URL 抓全文,用 trafilatura 抽正文。"""
|
|
|
|
|
try:
|
|
|
|
|
if self._http_client is None:
|
2026-06-08 15:55:30 +08:00
|
|
|
# 用真实浏览器 UA(很多站[如 NHK news.web]把爬虫 UA 直接 403)
|
2026-06-08 00:27:09 +08:00
|
|
|
self._http_client = httpx.AsyncClient(
|
|
|
|
|
follow_redirects=True,
|
|
|
|
|
timeout=20,
|
2026-06-08 15:55:30 +08:00
|
|
|
headers={
|
|
|
|
|
"User-Agent": (
|
|
|
|
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
|
|
|
|
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
|
|
|
|
"Chrome/120.0.0.0 Safari/537.36"
|
|
|
|
|
),
|
|
|
|
|
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
|
|
|
|
|
"Accept-Language": "ja,en-US;q=0.9,en;q=0.8,zh;q=0.7",
|
|
|
|
|
},
|
2026-06-08 00:27:09 +08:00
|
|
|
)
|
|
|
|
|
r = await self._http_client.get(url)
|
|
|
|
|
r.raise_for_status()
|
|
|
|
|
except Exception as e:
|
|
|
|
|
logger.warning("fulltext fetch failed for %s: %s", url, e)
|
|
|
|
|
return None, ""
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
html = trafilatura.extract(r.text, include_comments=False, include_tables=False, favor_recall=True, output_format="html") or ""
|
|
|
|
|
text = trafilatura.extract(r.text, include_comments=False, include_tables=False, favor_recall=True, output_format="txt") or ""
|
|
|
|
|
except Exception as e:
|
|
|
|
|
logger.warning("trafilatura extract failed for %s: %s", url, e)
|
|
|
|
|
return None, ""
|
|
|
|
|
return html, text
|
|
|
|
|
|
2026-06-07 21:51:01 +08:00
|
|
|
|
|
|
|
|
def _parse_dt(s: str | None) -> datetime | None:
|
|
|
|
|
if not s:
|
|
|
|
|
return None
|
|
|
|
|
try:
|
|
|
|
|
dt = dtp.parse(s)
|
|
|
|
|
except (ValueError, TypeError, dtp.ParserError):
|
|
|
|
|
try:
|
|
|
|
|
dt = parsedate_to_datetime(s)
|
|
|
|
|
except Exception:
|
|
|
|
|
return None
|
|
|
|
|
if dt is None:
|
|
|
|
|
return None
|
|
|
|
|
if dt.tzinfo is None:
|
|
|
|
|
dt = dt.replace(tzinfo=timezone.utc)
|
|
|
|
|
return dt.astimezone(timezone.utc)
|