"""Fetcher 抽象基类 + 通用工具。""" from __future__ import annotations import hashlib from abc import ABC, abstractmethod from dataclasses import dataclass, field from datetime import datetime from typing import Any import httpx from app.config import settings def normalize_url(url: str) -> str: """去 utm_*、fragment、尾斜杠,用于 url_hash。""" from urllib.parse import urlsplit, urlunsplit, parse_qsl, urlencode sp = urlsplit(url.strip()) # 去掉 fragment fragment = "" # 过滤 utm_* qs = [(k, v) for k, v in parse_qsl(sp.query, keep_blank_values=True) if not k.lower().startswith("utm_")] query = urlencode(qs) # 路径末尾 / path = sp.path.rstrip("/") or "/" return urlunsplit((sp.scheme.lower(), sp.netloc.lower(), path, query, fragment)) def url_hash(url: str) -> str: return hashlib.sha1(normalize_url(url).encode()).hexdigest() @dataclass class FetchedItem: """统一返回结构:一个待入库的条目。""" url: str title: str body_html: str | None = None body_text: str = "" published_at: datetime | None = None lang: str | None = None author: str | None = None image_url: str | None = None guid: str | None = None raw: dict[str, Any] = field(default_factory=dict) class BaseFetcher(ABC): def __init__(self, url: str, headers: dict | None = None): self.url = url self.headers = headers or {"User-Agent": "DiaryNews/0.1 (+https://github.com/)"} @abstractmethod async def fetch(self) -> list[FetchedItem]: """拉取并解析,返回 FetchedItem 列表。""" async def _http_get(self) -> bytes: async with httpx.AsyncClient( timeout=settings.fetch_timeout, follow_redirects=True, headers=self.headers, ) as client: r = await client.get(self.url) r.raise_for_status() return r.content