backend/app/services/fetchers/base.py

"""Fetcher 抽象基类 + 通用工具。"""
from __future__ import annotations

import hashlib
from abc import ABC, abstractmethod
from dataclasses import dataclass, field
from datetime import datetime
from typing import Any

import httpx

from app.config import settings


def normalize_url(url: str) -> str:
    """去 utm_*、fragment、尾斜杠,用于 url_hash。"""
    from urllib.parse import urlsplit, urlunsplit, parse_qsl, urlencode

    sp = urlsplit(url.strip())
    # 去掉 fragment
    fragment = ""
    # 过滤 utm_*
    qs = [(k, v) for k, v in parse_qsl(sp.query, keep_blank_values=True) if not k.lower().startswith("utm_")]
    query = urlencode(qs)
    # 路径末尾 /
    path = sp.path.rstrip("/") or "/"
    return urlunsplit((sp.scheme.lower(), sp.netloc.lower(), path, query, fragment))


def url_hash(url: str) -> str:
    return hashlib.sha1(normalize_url(url).encode()).hexdigest()


@dataclass
class FetchedItem:
    """统一返回结构:一个待入库的条目。"""

    url: str
    title: str
    body_html: str | None = None
    body_text: str = ""
    published_at: datetime | None = None
    lang: str | None = None
    author: str | None = None
    image_url: str | None = None
    guid: str | None = None
    raw: dict[str, Any] = field(default_factory=dict)


class BaseFetcher(ABC):
    def __init__(self, url: str, headers: dict | None = None):
        self.url = url
        self.headers = headers or {"User-Agent": "DiaryNews/0.1 (+https://github.com/)"}

    @abstractmethod
    async def fetch(self) -> list[FetchedItem]:
        """拉取并解析,返回 FetchedItem 列表。"""

    async def _http_get(self) -> bytes:
        async with httpx.AsyncClient(
            timeout=settings.fetch_timeout,
            follow_redirects=True,
            headers=self.headers,
        ) as client:
            r = await client.get(self.url)
            r.raise_for_status()
            return r.content
feat: initial MVP - FastAPI backend + Vue3 frontend + docker-compose - backend: FastAPI + SQLAlchemy 2.0(async) + asyncpg + Alembic - 7 API routes: auth/me/articles/sources/bookmarks/subscriptions/admin - models: User/Source/Article/Bookmark/Subscription/ApiToken - services: RSS fetcher (feedparser) + Tencent TMT translator with quota + cache + local NLLB fallback - workers: APScheduler + asyncio pipeline (fetch -> dedupe -> insert -> translate) - seed scripts: create_user, seed_sources (5 RSS: Reuters/BBC/Al Jazeera/NHK/DW) - frontend: Vue 3 + Vite + Naive UI + Pinia + vue-router - pages: Login, Feed (24h), ArticleDetail, Sources, Bookmarks, AdminSources - deploy: docker-compose (postgres/redis/api/worker/frontend/caddy) - docs: README, DEPLOY, architecture, acceptance 2026-06-07 21:51:01 +08:00			`"""Fetcher 抽象基类 + 通用工具。"""`
			`from __future__ import annotations`

			`import hashlib`
			`from abc import ABC, abstractmethod`
			`from dataclasses import dataclass, field`
			`from datetime import datetime`
			`from typing import Any`

			`import httpx`

			`from app.config import settings`


			`def normalize_url(url: str) -> str:`
			`"""去 utm_*、fragment、尾斜杠,用于 url_hash。"""`
			`from urllib.parse import urlsplit, urlunsplit, parse_qsl, urlencode`

			`sp = urlsplit(url.strip())`
			`# 去掉 fragment`
			`fragment = ""`
			`# 过滤 utm_*`
			`qs = [(k, v) for k, v in parse_qsl(sp.query, keep_blank_values=True) if not k.lower().startswith("utm_")]`
			`query = urlencode(qs)`
			`# 路径末尾 /`
			`path = sp.path.rstrip("/") or "/"`
			`return urlunsplit((sp.scheme.lower(), sp.netloc.lower(), path, query, fragment))`


			`def url_hash(url: str) -> str:`
			`return hashlib.sha1(normalize_url(url).encode()).hexdigest()`


			`@dataclass`
			`class FetchedItem:`
			`"""统一返回结构:一个待入库的条目。"""`

			`url: str`
			`title: str`
			`body_html: str \| None = None`
			`body_text: str = ""`
			`published_at: datetime \| None = None`
			`lang: str \| None = None`
			`author: str \| None = None`
			`image_url: str \| None = None`
			`guid: str \| None = None`
			`raw: dict[str, Any] = field(default_factory=dict)`


			`class BaseFetcher(ABC):`
			`def __init__(self, url: str, headers: dict \| None = None):`
			`self.url = url`
			`self.headers = headers or {"User-Agent": "DiaryNews/0.1 (+https://github.com/)"}`

			`@abstractmethod`
			`async def fetch(self) -> list[FetchedItem]:`
			`"""拉取并解析,返回 FetchedItem 列表。"""`

			`async def _http_get(self) -> bytes:`
			`async with httpx.AsyncClient(`
			`timeout=settings.fetch_timeout,`
			`follow_redirects=True,`
			`headers=self.headers,`
			`) as client:`
			`r = await client.get(self.url)`
			`r.raise_for_status()`
			`return r.content`