Files
diary-news/backend/app/services/fetchers/base.py
Mavis 60b062daf2 feat: initial MVP - FastAPI backend + Vue3 frontend + docker-compose
- backend: FastAPI + SQLAlchemy 2.0(async) + asyncpg + Alembic
- 7 API routes: auth/me/articles/sources/bookmarks/subscriptions/admin
- models: User/Source/Article/Bookmark/Subscription/ApiToken
- services: RSS fetcher (feedparser) + Tencent TMT translator with quota + cache + local NLLB fallback
- workers: APScheduler + asyncio pipeline (fetch -> dedupe -> insert -> translate)
- seed scripts: create_user, seed_sources (5 RSS: Reuters/BBC/Al Jazeera/NHK/DW)
- frontend: Vue 3 + Vite + Naive UI + Pinia + vue-router
- pages: Login, Feed (24h), ArticleDetail, Sources, Bookmarks, AdminSources
- deploy: docker-compose (postgres/redis/api/worker/frontend/caddy)
- docs: README, DEPLOY, architecture, acceptance
2026-06-07 21:51:01 +08:00

68 lines
1.9 KiB
Python

"""Fetcher 抽象基类 + 通用工具。"""
from __future__ import annotations
import hashlib
from abc import ABC, abstractmethod
from dataclasses import dataclass, field
from datetime import datetime
from typing import Any
import httpx
from app.config import settings
def normalize_url(url: str) -> str:
"""去 utm_*、fragment、尾斜杠,用于 url_hash。"""
from urllib.parse import urlsplit, urlunsplit, parse_qsl, urlencode
sp = urlsplit(url.strip())
# 去掉 fragment
fragment = ""
# 过滤 utm_*
qs = [(k, v) for k, v in parse_qsl(sp.query, keep_blank_values=True) if not k.lower().startswith("utm_")]
query = urlencode(qs)
# 路径末尾 /
path = sp.path.rstrip("/") or "/"
return urlunsplit((sp.scheme.lower(), sp.netloc.lower(), path, query, fragment))
def url_hash(url: str) -> str:
return hashlib.sha1(normalize_url(url).encode()).hexdigest()
@dataclass
class FetchedItem:
"""统一返回结构:一个待入库的条目。"""
url: str
title: str
body_html: str | None = None
body_text: str = ""
published_at: datetime | None = None
lang: str | None = None
author: str | None = None
image_url: str | None = None
guid: str | None = None
raw: dict[str, Any] = field(default_factory=dict)
class BaseFetcher(ABC):
def __init__(self, url: str, headers: dict | None = None):
self.url = url
self.headers = headers or {"User-Agent": "DiaryNews/0.1 (+https://github.com/)"}
@abstractmethod
async def fetch(self) -> list[FetchedItem]:
"""拉取并解析,返回 FetchedItem 列表。"""
async def _http_get(self) -> bytes:
async with httpx.AsyncClient(
timeout=settings.fetch_timeout,
follow_redirects=True,
headers=self.headers,
) as client:
r = await client.get(self.url)
r.raise_for_status()
return r.content