feat: initial MVP - FastAPI backend + Vue3 frontend + docker-compose
- backend: FastAPI + SQLAlchemy 2.0(async) + asyncpg + Alembic - 7 API routes: auth/me/articles/sources/bookmarks/subscriptions/admin - models: User/Source/Article/Bookmark/Subscription/ApiToken - services: RSS fetcher (feedparser) + Tencent TMT translator with quota + cache + local NLLB fallback - workers: APScheduler + asyncio pipeline (fetch -> dedupe -> insert -> translate) - seed scripts: create_user, seed_sources (5 RSS: Reuters/BBC/Al Jazeera/NHK/DW) - frontend: Vue 3 + Vite + Naive UI + Pinia + vue-router - pages: Login, Feed (24h), ArticleDetail, Sources, Bookmarks, AdminSources - deploy: docker-compose (postgres/redis/api/worker/frontend/caddy) - docs: README, DEPLOY, architecture, acceptance
This commit is contained in:
12
backend/app/services/fetchers/__init__.py
Normal file
12
backend/app/services/fetchers/__init__.py
Normal file
@@ -0,0 +1,12 @@
|
||||
"""Fetcher implementations."""
|
||||
from app.services.fetchers.base import BaseFetcher, FetchedItem
|
||||
from app.services.fetchers.rss import RSSFetcher
|
||||
|
||||
__all__ = ["BaseFetcher", "FetchedItem", "RSSFetcher"]
|
||||
|
||||
|
||||
def get_fetcher(kind: str, **kwargs) -> BaseFetcher:
|
||||
if kind == "rss":
|
||||
return RSSFetcher(**kwargs)
|
||||
# html_list / tg_channel: Phase 2 实现,这里抛错
|
||||
raise NotImplementedError(f"fetcher not implemented for kind={kind}")
|
||||
67
backend/app/services/fetchers/base.py
Normal file
67
backend/app/services/fetchers/base.py
Normal file
@@ -0,0 +1,67 @@
|
||||
"""Fetcher 抽象基类 + 通用工具。"""
|
||||
from __future__ import annotations
|
||||
|
||||
import hashlib
|
||||
from abc import ABC, abstractmethod
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import datetime
|
||||
from typing import Any
|
||||
|
||||
import httpx
|
||||
|
||||
from app.config import settings
|
||||
|
||||
|
||||
def normalize_url(url: str) -> str:
|
||||
"""去 utm_*、fragment、尾斜杠,用于 url_hash。"""
|
||||
from urllib.parse import urlsplit, urlunsplit, parse_qsl, urlencode
|
||||
|
||||
sp = urlsplit(url.strip())
|
||||
# 去掉 fragment
|
||||
fragment = ""
|
||||
# 过滤 utm_*
|
||||
qs = [(k, v) for k, v in parse_qsl(sp.query, keep_blank_values=True) if not k.lower().startswith("utm_")]
|
||||
query = urlencode(qs)
|
||||
# 路径末尾 /
|
||||
path = sp.path.rstrip("/") or "/"
|
||||
return urlunsplit((sp.scheme.lower(), sp.netloc.lower(), path, query, fragment))
|
||||
|
||||
|
||||
def url_hash(url: str) -> str:
|
||||
return hashlib.sha1(normalize_url(url).encode()).hexdigest()
|
||||
|
||||
|
||||
@dataclass
|
||||
class FetchedItem:
|
||||
"""统一返回结构:一个待入库的条目。"""
|
||||
|
||||
url: str
|
||||
title: str
|
||||
body_html: str | None = None
|
||||
body_text: str = ""
|
||||
published_at: datetime | None = None
|
||||
lang: str | None = None
|
||||
author: str | None = None
|
||||
image_url: str | None = None
|
||||
guid: str | None = None
|
||||
raw: dict[str, Any] = field(default_factory=dict)
|
||||
|
||||
|
||||
class BaseFetcher(ABC):
|
||||
def __init__(self, url: str, headers: dict | None = None):
|
||||
self.url = url
|
||||
self.headers = headers or {"User-Agent": "DiaryNews/0.1 (+https://github.com/)"}
|
||||
|
||||
@abstractmethod
|
||||
async def fetch(self) -> list[FetchedItem]:
|
||||
"""拉取并解析,返回 FetchedItem 列表。"""
|
||||
|
||||
async def _http_get(self) -> bytes:
|
||||
async with httpx.AsyncClient(
|
||||
timeout=settings.fetch_timeout,
|
||||
follow_redirects=True,
|
||||
headers=self.headers,
|
||||
) as client:
|
||||
r = await client.get(self.url)
|
||||
r.raise_for_status()
|
||||
return r.content
|
||||
100
backend/app/services/fetchers/rss.py
Normal file
100
backend/app/services/fetchers/rss.py
Normal file
@@ -0,0 +1,100 @@
|
||||
"""RSS / Atom fetcher(基于 feedparser)。"""
|
||||
from __future__ import annotations
|
||||
|
||||
from datetime import datetime, timezone
|
||||
from email.utils import parsedate_to_datetime
|
||||
|
||||
import feedparser
|
||||
from dateutil import parser as dtp
|
||||
|
||||
from app.services.fetchers.base import BaseFetcher, FetchedItem
|
||||
|
||||
|
||||
class RSSFetcher(BaseFetcher):
|
||||
async def fetch(self) -> list[FetchedItem]:
|
||||
raw = await self._http_get()
|
||||
# feedparser 在不同 Python 下处理 bytes/str
|
||||
try:
|
||||
text = raw.decode("utf-8")
|
||||
except UnicodeDecodeError:
|
||||
text = raw.decode("utf-8", errors="replace")
|
||||
feed = feedparser.parse(text)
|
||||
if feed.bozo and not feed.entries:
|
||||
# 整篇解析失败
|
||||
raise RuntimeError(f"RSS parse failed: {feed.bozo_exception}")
|
||||
items: list[FetchedItem] = []
|
||||
for e in feed.entries:
|
||||
url = e.get("link") or e.get("id")
|
||||
if not url:
|
||||
continue
|
||||
title = (e.get("title") or "").strip()
|
||||
if not title:
|
||||
continue
|
||||
|
||||
body_html = None
|
||||
body_text = ""
|
||||
if e.get("content"):
|
||||
# 选最长 content
|
||||
contents = sorted(e["content"], key=lambda c: -len(c.get("value", "")))
|
||||
body_html = contents[0].get("value")
|
||||
if not body_html:
|
||||
body_html = e.get("summary")
|
||||
if body_html:
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
soup = BeautifulSoup(body_html, "lxml")
|
||||
# 去 script/style
|
||||
for tag in soup(["script", "style", "noscript"]):
|
||||
tag.decompose()
|
||||
body_text = soup.get_text(separator="\n", strip=True)
|
||||
|
||||
published_at = _parse_dt(e.get("published") or e.get("updated") or e.get("created"))
|
||||
author = e.get("author")
|
||||
image_url = None
|
||||
if e.get("media_content"):
|
||||
try:
|
||||
image_url = e["media_content"][0].get("url")
|
||||
except (IndexError, KeyError, TypeError):
|
||||
pass
|
||||
if not image_url and e.get("media_thumbnail"):
|
||||
try:
|
||||
image_url = e["media_thumbnail"][0].get("url")
|
||||
except (IndexError, KeyError, TypeError):
|
||||
pass
|
||||
if not image_url and e.get("enclosures"):
|
||||
for enc in e["enclosures"]:
|
||||
if enc.get("type", "").startswith("image/"):
|
||||
image_url = enc.get("href") or enc.get("url")
|
||||
break
|
||||
|
||||
items.append(
|
||||
FetchedItem(
|
||||
url=url,
|
||||
title=title,
|
||||
body_html=body_html,
|
||||
body_text=body_text,
|
||||
published_at=published_at,
|
||||
lang=e.get("language") or feed.feed.get("language"),
|
||||
author=author,
|
||||
image_url=image_url,
|
||||
guid=e.get("id") or e.get("guid"),
|
||||
)
|
||||
)
|
||||
return items
|
||||
|
||||
|
||||
def _parse_dt(s: str | None) -> datetime | None:
|
||||
if not s:
|
||||
return None
|
||||
try:
|
||||
dt = dtp.parse(s)
|
||||
except (ValueError, TypeError, dtp.ParserError):
|
||||
try:
|
||||
dt = parsedate_to_datetime(s)
|
||||
except Exception:
|
||||
return None
|
||||
if dt is None:
|
||||
return None
|
||||
if dt.tzinfo is None:
|
||||
dt = dt.replace(tzinfo=timezone.utc)
|
||||
return dt.astimezone(timezone.utc)
|
||||
Reference in New Issue
Block a user