feat: initial MVP - FastAPI backend + Vue3 frontend + docker-compose

- backend: FastAPI + SQLAlchemy 2.0(async) + asyncpg + Alembic
- 7 API routes: auth/me/articles/sources/bookmarks/subscriptions/admin
- models: User/Source/Article/Bookmark/Subscription/ApiToken
- services: RSS fetcher (feedparser) + Tencent TMT translator with quota + cache + local NLLB fallback
- workers: APScheduler + asyncio pipeline (fetch -> dedupe -> insert -> translate)
- seed scripts: create_user, seed_sources (5 RSS: Reuters/BBC/Al Jazeera/NHK/DW)
- frontend: Vue 3 + Vite + Naive UI + Pinia + vue-router
- pages: Login, Feed (24h), ArticleDetail, Sources, Bookmarks, AdminSources
- deploy: docker-compose (postgres/redis/api/worker/frontend/caddy)
- docs: README, DEPLOY, architecture, acceptance
This commit is contained in:
Mavis
2026-06-07 21:51:01 +08:00
commit 60b062daf2
81 changed files with 5540 additions and 0 deletions

View File

@@ -0,0 +1,12 @@
"""Fetcher implementations."""
from app.services.fetchers.base import BaseFetcher, FetchedItem
from app.services.fetchers.rss import RSSFetcher
__all__ = ["BaseFetcher", "FetchedItem", "RSSFetcher"]
def get_fetcher(kind: str, **kwargs) -> BaseFetcher:
if kind == "rss":
return RSSFetcher(**kwargs)
# html_list / tg_channel: Phase 2 实现,这里抛错
raise NotImplementedError(f"fetcher not implemented for kind={kind}")

View File

@@ -0,0 +1,67 @@
"""Fetcher 抽象基类 + 通用工具。"""
from __future__ import annotations
import hashlib
from abc import ABC, abstractmethod
from dataclasses import dataclass, field
from datetime import datetime
from typing import Any
import httpx
from app.config import settings
def normalize_url(url: str) -> str:
"""去 utm_*、fragment、尾斜杠,用于 url_hash。"""
from urllib.parse import urlsplit, urlunsplit, parse_qsl, urlencode
sp = urlsplit(url.strip())
# 去掉 fragment
fragment = ""
# 过滤 utm_*
qs = [(k, v) for k, v in parse_qsl(sp.query, keep_blank_values=True) if not k.lower().startswith("utm_")]
query = urlencode(qs)
# 路径末尾 /
path = sp.path.rstrip("/") or "/"
return urlunsplit((sp.scheme.lower(), sp.netloc.lower(), path, query, fragment))
def url_hash(url: str) -> str:
return hashlib.sha1(normalize_url(url).encode()).hexdigest()
@dataclass
class FetchedItem:
"""统一返回结构:一个待入库的条目。"""
url: str
title: str
body_html: str | None = None
body_text: str = ""
published_at: datetime | None = None
lang: str | None = None
author: str | None = None
image_url: str | None = None
guid: str | None = None
raw: dict[str, Any] = field(default_factory=dict)
class BaseFetcher(ABC):
def __init__(self, url: str, headers: dict | None = None):
self.url = url
self.headers = headers or {"User-Agent": "DiaryNews/0.1 (+https://github.com/)"}
@abstractmethod
async def fetch(self) -> list[FetchedItem]:
"""拉取并解析,返回 FetchedItem 列表。"""
async def _http_get(self) -> bytes:
async with httpx.AsyncClient(
timeout=settings.fetch_timeout,
follow_redirects=True,
headers=self.headers,
) as client:
r = await client.get(self.url)
r.raise_for_status()
return r.content

View File

@@ -0,0 +1,100 @@
"""RSS / Atom fetcher(基于 feedparser)。"""
from __future__ import annotations
from datetime import datetime, timezone
from email.utils import parsedate_to_datetime
import feedparser
from dateutil import parser as dtp
from app.services.fetchers.base import BaseFetcher, FetchedItem
class RSSFetcher(BaseFetcher):
async def fetch(self) -> list[FetchedItem]:
raw = await self._http_get()
# feedparser 在不同 Python 下处理 bytes/str
try:
text = raw.decode("utf-8")
except UnicodeDecodeError:
text = raw.decode("utf-8", errors="replace")
feed = feedparser.parse(text)
if feed.bozo and not feed.entries:
# 整篇解析失败
raise RuntimeError(f"RSS parse failed: {feed.bozo_exception}")
items: list[FetchedItem] = []
for e in feed.entries:
url = e.get("link") or e.get("id")
if not url:
continue
title = (e.get("title") or "").strip()
if not title:
continue
body_html = None
body_text = ""
if e.get("content"):
# 选最长 content
contents = sorted(e["content"], key=lambda c: -len(c.get("value", "")))
body_html = contents[0].get("value")
if not body_html:
body_html = e.get("summary")
if body_html:
from bs4 import BeautifulSoup
soup = BeautifulSoup(body_html, "lxml")
# 去 script/style
for tag in soup(["script", "style", "noscript"]):
tag.decompose()
body_text = soup.get_text(separator="\n", strip=True)
published_at = _parse_dt(e.get("published") or e.get("updated") or e.get("created"))
author = e.get("author")
image_url = None
if e.get("media_content"):
try:
image_url = e["media_content"][0].get("url")
except (IndexError, KeyError, TypeError):
pass
if not image_url and e.get("media_thumbnail"):
try:
image_url = e["media_thumbnail"][0].get("url")
except (IndexError, KeyError, TypeError):
pass
if not image_url and e.get("enclosures"):
for enc in e["enclosures"]:
if enc.get("type", "").startswith("image/"):
image_url = enc.get("href") or enc.get("url")
break
items.append(
FetchedItem(
url=url,
title=title,
body_html=body_html,
body_text=body_text,
published_at=published_at,
lang=e.get("language") or feed.feed.get("language"),
author=author,
image_url=image_url,
guid=e.get("id") or e.get("guid"),
)
)
return items
def _parse_dt(s: str | None) -> datetime | None:
if not s:
return None
try:
dt = dtp.parse(s)
except (ValueError, TypeError, dtp.ParserError):
try:
dt = parsedate_to_datetime(s)
except Exception:
return None
if dt is None:
return None
if dt.tzinfo is None:
dt = dt.replace(tzinfo=timezone.utc)
return dt.astimezone(timezone.utc)