feat: initial MVP - FastAPI backend + Vue3 frontend + docker-compose

- backend: FastAPI + SQLAlchemy 2.0(async) + asyncpg + Alembic
- 7 API routes: auth/me/articles/sources/bookmarks/subscriptions/admin
- models: User/Source/Article/Bookmark/Subscription/ApiToken
- services: RSS fetcher (feedparser) + Tencent TMT translator with quota + cache + local NLLB fallback
- workers: APScheduler + asyncio pipeline (fetch -> dedupe -> insert -> translate)
- seed scripts: create_user, seed_sources (5 RSS: Reuters/BBC/Al Jazeera/NHK/DW)
- frontend: Vue 3 + Vite + Naive UI + Pinia + vue-router
- pages: Login, Feed (24h), ArticleDetail, Sources, Bookmarks, AdminSources
- deploy: docker-compose (postgres/redis/api/worker/frontend/caddy)
- docs: README, DEPLOY, architecture, acceptance
This commit is contained in:
Mavis
2026-06-07 21:51:01 +08:00
commit 60b062daf2
81 changed files with 5540 additions and 0 deletions

View File

@@ -0,0 +1 @@
"""Services (fetchers / translation)."""

View File

@@ -0,0 +1,12 @@
"""Fetcher implementations."""
from app.services.fetchers.base import BaseFetcher, FetchedItem
from app.services.fetchers.rss import RSSFetcher
__all__ = ["BaseFetcher", "FetchedItem", "RSSFetcher"]
def get_fetcher(kind: str, **kwargs) -> BaseFetcher:
if kind == "rss":
return RSSFetcher(**kwargs)
# html_list / tg_channel: Phase 2 实现,这里抛错
raise NotImplementedError(f"fetcher not implemented for kind={kind}")

View File

@@ -0,0 +1,67 @@
"""Fetcher 抽象基类 + 通用工具。"""
from __future__ import annotations
import hashlib
from abc import ABC, abstractmethod
from dataclasses import dataclass, field
from datetime import datetime
from typing import Any
import httpx
from app.config import settings
def normalize_url(url: str) -> str:
"""去 utm_*、fragment、尾斜杠,用于 url_hash。"""
from urllib.parse import urlsplit, urlunsplit, parse_qsl, urlencode
sp = urlsplit(url.strip())
# 去掉 fragment
fragment = ""
# 过滤 utm_*
qs = [(k, v) for k, v in parse_qsl(sp.query, keep_blank_values=True) if not k.lower().startswith("utm_")]
query = urlencode(qs)
# 路径末尾 /
path = sp.path.rstrip("/") or "/"
return urlunsplit((sp.scheme.lower(), sp.netloc.lower(), path, query, fragment))
def url_hash(url: str) -> str:
return hashlib.sha1(normalize_url(url).encode()).hexdigest()
@dataclass
class FetchedItem:
"""统一返回结构:一个待入库的条目。"""
url: str
title: str
body_html: str | None = None
body_text: str = ""
published_at: datetime | None = None
lang: str | None = None
author: str | None = None
image_url: str | None = None
guid: str | None = None
raw: dict[str, Any] = field(default_factory=dict)
class BaseFetcher(ABC):
def __init__(self, url: str, headers: dict | None = None):
self.url = url
self.headers = headers or {"User-Agent": "DiaryNews/0.1 (+https://github.com/)"}
@abstractmethod
async def fetch(self) -> list[FetchedItem]:
"""拉取并解析,返回 FetchedItem 列表。"""
async def _http_get(self) -> bytes:
async with httpx.AsyncClient(
timeout=settings.fetch_timeout,
follow_redirects=True,
headers=self.headers,
) as client:
r = await client.get(self.url)
r.raise_for_status()
return r.content

View File

@@ -0,0 +1,100 @@
"""RSS / Atom fetcher(基于 feedparser)。"""
from __future__ import annotations
from datetime import datetime, timezone
from email.utils import parsedate_to_datetime
import feedparser
from dateutil import parser as dtp
from app.services.fetchers.base import BaseFetcher, FetchedItem
class RSSFetcher(BaseFetcher):
async def fetch(self) -> list[FetchedItem]:
raw = await self._http_get()
# feedparser 在不同 Python 下处理 bytes/str
try:
text = raw.decode("utf-8")
except UnicodeDecodeError:
text = raw.decode("utf-8", errors="replace")
feed = feedparser.parse(text)
if feed.bozo and not feed.entries:
# 整篇解析失败
raise RuntimeError(f"RSS parse failed: {feed.bozo_exception}")
items: list[FetchedItem] = []
for e in feed.entries:
url = e.get("link") or e.get("id")
if not url:
continue
title = (e.get("title") or "").strip()
if not title:
continue
body_html = None
body_text = ""
if e.get("content"):
# 选最长 content
contents = sorted(e["content"], key=lambda c: -len(c.get("value", "")))
body_html = contents[0].get("value")
if not body_html:
body_html = e.get("summary")
if body_html:
from bs4 import BeautifulSoup
soup = BeautifulSoup(body_html, "lxml")
# 去 script/style
for tag in soup(["script", "style", "noscript"]):
tag.decompose()
body_text = soup.get_text(separator="\n", strip=True)
published_at = _parse_dt(e.get("published") or e.get("updated") or e.get("created"))
author = e.get("author")
image_url = None
if e.get("media_content"):
try:
image_url = e["media_content"][0].get("url")
except (IndexError, KeyError, TypeError):
pass
if not image_url and e.get("media_thumbnail"):
try:
image_url = e["media_thumbnail"][0].get("url")
except (IndexError, KeyError, TypeError):
pass
if not image_url and e.get("enclosures"):
for enc in e["enclosures"]:
if enc.get("type", "").startswith("image/"):
image_url = enc.get("href") or enc.get("url")
break
items.append(
FetchedItem(
url=url,
title=title,
body_html=body_html,
body_text=body_text,
published_at=published_at,
lang=e.get("language") or feed.feed.get("language"),
author=author,
image_url=image_url,
guid=e.get("id") or e.get("guid"),
)
)
return items
def _parse_dt(s: str | None) -> datetime | None:
if not s:
return None
try:
dt = dtp.parse(s)
except (ValueError, TypeError, dtp.ParserError):
try:
dt = parsedate_to_datetime(s)
except Exception:
return None
if dt is None:
return None
if dt.tzinfo is None:
dt = dt.replace(tzinfo=timezone.utc)
return dt.astimezone(timezone.utc)

View File

@@ -0,0 +1 @@
"""Translation services."""

View File

@@ -0,0 +1,26 @@
"""翻译后端抽象。"""
from __future__ import annotations
from abc import ABC, abstractmethod
from dataclasses import dataclass
@dataclass
class TranslationResult:
text: str
engine: str
chars: int
cached: bool = False
class BaseTranslator(ABC):
name: str = "base"
@abstractmethod
async def translate(self, text: str, source: str = "auto", target: str = "zh") -> TranslationResult:
"""同步调用,失败抛异常。"""
def count_chars(s: str) -> int:
"""近似的字符计数(Unicode 码点)。腾讯 TMT 按字符数计费。"""
return len(s)

View File

@@ -0,0 +1,62 @@
"""本地翻译(降级用,需要 transformers + 模型文件)。
默认关闭。启用方式:
- LOCAL_TRANSLATE_ENABLED=true
- 容器内预装模型(Volume 挂载)
"""
from __future__ import annotations
import logging
from app.config import settings
from app.services.translation.base import BaseTranslator, TranslationResult
logger = logging.getLogger("news.translate.local")
class LocalTranslator(BaseTranslator):
name = "nllb"
def __init__(self):
if not settings.local_translate_enabled:
raise RuntimeError("LocalTranslator disabled in settings")
# 模型懒加载(避免 import 时加载大模型)
self._pipe = None
def _ensure_loaded(self):
if self._pipe is not None:
return
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, pipeline
model_name = settings.local_translate_model
logger.info("loading local translation model: %s", model_name)
tok = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
self._pipe = pipeline(
"translation",
model=model,
tokenizer=tok,
device=settings.local_translate_device,
)
async def translate(
self, text: str, source: str = "auto", target: str = "zh"
) -> TranslationResult:
if not text.strip():
return TranslationResult(text=text, engine=self.name, chars=0)
self._ensure_loaded()
import asyncio
loop = asyncio.get_running_loop()
# NLLB 的 src_lang/tgt_lang 比较长,简单按约定:en→zh_Hans
src = "eng_Latn" if source in ("en", "auto") else source
tgt = "zho_Hans" if target == "zh" else target
out = await loop.run_in_executor(
None,
lambda: self._pipe(
text, src_lang=src, tgt_lang=tgt, max_length=2000
),
)
return TranslationResult(
text=out[0]["translation_text"], engine=self.name, chars=len(text)
)

View File

@@ -0,0 +1,146 @@
"""翻译服务门面:配额检查 + 缓存 + 引擎选择 + 月度计数。"""
from __future__ import annotations
import asyncio
import hashlib
import logging
from datetime import datetime, timezone
from typing import Protocol
from app.config import settings
from app.redis_client import get_redis
from app.services.translation.base import BaseTranslator, TranslationResult
from app.services.translation.local import LocalTranslator
from app.services.translation.tencent import TencentTranslator
logger = logging.getLogger("news.translate.service")
# 缓存 key
def _cache_key(text: str, src: str, tgt: str) -> str:
h = hashlib.sha1(f"{src}|{tgt}|{text}".encode()).hexdigest()
return f"translation:cache:{h}"
def _month_key() -> str:
now = datetime.now(timezone.utc)
return f"translation:month:{now:%Y%m}"
class TranslationService:
def __init__(self):
self._tencent: BaseTranslator | None = None
self._local: BaseTranslator | None = None
self._sem = asyncio.Semaphore(3) # 并发限流
def _primary(self) -> BaseTranslator:
if self._tencent is None:
self._tencent = TencentTranslator()
return self._tencent
def _fallback(self) -> BaseTranslator | None:
if self._local is None and settings.local_translate_enabled:
try:
self._local = LocalTranslator()
except Exception as e:
logger.warning("local translator init failed: %s", e)
self._local = None
return self._local
async def can_use_tencent(self, chars: int) -> bool:
if not settings.tencentcloud_secret_id:
return False
r = get_redis()
used = int(await r.get(_month_key()) or 0)
buffered = int(
settings.tencent_tmt_quota_month * (1 - settings.tencent_tmt_quota_buffer)
)
return (used + chars) <= buffered
async def add_usage(self, chars: int) -> None:
r = get_redis()
# 用 INCRBY + EXPIRE 月初;简单做法:每次 set + 设 TTL
key = _month_key()
async with r.pipeline(transaction=False) as pipe:
pipe.incrby(key, chars)
# 月底过期(下下月 1 日)
now = datetime.now(timezone.utc)
if now.month == 12:
next_month = now.replace(year=now.year + 1, month=1, day=1)
else:
next_month = now.replace(month=now.month + 1, day=1)
ttl = int((next_month - now).total_seconds()) + 86400
pipe.expire(key, ttl)
await pipe.execute()
async def translate(
self, text: str, source: str = "auto", target: str = "zh"
) -> TranslationResult:
if not text.strip():
return TranslationResult(text=text, engine="skip", chars=0)
chars = len(text)
# 1) 缓存
r = get_redis()
ck = _cache_key(text, source, target)
cached = await r.get(ck)
if cached is not None:
return TranslationResult(text=cached, engine="cache", chars=chars, cached=True)
# 2) 选引擎
use_tencent = await self.can_use_tencent(chars)
engine: BaseTranslator
if use_tencent:
engine = self._primary()
else:
fb = self._fallback()
if fb is None:
# 没本地:返回原文 + 标记
return TranslationResult(
text=text + "\n\n[本条未翻译:配额耗尽且未启用本地翻译]",
engine="skip",
chars=chars,
)
engine = fb
logger.info("fallback to local translator for %d chars", chars)
# 3) 调用
async with self._sem:
try:
res = await engine.translate(text, source=source, target=target)
except Exception as e:
# 失败:降级
logger.exception("translate failed with %s: %s", engine.name, e)
fb = self._fallback()
if fb is not None and engine is not fb:
res = await fb.translate(text, source=source, target=target)
else:
res = TranslationResult(
text=text + f"\n\n[翻译失败: {e}]",
engine="skip",
chars=chars,
)
# 4) 写缓存(无论引擎)
try:
await r.set(ck, res.text, ex=60 * 60 * 24 * 30) # 30 天
except Exception:
pass
# 5) 计数(只在 tencent 上计)
if res.engine == "tencent":
try:
await self.add_usage(res.chars or chars)
except Exception as e:
logger.warning("add_usage failed: %s", e)
return res
# 全局单例
service = TranslationService()
# 让后端 worker 直接调
class _Protocol(Protocol):
async def translate(self, text: str, source: str = "auto", target: str = "zh") -> TranslationResult: ...

View File

@@ -0,0 +1,74 @@
"""腾讯云文本翻译 TMT。"""
from __future__ import annotations
import asyncio
import logging
import random
from typing import Any
from tencentcloud.common import credential
from tencentcloud.common.exception.tencent_cloud_sdk_exception import (
TencentCloudSDKException,
)
from tencentcloud.tmt.v20180321 import models, tmt_client
from app.config import settings
from app.services.translation.base import BaseTranslator, TranslationResult
logger = logging.getLogger("news.translate.tencent")
# 常见语种映射
_LANG_MAP = {
"en": "en",
"zh": "zh",
"ja": "ja",
"ko": "ko",
"fr": "fr",
"de": "de",
"es": "es",
"ru": "ru",
"ar": "ar",
}
class TencentTranslator(BaseTranslator):
name = "tencent"
def __init__(self):
if not settings.tencentcloud_secret_id or not settings.tencentcloud_secret_key:
raise RuntimeError("Tencent Cloud credentials missing")
self.cred = credential.Credential(
settings.tencentcloud_secret_id, settings.tencentcloud_secret_key
)
self.client = tmt_client.TmtClient(self.cred, settings.tencentcloud_region)
async def translate(
self, text: str, source: str = "auto", target: str = "zh"
) -> TranslationResult:
if not text.strip():
return TranslationResult(text=text, engine=self.name, chars=0)
source = _LANG_MAP.get(source, source if source != "auto" else "auto")
target = _LANG_MAP.get(target, target)
# 简单重试
for attempt in range(2):
try:
req = models.TextTranslateRequest()
req.SourceText = text
req.Source = source
req.Target = target
req.ProjectId = 0
# SDK 同步调用 → 放线程池
resp: Any = await asyncio.to_thread(self.client.TextTranslate, req)
out = getattr(resp, "TargetText", "") or ""
return TranslationResult(
text=out, engine=self.name, chars=len(text), cached=False
)
except TencentCloudSDKException as e:
logger.warning("tencent translate attempt %s failed: %s", attempt, e)
if attempt == 0:
await asyncio.sleep(0.5 + random.random())
else:
raise
raise RuntimeError("unreachable")