feat: initial MVP - FastAPI backend + Vue3 frontend + docker-compose
- backend: FastAPI + SQLAlchemy 2.0(async) + asyncpg + Alembic - 7 API routes: auth/me/articles/sources/bookmarks/subscriptions/admin - models: User/Source/Article/Bookmark/Subscription/ApiToken - services: RSS fetcher (feedparser) + Tencent TMT translator with quota + cache + local NLLB fallback - workers: APScheduler + asyncio pipeline (fetch -> dedupe -> insert -> translate) - seed scripts: create_user, seed_sources (5 RSS: Reuters/BBC/Al Jazeera/NHK/DW) - frontend: Vue 3 + Vite + Naive UI + Pinia + vue-router - pages: Login, Feed (24h), ArticleDetail, Sources, Bookmarks, AdminSources - deploy: docker-compose (postgres/redis/api/worker/frontend/caddy) - docs: README, DEPLOY, architecture, acceptance
This commit is contained in:
1
backend/app/services/__init__.py
Normal file
1
backend/app/services/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
"""Services (fetchers / translation)."""
|
||||
12
backend/app/services/fetchers/__init__.py
Normal file
12
backend/app/services/fetchers/__init__.py
Normal file
@@ -0,0 +1,12 @@
|
||||
"""Fetcher implementations."""
|
||||
from app.services.fetchers.base import BaseFetcher, FetchedItem
|
||||
from app.services.fetchers.rss import RSSFetcher
|
||||
|
||||
__all__ = ["BaseFetcher", "FetchedItem", "RSSFetcher"]
|
||||
|
||||
|
||||
def get_fetcher(kind: str, **kwargs) -> BaseFetcher:
|
||||
if kind == "rss":
|
||||
return RSSFetcher(**kwargs)
|
||||
# html_list / tg_channel: Phase 2 实现,这里抛错
|
||||
raise NotImplementedError(f"fetcher not implemented for kind={kind}")
|
||||
67
backend/app/services/fetchers/base.py
Normal file
67
backend/app/services/fetchers/base.py
Normal file
@@ -0,0 +1,67 @@
|
||||
"""Fetcher 抽象基类 + 通用工具。"""
|
||||
from __future__ import annotations
|
||||
|
||||
import hashlib
|
||||
from abc import ABC, abstractmethod
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import datetime
|
||||
from typing import Any
|
||||
|
||||
import httpx
|
||||
|
||||
from app.config import settings
|
||||
|
||||
|
||||
def normalize_url(url: str) -> str:
|
||||
"""去 utm_*、fragment、尾斜杠,用于 url_hash。"""
|
||||
from urllib.parse import urlsplit, urlunsplit, parse_qsl, urlencode
|
||||
|
||||
sp = urlsplit(url.strip())
|
||||
# 去掉 fragment
|
||||
fragment = ""
|
||||
# 过滤 utm_*
|
||||
qs = [(k, v) for k, v in parse_qsl(sp.query, keep_blank_values=True) if not k.lower().startswith("utm_")]
|
||||
query = urlencode(qs)
|
||||
# 路径末尾 /
|
||||
path = sp.path.rstrip("/") or "/"
|
||||
return urlunsplit((sp.scheme.lower(), sp.netloc.lower(), path, query, fragment))
|
||||
|
||||
|
||||
def url_hash(url: str) -> str:
|
||||
return hashlib.sha1(normalize_url(url).encode()).hexdigest()
|
||||
|
||||
|
||||
@dataclass
|
||||
class FetchedItem:
|
||||
"""统一返回结构:一个待入库的条目。"""
|
||||
|
||||
url: str
|
||||
title: str
|
||||
body_html: str | None = None
|
||||
body_text: str = ""
|
||||
published_at: datetime | None = None
|
||||
lang: str | None = None
|
||||
author: str | None = None
|
||||
image_url: str | None = None
|
||||
guid: str | None = None
|
||||
raw: dict[str, Any] = field(default_factory=dict)
|
||||
|
||||
|
||||
class BaseFetcher(ABC):
|
||||
def __init__(self, url: str, headers: dict | None = None):
|
||||
self.url = url
|
||||
self.headers = headers or {"User-Agent": "DiaryNews/0.1 (+https://github.com/)"}
|
||||
|
||||
@abstractmethod
|
||||
async def fetch(self) -> list[FetchedItem]:
|
||||
"""拉取并解析,返回 FetchedItem 列表。"""
|
||||
|
||||
async def _http_get(self) -> bytes:
|
||||
async with httpx.AsyncClient(
|
||||
timeout=settings.fetch_timeout,
|
||||
follow_redirects=True,
|
||||
headers=self.headers,
|
||||
) as client:
|
||||
r = await client.get(self.url)
|
||||
r.raise_for_status()
|
||||
return r.content
|
||||
100
backend/app/services/fetchers/rss.py
Normal file
100
backend/app/services/fetchers/rss.py
Normal file
@@ -0,0 +1,100 @@
|
||||
"""RSS / Atom fetcher(基于 feedparser)。"""
|
||||
from __future__ import annotations
|
||||
|
||||
from datetime import datetime, timezone
|
||||
from email.utils import parsedate_to_datetime
|
||||
|
||||
import feedparser
|
||||
from dateutil import parser as dtp
|
||||
|
||||
from app.services.fetchers.base import BaseFetcher, FetchedItem
|
||||
|
||||
|
||||
class RSSFetcher(BaseFetcher):
|
||||
async def fetch(self) -> list[FetchedItem]:
|
||||
raw = await self._http_get()
|
||||
# feedparser 在不同 Python 下处理 bytes/str
|
||||
try:
|
||||
text = raw.decode("utf-8")
|
||||
except UnicodeDecodeError:
|
||||
text = raw.decode("utf-8", errors="replace")
|
||||
feed = feedparser.parse(text)
|
||||
if feed.bozo and not feed.entries:
|
||||
# 整篇解析失败
|
||||
raise RuntimeError(f"RSS parse failed: {feed.bozo_exception}")
|
||||
items: list[FetchedItem] = []
|
||||
for e in feed.entries:
|
||||
url = e.get("link") or e.get("id")
|
||||
if not url:
|
||||
continue
|
||||
title = (e.get("title") or "").strip()
|
||||
if not title:
|
||||
continue
|
||||
|
||||
body_html = None
|
||||
body_text = ""
|
||||
if e.get("content"):
|
||||
# 选最长 content
|
||||
contents = sorted(e["content"], key=lambda c: -len(c.get("value", "")))
|
||||
body_html = contents[0].get("value")
|
||||
if not body_html:
|
||||
body_html = e.get("summary")
|
||||
if body_html:
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
soup = BeautifulSoup(body_html, "lxml")
|
||||
# 去 script/style
|
||||
for tag in soup(["script", "style", "noscript"]):
|
||||
tag.decompose()
|
||||
body_text = soup.get_text(separator="\n", strip=True)
|
||||
|
||||
published_at = _parse_dt(e.get("published") or e.get("updated") or e.get("created"))
|
||||
author = e.get("author")
|
||||
image_url = None
|
||||
if e.get("media_content"):
|
||||
try:
|
||||
image_url = e["media_content"][0].get("url")
|
||||
except (IndexError, KeyError, TypeError):
|
||||
pass
|
||||
if not image_url and e.get("media_thumbnail"):
|
||||
try:
|
||||
image_url = e["media_thumbnail"][0].get("url")
|
||||
except (IndexError, KeyError, TypeError):
|
||||
pass
|
||||
if not image_url and e.get("enclosures"):
|
||||
for enc in e["enclosures"]:
|
||||
if enc.get("type", "").startswith("image/"):
|
||||
image_url = enc.get("href") or enc.get("url")
|
||||
break
|
||||
|
||||
items.append(
|
||||
FetchedItem(
|
||||
url=url,
|
||||
title=title,
|
||||
body_html=body_html,
|
||||
body_text=body_text,
|
||||
published_at=published_at,
|
||||
lang=e.get("language") or feed.feed.get("language"),
|
||||
author=author,
|
||||
image_url=image_url,
|
||||
guid=e.get("id") or e.get("guid"),
|
||||
)
|
||||
)
|
||||
return items
|
||||
|
||||
|
||||
def _parse_dt(s: str | None) -> datetime | None:
|
||||
if not s:
|
||||
return None
|
||||
try:
|
||||
dt = dtp.parse(s)
|
||||
except (ValueError, TypeError, dtp.ParserError):
|
||||
try:
|
||||
dt = parsedate_to_datetime(s)
|
||||
except Exception:
|
||||
return None
|
||||
if dt is None:
|
||||
return None
|
||||
if dt.tzinfo is None:
|
||||
dt = dt.replace(tzinfo=timezone.utc)
|
||||
return dt.astimezone(timezone.utc)
|
||||
1
backend/app/services/translation/__init__.py
Normal file
1
backend/app/services/translation/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
"""Translation services."""
|
||||
26
backend/app/services/translation/base.py
Normal file
26
backend/app/services/translation/base.py
Normal file
@@ -0,0 +1,26 @@
|
||||
"""翻译后端抽象。"""
|
||||
from __future__ import annotations
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
from dataclasses import dataclass
|
||||
|
||||
|
||||
@dataclass
|
||||
class TranslationResult:
|
||||
text: str
|
||||
engine: str
|
||||
chars: int
|
||||
cached: bool = False
|
||||
|
||||
|
||||
class BaseTranslator(ABC):
|
||||
name: str = "base"
|
||||
|
||||
@abstractmethod
|
||||
async def translate(self, text: str, source: str = "auto", target: str = "zh") -> TranslationResult:
|
||||
"""同步调用,失败抛异常。"""
|
||||
|
||||
|
||||
def count_chars(s: str) -> int:
|
||||
"""近似的字符计数(Unicode 码点)。腾讯 TMT 按字符数计费。"""
|
||||
return len(s)
|
||||
62
backend/app/services/translation/local.py
Normal file
62
backend/app/services/translation/local.py
Normal file
@@ -0,0 +1,62 @@
|
||||
"""本地翻译(降级用,需要 transformers + 模型文件)。
|
||||
|
||||
默认关闭。启用方式:
|
||||
- LOCAL_TRANSLATE_ENABLED=true
|
||||
- 容器内预装模型(Volume 挂载)
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
|
||||
from app.config import settings
|
||||
from app.services.translation.base import BaseTranslator, TranslationResult
|
||||
|
||||
logger = logging.getLogger("news.translate.local")
|
||||
|
||||
|
||||
class LocalTranslator(BaseTranslator):
|
||||
name = "nllb"
|
||||
|
||||
def __init__(self):
|
||||
if not settings.local_translate_enabled:
|
||||
raise RuntimeError("LocalTranslator disabled in settings")
|
||||
# 模型懒加载(避免 import 时加载大模型)
|
||||
self._pipe = None
|
||||
|
||||
def _ensure_loaded(self):
|
||||
if self._pipe is not None:
|
||||
return
|
||||
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, pipeline
|
||||
|
||||
model_name = settings.local_translate_model
|
||||
logger.info("loading local translation model: %s", model_name)
|
||||
tok = AutoTokenizer.from_pretrained(model_name)
|
||||
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
|
||||
self._pipe = pipeline(
|
||||
"translation",
|
||||
model=model,
|
||||
tokenizer=tok,
|
||||
device=settings.local_translate_device,
|
||||
)
|
||||
|
||||
async def translate(
|
||||
self, text: str, source: str = "auto", target: str = "zh"
|
||||
) -> TranslationResult:
|
||||
if not text.strip():
|
||||
return TranslationResult(text=text, engine=self.name, chars=0)
|
||||
self._ensure_loaded()
|
||||
import asyncio
|
||||
|
||||
loop = asyncio.get_running_loop()
|
||||
# NLLB 的 src_lang/tgt_lang 比较长,简单按约定:en→zh_Hans
|
||||
src = "eng_Latn" if source in ("en", "auto") else source
|
||||
tgt = "zho_Hans" if target == "zh" else target
|
||||
out = await loop.run_in_executor(
|
||||
None,
|
||||
lambda: self._pipe(
|
||||
text, src_lang=src, tgt_lang=tgt, max_length=2000
|
||||
),
|
||||
)
|
||||
return TranslationResult(
|
||||
text=out[0]["translation_text"], engine=self.name, chars=len(text)
|
||||
)
|
||||
146
backend/app/services/translation/service.py
Normal file
146
backend/app/services/translation/service.py
Normal file
@@ -0,0 +1,146 @@
|
||||
"""翻译服务门面:配额检查 + 缓存 + 引擎选择 + 月度计数。"""
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import hashlib
|
||||
import logging
|
||||
from datetime import datetime, timezone
|
||||
from typing import Protocol
|
||||
|
||||
from app.config import settings
|
||||
from app.redis_client import get_redis
|
||||
from app.services.translation.base import BaseTranslator, TranslationResult
|
||||
from app.services.translation.local import LocalTranslator
|
||||
from app.services.translation.tencent import TencentTranslator
|
||||
|
||||
logger = logging.getLogger("news.translate.service")
|
||||
|
||||
|
||||
# 缓存 key
|
||||
def _cache_key(text: str, src: str, tgt: str) -> str:
|
||||
h = hashlib.sha1(f"{src}|{tgt}|{text}".encode()).hexdigest()
|
||||
return f"translation:cache:{h}"
|
||||
|
||||
|
||||
def _month_key() -> str:
|
||||
now = datetime.now(timezone.utc)
|
||||
return f"translation:month:{now:%Y%m}"
|
||||
|
||||
|
||||
class TranslationService:
|
||||
def __init__(self):
|
||||
self._tencent: BaseTranslator | None = None
|
||||
self._local: BaseTranslator | None = None
|
||||
self._sem = asyncio.Semaphore(3) # 并发限流
|
||||
|
||||
def _primary(self) -> BaseTranslator:
|
||||
if self._tencent is None:
|
||||
self._tencent = TencentTranslator()
|
||||
return self._tencent
|
||||
|
||||
def _fallback(self) -> BaseTranslator | None:
|
||||
if self._local is None and settings.local_translate_enabled:
|
||||
try:
|
||||
self._local = LocalTranslator()
|
||||
except Exception as e:
|
||||
logger.warning("local translator init failed: %s", e)
|
||||
self._local = None
|
||||
return self._local
|
||||
|
||||
async def can_use_tencent(self, chars: int) -> bool:
|
||||
if not settings.tencentcloud_secret_id:
|
||||
return False
|
||||
r = get_redis()
|
||||
used = int(await r.get(_month_key()) or 0)
|
||||
buffered = int(
|
||||
settings.tencent_tmt_quota_month * (1 - settings.tencent_tmt_quota_buffer)
|
||||
)
|
||||
return (used + chars) <= buffered
|
||||
|
||||
async def add_usage(self, chars: int) -> None:
|
||||
r = get_redis()
|
||||
# 用 INCRBY + EXPIRE 月初;简单做法:每次 set + 设 TTL
|
||||
key = _month_key()
|
||||
async with r.pipeline(transaction=False) as pipe:
|
||||
pipe.incrby(key, chars)
|
||||
# 月底过期(下下月 1 日)
|
||||
now = datetime.now(timezone.utc)
|
||||
if now.month == 12:
|
||||
next_month = now.replace(year=now.year + 1, month=1, day=1)
|
||||
else:
|
||||
next_month = now.replace(month=now.month + 1, day=1)
|
||||
ttl = int((next_month - now).total_seconds()) + 86400
|
||||
pipe.expire(key, ttl)
|
||||
await pipe.execute()
|
||||
|
||||
async def translate(
|
||||
self, text: str, source: str = "auto", target: str = "zh"
|
||||
) -> TranslationResult:
|
||||
if not text.strip():
|
||||
return TranslationResult(text=text, engine="skip", chars=0)
|
||||
|
||||
chars = len(text)
|
||||
# 1) 缓存
|
||||
r = get_redis()
|
||||
ck = _cache_key(text, source, target)
|
||||
cached = await r.get(ck)
|
||||
if cached is not None:
|
||||
return TranslationResult(text=cached, engine="cache", chars=chars, cached=True)
|
||||
|
||||
# 2) 选引擎
|
||||
use_tencent = await self.can_use_tencent(chars)
|
||||
engine: BaseTranslator
|
||||
if use_tencent:
|
||||
engine = self._primary()
|
||||
else:
|
||||
fb = self._fallback()
|
||||
if fb is None:
|
||||
# 没本地:返回原文 + 标记
|
||||
return TranslationResult(
|
||||
text=text + "\n\n[本条未翻译:配额耗尽且未启用本地翻译]",
|
||||
engine="skip",
|
||||
chars=chars,
|
||||
)
|
||||
engine = fb
|
||||
logger.info("fallback to local translator for %d chars", chars)
|
||||
|
||||
# 3) 调用
|
||||
async with self._sem:
|
||||
try:
|
||||
res = await engine.translate(text, source=source, target=target)
|
||||
except Exception as e:
|
||||
# 失败:降级
|
||||
logger.exception("translate failed with %s: %s", engine.name, e)
|
||||
fb = self._fallback()
|
||||
if fb is not None and engine is not fb:
|
||||
res = await fb.translate(text, source=source, target=target)
|
||||
else:
|
||||
res = TranslationResult(
|
||||
text=text + f"\n\n[翻译失败: {e}]",
|
||||
engine="skip",
|
||||
chars=chars,
|
||||
)
|
||||
|
||||
# 4) 写缓存(无论引擎)
|
||||
try:
|
||||
await r.set(ck, res.text, ex=60 * 60 * 24 * 30) # 30 天
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# 5) 计数(只在 tencent 上计)
|
||||
if res.engine == "tencent":
|
||||
try:
|
||||
await self.add_usage(res.chars or chars)
|
||||
except Exception as e:
|
||||
logger.warning("add_usage failed: %s", e)
|
||||
|
||||
return res
|
||||
|
||||
|
||||
# 全局单例
|
||||
service = TranslationService()
|
||||
|
||||
|
||||
# 让后端 worker 直接调
|
||||
class _Protocol(Protocol):
|
||||
async def translate(self, text: str, source: str = "auto", target: str = "zh") -> TranslationResult: ...
|
||||
74
backend/app/services/translation/tencent.py
Normal file
74
backend/app/services/translation/tencent.py
Normal file
@@ -0,0 +1,74 @@
|
||||
"""腾讯云文本翻译 TMT。"""
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
import random
|
||||
from typing import Any
|
||||
|
||||
from tencentcloud.common import credential
|
||||
from tencentcloud.common.exception.tencent_cloud_sdk_exception import (
|
||||
TencentCloudSDKException,
|
||||
)
|
||||
from tencentcloud.tmt.v20180321 import models, tmt_client
|
||||
|
||||
from app.config import settings
|
||||
from app.services.translation.base import BaseTranslator, TranslationResult
|
||||
|
||||
logger = logging.getLogger("news.translate.tencent")
|
||||
|
||||
# 常见语种映射
|
||||
_LANG_MAP = {
|
||||
"en": "en",
|
||||
"zh": "zh",
|
||||
"ja": "ja",
|
||||
"ko": "ko",
|
||||
"fr": "fr",
|
||||
"de": "de",
|
||||
"es": "es",
|
||||
"ru": "ru",
|
||||
"ar": "ar",
|
||||
}
|
||||
|
||||
|
||||
class TencentTranslator(BaseTranslator):
|
||||
name = "tencent"
|
||||
|
||||
def __init__(self):
|
||||
if not settings.tencentcloud_secret_id or not settings.tencentcloud_secret_key:
|
||||
raise RuntimeError("Tencent Cloud credentials missing")
|
||||
self.cred = credential.Credential(
|
||||
settings.tencentcloud_secret_id, settings.tencentcloud_secret_key
|
||||
)
|
||||
self.client = tmt_client.TmtClient(self.cred, settings.tencentcloud_region)
|
||||
|
||||
async def translate(
|
||||
self, text: str, source: str = "auto", target: str = "zh"
|
||||
) -> TranslationResult:
|
||||
if not text.strip():
|
||||
return TranslationResult(text=text, engine=self.name, chars=0)
|
||||
|
||||
source = _LANG_MAP.get(source, source if source != "auto" else "auto")
|
||||
target = _LANG_MAP.get(target, target)
|
||||
|
||||
# 简单重试
|
||||
for attempt in range(2):
|
||||
try:
|
||||
req = models.TextTranslateRequest()
|
||||
req.SourceText = text
|
||||
req.Source = source
|
||||
req.Target = target
|
||||
req.ProjectId = 0
|
||||
# SDK 同步调用 → 放线程池
|
||||
resp: Any = await asyncio.to_thread(self.client.TextTranslate, req)
|
||||
out = getattr(resp, "TargetText", "") or ""
|
||||
return TranslationResult(
|
||||
text=out, engine=self.name, chars=len(text), cached=False
|
||||
)
|
||||
except TencentCloudSDKException as e:
|
||||
logger.warning("tencent translate attempt %s failed: %s", attempt, e)
|
||||
if attempt == 0:
|
||||
await asyncio.sleep(0.5 + random.random())
|
||||
else:
|
||||
raise
|
||||
raise RuntimeError("unreachable")
|
||||
Reference in New Issue
Block a user