feat: initial MVP - FastAPI backend + Vue3 frontend + docker-compose

- backend: FastAPI + SQLAlchemy 2.0(async) + asyncpg + Alembic
- 7 API routes: auth/me/articles/sources/bookmarks/subscriptions/admin
- models: User/Source/Article/Bookmark/Subscription/ApiToken
- services: RSS fetcher (feedparser) + Tencent TMT translator with quota + cache + local NLLB fallback
- workers: APScheduler + asyncio pipeline (fetch -> dedupe -> insert -> translate)
- seed scripts: create_user, seed_sources (5 RSS: Reuters/BBC/Al Jazeera/NHK/DW)
- frontend: Vue 3 + Vite + Naive UI + Pinia + vue-router
- pages: Login, Feed (24h), ArticleDetail, Sources, Bookmarks, AdminSources
- deploy: docker-compose (postgres/redis/api/worker/frontend/caddy)
- docs: README, DEPLOY, architecture, acceptance
This commit is contained in:
Mavis
2026-06-07 21:51:01 +08:00
commit 60b062daf2
81 changed files with 5540 additions and 0 deletions

11
backend/.dockerignore Normal file
View File

@@ -0,0 +1,11 @@
__pycache__
*.pyc
.pytest_cache
.mypy_cache
.ruff_cache
.venv
venv
.env
*.egg-info
build
dist

36
backend/Dockerfile Normal file
View File

@@ -0,0 +1,36 @@
FROM python:3.12-slim
ENV PYTHONUNBUFFERED=1 \
PYTHONDONTWRITEBYTECODE=1 \
PIP_NO_CACHE_DIR=1 \
PIP_DISABLE_PIP_VERSION_CHECK=1 \
TZ=Asia/Hong_Kong
# 系统依赖
RUN apt-get update && apt-get install -y --no-install-recommends \
build-essential \
libpq-dev \
curl \
ca-certificates \
tzdata \
&& ln -sf /usr/share/zoneinfo/Asia/Hong_Kong /etc/localtime \
&& echo "Asia/Hong_Kong" > /etc/timezone \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/*
WORKDIR /app
# 先装依赖(利用 Docker 缓存)
COPY pyproject.toml ./
RUN pip install --upgrade pip && \
pip install -e .
# 代码(开发期用 volume 覆盖,这里也保留一份)
COPY app ./app
COPY alembic ./alembic
COPY alembic.ini ./
EXPOSE 8000
# 默认启动 uvicorn;docker-compose 中 worker 容器会用别的 command
CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"]

42
backend/alembic.ini Normal file
View File

@@ -0,0 +1,42 @@
[alembic]
script_location = alembic
prepend_sys_path = .
version_path_separator = os
# sqlalchemy.url 从 env.py 注入,这里留空
sqlalchemy.url =
[post_write_hooks]
[loggers]
keys = root,sqlalchemy,alembic
[handlers]
keys = console
[formatters]
keys = generic
[logger_root]
level = WARN
handlers = console
qualname =
[logger_sqlalchemy]
level = WARN
handlers =
qualname = sqlalchemy.engine
[logger_alembic]
level = INFO
handlers =
qualname = alembic
[handler_console]
class = StreamHandler
args = (sys.stderr,)
level = NOTSET
formatter = generic
[formatter_generic]
format = %(levelname)-5.5s [%(name)s] %(message)s
datefmt = %H:%M:%S

59
backend/alembic/env.py Normal file
View File

@@ -0,0 +1,59 @@
"""Alembic 环境配置:从 app.config 读取 URL,启用 autogenerate。"""
from __future__ import annotations
import sys
from logging.config import fileConfig
from pathlib import Path
from alembic import context
from sqlalchemy import engine_from_config, pool
# 让 alembic 能 import app
sys.path.insert(0, str(Path(__file__).resolve().parents[1]))
from app.config import settings # noqa: E402
from app.database import Base # noqa: E402
from app.models import * # noqa: F401, F403, E402
config = context.config
if config.config_file_name is not None:
fileConfig(config.config_file_name)
config.set_main_option("sqlalchemy.url", settings.sync_database_url)
target_metadata = Base.metadata
def run_migrations_offline() -> None:
url = config.get_main_option("sqlalchemy.url")
context.configure(
url=url,
target_metadata=target_metadata,
literal_binds=True,
dialect_opts={"paramstyle": "named"},
compare_type=True,
)
with context.begin_transaction():
context.run_migrations()
def run_migrations_online() -> None:
connectable = engine_from_config(
config.get_section(config.config_ini_section, {}),
prefix="sqlalchemy.",
poolclass=pool.NullPool,
)
with connectable.connect() as connection:
context.configure(
connection=connection,
target_metadata=target_metadata,
compare_type=True,
)
with context.begin_transaction():
context.run_migrations()
if context.is_offline_mode():
run_migrations_offline()
else:
run_migrations_online()

View File

@@ -0,0 +1,24 @@
"""${message}
Revision ID: ${up_revision}
Revises: ${down_revision | comma,n}
Create Date: ${create_date}
"""
from typing import Sequence, Union
from alembic import op
import sqlalchemy as sa
${imports if imports else ""}
revision: str = ${repr(up_revision)}
down_revision: Union[str, None] = ${repr(down_revision)}
branch_labels: Union[str, Sequence[str], None] = ${repr(branch_labels)}
depends_on: Union[str, Sequence[str], None] = ${repr(depends_on)}
def upgrade() -> None:
${upgrades if upgrades else "pass"}
def downgrade() -> None:
${downgrades if downgrades else "pass"}

View File

@@ -0,0 +1,180 @@
"""initial schema
Revision ID: 0001
Revises:
Create Date: 2026-06-07
"""
from __future__ import annotations
from typing import Sequence, Union
import sqlalchemy as sa
from alembic import op
from sqlalchemy.dialects import postgresql
revision: str = "0001"
down_revision: Union[str, None] = None
branch_labels = None
depends_on = None
def upgrade() -> None:
# === 用户 ===
user_role = postgresql.ENUM("owner", "member", name="user_role", create_type=True)
user_role.create(op.get_bind(), checkfirst=True)
op.create_table(
"users",
sa.Column("id", sa.Integer, primary_key=True),
sa.Column("username", sa.String(64), unique=True, index=True, nullable=False),
sa.Column("email", sa.String(255), unique=True, index=True),
sa.Column("password_hash", sa.String(255), nullable=False),
sa.Column(
"role",
postgresql.ENUM("owner", "member", name="user_role", create_type=False),
nullable=False,
),
sa.Column("is_active", sa.Boolean, nullable=False, server_default=sa.text("true")),
sa.Column("display_name", sa.String(128)),
sa.Column("created_at", sa.DateTime(timezone=True), server_default=sa.func.now(), nullable=False),
sa.Column("last_login_at", sa.DateTime(timezone=True)),
)
# === 源 ===
source_kind = postgresql.ENUM("rss", "html_list", "tg_channel", name="source_kind", create_type=True)
source_kind.create(op.get_bind(), checkfirst=True)
op.create_table(
"sources",
sa.Column("id", sa.Integer, primary_key=True),
sa.Column("name", sa.String(128), nullable=False),
sa.Column("slug", sa.String(128), unique=True, index=True, nullable=False),
sa.Column(
"kind",
postgresql.ENUM("rss", "html_list", "tg_channel", name="source_kind", create_type=False),
nullable=False,
),
sa.Column("url", sa.Text, nullable=False),
sa.Column("detail_selector", postgresql.JSONB),
sa.Column("fetch_interval_min", sa.Integer, nullable=False, server_default="60"),
sa.Column("fetch_cron", sa.String(64)),
sa.Column("translate_to", sa.String(8), nullable=False, server_default="zh"),
sa.Column("enabled", sa.Boolean, nullable=False, server_default=sa.text("true")),
sa.Column("region", sa.String(32), index=True),
sa.Column("language_src", sa.String(8)),
sa.Column("priority", sa.Integer, nullable=False, server_default="50", index=True),
sa.Column("headers_json", postgresql.JSONB),
sa.Column("last_fetched_at", sa.DateTime(timezone=True)),
sa.Column("last_status", sa.String(64)),
sa.Column("consecutive_failures", sa.Integer, nullable=False, server_default="0"),
sa.Column("created_at", sa.DateTime(timezone=True), server_default=sa.func.now(), nullable=False),
sa.Column(
"updated_at",
sa.DateTime(timezone=True),
server_default=sa.func.now(),
nullable=False,
),
)
# === 文章 ===
op.create_table(
"articles",
sa.Column("id", sa.BigInteger, primary_key=True),
sa.Column("source_id", sa.Integer, sa.ForeignKey("sources.id", ondelete="CASCADE"), nullable=False),
sa.Column("url", sa.Text, nullable=False),
sa.Column("url_hash", sa.String(40), unique=True, nullable=False, index=True),
sa.Column("guid", sa.String(255), index=True),
sa.Column("title", sa.Text, nullable=False),
sa.Column("body_html", sa.Text),
sa.Column("body_text", sa.Text, nullable=False, server_default=""),
sa.Column("lang_src", sa.String(8)),
sa.Column("author", sa.String(255)),
sa.Column("image_url", sa.Text),
sa.Column("title_zh", sa.Text),
sa.Column("body_zh_html", sa.Text),
sa.Column("body_zh_text", sa.Text),
sa.Column("summary_zh", sa.Text),
sa.Column("translation_status", sa.String(16), nullable=False, server_default="pending"),
sa.Column("translation_engine", sa.String(16)),
sa.Column("translation_chars", sa.Integer, nullable=False, server_default="0"),
sa.Column("translated_at", sa.DateTime(timezone=True)),
sa.Column("category", sa.String(32), index=True),
sa.Column("commentary", sa.Text),
sa.Column("entities", postgresql.JSONB),
sa.Column("sentiment", sa.Float),
sa.Column("topic_id", sa.String(64), index=True),
sa.Column("bias", sa.String(16)),
sa.Column("duplicate_of", sa.BigInteger, sa.ForeignKey("articles.id", ondelete="SET NULL")),
sa.Column("published_at", sa.DateTime(timezone=True), index=True),
sa.Column("fetched_at", sa.DateTime(timezone=True), server_default=sa.func.now(), nullable=False),
sa.Column("created_at", sa.DateTime(timezone=True), server_default=sa.func.now(), nullable=False),
)
op.create_index("ix_articles_source_published", "articles", ["source_id", "published_at"])
op.create_index("ix_articles_status_published", "articles", ["translation_status", "published_at"])
# === 收藏 ===
op.create_table(
"bookmarks",
sa.Column("id", sa.Integer, primary_key=True),
sa.Column("user_id", sa.Integer, sa.ForeignKey("users.id", ondelete="CASCADE"), nullable=False),
sa.Column("article_id", sa.BigInteger, sa.ForeignKey("articles.id", ondelete="CASCADE"), nullable=False),
sa.Column("note", sa.Text),
sa.Column("created_at", sa.DateTime(timezone=True), server_default=sa.func.now(), nullable=False),
sa.UniqueConstraint("user_id", "article_id", name="uq_bookmark_user_article"),
)
op.create_index("ix_bookmarks_user_id", "bookmarks", ["user_id"])
op.create_index("ix_bookmarks_article_id", "bookmarks", ["article_id"])
# === 订阅 ===
subscription_match = postgresql.ENUM("any", "title", "body", name="subscription_match", create_type=True)
subscription_match.create(op.get_bind(), checkfirst=True)
op.create_table(
"subscriptions",
sa.Column("id", sa.Integer, primary_key=True),
sa.Column("user_id", sa.Integer, sa.ForeignKey("users.id", ondelete="CASCADE"), nullable=False),
sa.Column("keyword", sa.String(255), nullable=False),
sa.Column(
"match_in",
postgresql.ENUM("any", "title", "body", name="subscription_match", create_type=False),
nullable=False,
),
sa.Column("channel", sa.String(32), nullable=False, server_default="telegram"),
sa.Column("target", sa.Text),
sa.Column("enabled", sa.Boolean, nullable=False, server_default=sa.text("true")),
sa.Column("last_hit_at", sa.DateTime(timezone=True)),
sa.Column("created_at", sa.DateTime(timezone=True), server_default=sa.func.now(), nullable=False),
)
op.create_index("ix_subscriptions_user_id", "subscriptions", ["user_id"])
# === API Token ===
op.create_table(
"api_tokens",
sa.Column("id", sa.Integer, primary_key=True),
sa.Column("user_id", sa.Integer, sa.ForeignKey("users.id", ondelete="CASCADE"), nullable=False),
sa.Column("name", sa.String(64), nullable=False),
sa.Column("token_hash", sa.String(128), unique=True, nullable=False),
sa.Column("last_used_at", sa.DateTime(timezone=True)),
sa.Column("expires_at", sa.DateTime(timezone=True)),
sa.Column("revoked_at", sa.DateTime(timezone=True)),
sa.Column("created_at", sa.DateTime(timezone=True), server_default=sa.func.now(), nullable=False),
)
op.create_index("ix_api_tokens_user_id", "api_tokens", ["user_id"])
def downgrade() -> None:
op.drop_table("api_tokens")
op.drop_table("subscriptions")
op.drop_index("ix_subscriptions_user_id", table_name="subscriptions")
op.drop_table("bookmarks")
op.drop_index("ix_bookmarks_user_id", table_name="bookmarks")
op.drop_index("ix_bookmarks_article_id", table_name="bookmarks")
op.drop_index("ix_articles_status_published", table_name="articles")
op.drop_index("ix_articles_source_published", table_name="articles")
op.drop_table("articles")
op.drop_table("sources")
op.drop_table("users")
op.execute("DROP TYPE IF EXISTS subscription_match")
op.execute("DROP TYPE IF EXISTS source_kind")
op.execute("DROP TYPE IF EXISTS user_role")

3
backend/app/__init__.py Normal file
View File

@@ -0,0 +1,3 @@
"""News Aggregator backend."""
__version__ = "0.1.0"

View File

@@ -0,0 +1 @@
"""API routes."""

199
backend/app/api/admin.py Normal file
View File

@@ -0,0 +1,199 @@
"""Admin API(仅 owner)。
- 源管理 CRUD
- 手动触发抓取 / 重译
- 源健康看板
- 翻译配额管理
"""
from __future__ import annotations
from datetime import datetime, timedelta, timezone
from typing import Any
from fastapi import APIRouter, BackgroundTasks, Depends, HTTPException, status
from pydantic import BaseModel
from sqlalchemy import func, select
from sqlalchemy.exc import IntegrityError
from sqlalchemy.ext.asyncio import AsyncSession
from app.core.deps import require_owner
from app.database import get_session
from app.models.article import Article
from app.models.source import Source
from app.models.user import User
from app.schemas.source import SourceIn, SourceOut, SourceUpdate
router = APIRouter(prefix="/admin", tags=["admin"], dependencies=[Depends(require_owner)])
# === Source CRUD ===
@router.get("/sources", response_model=list[SourceOut])
async def list_sources_all(session: AsyncSession = Depends(get_session)):
rows = (await session.execute(select(Source).order_by(Source.id))).scalars()
return [SourceOut.model_validate(s) for s in rows]
@router.post("/sources", response_model=SourceOut, status_code=status.HTTP_201_CREATED)
async def create_source(body: SourceIn, session: AsyncSession = Depends(get_session)):
src = Source(
name=body.name,
slug=body.slug,
kind=body.kind,
url=str(body.url),
detail_selector=body.detail_selector,
region=body.region,
language_src=body.language_src,
priority=body.priority,
fetch_interval_min=body.fetch_interval_min,
translate_to=body.translate_to,
enabled=body.enabled,
headers_json=body.headers_json,
)
session.add(src)
try:
await session.commit()
except IntegrityError as e:
await session.rollback()
raise HTTPException(status.HTTP_409_CONFLICT, f"slug '{body.slug}' already exists") from e
await session.refresh(src)
return SourceOut.model_validate(src)
@router.patch("/sources/{source_id}", response_model=SourceOut)
async def update_source(
source_id: int,
body: SourceUpdate,
session: AsyncSession = Depends(get_session),
):
src = (await session.execute(select(Source).where(Source.id == source_id))).scalar_one_or_none()
if not src:
raise HTTPException(status.HTTP_404_NOT_FOUND, "Source not found")
for k, v in body.model_dump(exclude_unset=True).items():
setattr(src, k, v)
await session.commit()
await session.refresh(src)
return SourceOut.model_validate(src)
@router.delete("/sources/{source_id}", status_code=status.HTTP_204_NO_CONTENT)
async def delete_source(source_id: int, session: AsyncSession = Depends(get_session)):
src = (await session.execute(select(Source).where(Source.id == source_id))).scalar_one_or_none()
if not src:
raise HTTPException(status.HTTP_404_NOT_FOUND, "Source not found")
await session.delete(src)
await session.commit()
return None
# === 手动触发 ===
class TriggerResponse(BaseModel):
triggered: bool
detail: str = ""
@router.post("/refresh/{source_id}", response_model=TriggerResponse)
async def refresh_source(
source_id: int,
background: BackgroundTasks,
session: AsyncSession = Depends(get_session),
):
src = (await session.execute(select(Source).where(Source.id == source_id))).scalar_one_or_none()
if not src:
raise HTTPException(status.HTTP_404_NOT_FOUND, "Source not found")
if not src.enabled:
raise HTTPException(status.HTTP_400_BAD_REQUEST, "Source disabled")
# 走 background,不等结果
from app.workers.pipeline import fetch_one_source
background.add_task(fetch_one_source, source_id)
return TriggerResponse(triggered=True, detail=f"queued fetch for {src.slug}")
async def _run_fetch(source_id: int) -> None:
"""(deprecated) 走 background 用的薄包装,见 refresh_source。"""
from app.workers.pipeline import fetch_one_source
await fetch_one_source(source_id)
@router.post("/translation/rerun/{article_id}", response_model=TriggerResponse)
async def rerun_translation(
article_id: int,
background: BackgroundTasks,
session: AsyncSession = Depends(get_session),
):
art = (await session.execute(select(Article).where(Article.id == article_id))).scalar_one_or_none()
if not art:
raise HTTPException(status.HTTP_404_NOT_FOUND, "Article not found")
art.translation_status = "pending"
art.title_zh = None
art.body_zh_text = None
art.body_zh_html = None
art.translated_at = None
art.translation_engine = None
await session.commit()
from app.workers.pipeline import translate_article
background.add_task(translate_article, article_id)
return TriggerResponse(triggered=True, detail=f"queued translation for article {article_id}")
# === 健康看板 ===
class HealthOut(BaseModel):
source_id: int
slug: str
name: str
enabled: bool
last_fetched_at: datetime | None
last_status: str | None
consecutive_failures: int
fetch_interval_min: int
article_count_24h: int
@router.get("/health", response_model=list[HealthOut])
async def health(session: AsyncSession = Depends(get_session)):
rows = (await session.execute(select(Source).order_by(Source.priority.desc()))).scalars()
out: list[HealthOut] = []
for s in rows:
c24 = (
await session.execute(
select(func.count(Article.id)).where(
Article.source_id == s.id,
Article.fetched_at >= datetime.now(timezone.utc).replace(tzinfo=None)
- timedelta(hours=24),
)
)
).scalar_one()
out.append(
HealthOut(
source_id=s.id,
slug=s.slug,
name=s.name,
enabled=s.enabled,
last_fetched_at=s.last_fetched_at,
last_status=s.last_status,
consecutive_failures=s.consecutive_failures,
fetch_interval_min=s.fetch_interval_min,
article_count_24h=c24 or 0,
)
)
return out
# === 翻译配额(管理员视图) ===
class QuotaReset(BaseModel):
used_chars: int = 0
@router.post("/translation/quota/reset")
async def reset_quota(payload: QuotaReset) -> dict[str, Any]:
from app.redis_client import get_redis
r = get_redis()
now = datetime.now(timezone.utc)
key = f"translation:month:{now:%Y%m}"
await r.set(key, payload.used_chars)
return {"key": key, "value": payload.used_chars}

194
backend/app/api/articles.py Normal file
View File

@@ -0,0 +1,194 @@
"""/articles 列表与详情。"""
from __future__ import annotations
import base64
import json
from datetime import datetime
from typing import Annotated
from fastapi import APIRouter, Depends, HTTPException, Query, status
from sqlalchemy import and_, desc, func, or_, select
from sqlalchemy.ext.asyncio import AsyncSession
from app.core.deps import get_current_user
from app.database import get_session
from app.models.article import Article
from app.models.bookmark import Bookmark
from app.models.source import Source
from app.models.user import User
from app.schemas.article import (
ArticleDetail,
ArticleListItem,
ArticleListResponse,
SourceBrief,
)
router = APIRouter(prefix="/articles", tags=["articles"])
def _encode_cursor(article: Article) -> str:
payload = {"id": article.id, "ts": int(article.fetched_at.timestamp())}
return base64.urlsafe_b64encode(json.dumps(payload).encode()).decode()
def _decode_cursor(cur: str) -> tuple[int, datetime]:
try:
data = json.loads(base64.urlsafe_b64decode(cur.encode()).decode())
return int(data["id"]), datetime.fromtimestamp(int(data["ts"]))
except Exception:
raise HTTPException(status.HTTP_400_BAD_REQUEST, "Invalid cursor")
@router.get("", response_model=ArticleListResponse)
async def list_articles(
since: datetime | None = Query(default=None, description="起时间 UTC"),
until: datetime | None = Query(default=None, description="止时间 UTC"),
source: str | None = Query(default=None, description="逗号分隔 source slug"),
category: str | None = None,
q: str | None = Query(default=None, description="标题/正文搜索"),
lang: Annotated[str, Query(pattern=r"^(src|zh|both)$")] = "both",
limit: int = Query(default=50, ge=1, le=200),
cursor: str | None = None,
starred_only: bool = False,
user: User = Depends(get_current_user),
session: AsyncSession = Depends(get_session),
):
stmt = (
select(Article, Source)
.join(Source, Source.id == Article.source_id)
.where(Article.duplicate_of.is_(None))
)
# 默认过去 24h
if since is None and until is None and cursor is None:
since = _default_since_24h()
if since:
stmt = stmt.where(Article.published_at >= since)
if until:
stmt = stmt.where(Article.published_at <= until)
if category:
stmt = stmt.where(Article.category == category)
if source:
slugs = [s.strip() for s in source.split(",") if s.strip()]
if slugs:
stmt = stmt.where(Source.slug.in_(slugs))
if q:
like = f"%{q}%"
stmt = stmt.where(or_(Article.title.ilike(like), Article.body_text.ilike(like)))
# 语言过滤
if lang == "zh":
stmt = stmt.where(Article.title_zh.is_not(None))
elif lang == "src":
# 只要原文已有
pass
if cursor:
last_id, _ = _decode_cursor(cursor)
stmt = stmt.where(Article.id < last_id)
if starred_only:
stmt = stmt.join(Bookmark, and_(Bookmark.article_id == Article.id, Bookmark.user_id == user.id))
stmt = stmt.order_by(desc(Article.published_at), desc(Article.id)).limit(limit + 1)
rows = (await session.execute(stmt)).all()
has_more = len(rows) > limit
rows = rows[:limit]
# 标记 is_starred(批量)
ids = [a.id for a, _ in rows]
starred_ids: set[int] = set()
if ids:
bm_rows = (
await session.execute(
select(Bookmark.article_id).where(
Bookmark.user_id == user.id, Bookmark.article_id.in_(ids)
)
)
).all()
starred_ids = {b[0] for b in bm_rows}
items = []
for art, src in rows:
item = ArticleListItem(
id=art.id,
source=SourceBrief.model_validate(src),
title=art.title,
title_zh=art.title_zh,
summary_zh=art.summary_zh,
lang_src=art.lang_src,
translation_status=art.translation_status,
category=art.category,
published_at=art.published_at,
fetched_at=art.fetched_at,
image_url=art.image_url,
is_starred=art.id in starred_ids,
)
items.append(item)
next_cursor = _encode_cursor(rows[-1][0]) if has_more and rows else None
return ArticleListResponse(items=items, next_cursor=next_cursor, total=None)
@router.get("/{article_id}", response_model=ArticleDetail)
async def get_article(
article_id: int,
user: User = Depends(get_current_user),
session: AsyncSession = Depends(get_session),
):
art = (
await session.execute(
select(Article, Source)
.join(Source, Source.id == Article.source_id)
.where(Article.id == article_id)
)
.first()
)
if not art:
raise HTTPException(status.HTTP_404_NOT_FOUND, "Article not found")
article, source = art
is_starred = (
await session.execute(
select(Bookmark.id).where(
Bookmark.user_id == user.id, Bookmark.article_id == article.id
)
)
).first() is not None
return ArticleDetail(
id=article.id,
source=SourceBrief.model_validate(source),
url=article.url,
title=article.title,
body_html=article.body_html,
body_text=article.body_text,
title_zh=article.title_zh,
body_zh_html=article.body_zh_html,
body_zh_text=article.body_zh_text,
summary_zh=article.summary_zh,
lang_src=article.lang_src,
author=article.author,
image_url=article.image_url,
translation_status=article.translation_status,
translation_engine=article.translation_engine,
translated_at=article.translated_at,
category=article.category,
commentary=article.commentary,
entities=article.entities,
sentiment=article.sentiment,
duplicate_of=article.duplicate_of,
published_at=article.published_at,
fetched_at=article.fetched_at,
is_starred=is_starred,
)
def _default_since_24h() -> datetime:
from datetime import timedelta
return datetime.utcnow() - timedelta(hours=24)

65
backend/app/api/auth.py Normal file
View File

@@ -0,0 +1,65 @@
"""登录/刷新/登出。"""
from __future__ import annotations
from datetime import datetime, timezone
from fastapi import APIRouter, Depends, HTTPException, status
from jwt.exceptions import InvalidTokenError
from sqlalchemy import select
from sqlalchemy.ext.asyncio import AsyncSession
from app.config import settings
from app.core.security import (
create_access_token,
create_refresh_token,
decode_token,
verify_password,
)
from app.database import get_session
from app.models.user import User
from app.schemas.auth import LoginRequest, RefreshRequest, TokenPair
router = APIRouter(prefix="/auth", tags=["auth"])
def _pair_for(user: User) -> TokenPair:
access = create_access_token(user.id, extra={"role": user.role.value})
refresh = create_refresh_token(user.id)
return TokenPair(
access_token=access,
refresh_token=refresh,
expires_in=settings.access_token_ttl_min * 60,
)
@router.post("/login", response_model=TokenPair)
async def login(body: LoginRequest, session: AsyncSession = Depends(get_session)):
user = (
await session.execute(select(User).where(User.username == body.username))
.scalars()
.first()
)
if not user or not user.is_active or not verify_password(body.password, user.password_hash):
raise HTTPException(status.HTTP_401_UNAUTHORIZED, "Invalid credentials")
user.last_login_at = datetime.now(timezone.utc)
await session.commit()
return _pair_for(user)
@router.post("/refresh", response_model=TokenPair)
async def refresh(body: RefreshRequest, session: AsyncSession = Depends(get_session)):
try:
payload = decode_token(body.refresh_token)
if payload.get("type") != "refresh":
raise InvalidTokenError("wrong type")
uid = int(payload["sub"])
except (InvalidTokenError, KeyError, ValueError):
raise HTTPException(status.HTTP_401_UNAUTHORIZED, "Invalid refresh token")
user = (
await session.execute(select(User).where(User.id == uid, User.is_active.is_(True)))
.scalars()
.first()
)
if not user:
raise HTTPException(status.HTTP_401_UNAUTHORIZED, "User not found")
return _pair_for(user)

View File

@@ -0,0 +1,73 @@
"""/bookmarks 收藏。"""
from __future__ import annotations
from fastapi import APIRouter, Depends, HTTPException, status
from sqlalchemy import select
from sqlalchemy.ext.asyncio import AsyncSession
from app.core.deps import get_current_user
from app.database import get_session
from app.models.article import Article
from app.models.bookmark import Bookmark
from app.models.user import User
from app.schemas.misc import BookmarkIn, BookmarkOut
router = APIRouter(prefix="/bookmarks", tags=["bookmarks"])
@router.post("", response_model=BookmarkOut, status_code=status.HTTP_201_CREATED)
async def add(
body: BookmarkIn,
user: User = Depends(get_current_user),
session: AsyncSession = Depends(get_session),
):
art = (await session.execute(select(Article).where(Article.id == body.article_id))).scalar_one_or_none()
if not art:
raise HTTPException(status.HTTP_404_NOT_FOUND, "Article not found")
# 已存在则直接返回
existing = (
await session.execute(
select(Bookmark).where(
Bookmark.user_id == user.id, Bookmark.article_id == body.article_id
)
)
).scalar_one_or_none()
if existing:
return BookmarkOut.model_validate(existing)
bm = Bookmark(user_id=user.id, article_id=body.article_id, note=body.note)
session.add(bm)
await session.commit()
await session.refresh(bm)
return BookmarkOut.model_validate(bm)
@router.delete("/{article_id}", status_code=status.HTTP_204_NO_CONTENT)
async def remove(
article_id: int,
user: User = Depends(get_current_user),
session: AsyncSession = Depends(get_session),
):
bm = (
await session.execute(
select(Bookmark).where(
Bookmark.user_id == user.id, Bookmark.article_id == article_id
)
)
).scalar_one_or_none()
if bm:
await session.delete(bm)
await session.commit()
return None
@router.get("", response_model=list[BookmarkOut])
async def list_mine(
user: User = Depends(get_current_user),
session: AsyncSession = Depends(get_session),
):
rows = (
await session.execute(
select(Bookmark).where(Bookmark.user_id == user.id).order_by(Bookmark.created_at.desc())
)
).scalars()
return [BookmarkOut.model_validate(b) for b in rows]

68
backend/app/api/me.py Normal file
View File

@@ -0,0 +1,68 @@
"""/me 当前用户信息 + 翻译配额。"""
from __future__ import annotations
from datetime import datetime, timezone
from fastapi import APIRouter, Depends
from pydantic import BaseModel
from sqlalchemy.ext.asyncio import AsyncSession
from app.config import settings
from app.core.deps import get_current_user
from app.database import get_session
from app.models.user import User
from app.redis_client import get_redis
router = APIRouter(prefix="/me", tags=["me"])
class MeOut(BaseModel):
id: int
username: str
email: str | None
role: str
display_name: str | None
created_at: datetime
class UsageOut(BaseModel):
month: str
used_chars: int
quota_chars: int
remaining_chars: int
buffered_quota: int
pct_used: float
@router.get("", response_model=MeOut)
async def me(user: User = Depends(get_current_user)):
return MeOut(
id=user.id,
username=user.username,
email=user.email,
role=user.role.value,
display_name=user.display_name,
created_at=user.created_at,
)
@router.get("/usage", response_model=UsageOut)
async def usage(
user: User = Depends(get_current_user),
session: AsyncSession = Depends(get_session), # noqa: ARG001
):
r = get_redis()
now = datetime.now(timezone.utc)
key = f"translation:month:{now:%Y%m}"
used = int(await r.get(key) or 0)
quota = settings.tencent_tmt_quota_month
buffered = int(quota * (1 - settings.tencent_tmt_quota_buffer))
remaining = max(0, quota - used)
return UsageOut(
month=f"{now:%Y%m}",
used_chars=used,
quota_chars=quota,
remaining_chars=remaining,
buffered_quota=buffered,
pct_used=round(used / quota * 100, 2) if quota else 0.0,
)

View File

@@ -0,0 +1,25 @@
"""/sources 源列表(只读,所有登录用户可看)。"""
from __future__ import annotations
from fastapi import APIRouter, Depends
from sqlalchemy import select
from sqlalchemy.ext.asyncio import AsyncSession
from app.core.deps import get_current_user
from app.database import get_session
from app.models.source import Source
from app.models.user import User
from app.schemas.source import SourceOut
router = APIRouter(prefix="/sources", tags=["sources"])
@router.get("", response_model=list[SourceOut])
async def list_sources(
user: User = Depends(get_current_user), # noqa: ARG001
session: AsyncSession = Depends(get_session),
):
rows = (
await session.execute(select(Source).order_by(Source.priority.desc(), Source.name))
).scalars()
return [SourceOut.model_validate(s) for s in rows]

View File

@@ -0,0 +1,68 @@
"""/subscriptions 关键词订阅。"""
from __future__ import annotations
from fastapi import APIRouter, Depends, HTTPException, status
from sqlalchemy import select
from sqlalchemy.ext.asyncio import AsyncSession
from app.core.deps import get_current_user
from app.database import get_session
from app.models.subscription import Subscription
from app.models.user import User
from app.schemas.misc import SubscriptionIn, SubscriptionOut
router = APIRouter(prefix="/subscriptions", tags=["subscriptions"])
@router.post("", response_model=SubscriptionOut, status_code=status.HTTP_201_CREATED)
async def create(
body: SubscriptionIn,
user: User = Depends(get_current_user),
session: AsyncSession = Depends(get_session),
):
sub = Subscription(
user_id=user.id,
keyword=body.keyword,
match_in=body.match_in,
channel=body.channel,
target=body.target,
)
session.add(sub)
await session.commit()
await session.refresh(sub)
return SubscriptionOut.model_validate(sub)
@router.get("", response_model=list[SubscriptionOut])
async def list_mine(
user: User = Depends(get_current_user),
session: AsyncSession = Depends(get_session),
):
rows = (
await session.execute(
select(Subscription)
.where(Subscription.user_id == user.id)
.order_by(Subscription.created_at.desc())
)
).scalars()
return [SubscriptionOut.model_validate(s) for s in rows]
@router.delete("/{sub_id}", status_code=status.HTTP_204_NO_CONTENT)
async def delete(
sub_id: int,
user: User = Depends(get_current_user),
session: AsyncSession = Depends(get_session),
):
sub = (
await session.execute(
select(Subscription).where(
Subscription.id == sub_id, Subscription.user_id == user.id
)
)
).scalar_one_or_none()
if not sub:
raise HTTPException(status.HTTP_404_NOT_FOUND, "Subscription not found")
await session.delete(sub)
await session.commit()
return None

104
backend/app/config.py Normal file
View File

@@ -0,0 +1,104 @@
"""应用配置:从 .env / 环境变量读取,集中管理所有开关。"""
from __future__ import annotations
from functools import lru_cache
from pathlib import Path
from pydantic import Field, field_validator
from pydantic_settings import BaseSettings, SettingsConfigDict
class Settings(BaseSettings):
model_config = SettingsConfigDict(
env_file=".env",
env_file_encoding="utf-8",
case_sensitive=False,
extra="ignore",
)
# ===== 通用 =====
tz: str = "Asia/Hong_Kong"
log_level: str = "INFO"
# ===== 数据库 =====
postgres_user: str
postgres_password: str
postgres_db: str
postgres_host: str = "postgres"
postgres_port: int = 5432
@property
def database_url(self) -> str:
# asyncpg
return (
f"postgresql+asyncpg://{self.postgres_user}:{self.postgres_password}"
f"@{self.postgres_host}:{self.postgres_port}/{self.postgres_db}"
)
@property
def sync_database_url(self) -> str:
# alembic 用的同步 URL
return (
f"postgresql+psycopg2://{self.postgres_user}:{self.postgres_password}"
f"@{self.postgres_host}:{self.postgres_port}/{self.postgres_db}"
)
# ===== Redis =====
redis_host: str = "redis"
redis_port: int = 6379
redis_password: str
redis_db: int = 0
@property
def redis_url(self) -> str:
return (
f"redis://:{self.redis_password}@{self.redis_host}:{self.redis_port}/{self.redis_db}"
)
# ===== JWT =====
jwt_secret: str
jwt_algorithm: str = "HS256"
access_token_ttl_min: int = 60
refresh_token_ttl_day: int = 14
# ===== 腾讯云 TMT =====
tencentcloud_secret_id: str = ""
tencentcloud_secret_key: str = ""
tencentcloud_region: str = "ap-hongkong"
tencent_tmt_endpoint: str = "tmt.tencentcloudapi.com"
tencent_tmt_quota_month: int = 5_000_000
tencent_tmt_quota_buffer: float = 0.05
tencent_tmt_max_chars_per_req: int = 4500
@field_validator("tencent_tmt_quota_buffer")
@classmethod
def _check_buffer(cls, v: float) -> float:
if not 0.0 <= v <= 0.5:
raise ValueError("buffer 必须在 0~0.5")
return v
# ===== 本地翻译 =====
local_translate_enabled: bool = False
local_translate_model: str = "nllb-200-distilled-600M"
local_translate_device: str = "cpu"
# ===== 抓取 =====
fetch_global_qps: int = 4
fetch_timeout: int = 20
fetch_fail_pause_threshold: int = 3
fetch_max_retries: int = 2
# ===== Caddy / 域名 =====
domain: str = ""
acme_email: str = ""
# ===== 内部路径(部署后可调) =====
project_root: Path = Path(__file__).resolve().parents[2]
@lru_cache
def get_settings() -> Settings:
return Settings() # type: ignore[call-arg]
settings = get_settings()

View File

@@ -0,0 +1 @@
"""core utilities."""

77
backend/app/core/deps.py Normal file
View File

@@ -0,0 +1,77 @@
"""通用依赖:获取当前用户、要求 owner。"""
from __future__ import annotations
from datetime import datetime, timezone
from fastapi import Depends, HTTPException, status
from fastapi.security import HTTPAuthorizationCredentials, HTTPBearer
from jwt.exceptions import InvalidTokenError
from sqlalchemy import select
from sqlalchemy.ext.asyncio import AsyncSession
from app.core.security import decode_token, hash_api_token
from app.database import get_session
from app.models.api_token import ApiToken
from app.models.user import User, UserRole
_bearer = HTTPBearer(auto_error=False)
async def _resolve_user(
creds: HTTPAuthorizationCredentials | None = Depends(_bearer),
session: AsyncSession = Depends(get_session),
) -> User:
if creds is None or not creds.credentials:
raise HTTPException(status.HTTP_401_UNAUTHORIZED, "Missing credentials")
token = creds.credentials
# 1) 先试 API Token(sha256 比较)
h = hash_api_token(token)
api_row = (
await session.execute(
select(ApiToken).where(ApiToken.token_hash == h, ApiToken.revoked_at.is_(None))
)
.scalars()
.first()
)
if api_row:
if api_row.expires_at and api_row.expires_at < datetime.now(timezone.utc):
raise HTTPException(status.HTTP_401_UNAUTHORIZED, "Token expired")
user = (
await session.execute(select(User).where(User.id == api_row.user_id))
.scalars()
.first()
)
if user and user.is_active:
api_row.last_used_at = datetime.now(timezone.utc)
await session.commit()
return user
# 2) 试 JWT
try:
payload = decode_token(token)
if payload.get("type") != "access":
raise InvalidTokenError("wrong type")
uid = int(payload["sub"])
except (InvalidTokenError, KeyError, ValueError):
raise HTTPException(status.HTTP_401_UNAUTHORIZED, "Invalid token")
user = (
await session.execute(select(User).where(User.id == uid, User.is_active.is_(True)))
.scalars()
.first()
)
if user is None:
raise HTTPException(status.HTTP_401_UNAUTHORIZED, "User not found or inactive")
return user
async def get_current_user(user: User = Depends(_resolve_user)) -> User:
return user
async def require_owner(user: User = Depends(get_current_user)) -> User:
if user.role != UserRole.OWNER:
raise HTTPException(status.HTTP_403_FORBIDDEN, "Owner only")
return user

View File

@@ -0,0 +1,73 @@
"""鉴权核心:密码哈希 + JWT 编解码 + API Token。"""
from __future__ import annotations
import hashlib
import hmac
import secrets
from datetime import datetime, timedelta, timezone
from typing import Any
import jwt
from passlib.context import CryptContext
from app.config import settings
# bcrypt 4.0.1 与 passlib 1.7.4 兼容
pwd_ctx = CryptContext(schemes=["bcrypt"], deprecated="auto", bcrypt__rounds=12)
def hash_password(plain: str) -> str:
return pwd_ctx.hash(plain)
def verify_password(plain: str, hashed: str) -> bool:
try:
return pwd_ctx.verify(plain, hashed)
except Exception:
return False
# === JWT ===
def create_access_token(subject: str | int, extra: dict[str, Any] | None = None) -> str:
now = datetime.now(timezone.utc)
payload: dict[str, Any] = {
"sub": str(subject),
"type": "access",
"iat": int(now.timestamp()),
"exp": int((now + timedelta(minutes=settings.access_token_ttl_min)).timestamp()),
}
if extra:
payload.update(extra)
return jwt.encode(payload, settings.jwt_secret, algorithm=settings.jwt_algorithm)
def create_refresh_token(subject: str | int) -> str:
now = datetime.now(timezone.utc)
payload = {
"sub": str(subject),
"type": "refresh",
"iat": int(now.timestamp()),
"exp": int((now + timedelta(days=settings.refresh_token_ttl_day)).timestamp()),
"jti": secrets.token_urlsafe(16),
}
return jwt.encode(payload, settings.jwt_secret, algorithm=settings.jwt_algorithm)
def decode_token(token: str) -> dict[str, Any]:
return jwt.decode(token, settings.jwt_secret, algorithms=[settings.jwt_algorithm])
# === API Token(给 Android 用)===
def generate_api_token() -> tuple[str, str]:
"""返回 (raw_token, token_hash)。raw_token 只显示一次。"""
raw = secrets.token_urlsafe(32)
return raw, hash_api_token(raw)
def hash_api_token(raw: str) -> str:
# 简单 sha256 即可(随机性已经够)
return hashlib.sha256(raw.encode()).hexdigest()
def constant_time_eq(a: str, b: str) -> bool:
return hmac.compare_digest(a, b)

52
backend/app/database.py Normal file
View File

@@ -0,0 +1,52 @@
"""异步 SQLAlchemy 数据库连接。"""
from __future__ import annotations
from collections.abc import AsyncGenerator
from sqlalchemy.ext.asyncio import (
AsyncSession,
async_sessionmaker,
create_async_engine,
)
from sqlalchemy.orm import DeclarativeBase
from app.config import settings
class Base(DeclarativeBase):
"""所有 ORM 模型的基类。"""
engine = create_async_engine(
settings.database_url,
echo=False,
pool_size=5,
max_overflow=10,
pool_pre_ping=True,
pool_recycle=1800,
)
AsyncSessionLocal = async_sessionmaker(
bind=engine,
class_=AsyncSession,
expire_on_commit=False,
autoflush=False,
)
async def get_session() -> AsyncGenerator[AsyncSession, None]:
"""FastAPI 依赖:请求级 session。"""
async with AsyncSessionLocal() as session:
try:
yield session
finally:
await session.close()
async def init_db() -> None:
"""开发期用,生产请用 alembic。"""
# import models to register them
from app.models import article, source, user # noqa: F401
async with engine.begin() as conn:
await conn.run_sync(Base.metadata.create_all)

117
backend/app/main.py Normal file
View File

@@ -0,0 +1,117 @@
"""FastAPI 入口。
- 注册路由
- 启动 / 关闭事件:连接池、调度器
- CORS
- 全局异常处理
"""
from __future__ import annotations
import logging
from contextlib import asynccontextmanager
from fastapi import FastAPI, Request
from fastapi.exceptions import RequestValidationError
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import JSONResponse
from starlette.exceptions import HTTPException as StarletteHTTPException
from app.api import admin, articles, auth, bookmarks, me, sources, subscriptions
from app.config import settings
from app.database import engine
from app.redis_client import close_redis, get_redis
logger = logging.getLogger("news.api")
logging.basicConfig(
level=settings.log_level,
format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
)
@asynccontextmanager
async def lifespan(app: FastAPI):
# 启动
logger.info("api starting, tz=%s", settings.tz)
# 触发 redis 连接
await get_redis().ping()
yield
# 关闭
logger.info("api shutting down")
await close_redis()
await engine.dispose()
app = FastAPI(
title="Diary News",
description="Private news aggregator",
version="0.1.0",
default_response_class=JSONResponse,
lifespan=lifespan,
docs_url="/api/docs" if settings.log_level == "DEBUG" else None,
redoc_url=None,
)
# CORS:网页 + Android,简单放开(私有)
app.add_middleware(
CORSMiddleware,
allow_origins=["*"], # MVP 放开,生产收紧
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
# === 全局异常处理(RFC 7807) ===
@app.exception_handler(StarletteHTTPException)
async def http_exc_handler(request: Request, exc: StarletteHTTPException):
return JSONResponse(
status_code=exc.status_code,
content={
"type": "about:blank",
"title": exc.detail if isinstance(exc.detail, str) else "Error",
"status": exc.status_code,
"instance": str(request.url),
},
headers=exc.headers or None,
)
@app.exception_handler(RequestValidationError)
async def validation_exc_handler(request: Request, exc: RequestValidationError):
return JSONResponse(
status_code=422,
content={
"type": "about:blank",
"title": "Validation Error",
"status": 422,
"errors": exc.errors(),
"instance": str(request.url),
},
)
# === 路由 ===
API_PREFIX = "/api/v1"
app.include_router(auth.router, prefix=API_PREFIX)
app.include_router(me.router, prefix=API_PREFIX)
app.include_router(articles.router, prefix=API_PREFIX)
app.include_router(sources.router, prefix=API_PREFIX)
app.include_router(bookmarks.router, prefix=API_PREFIX)
app.include_router(subscriptions.router, prefix=API_PREFIX)
app.include_router(admin.router, prefix=API_PREFIX)
# === 健康检查 ===
@app.get("/healthz", include_in_schema=False)
async def healthz():
try:
await get_redis().ping()
except Exception as e:
return JSONResponse({"status": "degraded", "redis": str(e)}, status_code=503)
return {"status": "ok"}
@app.get("/", include_in_schema=False)
async def root():
return {"name": "diary-news", "version": app.version, "docs": "/api/docs"}

View File

@@ -0,0 +1,21 @@
"""所有 ORM 模型。
新模型请在这里 import,确保 Alembic 自动发现。
"""
from app.models.api_token import ApiToken # noqa: F401
from app.models.article import Article # noqa: F401
from app.models.bookmark import Bookmark # noqa: F401
from app.models.source import Source, SourceKind # noqa: F401
from app.models.subscription import Subscription # noqa: F401
from app.models.user import User, UserRole # noqa: F401
__all__ = [
"ApiToken",
"Article",
"Bookmark",
"Source",
"SourceKind",
"Subscription",
"User",
"UserRole",
]

View File

@@ -0,0 +1,27 @@
"""API Token(给 Android 用,可独立撤销)。"""
from __future__ import annotations
from datetime import datetime
from sqlalchemy import DateTime, ForeignKey, String, func
from sqlalchemy.orm import Mapped, mapped_column
from app.database import Base
class ApiToken(Base):
__tablename__ = "api_tokens"
id: Mapped[int] = mapped_column(primary_key=True)
user_id: Mapped[int] = mapped_column(
ForeignKey("users.id", ondelete="CASCADE"), nullable=False, index=True
)
name: Mapped[str] = mapped_column(String(64), nullable=False) # "Xiaomi-14"
token_hash: Mapped[str] = mapped_column(String(128), unique=True, nullable=False, index=True)
# 只存 hash,原始 token 一次性返回给用户
last_used_at: Mapped[datetime | None] = mapped_column(DateTime(timezone=True))
expires_at: Mapped[datetime | None] = mapped_column(DateTime(timezone=True))
revoked_at: Mapped[datetime | None] = mapped_column(DateTime(timezone=True))
created_at: Mapped[datetime] = mapped_column(
DateTime(timezone=True), server_default=func.now(), nullable=False
)

View File

@@ -0,0 +1,91 @@
"""文章主表:原文 + 译文 + ML 字段预留。"""
from __future__ import annotations
from datetime import datetime
from sqlalchemy import (
BigInteger,
DateTime,
Float,
ForeignKey,
Index,
Integer,
String,
Text,
func,
)
from sqlalchemy.dialects.postgresql import JSONB
from sqlalchemy.orm import Mapped, mapped_column, relationship
from app.database import Base
class Article(Base):
__tablename__ = "articles"
id: Mapped[int] = mapped_column(BigInteger, primary_key=True)
# === 来源 ===
source_id: Mapped[int] = mapped_column(
ForeignKey("sources.id", ondelete="CASCADE"), nullable=False, index=True
)
source: Mapped["Source"] = relationship(back_populates="articles", lazy="joined") # noqa: F821
# === 原文标识 ===
url: Mapped[str] = mapped_column(Text, nullable=False)
url_hash: Mapped[str] = mapped_column(String(40), unique=True, nullable=False, index=True)
guid: Mapped[str | None] = mapped_column(String(255), index=True) # 源站给的 ID
# === 原文内容 ===
title: Mapped[str] = mapped_column(Text, nullable=False)
body_html: Mapped[str | None] = mapped_column(Text) # 抽取后保留结构
body_text: Mapped[str] = mapped_column(Text, nullable=False, default="")
lang_src: Mapped[str | None] = mapped_column(String(8))
author: Mapped[str | None] = mapped_column(String(255))
image_url: Mapped[str | None] = mapped_column(Text)
# === 译文 ===
title_zh: Mapped[str | None] = mapped_column(Text)
body_zh_html: Mapped[str | None] = mapped_column(Text)
body_zh_text: Mapped[str | None] = mapped_column(Text)
summary_zh: Mapped[str | None] = mapped_column(Text)
# === 翻译状态 ===
translation_status: Mapped[str] = mapped_column(
String(16), default="pending", nullable=False, index=True
)
# pending / ok / partial / failed / n/a
translation_engine: Mapped[str | None] = mapped_column(String(16))
# tencent / nllb / cache / skip
translation_chars: Mapped[int] = mapped_column(Integer, default=0, nullable=False)
translated_at: Mapped[datetime | None] = mapped_column(DateTime(timezone=True))
# === ML 字段(预留,MVP 全 null)===
category: Mapped[str | None] = mapped_column(String(32), index=True)
commentary: Mapped[str | None] = mapped_column(Text)
entities: Mapped[dict | None] = mapped_column(JSONB)
sentiment: Mapped[float | None] = mapped_column(Float)
topic_id: Mapped[str | None] = mapped_column(String(64), index=True)
bias: Mapped[str | None] = mapped_column(String(16)) # left/center/right
# === 去重 ===
duplicate_of: Mapped[int | None] = mapped_column(
ForeignKey("articles.id", ondelete="SET NULL"), index=True
)
# === 时间 ===
published_at: Mapped[datetime | None] = mapped_column(DateTime(timezone=True), index=True)
fetched_at: Mapped[datetime] = mapped_column(
DateTime(timezone=True), server_default=func.now(), nullable=False, index=True
)
created_at: Mapped[datetime] = mapped_column(
DateTime(timezone=True), server_default=func.now(), nullable=False
)
__table_args__ = (
Index("ix_articles_source_published", "source_id", "published_at"),
Index("ix_articles_status_published", "translation_status", "published_at"),
)
def __repr__(self) -> str:
return f"<Article id={self.id} src={self.source_id} status={self.translation_status}>"

View File

@@ -0,0 +1,27 @@
"""收藏。"""
from __future__ import annotations
from datetime import datetime
from sqlalchemy import DateTime, ForeignKey, UniqueConstraint, func
from sqlalchemy.orm import Mapped, mapped_column
from app.database import Base
class Bookmark(Base):
__tablename__ = "bookmarks"
id: Mapped[int] = mapped_column(primary_key=True)
user_id: Mapped[int] = mapped_column(
ForeignKey("users.id", ondelete="CASCADE"), nullable=False, index=True
)
article_id: Mapped[int] = mapped_column(
ForeignKey("articles.id", ondelete="CASCADE"), nullable=False, index=True
)
note: Mapped[str | None] = mapped_column()
created_at: Mapped[datetime] = mapped_column(
DateTime(timezone=True), server_default=func.now(), nullable=False
)
__table_args__ = (UniqueConstraint("user_id", "article_id", name="uq_bookmark_user_article"),)

View File

@@ -0,0 +1,64 @@
"""采集源模型。"""
from __future__ import annotations
import enum
from datetime import datetime
from sqlalchemy import (
JSON,
Boolean,
DateTime,
Enum,
Integer,
String,
Text,
func,
)
from sqlalchemy.orm import Mapped, mapped_column, relationship
from app.database import Base
class SourceKind(str, enum.Enum):
RSS = "rss"
HTML_LIST = "html_list"
TG_CHANNEL = "tg_channel"
class Source(Base):
__tablename__ = "sources"
id: Mapped[int] = mapped_column(primary_key=True)
name: Mapped[str] = mapped_column(String(128), nullable=False)
slug: Mapped[str] = mapped_column(String(128), unique=True, index=True, nullable=False)
kind: Mapped[SourceKind] = mapped_column(
Enum(SourceKind, name="source_kind"),
default=SourceKind.RSS,
nullable=False,
)
url: Mapped[str] = mapped_column(Text, nullable=False)
detail_selector: Mapped[dict | None] = mapped_column(JSON)
fetch_interval_min: Mapped[int] = mapped_column(Integer, default=60, nullable=False)
fetch_cron: Mapped[str | None] = mapped_column(String(64)) # 5 段 cron
translate_to: Mapped[str] = mapped_column(String(8), default="zh", nullable=False)
enabled: Mapped[bool] = mapped_column(Boolean, default=True, nullable=False)
region: Mapped[str | None] = mapped_column(String(32), index=True)
language_src: Mapped[str | None] = mapped_column(String(8))
priority: Mapped[int] = mapped_column(Integer, default=50, nullable=False, index=True)
headers_json: Mapped[dict | None] = mapped_column(JSON)
last_fetched_at: Mapped[datetime | None] = mapped_column(DateTime(timezone=True))
last_status: Mapped[str | None] = mapped_column(String(64))
consecutive_failures: Mapped[int] = mapped_column(Integer, default=0, nullable=False)
created_at: Mapped[datetime] = mapped_column(
DateTime(timezone=True), server_default=func.now(), nullable=False
)
updated_at: Mapped[datetime] = mapped_column(
DateTime(timezone=True), server_default=func.now(), onupdate=func.now(), nullable=False
)
articles: Mapped[list["Article"]] = relationship( # noqa: F821
back_populates="source", cascade="all, delete-orphan", lazy="noload"
)
def __repr__(self) -> str:
return f"<Source id={self.id} slug={self.slug} kind={self.kind.value}>"

View File

@@ -0,0 +1,48 @@
"""关键词订阅(命中即通知)。"""
from __future__ import annotations
import enum
from datetime import datetime
from sqlalchemy import (
Boolean,
DateTime,
Enum,
ForeignKey,
String,
Text,
func,
)
from sqlalchemy.orm import Mapped, mapped_column
from app.database import Base
class SubscriptionMatch(str, enum.Enum):
ANY = "any" # 标题或正文
TITLE = "title"
BODY = "body"
class Subscription(Base):
__tablename__ = "subscriptions"
id: Mapped[int] = mapped_column(primary_key=True)
user_id: Mapped[int] = mapped_column(
ForeignKey("users.id", ondelete="CASCADE"), nullable=False, index=True
)
keyword: Mapped[str] = mapped_column(String(255), nullable=False)
# 简单关键词,匹配走 ILIKE '%kw%';后续可加 regex/lucene
match_in: Mapped[SubscriptionMatch] = mapped_column(
Enum(SubscriptionMatch, name="subscription_match"),
default=SubscriptionMatch.ANY,
nullable=False,
)
channel: Mapped[str] = mapped_column(String(32), default="telegram", nullable=False)
# telegram / email / web
target: Mapped[str | None] = mapped_column(Text) # chat_id / email
enabled: Mapped[bool] = mapped_column(Boolean, default=True, nullable=False)
last_hit_at: Mapped[datetime | None] = mapped_column(DateTime(timezone=True))
created_at: Mapped[datetime] = mapped_column(
DateTime(timezone=True), server_default=func.now(), nullable=False
)

View File

@@ -0,0 +1,41 @@
"""用户模型。
Phase 1 仅 owner + member 两级,后续扩展。
"""
from __future__ import annotations
import enum
from datetime import datetime
from sqlalchemy import Boolean, DateTime, Enum, String, func
from sqlalchemy.orm import Mapped, mapped_column
from app.database import Base
class UserRole(str, enum.Enum):
OWNER = "owner"
MEMBER = "member"
class User(Base):
__tablename__ = "users"
id: Mapped[int] = mapped_column(primary_key=True)
username: Mapped[str] = mapped_column(String(64), unique=True, index=True, nullable=False)
email: Mapped[str | None] = mapped_column(String(255), unique=True, index=True)
password_hash: Mapped[str] = mapped_column(String(255), nullable=False)
role: Mapped[UserRole] = mapped_column(
Enum(UserRole, name="user_role"),
default=UserRole.MEMBER,
nullable=False,
)
is_active: Mapped[bool] = mapped_column(Boolean, default=True, nullable=False)
display_name: Mapped[str | None] = mapped_column(String(128))
created_at: Mapped[datetime] = mapped_column(
DateTime(timezone=True), server_default=func.now(), nullable=False
)
last_login_at: Mapped[datetime | None] = mapped_column(DateTime(timezone=True))
def __repr__(self) -> str:
return f"<User id={self.id} username={self.username} role={self.role.value}>"

View File

@@ -0,0 +1,31 @@
"""Redis 客户端(单例)。用于:
- 翻译缓存
- 翻译字符配额(月度)
- 限流(后续)
"""
from __future__ import annotations
import redis.asyncio as redis_async
from app.config import settings
_pool: redis_async.Redis | None = None
def get_redis() -> redis_async.Redis:
global _pool
if _pool is None:
_pool = redis_async.from_url(
settings.redis_url,
encoding="utf-8",
decode_responses=True,
max_connections=20,
)
return _pool
async def close_redis() -> None:
global _pool
if _pool is not None:
await _pool.aclose()
_pool = None

View File

@@ -0,0 +1 @@
"""Pydantic schemas for API I/O."""

View File

@@ -0,0 +1,83 @@
"""Article schemas."""
from __future__ import annotations
from datetime import datetime
from pydantic import BaseModel, ConfigDict, Field
class SourceBrief(BaseModel):
model_config = ConfigDict(from_attributes=True)
id: int
name: str
slug: str
region: str | None = None
class ArticleListItem(BaseModel):
"""列表项:精简字段。"""
model_config = ConfigDict(from_attributes=True)
id: int
source: SourceBrief
title: str
title_zh: str | None = None
summary_zh: str | None = None
lang_src: str | None = None
translation_status: str
category: str | None = None
published_at: datetime | None = None
fetched_at: datetime
image_url: str | None = None
is_starred: bool = False
class ArticleDetail(BaseModel):
model_config = ConfigDict(from_attributes=True)
id: int
source: SourceBrief
url: str
title: str
body_html: str | None = None
body_text: str
title_zh: str | None = None
body_zh_html: str | None = None
body_zh_text: str | None = None
summary_zh: str | None = None
lang_src: str | None = None
author: str | None = None
image_url: str | None = None
translation_status: str
translation_engine: str | None = None
translated_at: datetime | None = None
category: str | None = None
commentary: str | None = None
entities: dict | None = None
sentiment: float | None = None
duplicate_of: int | None = None
published_at: datetime | None = None
fetched_at: datetime
is_starred: bool = False
class ArticleListResponse(BaseModel):
items: list[ArticleListItem]
next_cursor: str | None = None
total: int | None = None
class ArticleQuery(BaseModel):
"""用作 ?query= 解析参考(实际 FastAPI 直接用 Query)。"""
since: datetime | None = None
until: datetime | None = None
source: str | None = None # 逗号分隔 slug
category: str | None = None
q: str | None = None
lang: str = Field(default="both", pattern=r"^(src|zh|both)$")
limit: int = Field(default=50, ge=1, le=200)
cursor: str | None = None
starred_only: bool = False

View File

@@ -0,0 +1,20 @@
"""Auth schemas."""
from __future__ import annotations
from pydantic import BaseModel, Field
class LoginRequest(BaseModel):
username: str = Field(min_length=1, max_length=64)
password: str = Field(min_length=6, max_length=128)
class TokenPair(BaseModel):
access_token: str
refresh_token: str
token_type: str = "bearer"
expires_in: int # seconds
class RefreshRequest(BaseModel):
refresh_token: str

View File

@@ -0,0 +1,43 @@
"""Bookmark / Subscription schemas."""
from __future__ import annotations
from datetime import datetime
from pydantic import BaseModel, ConfigDict, Field
from app.models.subscription import SubscriptionMatch
class BookmarkIn(BaseModel):
article_id: int
note: str | None = None
class BookmarkOut(BaseModel):
model_config = ConfigDict(from_attributes=True)
id: int
user_id: int
article_id: int
note: str | None = None
created_at: datetime
class SubscriptionIn(BaseModel):
keyword: str = Field(min_length=1, max_length=255)
match_in: SubscriptionMatch = SubscriptionMatch.ANY
channel: str = "telegram"
target: str | None = None
class SubscriptionOut(BaseModel):
model_config = ConfigDict(from_attributes=True)
id: int
keyword: str
match_in: SubscriptionMatch
channel: str
target: str | None = None
enabled: bool
last_hit_at: datetime | None = None
created_at: datetime

View File

@@ -0,0 +1,51 @@
"""Source schemas."""
from __future__ import annotations
from datetime import datetime
from pydantic import BaseModel, ConfigDict, Field, HttpUrl
from app.models.source import SourceKind
class SourceOut(BaseModel):
model_config = ConfigDict(from_attributes=True)
id: int
name: str
slug: str
kind: SourceKind
url: str
enabled: bool
region: str | None = None
language_src: str | None = None
priority: int
fetch_interval_min: int
translate_to: str
last_fetched_at: datetime | None = None
last_status: str | None = None
consecutive_failures: int = 0
class SourceIn(BaseModel):
name: str = Field(min_length=1, max_length=128)
slug: str = Field(min_length=1, max_length=128, pattern=r"^[a-z0-9-]+$")
kind: SourceKind = SourceKind.RSS
url: HttpUrl
region: str | None = None
language_src: str | None = None
priority: int = Field(default=50, ge=1, le=100)
fetch_interval_min: int = Field(default=60, ge=5, le=1440)
translate_to: str = "zh"
enabled: bool = True
detail_selector: dict | None = None
headers_json: dict | None = None
class SourceUpdate(BaseModel):
name: str | None = None
enabled: bool | None = None
priority: int | None = Field(default=None, ge=1, le=100)
fetch_interval_min: int | None = Field(default=None, ge=5, le=1440)
region: str | None = None
translate_to: str | None = None

View File

@@ -0,0 +1 @@
"""命令行脚本集合。"""

View File

@@ -0,0 +1,56 @@
"""创建用户(默认 owner)。"""
from __future__ import annotations
import argparse
import asyncio
import sys
from getpass import getpass
from sqlalchemy import select
from app.core.security import hash_password
from app.database import AsyncSessionLocal
from app.models.user import User, UserRole
async def main(username: str, password: str, email: str | None, role: UserRole) -> int:
async with AsyncSessionLocal() as session:
exists = (await session.execute(select(User).where(User.username == username))).scalar_one_or_none()
if exists:
print(f"user '{username}' already exists (id={exists.id})", file=sys.stderr)
return 1
u = User(
username=username,
email=email,
password_hash=hash_password(password),
role=role,
is_active=True,
)
session.add(u)
await session.commit()
await session.refresh(u)
print(f"created user id={u.id} username={u.username} role={u.role.value}")
return 0
def cli() -> None:
p = argparse.ArgumentParser()
p.add_argument("--username", required=True)
p.add_argument("--password", default=None, help="缺省则交互输入")
p.add_argument("--email", default=None)
p.add_argument("--role", choices=["owner", "member"], default="member")
args = p.parse_args()
password = args.password
if not password:
pw1 = getpass("password: ")
pw2 = getpass("password (again): ")
if pw1 != pw2 or len(pw1) < 6:
print("passwords differ or too short", file=sys.stderr)
sys.exit(2)
password = pw1
rc = asyncio.run(main(args.username, password, args.email, UserRole(args.role)))
sys.exit(rc)
if __name__ == "__main__":
cli()

View File

@@ -0,0 +1,114 @@
"""种子:导入 MVP 5 源。
- Reuters World
- BBC World
- Al Jazeera
- NHK World
- DW
RSS 链接为公开 feed,实际链接可能变更;若 fetch 失败,先看 /admin/health。
"""
from __future__ import annotations
import asyncio
import sys
from sqlalchemy import select
from sqlalchemy.dialects.postgresql import insert as pg_insert
from sqlalchemy.exc import IntegrityError
from app.database import AsyncSessionLocal
from app.models.source import Source, SourceKind
SEEDS = [
{
"name": "Reuters World",
"slug": "reuters-world",
"kind": SourceKind.RSS,
"url": "https://feeds.reuters.com/Reuters/worldNews",
"region": "global",
"language_src": "en",
"priority": 90,
"fetch_interval_min": 30,
"translate_to": "zh",
"enabled": True,
},
{
"name": "BBC World",
"slug": "bbc-world",
"kind": SourceKind.RSS,
"url": "https://feeds.bbci.co.uk/news/world/rss.xml",
"region": "global",
"language_src": "en",
"priority": 85,
"fetch_interval_min": 30,
"translate_to": "zh",
"enabled": True,
},
{
"name": "Al Jazeera",
"slug": "aljazeera",
"kind": SourceKind.RSS,
"url": "https://www.aljazeera.com/xml/rss/all.xml",
"region": "mena",
"language_src": "en",
"priority": 80,
"fetch_interval_min": 45,
"translate_to": "zh",
"enabled": True,
},
{
"name": "NHK World",
"slug": "nhk-world",
"kind": SourceKind.RSS,
"url": "https://www3.nhk.or.jp/rss/news/cat0.xml",
"region": "asia",
"language_src": "en",
"priority": 70,
"fetch_interval_min": 60,
"translate_to": "zh",
"enabled": True,
},
{
"name": "DW (Deutsche Welle)",
"slug": "dw",
"kind": SourceKind.RSS,
"url": "https://rss.dw.com/xml/rss-en-all",
"region": "eu",
"language_src": "en",
"priority": 70,
"fetch_interval_min": 60,
"translate_to": "zh",
"enabled": True,
},
]
async def main() -> int:
async with AsyncSessionLocal() as session:
inserted = 0
for row in SEEDS:
stmt = (
pg_insert(Source)
.values(**row)
.on_conflict_do_nothing(index_elements=["slug"])
.returning(Source.id)
)
try:
r = await session.execute(stmt)
rid = r.scalar_one_or_none()
if rid is not None:
inserted += 1
print(f" + {row['slug']} (id={rid})")
else:
print(f" = {row['slug']} (already exists)")
except IntegrityError as e:
print(f" ! {row['slug']}: {e}", file=sys.stderr)
await session.rollback()
await session.commit()
print(f"seeded {inserted} new source(s)")
return 0
if __name__ == "__main__":
sys.exit(asyncio.run(main()))

View File

@@ -0,0 +1 @@
"""Services (fetchers / translation)."""

View File

@@ -0,0 +1,12 @@
"""Fetcher implementations."""
from app.services.fetchers.base import BaseFetcher, FetchedItem
from app.services.fetchers.rss import RSSFetcher
__all__ = ["BaseFetcher", "FetchedItem", "RSSFetcher"]
def get_fetcher(kind: str, **kwargs) -> BaseFetcher:
if kind == "rss":
return RSSFetcher(**kwargs)
# html_list / tg_channel: Phase 2 实现,这里抛错
raise NotImplementedError(f"fetcher not implemented for kind={kind}")

View File

@@ -0,0 +1,67 @@
"""Fetcher 抽象基类 + 通用工具。"""
from __future__ import annotations
import hashlib
from abc import ABC, abstractmethod
from dataclasses import dataclass, field
from datetime import datetime
from typing import Any
import httpx
from app.config import settings
def normalize_url(url: str) -> str:
"""去 utm_*、fragment、尾斜杠,用于 url_hash。"""
from urllib.parse import urlsplit, urlunsplit, parse_qsl, urlencode
sp = urlsplit(url.strip())
# 去掉 fragment
fragment = ""
# 过滤 utm_*
qs = [(k, v) for k, v in parse_qsl(sp.query, keep_blank_values=True) if not k.lower().startswith("utm_")]
query = urlencode(qs)
# 路径末尾 /
path = sp.path.rstrip("/") or "/"
return urlunsplit((sp.scheme.lower(), sp.netloc.lower(), path, query, fragment))
def url_hash(url: str) -> str:
return hashlib.sha1(normalize_url(url).encode()).hexdigest()
@dataclass
class FetchedItem:
"""统一返回结构:一个待入库的条目。"""
url: str
title: str
body_html: str | None = None
body_text: str = ""
published_at: datetime | None = None
lang: str | None = None
author: str | None = None
image_url: str | None = None
guid: str | None = None
raw: dict[str, Any] = field(default_factory=dict)
class BaseFetcher(ABC):
def __init__(self, url: str, headers: dict | None = None):
self.url = url
self.headers = headers or {"User-Agent": "DiaryNews/0.1 (+https://github.com/)"}
@abstractmethod
async def fetch(self) -> list[FetchedItem]:
"""拉取并解析,返回 FetchedItem 列表。"""
async def _http_get(self) -> bytes:
async with httpx.AsyncClient(
timeout=settings.fetch_timeout,
follow_redirects=True,
headers=self.headers,
) as client:
r = await client.get(self.url)
r.raise_for_status()
return r.content

View File

@@ -0,0 +1,100 @@
"""RSS / Atom fetcher(基于 feedparser)。"""
from __future__ import annotations
from datetime import datetime, timezone
from email.utils import parsedate_to_datetime
import feedparser
from dateutil import parser as dtp
from app.services.fetchers.base import BaseFetcher, FetchedItem
class RSSFetcher(BaseFetcher):
async def fetch(self) -> list[FetchedItem]:
raw = await self._http_get()
# feedparser 在不同 Python 下处理 bytes/str
try:
text = raw.decode("utf-8")
except UnicodeDecodeError:
text = raw.decode("utf-8", errors="replace")
feed = feedparser.parse(text)
if feed.bozo and not feed.entries:
# 整篇解析失败
raise RuntimeError(f"RSS parse failed: {feed.bozo_exception}")
items: list[FetchedItem] = []
for e in feed.entries:
url = e.get("link") or e.get("id")
if not url:
continue
title = (e.get("title") or "").strip()
if not title:
continue
body_html = None
body_text = ""
if e.get("content"):
# 选最长 content
contents = sorted(e["content"], key=lambda c: -len(c.get("value", "")))
body_html = contents[0].get("value")
if not body_html:
body_html = e.get("summary")
if body_html:
from bs4 import BeautifulSoup
soup = BeautifulSoup(body_html, "lxml")
# 去 script/style
for tag in soup(["script", "style", "noscript"]):
tag.decompose()
body_text = soup.get_text(separator="\n", strip=True)
published_at = _parse_dt(e.get("published") or e.get("updated") or e.get("created"))
author = e.get("author")
image_url = None
if e.get("media_content"):
try:
image_url = e["media_content"][0].get("url")
except (IndexError, KeyError, TypeError):
pass
if not image_url and e.get("media_thumbnail"):
try:
image_url = e["media_thumbnail"][0].get("url")
except (IndexError, KeyError, TypeError):
pass
if not image_url and e.get("enclosures"):
for enc in e["enclosures"]:
if enc.get("type", "").startswith("image/"):
image_url = enc.get("href") or enc.get("url")
break
items.append(
FetchedItem(
url=url,
title=title,
body_html=body_html,
body_text=body_text,
published_at=published_at,
lang=e.get("language") or feed.feed.get("language"),
author=author,
image_url=image_url,
guid=e.get("id") or e.get("guid"),
)
)
return items
def _parse_dt(s: str | None) -> datetime | None:
if not s:
return None
try:
dt = dtp.parse(s)
except (ValueError, TypeError, dtp.ParserError):
try:
dt = parsedate_to_datetime(s)
except Exception:
return None
if dt is None:
return None
if dt.tzinfo is None:
dt = dt.replace(tzinfo=timezone.utc)
return dt.astimezone(timezone.utc)

View File

@@ -0,0 +1 @@
"""Translation services."""

View File

@@ -0,0 +1,26 @@
"""翻译后端抽象。"""
from __future__ import annotations
from abc import ABC, abstractmethod
from dataclasses import dataclass
@dataclass
class TranslationResult:
text: str
engine: str
chars: int
cached: bool = False
class BaseTranslator(ABC):
name: str = "base"
@abstractmethod
async def translate(self, text: str, source: str = "auto", target: str = "zh") -> TranslationResult:
"""同步调用,失败抛异常。"""
def count_chars(s: str) -> int:
"""近似的字符计数(Unicode 码点)。腾讯 TMT 按字符数计费。"""
return len(s)

View File

@@ -0,0 +1,62 @@
"""本地翻译(降级用,需要 transformers + 模型文件)。
默认关闭。启用方式:
- LOCAL_TRANSLATE_ENABLED=true
- 容器内预装模型(Volume 挂载)
"""
from __future__ import annotations
import logging
from app.config import settings
from app.services.translation.base import BaseTranslator, TranslationResult
logger = logging.getLogger("news.translate.local")
class LocalTranslator(BaseTranslator):
name = "nllb"
def __init__(self):
if not settings.local_translate_enabled:
raise RuntimeError("LocalTranslator disabled in settings")
# 模型懒加载(避免 import 时加载大模型)
self._pipe = None
def _ensure_loaded(self):
if self._pipe is not None:
return
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, pipeline
model_name = settings.local_translate_model
logger.info("loading local translation model: %s", model_name)
tok = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
self._pipe = pipeline(
"translation",
model=model,
tokenizer=tok,
device=settings.local_translate_device,
)
async def translate(
self, text: str, source: str = "auto", target: str = "zh"
) -> TranslationResult:
if not text.strip():
return TranslationResult(text=text, engine=self.name, chars=0)
self._ensure_loaded()
import asyncio
loop = asyncio.get_running_loop()
# NLLB 的 src_lang/tgt_lang 比较长,简单按约定:en→zh_Hans
src = "eng_Latn" if source in ("en", "auto") else source
tgt = "zho_Hans" if target == "zh" else target
out = await loop.run_in_executor(
None,
lambda: self._pipe(
text, src_lang=src, tgt_lang=tgt, max_length=2000
),
)
return TranslationResult(
text=out[0]["translation_text"], engine=self.name, chars=len(text)
)

View File

@@ -0,0 +1,146 @@
"""翻译服务门面:配额检查 + 缓存 + 引擎选择 + 月度计数。"""
from __future__ import annotations
import asyncio
import hashlib
import logging
from datetime import datetime, timezone
from typing import Protocol
from app.config import settings
from app.redis_client import get_redis
from app.services.translation.base import BaseTranslator, TranslationResult
from app.services.translation.local import LocalTranslator
from app.services.translation.tencent import TencentTranslator
logger = logging.getLogger("news.translate.service")
# 缓存 key
def _cache_key(text: str, src: str, tgt: str) -> str:
h = hashlib.sha1(f"{src}|{tgt}|{text}".encode()).hexdigest()
return f"translation:cache:{h}"
def _month_key() -> str:
now = datetime.now(timezone.utc)
return f"translation:month:{now:%Y%m}"
class TranslationService:
def __init__(self):
self._tencent: BaseTranslator | None = None
self._local: BaseTranslator | None = None
self._sem = asyncio.Semaphore(3) # 并发限流
def _primary(self) -> BaseTranslator:
if self._tencent is None:
self._tencent = TencentTranslator()
return self._tencent
def _fallback(self) -> BaseTranslator | None:
if self._local is None and settings.local_translate_enabled:
try:
self._local = LocalTranslator()
except Exception as e:
logger.warning("local translator init failed: %s", e)
self._local = None
return self._local
async def can_use_tencent(self, chars: int) -> bool:
if not settings.tencentcloud_secret_id:
return False
r = get_redis()
used = int(await r.get(_month_key()) or 0)
buffered = int(
settings.tencent_tmt_quota_month * (1 - settings.tencent_tmt_quota_buffer)
)
return (used + chars) <= buffered
async def add_usage(self, chars: int) -> None:
r = get_redis()
# 用 INCRBY + EXPIRE 月初;简单做法:每次 set + 设 TTL
key = _month_key()
async with r.pipeline(transaction=False) as pipe:
pipe.incrby(key, chars)
# 月底过期(下下月 1 日)
now = datetime.now(timezone.utc)
if now.month == 12:
next_month = now.replace(year=now.year + 1, month=1, day=1)
else:
next_month = now.replace(month=now.month + 1, day=1)
ttl = int((next_month - now).total_seconds()) + 86400
pipe.expire(key, ttl)
await pipe.execute()
async def translate(
self, text: str, source: str = "auto", target: str = "zh"
) -> TranslationResult:
if not text.strip():
return TranslationResult(text=text, engine="skip", chars=0)
chars = len(text)
# 1) 缓存
r = get_redis()
ck = _cache_key(text, source, target)
cached = await r.get(ck)
if cached is not None:
return TranslationResult(text=cached, engine="cache", chars=chars, cached=True)
# 2) 选引擎
use_tencent = await self.can_use_tencent(chars)
engine: BaseTranslator
if use_tencent:
engine = self._primary()
else:
fb = self._fallback()
if fb is None:
# 没本地:返回原文 + 标记
return TranslationResult(
text=text + "\n\n[本条未翻译:配额耗尽且未启用本地翻译]",
engine="skip",
chars=chars,
)
engine = fb
logger.info("fallback to local translator for %d chars", chars)
# 3) 调用
async with self._sem:
try:
res = await engine.translate(text, source=source, target=target)
except Exception as e:
# 失败:降级
logger.exception("translate failed with %s: %s", engine.name, e)
fb = self._fallback()
if fb is not None and engine is not fb:
res = await fb.translate(text, source=source, target=target)
else:
res = TranslationResult(
text=text + f"\n\n[翻译失败: {e}]",
engine="skip",
chars=chars,
)
# 4) 写缓存(无论引擎)
try:
await r.set(ck, res.text, ex=60 * 60 * 24 * 30) # 30 天
except Exception:
pass
# 5) 计数(只在 tencent 上计)
if res.engine == "tencent":
try:
await self.add_usage(res.chars or chars)
except Exception as e:
logger.warning("add_usage failed: %s", e)
return res
# 全局单例
service = TranslationService()
# 让后端 worker 直接调
class _Protocol(Protocol):
async def translate(self, text: str, source: str = "auto", target: str = "zh") -> TranslationResult: ...

View File

@@ -0,0 +1,74 @@
"""腾讯云文本翻译 TMT。"""
from __future__ import annotations
import asyncio
import logging
import random
from typing import Any
from tencentcloud.common import credential
from tencentcloud.common.exception.tencent_cloud_sdk_exception import (
TencentCloudSDKException,
)
from tencentcloud.tmt.v20180321 import models, tmt_client
from app.config import settings
from app.services.translation.base import BaseTranslator, TranslationResult
logger = logging.getLogger("news.translate.tencent")
# 常见语种映射
_LANG_MAP = {
"en": "en",
"zh": "zh",
"ja": "ja",
"ko": "ko",
"fr": "fr",
"de": "de",
"es": "es",
"ru": "ru",
"ar": "ar",
}
class TencentTranslator(BaseTranslator):
name = "tencent"
def __init__(self):
if not settings.tencentcloud_secret_id or not settings.tencentcloud_secret_key:
raise RuntimeError("Tencent Cloud credentials missing")
self.cred = credential.Credential(
settings.tencentcloud_secret_id, settings.tencentcloud_secret_key
)
self.client = tmt_client.TmtClient(self.cred, settings.tencentcloud_region)
async def translate(
self, text: str, source: str = "auto", target: str = "zh"
) -> TranslationResult:
if not text.strip():
return TranslationResult(text=text, engine=self.name, chars=0)
source = _LANG_MAP.get(source, source if source != "auto" else "auto")
target = _LANG_MAP.get(target, target)
# 简单重试
for attempt in range(2):
try:
req = models.TextTranslateRequest()
req.SourceText = text
req.Source = source
req.Target = target
req.ProjectId = 0
# SDK 同步调用 → 放线程池
resp: Any = await asyncio.to_thread(self.client.TextTranslate, req)
out = getattr(resp, "TargetText", "") or ""
return TranslationResult(
text=out, engine=self.name, chars=len(text), cached=False
)
except TencentCloudSDKException as e:
logger.warning("tencent translate attempt %s failed: %s", attempt, e)
if attempt == 0:
await asyncio.sleep(0.5 + random.random())
else:
raise
raise RuntimeError("unreachable")

View File

@@ -0,0 +1 @@
"""Background workers (fetch + translate + scheduler)."""

View File

@@ -0,0 +1,112 @@
"""Worker 入口:启动调度器 + 异步任务。
`docker compose exec worker python -m app.workers`
"""
from __future__ import annotations
import asyncio
import logging
import signal
from datetime import datetime, timezone
from apscheduler.schedulers.asyncio import AsyncIOScheduler
from apscheduler.triggers.cron import CronTrigger
from apscheduler.triggers.interval import IntervalTrigger
from sqlalchemy import select
from app.config import settings
from app.database import AsyncSessionLocal
from app.models.source import Source
from app.workers.pipeline import fetch_one_source, run_once
logger = logging.getLogger("news.worker")
logging.basicConfig(
level=settings.log_level,
format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
)
async def _rebuild_jobs(scheduler: AsyncIOScheduler) -> None:
"""从 sources 表动态构建 job(可热更新)。"""
scheduler.remove_all_jobs()
async with AsyncSessionLocal() as s:
rows = (await s.execute(select(Source).where(Source.enabled.is_(True)))).scalars()
sources = list(rows)
if not sources:
logger.warning("no enabled sources; scheduler idle")
return
for src in sources:
trigger = (
CronTrigger.from_crontab(src.fetch_cron)
if src.fetch_cron
else IntervalTrigger(minutes=src.fetch_interval_min)
)
scheduler.add_job(
fetch_one_source,
trigger=trigger,
args=[src.id],
id=f"src:{src.slug}",
replace_existing=True,
max_instances=1,
coalesce=True,
misfire_grace_time=300,
)
logger.info("scheduled %s every %s", src.slug, src.fetch_cron or f"{src.fetch_interval_min}m")
async def _daily_rebuild() -> None:
"""每天 00:30 重建 job 列表(支持运行时新增源)。"""
scheduler = AsyncIOScheduler()
# 临时实例,只为重建用
# 实际用全局 scheduler 实例
pass
def build_scheduler() -> AsyncIOScheduler:
sched = AsyncIOScheduler(timezone="Asia/Hong_Kong")
return sched
async def main() -> None:
scheduler = build_scheduler()
await _rebuild_jobs(scheduler)
# 每天 00:30 重建一次
scheduler.add_job(
_rebuild_jobs,
trigger=CronTrigger(hour=0, minute=30),
args=[scheduler],
id="rebuild_jobs",
replace_existing=True,
)
# 启动时立即跑一次
scheduler.add_job(
run_once,
trigger=IntervalTrigger(minutes=0),
id="startup_run",
next_run_time=datetime.now(timezone.utc),
)
scheduler.start()
logger.info("scheduler started with %d jobs", len(scheduler.get_jobs()))
stop = asyncio.Event()
def _signal_handler():
logger.info("shutdown signal received")
stop.set()
loop = asyncio.get_running_loop()
for sig in (signal.SIGINT, signal.SIGTERM):
try:
loop.add_signal_handler(sig, _signal_handler)
except NotImplementedError:
# Windows 等不支持
pass
await stop.wait()
logger.info("stopping scheduler")
scheduler.shutdown(wait=False)
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -0,0 +1,274 @@
"""核心 pipeline:
- 抓取(去重 + 入库)
- 翻译(分块 + 配额管理)
- 手动 run_once / fetch_one_source / translate_article
"""
from __future__ import annotations
import asyncio
import logging
from datetime import datetime, timezone
from sqlalchemy import select
from sqlalchemy.dialects.postgresql import insert as pg_insert
from app.config import settings
from app.database import AsyncSessionLocal
from app.models.article import Article
from app.models.source import Source, SourceKind
from app.services.fetchers import get_fetcher
from app.services.fetchers.base import FetchedItem, url_hash
from app.services.translation.service import service as translation_service
logger = logging.getLogger("news.pipeline")
TRANSLATE_BODY_MAX = 8000 # 单篇正文最大翻译字符
SEM_PER_SOURCE = asyncio.Semaphore(2) # 同一源抓取并发
# === 抓取 + 入库 ===
async def fetch_one_source(source_id: int) -> None:
async with SEM_PER_SOURCE:
async with AsyncSessionLocal() as session:
src = (
await session.execute(select(Source).where(Source.id == source_id))
).scalar_one_or_none()
if not src or not src.enabled:
logger.info("source %s disabled or missing", source_id)
return
try:
fetcher = get_fetcher(src.kind.value, url=src.url, headers=src.headers_json)
items = await fetcher.fetch()
except Exception as e:
logger.exception("fetch failed for %s: %s", src.slug, e)
await _mark_failure(source_id, f"fetch: {type(e).__name__}: {e}")
return
if not items:
await _mark_success(source_id, n_new=0)
return
n_new = await _bulk_insert(src, items)
await _mark_success(source_id, n_new=n_new)
logger.info("source %s: %d new articles", src.slug, n_new)
# 入库后,挑高优先级 / 没翻译的开始翻译
await _translate_recent_for_source(source_id, max_n=20)
async def _mark_failure(source_id: int, status: str) -> None:
async with AsyncSessionLocal() as session:
src = (
await session.execute(select(Source).where(Source.id == source_id))
).scalar_one_or_none()
if not src:
return
src.last_status = status
src.consecutive_failures += 1
src.last_fetched_at = datetime.now(timezone.utc)
if src.consecutive_failures >= settings.fetch_fail_pause_threshold:
# 退避:把 interval 翻倍,封顶 720 分钟
src.fetch_interval_min = min(720, src.fetch_interval_min * 2)
logger.warning(
"source %s paused, interval bumped to %dm",
src.slug,
src.fetch_interval_min,
)
await session.commit()
async def _mark_success(source_id: int, n_new: int) -> None:
async with AsyncSessionLocal() as session:
src = (
await session.execute(select(Source).where(Source.id == source_id))
).scalar_one_or_none()
if not src:
return
src.last_status = f"ok:new={n_new}"
src.consecutive_failures = 0
src.last_fetched_at = datetime.now(timezone.utc)
await session.commit()
async def _bulk_insert(src: Source, items: list[FetchedItem]) -> int:
"""用 PG ON CONFLICT DO NOTHING 去重;返回新插入行数。"""
if not items:
return 0
rows = []
for it in items:
if not it.title or not it.url:
continue
rows.append(
{
"source_id": src.id,
"url": it.url,
"url_hash": url_hash(it.url),
"guid": it.guid,
"title": it.title[:512],
"body_html": (it.body_html or "")[:65535],
"body_text": (it.body_text or "")[:65535],
"lang_src": it.lang or src.language_src,
"author": it.author,
"image_url": it.image_url,
"published_at": it.published_at,
"translation_status": "pending",
"translate_to": src.translate_to,
}
)
if not rows:
return 0
async with AsyncSessionLocal() as session:
stmt = (
pg_insert(Article)
.values(rows)
.on_conflict_do_nothing(index_elements=["url_hash"])
.returning(Article.id)
)
result = await session.execute(stmt)
inserted_ids = [r[0] for r in result.all()]
await session.commit()
return len(inserted_ids)
# === 翻译 ===
async def _translate_recent_for_source(source_id: int, max_n: int = 20) -> None:
async with AsyncSessionLocal() as session:
rows = (
await session.execute(
select(Article)
.where(Article.source_id == source_id, Article.translation_status == "pending")
.order_by(Article.published_at.desc().nullslast(), Article.id.desc())
.limit(max_n)
)
).scalars()
article_ids = [a.id for a in rows]
for aid in article_ids:
await translate_article(aid)
async def translate_article(article_id: int) -> None:
async with AsyncSessionLocal() as session:
art = (
await session.execute(select(Article).where(Article.id == article_id))
).scalar_one_or_none()
if not art:
return
if art.translation_status not in ("pending", "failed"):
return
title = art.title
body_text = (art.body_text or "")[:TRANSLATE_BODY_MAX]
lang_src = art.lang_src or "auto"
target = "zh"
article_id_ref = art.id
if not body_text and not title:
return
total_chars = 0
try:
# title
tr_title = await translation_service.translate(title, source=lang_src, target=target)
total_chars += tr_title.chars
# body 段落切分 + 重组
chunks = _chunk_text(body_text, max_chars=settings.tencent_tmt_max_chars_per_req)
translated_chunks: list[str] = []
for ch in chunks:
tr = await translation_service.translate(ch, source=lang_src, target=target)
total_chars += tr.chars
translated_chunks.append(tr.text)
tr_body = "\n\n".join(translated_chunks)
engine_label = "tencent"
status = "ok" if (tr_title.text and tr_body) else "partial"
except Exception as e:
logger.exception("translate article %s failed: %s", article_id, e)
async with AsyncSessionLocal() as session:
art = (
await session.execute(select(Article).where(Article.id == article_id))
).scalar_one_or_none()
if art:
art.translation_status = "failed"
await session.commit()
return
# 写回
async with AsyncSessionLocal() as session:
art = (
await session.execute(select(Article).where(Article.id == article_id_ref))
).scalar_one_or_none()
if art:
art.title_zh = tr_title.text if tr_title.text else None
art.body_zh_text = tr_body or None
art.body_zh_html = _wrap_html(tr_body) if tr_body else None
art.translation_status = status
art.translation_engine = engine_label
art.translation_chars = total_chars
art.translated_at = datetime.now(timezone.utc)
await session.commit()
logger.info("article %s translated: %d chars, %s", article_id, total_chars, engine_label)
def _chunk_text(text: str, max_chars: int) -> list[str]:
if not text:
return []
paragraphs = [p.strip() for p in text.split("\n\n") if p.strip()]
chunks: list[str] = []
cur = ""
for p in paragraphs:
if len(p) > max_chars:
# 单段过长:按句号切
sentences = _split_long_para(p, max_chars)
for s in sentences:
if len(cur) + len(s) + 2 > max_chars:
if cur:
chunks.append(cur)
cur = s
else:
cur = (cur + "\n\n" + s).strip() if cur else s
else:
if len(cur) + len(p) + 2 > max_chars:
if cur:
chunks.append(cur)
cur = p
else:
cur = (cur + "\n\n" + p).strip() if cur else p
if cur:
chunks.append(cur)
return chunks
def _split_long_para(para: str, max_chars: int) -> list[str]:
parts: list[str] = []
cur = ""
for ch in para:
cur += ch
if ch in ".!?。!?" and len(cur) >= max_chars // 2:
parts.append(cur.strip())
cur = ""
if cur.strip():
parts.append(cur.strip())
if not parts:
return [para[:max_chars]]
return parts
def _wrap_html(text: str) -> str:
"""把译文包成 HTML 段落。"""
from bs4 import BeautifulSoup
parts = [f"<p>{p.strip()}</p>" for p in text.split("\n\n") if p.strip()]
return "\n".join(parts) if parts else ""
# === 全量跑(供测试 / 手动触发) ===
async def run_once() -> None:
async with AsyncSessionLocal() as session:
rows = (await session.execute(select(Source).where(Source.enabled.is_(True)))).scalars()
sources = list(rows)
logger.info("run_once: %d enabled sources", len(sources))
tasks = [fetch_one_source(s.id) for s in sources]
await asyncio.gather(*tasks, return_exceptions=True)

72
backend/pyproject.toml Normal file
View File

@@ -0,0 +1,72 @@
[project]
name = "news-aggregator"
version = "0.1.0"
description = "Private news aggregator with multi-source RSS, translation, web + Android clients"
requires-python = ">=3.12"
dependencies = [
# web
"fastapi>=0.115.0",
"uvicorn[standard]>=0.32.0",
"pydantic>=2.9.0",
"pydantic-settings>=2.6.0",
"python-multipart>=0.0.12",
# db
"sqlalchemy[asyncio]>=2.0.36",
"asyncpg>=0.30.0",
"alembic>=1.14.0",
"psycopg2-binary>=2.9.10", # alembic sync driver
# cache / queue
"redis>=5.2.0",
# auth
"passlib[bcrypt]>=1.7.4",
"bcrypt==4.0.1", # 锁版本,passlib 与新版 bcrypt 不兼容
"pyjwt>=2.10.0",
# fetch / parse
"feedparser>=6.0.11",
"httpx>=0.28.0",
"trafilatura>=2.0.0",
"beautifulsoup4>=4.12.3",
"lxml>=5.3.0",
"python-dateutil>=2.9.0",
# translation
"tencentcloud-sdk-python>=3.0.1200",
# scheduling
"apscheduler>=3.10.4",
# observability
"structlog>=24.4.0",
"orjson>=3.10.10",
# util
"pydantic-extra-types>=2.10.0",
"email-validator>=2.2.0",
"python-slugify>=8.0.4",
]
[project.optional-dependencies]
dev = [
"pytest>=8.3.0",
"pytest-asyncio>=0.24.0",
"ruff>=0.7.0",
"mypy>=1.13.0",
]
[tool.ruff]
line-length = 110
target-version = "py312"
[tool.ruff.lint]
select = ["E", "F", "I", "B", "UP", "W"]
ignore = ["E501"]
[tool.mypy]
python_version = "3.12"
ignore_missing_imports = true
strict_optional = true
warn_unused_ignores = true
[build-system]
requires = ["setuptools>=68"]
build-backend = "setuptools.build_meta"
[tool.setuptools.packages.find]
where = ["."]
include = ["app*"]