commit 60b062daf25fa922a26e26ead0bf4f3a0892007c Author: Mavis Date: Sun Jun 7 21:51:01 2026 +0800 feat: initial MVP - FastAPI backend + Vue3 frontend + docker-compose - backend: FastAPI + SQLAlchemy 2.0(async) + asyncpg + Alembic - 7 API routes: auth/me/articles/sources/bookmarks/subscriptions/admin - models: User/Source/Article/Bookmark/Subscription/ApiToken - services: RSS fetcher (feedparser) + Tencent TMT translator with quota + cache + local NLLB fallback - workers: APScheduler + asyncio pipeline (fetch -> dedupe -> insert -> translate) - seed scripts: create_user, seed_sources (5 RSS: Reuters/BBC/Al Jazeera/NHK/DW) - frontend: Vue 3 + Vite + Naive UI + Pinia + vue-router - pages: Login, Feed (24h), ArticleDetail, Sources, Bookmarks, AdminSources - deploy: docker-compose (postgres/redis/api/worker/frontend/caddy) - docs: README, DEPLOY, architecture, acceptance diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..56bb65e --- /dev/null +++ b/.env.example @@ -0,0 +1,58 @@ +# ===== 通用 ===== +TZ=Asia/Hong_Kong +LOG_LEVEL=INFO + +# ===== 数据库 ===== +POSTGRES_USER=news +POSTGRES_PASSWORD=change_me_strong_password +POSTGRES_DB=news +POSTGRES_HOST=postgres +POSTGRES_PORT=5432 + +# ===== Redis ===== +REDIS_HOST=redis +REDIS_PORT=6379 +REDIS_PASSWORD=change_me_redis_password +REDIS_DB=0 + +# ===== JWT ===== +# openssl rand -hex 64 生成 +JWT_SECRET=change_me_to_a_64byte_random_hex +JWT_ALGORITHM=HS256 +ACCESS_TOKEN_TTL_MIN=60 +REFRESH_TOKEN_TTL_DAY=14 + +# ===== 腾讯云 TMT 翻译 ===== +# https://console.cloud.tencent.com/cam/capi 申请 +TENCENTCLOUD_SECRET_ID=your_tencent_secret_id +TENCENTCLOUD_SECRET_KEY=your_tencent_secret_key +TENCENTCLOUD_REGION=ap-hongkong +TENCENT_TMT_ENDPOINT=tmt.tencentcloudapi.com +# 字符配额(月度,5000000 = 500 万) +TENCENT_TMT_QUOTA_MONTH=5000000 +# 缓冲比例(0.05 = 95% 触发后切本地) +TENCENT_TMT_QUOTA_BUFFER=0.05 +# 单次请求最大字符 +TENCENT_TMT_MAX_CHARS_PER_REQ=4500 + +# ===== 本地翻译(降级) ===== +# 不启用就留空:不会用本地模型 +LOCAL_TRANSLATE_ENABLED=false +LOCAL_TRANSLATE_MODEL=nllb-200-distilled-600M +LOCAL_TRANSLATE_DEVICE=cpu + +# ===== 抓取 ===== +# 全局 QPS 上限 +FETCH_GLOBAL_QPS=4 +# 单源超时(秒) +FETCH_TIMEOUT=20 +# 单源失败连续次数后暂停 +FETCH_FAIL_PAUSE_THRESHOLD=3 +# 单源 fetch 最大重试次数 +FETCH_MAX_RETRIES=2 + +# ===== Caddy / 域名 ===== +# 留空走 IP 自签证书;有域名走自动 HTTPS +DOMAIN= +# 邮箱(Let's Encrypt 用) +ACME_EMAIL=you@example.com diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..6f77b8f --- /dev/null +++ b/.gitignore @@ -0,0 +1,54 @@ +# Python +__pycache__/ +*.py[cod] +*$py.class +*.so +.Python +.venv/ +venv/ +env/ +ENV/ +.pytest_cache/ +.mypy_cache/ +.ruff_cache/ +.coverage +htmlcov/ +*.egg-info/ +dist/ +build/ + +# Node +node_modules/ +.npm +.pnpm-store/ +*.log +.npmrc.local + +# 编辑器 +.idea/ +.vscode/ +*.swp +.DS_Store +Thumbs.db + +# 项目 +.env +.env.local +*.sqlite +*.sqlite3 +data/ +backups/ +logs/ +*.log +alembic/versions/__pycache__/ + +# 打包产物 / 临时 +*.zip +*.tar.gz +*.7z +*.bak + +# 敏感 +secrets/ +*.pem +*.key diff --git a/Caddyfile b/Caddyfile new file mode 100644 index 0000000..7d3c410 --- /dev/null +++ b/Caddyfile @@ -0,0 +1,32 @@ +{ + # 全局选项 + auto_https off + admin off + log { + level info + output stdout + } +} + +# 如果 DOMAIN 为空,直接走 80 端口 HTTP(自签证书后面再补) +# 如果有域名,改用下面的 https 配置块 + +http://{$DOMAIN:NEWS_DOMAIN_FALLBACK} { + reverse_proxy /api/* api:8000 + reverse_proxy /* frontend:80 + + encode gzip zstd + + # 日志 + log { + output stdout + format console + } +} + +# 如果有域名,启用自动 HTTPS(取消下面注释,并把上面块注释) +# {$DOMAIN} { +# reverse_proxy /api/* api:8000 +# reverse_proxy /* frontend:80 +# encode gzip zstd +# } diff --git a/DEPLOY.md b/DEPLOY.md new file mode 100644 index 0000000..13b1a41 --- /dev/null +++ b/DEPLOY.md @@ -0,0 +1,165 @@ +# 部署指南 · DEPLOY + +目标:从一台全新的 Ubuntu 24 香港 VPS,到能访问的私人新闻系统。 + +## 0. 准备 + +- 香港 VPS(最低 2C2G 30G) +- 域名(可选,没域名走 IP + 自签证书) +- 腾讯云账号 + 已开通「文本翻译 TMT」 + +## 1. 服务器初始化 + +```bash +# SSH 登录 +ssh root@YOUR_SERVER_IP + +# 创建非 root 用户 +adduser news +usermod -aG sudo news + +# 基础包 +apt update && apt -y upgrade +apt -y install curl git ufw fail2ban + +# 防火墙 +ufw allow OpenSSH +ufw allow 80/tcp +ufw allow 443/tcp +ufw enable + +# Docker +curl -fsSL https://get.docker.com | sh +usermod -aG docker news + +# 退出 root,切换到 news +exit +ssh news@YOUR_SERVER_IP +``` + +## 2. 拉代码 + +```bash +sudo mkdir -p /srv/news +sudo chown news:news /srv/news +cd /srv/news +git clone <你的仓库地址> . +# 或者 scp 上传 +``` + +## 3. 配置环境变量 + +```bash +cp .env.example .env +nano .env +``` + +**必填字段:** + +| 字段 | 怎么填 | +| --- | --- | +| `POSTGRES_PASSWORD` | `openssl rand -hex 24` | +| `REDIS_PASSWORD` | `openssl rand -hex 24` | +| `JWT_SECRET` | `openssl rand -hex 64` | +| `TENCENTCLOUD_SECRET_ID` | 腾讯云控制台 → 访问管理 → API 密钥 | +| `TENCENTCLOUD_SECRET_KEY` | 同上 | +| `TENCENTCLOUD_REGION` | `ap-hongkong` | +| `DOMAIN` | 域名(可选,留空走 IP) | + +## 4. 启动 + +```bash +docker compose up -d --build +# 等 30 秒 +docker compose ps +# 全部 healthy 即可 +``` + +## 5. 初始化 + +```bash +# 5.1 数据库迁移 +docker compose exec api alembic upgrade head + +# 5.2 创建 owner 账号 +docker compose exec api python -m app.scripts.create_user \ + --username owner --password YOUR_STRONG_PASS + +# 5.3 导入 5 个种子源 +docker compose exec api python -m app.scripts.seed_sources + +# 5.4 手动触发一次抓取(看效果) +docker compose exec worker python -c "import asyncio; from app.workers.pipeline import run_once; asyncio.run(run_once())" + +# 等 1~3 分钟,刷一下 +docker compose exec postgres psql -U $POSTGRES_USER -d $POSTGRES_DB -c "SELECT count(*) FROM articles;" +``` + +## 6. 验证清单 + +- [ ] 浏览器打开 `http://YOUR_IP/` 看到登录页 +- [ ] 用 owner 登录成功 +- [ ] Feed 列表显示 24h 内新闻(标题中英对照) +- [ ] 详情页原文+译文并列 +- [ ] `/admin/sources` 能看到 5 个源 +- [ ] 翻译配额仪表盘显示已用字符 +- [ ] 等到凌晨,worker 自动跑批,文章数持续增长 + +## 7. 域名 + HTTPS(可选) + +1. 域名 A 记录指向服务器 IP +2. 编辑 `.env` 填 `DOMAIN=news.example.com` + `ACME_EMAIL=you@example.com` +3. 编辑 `Caddyfile`,把 `http://{$DOMAIN}` 改成 `{$DOMAIN}`(取消注释下面块) +4. `docker compose restart caddy` +5. Caddy 自动申请 Let's Encrypt 证书 + +## 8. 备份 + +```bash +# 每天凌晨 4 点备份到本地 +cat > /srv/news/scripts/backup.sh <<'EOF' +#!/bin/bash +set -e +BACKUP_DIR=/srv/news/backups/$(date +%Y%m%d) +mkdir -p "$BACKUP_DIR" +docker compose exec -T postgres pg_dump -U $POSTGRES_USER $POSTGRES_DB | gzip > "$BACKUP_DIR/db.sql.gz" +# 保留 7 天 +find /srv/news/backups -type d -mtime +7 -exec rm -rf {} + +EOF +chmod +x /srv/news/scripts/backup.sh + +# 加 cron +crontab -e +# 添加一行: +# 0 4 * * * /srv/news/scripts/backup.sh +``` + +**强烈建议**:把 `/srv/news/backups/` 同步到腾讯云 COS / 阿里云 OSS,做异地灾备。 + +## 9. 升级 + +```bash +cd /srv/news +git pull +docker compose pull +docker compose up -d --build +docker compose exec api alembic upgrade head +``` + +## 10. 常见问题 + +**Q: 某个源一直 fail?** +A: 看 `docker compose logs worker | grep `,90% 是 RSS URL 失效或者被反爬。在 `sources` 表里 `enabled=false` 暂停。 + +**Q: 翻译字符超 500 万?** +A: 配 `.env` 的 `TENCENT_TMT_QUOTA_BUFFER=0.05`,系统在 475 万字符后自动切本地 NLLB(需启用 `LOCAL_TRANSLATE_ENABLED=true`)。 +未启用本地翻译时,系统会在原文末尾标 `[本条未翻译]`。 + +**Q: 30G 硬盘快满了?** +A: 执行冷热分层 cron: +```sql +DELETE FROM articles WHERE published_at < now() - interval '90 day' AND duplicate_of IS NULL; +``` + +**Q: 怎么加新源?** +A: 网页登录 owner → `/admin/sources` → 新增。填 name / kind=rss / url=RSS 链接 / 优先级 / 抓取频率。保存后 worker 下个轮询周期自动拉。 diff --git a/README.md b/README.md new file mode 100644 index 0000000..64912c3 --- /dev/null +++ b/README.md @@ -0,0 +1,75 @@ +# Diary News · 私人新闻汇总系统 + +> 抓取境外权威源 → 自动翻译 → 网页 + Android 双端展示。 +> 跑在一台 30G 香港 VPS 上,自用 + 家人/小圈子。 + +完整方案见 [`docs/architecture.md`](./docs/architecture.md),部署步骤见 [`DEPLOY.md`](./DEPLOY.md)。 + +## 仓库结构 + +``` +diary-news/ +├── backend/ # FastAPI 后端 + worker + scheduler +│ ├── app/ +│ │ ├── api/ # 路由 +│ │ ├── core/ # 安全 / 依赖 +│ │ ├── models/ # SQLAlchemy 模型 +│ │ ├── schemas/ # Pydantic schemas +│ │ ├── services/ # 采集 / 翻译 +│ │ ├── workers/ # 抓取 / 翻译 pipeline + APScheduler +│ │ ├── scripts/ # 初始化脚本 +│ │ ├── config.py # Pydantic Settings +│ │ ├── database.py # 异步 SQLAlchemy +│ │ └── main.py # FastAPI 入口 +│ ├── alembic/ # 迁移 +│ ├── Dockerfile +│ └── pyproject.toml +├── frontend/ # Vue 3 + Vite + Naive UI +├── docs/ +│ └── architecture.md +├── Caddyfile # 反代 +├── docker-compose.yml +├── .env.example +├── DEPLOY.md +└── README.md +``` + +## 快速开始(本地开发) + +```bash +# 1. 准备环境 +cp .env.example .env +# 编辑 .env 填入密钥 + +# 2. 启动 +docker compose up -d + +# 3. 初始化数据库 + 创建 owner 账号 + 导入 5 个种子源 +docker compose exec api alembic upgrade head +docker compose exec api python -m app.scripts.create_user --username owner --password YOUR_PASS +docker compose exec api python -m app.scripts.seed_sources + +# 4. 触发一次抓取(看效果) +docker compose exec api python -c "import asyncio; from app.workers.pipeline import run_once; asyncio.run(run_once())" + +# 5. 打开 +# http://localhost/ +``` + +## 设计原则 + +- **轻量**:单机 30G 能跑,不堆重型服务 +- **可控**:源管理 / 翻译配额 / 抓取调度全部可视化 +- **可扩展**:ML 字段已建好(分类/点评/实体),不需改表 +- **不反爬对抗**:愿意被 ban IP 就 ban,优先合规 + +## 当前阶段 + +**Phase 1 · MVP(本仓库)** +- ✅ 5 个权威 RSS 源采集(Reuters/BBC/Al Jazeera/NHK/DW) +- ✅ 腾讯云 TMT 翻译 + 字符配额监控 + 降级 +- ✅ 网页:登录 / 24h 列表 / 详情 / 源管理 +- ✅ 凌晨分波次调度 +- ⏳ Android(Phase 3) +- ⏳ 智能分类/点评(Phase 4) +- ⏳ PWA 离线 / 推送(Phase 2) diff --git a/backend/.dockerignore b/backend/.dockerignore new file mode 100644 index 0000000..08c853f --- /dev/null +++ b/backend/.dockerignore @@ -0,0 +1,11 @@ +__pycache__ +*.pyc +.pytest_cache +.mypy_cache +.ruff_cache +.venv +venv +.env +*.egg-info +build +dist diff --git a/backend/Dockerfile b/backend/Dockerfile new file mode 100644 index 0000000..b7c113d --- /dev/null +++ b/backend/Dockerfile @@ -0,0 +1,36 @@ +FROM python:3.12-slim + +ENV PYTHONUNBUFFERED=1 \ + PYTHONDONTWRITEBYTECODE=1 \ + PIP_NO_CACHE_DIR=1 \ + PIP_DISABLE_PIP_VERSION_CHECK=1 \ + TZ=Asia/Hong_Kong + +# 系统依赖 +RUN apt-get update && apt-get install -y --no-install-recommends \ + build-essential \ + libpq-dev \ + curl \ + ca-certificates \ + tzdata \ + && ln -sf /usr/share/zoneinfo/Asia/Hong_Kong /etc/localtime \ + && echo "Asia/Hong_Kong" > /etc/timezone \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + +WORKDIR /app + +# 先装依赖(利用 Docker 缓存) +COPY pyproject.toml ./ +RUN pip install --upgrade pip && \ + pip install -e . + +# 代码(开发期用 volume 覆盖,这里也保留一份) +COPY app ./app +COPY alembic ./alembic +COPY alembic.ini ./ + +EXPOSE 8000 + +# 默认启动 uvicorn;docker-compose 中 worker 容器会用别的 command +CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"] diff --git a/backend/alembic.ini b/backend/alembic.ini new file mode 100644 index 0000000..9d7c7de --- /dev/null +++ b/backend/alembic.ini @@ -0,0 +1,42 @@ +[alembic] +script_location = alembic +prepend_sys_path = . +version_path_separator = os +# sqlalchemy.url 从 env.py 注入,这里留空 +sqlalchemy.url = + +[post_write_hooks] + +[loggers] +keys = root,sqlalchemy,alembic + +[handlers] +keys = console + +[formatters] +keys = generic + +[logger_root] +level = WARN +handlers = console +qualname = + +[logger_sqlalchemy] +level = WARN +handlers = +qualname = sqlalchemy.engine + +[logger_alembic] +level = INFO +handlers = +qualname = alembic + +[handler_console] +class = StreamHandler +args = (sys.stderr,) +level = NOTSET +formatter = generic + +[formatter_generic] +format = %(levelname)-5.5s [%(name)s] %(message)s +datefmt = %H:%M:%S diff --git a/backend/alembic/env.py b/backend/alembic/env.py new file mode 100644 index 0000000..2502686 --- /dev/null +++ b/backend/alembic/env.py @@ -0,0 +1,59 @@ +"""Alembic 环境配置:从 app.config 读取 URL,启用 autogenerate。""" +from __future__ import annotations + +import sys +from logging.config import fileConfig +from pathlib import Path + +from alembic import context +from sqlalchemy import engine_from_config, pool + +# 让 alembic 能 import app +sys.path.insert(0, str(Path(__file__).resolve().parents[1])) + +from app.config import settings # noqa: E402 +from app.database import Base # noqa: E402 +from app.models import * # noqa: F401, F403, E402 + +config = context.config +if config.config_file_name is not None: + fileConfig(config.config_file_name) + +config.set_main_option("sqlalchemy.url", settings.sync_database_url) + +target_metadata = Base.metadata + + +def run_migrations_offline() -> None: + url = config.get_main_option("sqlalchemy.url") + context.configure( + url=url, + target_metadata=target_metadata, + literal_binds=True, + dialect_opts={"paramstyle": "named"}, + compare_type=True, + ) + with context.begin_transaction(): + context.run_migrations() + + +def run_migrations_online() -> None: + connectable = engine_from_config( + config.get_section(config.config_ini_section, {}), + prefix="sqlalchemy.", + poolclass=pool.NullPool, + ) + with connectable.connect() as connection: + context.configure( + connection=connection, + target_metadata=target_metadata, + compare_type=True, + ) + with context.begin_transaction(): + context.run_migrations() + + +if context.is_offline_mode(): + run_migrations_offline() +else: + run_migrations_online() diff --git a/backend/alembic/script.py.mako b/backend/alembic/script.py.mako new file mode 100644 index 0000000..590f5b3 --- /dev/null +++ b/backend/alembic/script.py.mako @@ -0,0 +1,24 @@ +"""${message} + +Revision ID: ${up_revision} +Revises: ${down_revision | comma,n} +Create Date: ${create_date} +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa +${imports if imports else ""} + +revision: str = ${repr(up_revision)} +down_revision: Union[str, None] = ${repr(down_revision)} +branch_labels: Union[str, Sequence[str], None] = ${repr(branch_labels)} +depends_on: Union[str, Sequence[str], None] = ${repr(depends_on)} + + +def upgrade() -> None: + ${upgrades if upgrades else "pass"} + + +def downgrade() -> None: + ${downgrades if downgrades else "pass"} diff --git a/backend/alembic/versions/0001_initial.py b/backend/alembic/versions/0001_initial.py new file mode 100644 index 0000000..4286e78 --- /dev/null +++ b/backend/alembic/versions/0001_initial.py @@ -0,0 +1,180 @@ +"""initial schema + +Revision ID: 0001 +Revises: +Create Date: 2026-06-07 +""" +from __future__ import annotations + +from typing import Sequence, Union + +import sqlalchemy as sa +from alembic import op +from sqlalchemy.dialects import postgresql + +revision: str = "0001" +down_revision: Union[str, None] = None +branch_labels = None +depends_on = None + + +def upgrade() -> None: + # === 用户 === + user_role = postgresql.ENUM("owner", "member", name="user_role", create_type=True) + user_role.create(op.get_bind(), checkfirst=True) + + op.create_table( + "users", + sa.Column("id", sa.Integer, primary_key=True), + sa.Column("username", sa.String(64), unique=True, index=True, nullable=False), + sa.Column("email", sa.String(255), unique=True, index=True), + sa.Column("password_hash", sa.String(255), nullable=False), + sa.Column( + "role", + postgresql.ENUM("owner", "member", name="user_role", create_type=False), + nullable=False, + ), + sa.Column("is_active", sa.Boolean, nullable=False, server_default=sa.text("true")), + sa.Column("display_name", sa.String(128)), + sa.Column("created_at", sa.DateTime(timezone=True), server_default=sa.func.now(), nullable=False), + sa.Column("last_login_at", sa.DateTime(timezone=True)), + ) + + # === 源 === + source_kind = postgresql.ENUM("rss", "html_list", "tg_channel", name="source_kind", create_type=True) + source_kind.create(op.get_bind(), checkfirst=True) + + op.create_table( + "sources", + sa.Column("id", sa.Integer, primary_key=True), + sa.Column("name", sa.String(128), nullable=False), + sa.Column("slug", sa.String(128), unique=True, index=True, nullable=False), + sa.Column( + "kind", + postgresql.ENUM("rss", "html_list", "tg_channel", name="source_kind", create_type=False), + nullable=False, + ), + sa.Column("url", sa.Text, nullable=False), + sa.Column("detail_selector", postgresql.JSONB), + sa.Column("fetch_interval_min", sa.Integer, nullable=False, server_default="60"), + sa.Column("fetch_cron", sa.String(64)), + sa.Column("translate_to", sa.String(8), nullable=False, server_default="zh"), + sa.Column("enabled", sa.Boolean, nullable=False, server_default=sa.text("true")), + sa.Column("region", sa.String(32), index=True), + sa.Column("language_src", sa.String(8)), + sa.Column("priority", sa.Integer, nullable=False, server_default="50", index=True), + sa.Column("headers_json", postgresql.JSONB), + sa.Column("last_fetched_at", sa.DateTime(timezone=True)), + sa.Column("last_status", sa.String(64)), + sa.Column("consecutive_failures", sa.Integer, nullable=False, server_default="0"), + sa.Column("created_at", sa.DateTime(timezone=True), server_default=sa.func.now(), nullable=False), + sa.Column( + "updated_at", + sa.DateTime(timezone=True), + server_default=sa.func.now(), + nullable=False, + ), + ) + + # === 文章 === + op.create_table( + "articles", + sa.Column("id", sa.BigInteger, primary_key=True), + sa.Column("source_id", sa.Integer, sa.ForeignKey("sources.id", ondelete="CASCADE"), nullable=False), + sa.Column("url", sa.Text, nullable=False), + sa.Column("url_hash", sa.String(40), unique=True, nullable=False, index=True), + sa.Column("guid", sa.String(255), index=True), + sa.Column("title", sa.Text, nullable=False), + sa.Column("body_html", sa.Text), + sa.Column("body_text", sa.Text, nullable=False, server_default=""), + sa.Column("lang_src", sa.String(8)), + sa.Column("author", sa.String(255)), + sa.Column("image_url", sa.Text), + sa.Column("title_zh", sa.Text), + sa.Column("body_zh_html", sa.Text), + sa.Column("body_zh_text", sa.Text), + sa.Column("summary_zh", sa.Text), + sa.Column("translation_status", sa.String(16), nullable=False, server_default="pending"), + sa.Column("translation_engine", sa.String(16)), + sa.Column("translation_chars", sa.Integer, nullable=False, server_default="0"), + sa.Column("translated_at", sa.DateTime(timezone=True)), + sa.Column("category", sa.String(32), index=True), + sa.Column("commentary", sa.Text), + sa.Column("entities", postgresql.JSONB), + sa.Column("sentiment", sa.Float), + sa.Column("topic_id", sa.String(64), index=True), + sa.Column("bias", sa.String(16)), + sa.Column("duplicate_of", sa.BigInteger, sa.ForeignKey("articles.id", ondelete="SET NULL")), + sa.Column("published_at", sa.DateTime(timezone=True), index=True), + sa.Column("fetched_at", sa.DateTime(timezone=True), server_default=sa.func.now(), nullable=False), + sa.Column("created_at", sa.DateTime(timezone=True), server_default=sa.func.now(), nullable=False), + ) + op.create_index("ix_articles_source_published", "articles", ["source_id", "published_at"]) + op.create_index("ix_articles_status_published", "articles", ["translation_status", "published_at"]) + + # === 收藏 === + op.create_table( + "bookmarks", + sa.Column("id", sa.Integer, primary_key=True), + sa.Column("user_id", sa.Integer, sa.ForeignKey("users.id", ondelete="CASCADE"), nullable=False), + sa.Column("article_id", sa.BigInteger, sa.ForeignKey("articles.id", ondelete="CASCADE"), nullable=False), + sa.Column("note", sa.Text), + sa.Column("created_at", sa.DateTime(timezone=True), server_default=sa.func.now(), nullable=False), + sa.UniqueConstraint("user_id", "article_id", name="uq_bookmark_user_article"), + ) + op.create_index("ix_bookmarks_user_id", "bookmarks", ["user_id"]) + op.create_index("ix_bookmarks_article_id", "bookmarks", ["article_id"]) + + # === 订阅 === + subscription_match = postgresql.ENUM("any", "title", "body", name="subscription_match", create_type=True) + subscription_match.create(op.get_bind(), checkfirst=True) + + op.create_table( + "subscriptions", + sa.Column("id", sa.Integer, primary_key=True), + sa.Column("user_id", sa.Integer, sa.ForeignKey("users.id", ondelete="CASCADE"), nullable=False), + sa.Column("keyword", sa.String(255), nullable=False), + sa.Column( + "match_in", + postgresql.ENUM("any", "title", "body", name="subscription_match", create_type=False), + nullable=False, + ), + sa.Column("channel", sa.String(32), nullable=False, server_default="telegram"), + sa.Column("target", sa.Text), + sa.Column("enabled", sa.Boolean, nullable=False, server_default=sa.text("true")), + sa.Column("last_hit_at", sa.DateTime(timezone=True)), + sa.Column("created_at", sa.DateTime(timezone=True), server_default=sa.func.now(), nullable=False), + ) + op.create_index("ix_subscriptions_user_id", "subscriptions", ["user_id"]) + + # === API Token === + op.create_table( + "api_tokens", + sa.Column("id", sa.Integer, primary_key=True), + sa.Column("user_id", sa.Integer, sa.ForeignKey("users.id", ondelete="CASCADE"), nullable=False), + sa.Column("name", sa.String(64), nullable=False), + sa.Column("token_hash", sa.String(128), unique=True, nullable=False), + sa.Column("last_used_at", sa.DateTime(timezone=True)), + sa.Column("expires_at", sa.DateTime(timezone=True)), + sa.Column("revoked_at", sa.DateTime(timezone=True)), + sa.Column("created_at", sa.DateTime(timezone=True), server_default=sa.func.now(), nullable=False), + ) + op.create_index("ix_api_tokens_user_id", "api_tokens", ["user_id"]) + + +def downgrade() -> None: + op.drop_table("api_tokens") + op.drop_table("subscriptions") + op.drop_index("ix_subscriptions_user_id", table_name="subscriptions") + op.drop_table("bookmarks") + op.drop_index("ix_bookmarks_user_id", table_name="bookmarks") + op.drop_index("ix_bookmarks_article_id", table_name="bookmarks") + op.drop_index("ix_articles_status_published", table_name="articles") + op.drop_index("ix_articles_source_published", table_name="articles") + op.drop_table("articles") + op.drop_table("sources") + op.drop_table("users") + + op.execute("DROP TYPE IF EXISTS subscription_match") + op.execute("DROP TYPE IF EXISTS source_kind") + op.execute("DROP TYPE IF EXISTS user_role") diff --git a/backend/app/__init__.py b/backend/app/__init__.py new file mode 100644 index 0000000..1e2f9e7 --- /dev/null +++ b/backend/app/__init__.py @@ -0,0 +1,3 @@ +"""News Aggregator backend.""" + +__version__ = "0.1.0" diff --git a/backend/app/api/__init__.py b/backend/app/api/__init__.py new file mode 100644 index 0000000..1ce04c3 --- /dev/null +++ b/backend/app/api/__init__.py @@ -0,0 +1 @@ +"""API routes.""" diff --git a/backend/app/api/admin.py b/backend/app/api/admin.py new file mode 100644 index 0000000..3b3d0f3 --- /dev/null +++ b/backend/app/api/admin.py @@ -0,0 +1,199 @@ +"""Admin API(仅 owner)。 + +- 源管理 CRUD +- 手动触发抓取 / 重译 +- 源健康看板 +- 翻译配额管理 +""" +from __future__ import annotations + +from datetime import datetime, timedelta, timezone +from typing import Any + +from fastapi import APIRouter, BackgroundTasks, Depends, HTTPException, status +from pydantic import BaseModel +from sqlalchemy import func, select +from sqlalchemy.exc import IntegrityError +from sqlalchemy.ext.asyncio import AsyncSession + +from app.core.deps import require_owner +from app.database import get_session +from app.models.article import Article +from app.models.source import Source +from app.models.user import User +from app.schemas.source import SourceIn, SourceOut, SourceUpdate + +router = APIRouter(prefix="/admin", tags=["admin"], dependencies=[Depends(require_owner)]) + + +# === Source CRUD === +@router.get("/sources", response_model=list[SourceOut]) +async def list_sources_all(session: AsyncSession = Depends(get_session)): + rows = (await session.execute(select(Source).order_by(Source.id))).scalars() + return [SourceOut.model_validate(s) for s in rows] + + +@router.post("/sources", response_model=SourceOut, status_code=status.HTTP_201_CREATED) +async def create_source(body: SourceIn, session: AsyncSession = Depends(get_session)): + src = Source( + name=body.name, + slug=body.slug, + kind=body.kind, + url=str(body.url), + detail_selector=body.detail_selector, + region=body.region, + language_src=body.language_src, + priority=body.priority, + fetch_interval_min=body.fetch_interval_min, + translate_to=body.translate_to, + enabled=body.enabled, + headers_json=body.headers_json, + ) + session.add(src) + try: + await session.commit() + except IntegrityError as e: + await session.rollback() + raise HTTPException(status.HTTP_409_CONFLICT, f"slug '{body.slug}' already exists") from e + await session.refresh(src) + return SourceOut.model_validate(src) + + +@router.patch("/sources/{source_id}", response_model=SourceOut) +async def update_source( + source_id: int, + body: SourceUpdate, + session: AsyncSession = Depends(get_session), +): + src = (await session.execute(select(Source).where(Source.id == source_id))).scalar_one_or_none() + if not src: + raise HTTPException(status.HTTP_404_NOT_FOUND, "Source not found") + for k, v in body.model_dump(exclude_unset=True).items(): + setattr(src, k, v) + await session.commit() + await session.refresh(src) + return SourceOut.model_validate(src) + + +@router.delete("/sources/{source_id}", status_code=status.HTTP_204_NO_CONTENT) +async def delete_source(source_id: int, session: AsyncSession = Depends(get_session)): + src = (await session.execute(select(Source).where(Source.id == source_id))).scalar_one_or_none() + if not src: + raise HTTPException(status.HTTP_404_NOT_FOUND, "Source not found") + await session.delete(src) + await session.commit() + return None + + +# === 手动触发 === +class TriggerResponse(BaseModel): + triggered: bool + detail: str = "" + + +@router.post("/refresh/{source_id}", response_model=TriggerResponse) +async def refresh_source( + source_id: int, + background: BackgroundTasks, + session: AsyncSession = Depends(get_session), +): + src = (await session.execute(select(Source).where(Source.id == source_id))).scalar_one_or_none() + if not src: + raise HTTPException(status.HTTP_404_NOT_FOUND, "Source not found") + if not src.enabled: + raise HTTPException(status.HTTP_400_BAD_REQUEST, "Source disabled") + + # 走 background,不等结果 + from app.workers.pipeline import fetch_one_source + + background.add_task(fetch_one_source, source_id) + return TriggerResponse(triggered=True, detail=f"queued fetch for {src.slug}") + + +async def _run_fetch(source_id: int) -> None: + """(deprecated) 走 background 用的薄包装,见 refresh_source。""" + from app.workers.pipeline import fetch_one_source + + await fetch_one_source(source_id) + + +@router.post("/translation/rerun/{article_id}", response_model=TriggerResponse) +async def rerun_translation( + article_id: int, + background: BackgroundTasks, + session: AsyncSession = Depends(get_session), +): + art = (await session.execute(select(Article).where(Article.id == article_id))).scalar_one_or_none() + if not art: + raise HTTPException(status.HTTP_404_NOT_FOUND, "Article not found") + art.translation_status = "pending" + art.title_zh = None + art.body_zh_text = None + art.body_zh_html = None + art.translated_at = None + art.translation_engine = None + await session.commit() + + from app.workers.pipeline import translate_article + + background.add_task(translate_article, article_id) + return TriggerResponse(triggered=True, detail=f"queued translation for article {article_id}") + + +# === 健康看板 === +class HealthOut(BaseModel): + source_id: int + slug: str + name: str + enabled: bool + last_fetched_at: datetime | None + last_status: str | None + consecutive_failures: int + fetch_interval_min: int + article_count_24h: int + + +@router.get("/health", response_model=list[HealthOut]) +async def health(session: AsyncSession = Depends(get_session)): + rows = (await session.execute(select(Source).order_by(Source.priority.desc()))).scalars() + out: list[HealthOut] = [] + for s in rows: + c24 = ( + await session.execute( + select(func.count(Article.id)).where( + Article.source_id == s.id, + Article.fetched_at >= datetime.now(timezone.utc).replace(tzinfo=None) + - timedelta(hours=24), + ) + ) + ).scalar_one() + out.append( + HealthOut( + source_id=s.id, + slug=s.slug, + name=s.name, + enabled=s.enabled, + last_fetched_at=s.last_fetched_at, + last_status=s.last_status, + consecutive_failures=s.consecutive_failures, + fetch_interval_min=s.fetch_interval_min, + article_count_24h=c24 or 0, + ) + ) + return out + + +# === 翻译配额(管理员视图) === +class QuotaReset(BaseModel): + used_chars: int = 0 + + +@router.post("/translation/quota/reset") +async def reset_quota(payload: QuotaReset) -> dict[str, Any]: + from app.redis_client import get_redis + + r = get_redis() + now = datetime.now(timezone.utc) + key = f"translation:month:{now:%Y%m}" + await r.set(key, payload.used_chars) + return {"key": key, "value": payload.used_chars} diff --git a/backend/app/api/articles.py b/backend/app/api/articles.py new file mode 100644 index 0000000..a3a77f7 --- /dev/null +++ b/backend/app/api/articles.py @@ -0,0 +1,194 @@ +"""/articles 列表与详情。""" +from __future__ import annotations + +import base64 +import json +from datetime import datetime +from typing import Annotated + +from fastapi import APIRouter, Depends, HTTPException, Query, status +from sqlalchemy import and_, desc, func, or_, select +from sqlalchemy.ext.asyncio import AsyncSession + +from app.core.deps import get_current_user +from app.database import get_session +from app.models.article import Article +from app.models.bookmark import Bookmark +from app.models.source import Source +from app.models.user import User +from app.schemas.article import ( + ArticleDetail, + ArticleListItem, + ArticleListResponse, + SourceBrief, +) + +router = APIRouter(prefix="/articles", tags=["articles"]) + + +def _encode_cursor(article: Article) -> str: + payload = {"id": article.id, "ts": int(article.fetched_at.timestamp())} + return base64.urlsafe_b64encode(json.dumps(payload).encode()).decode() + + +def _decode_cursor(cur: str) -> tuple[int, datetime]: + try: + data = json.loads(base64.urlsafe_b64decode(cur.encode()).decode()) + return int(data["id"]), datetime.fromtimestamp(int(data["ts"])) + except Exception: + raise HTTPException(status.HTTP_400_BAD_REQUEST, "Invalid cursor") + + +@router.get("", response_model=ArticleListResponse) +async def list_articles( + since: datetime | None = Query(default=None, description="起时间 UTC"), + until: datetime | None = Query(default=None, description="止时间 UTC"), + source: str | None = Query(default=None, description="逗号分隔 source slug"), + category: str | None = None, + q: str | None = Query(default=None, description="标题/正文搜索"), + lang: Annotated[str, Query(pattern=r"^(src|zh|both)$")] = "both", + limit: int = Query(default=50, ge=1, le=200), + cursor: str | None = None, + starred_only: bool = False, + user: User = Depends(get_current_user), + session: AsyncSession = Depends(get_session), +): + stmt = ( + select(Article, Source) + .join(Source, Source.id == Article.source_id) + .where(Article.duplicate_of.is_(None)) + ) + + # 默认过去 24h + if since is None and until is None and cursor is None: + since = _default_since_24h() + + if since: + stmt = stmt.where(Article.published_at >= since) + if until: + stmt = stmt.where(Article.published_at <= until) + if category: + stmt = stmt.where(Article.category == category) + + if source: + slugs = [s.strip() for s in source.split(",") if s.strip()] + if slugs: + stmt = stmt.where(Source.slug.in_(slugs)) + + if q: + like = f"%{q}%" + stmt = stmt.where(or_(Article.title.ilike(like), Article.body_text.ilike(like))) + + # 语言过滤 + if lang == "zh": + stmt = stmt.where(Article.title_zh.is_not(None)) + elif lang == "src": + # 只要原文已有 + pass + + if cursor: + last_id, _ = _decode_cursor(cursor) + stmt = stmt.where(Article.id < last_id) + + if starred_only: + stmt = stmt.join(Bookmark, and_(Bookmark.article_id == Article.id, Bookmark.user_id == user.id)) + + stmt = stmt.order_by(desc(Article.published_at), desc(Article.id)).limit(limit + 1) + + rows = (await session.execute(stmt)).all() + has_more = len(rows) > limit + rows = rows[:limit] + + # 标记 is_starred(批量) + ids = [a.id for a, _ in rows] + starred_ids: set[int] = set() + if ids: + bm_rows = ( + await session.execute( + select(Bookmark.article_id).where( + Bookmark.user_id == user.id, Bookmark.article_id.in_(ids) + ) + ) + ).all() + starred_ids = {b[0] for b in bm_rows} + + items = [] + for art, src in rows: + item = ArticleListItem( + id=art.id, + source=SourceBrief.model_validate(src), + title=art.title, + title_zh=art.title_zh, + summary_zh=art.summary_zh, + lang_src=art.lang_src, + translation_status=art.translation_status, + category=art.category, + published_at=art.published_at, + fetched_at=art.fetched_at, + image_url=art.image_url, + is_starred=art.id in starred_ids, + ) + items.append(item) + + next_cursor = _encode_cursor(rows[-1][0]) if has_more and rows else None + return ArticleListResponse(items=items, next_cursor=next_cursor, total=None) + + +@router.get("/{article_id}", response_model=ArticleDetail) +async def get_article( + article_id: int, + user: User = Depends(get_current_user), + session: AsyncSession = Depends(get_session), +): + art = ( + await session.execute( + select(Article, Source) + .join(Source, Source.id == Article.source_id) + .where(Article.id == article_id) + ) + .first() + ) + if not art: + raise HTTPException(status.HTTP_404_NOT_FOUND, "Article not found") + article, source = art + + is_starred = ( + await session.execute( + select(Bookmark.id).where( + Bookmark.user_id == user.id, Bookmark.article_id == article.id + ) + ) + ).first() is not None + + return ArticleDetail( + id=article.id, + source=SourceBrief.model_validate(source), + url=article.url, + title=article.title, + body_html=article.body_html, + body_text=article.body_text, + title_zh=article.title_zh, + body_zh_html=article.body_zh_html, + body_zh_text=article.body_zh_text, + summary_zh=article.summary_zh, + lang_src=article.lang_src, + author=article.author, + image_url=article.image_url, + translation_status=article.translation_status, + translation_engine=article.translation_engine, + translated_at=article.translated_at, + category=article.category, + commentary=article.commentary, + entities=article.entities, + sentiment=article.sentiment, + duplicate_of=article.duplicate_of, + published_at=article.published_at, + fetched_at=article.fetched_at, + is_starred=is_starred, + ) + + +def _default_since_24h() -> datetime: + from datetime import timedelta + + return datetime.utcnow() - timedelta(hours=24) diff --git a/backend/app/api/auth.py b/backend/app/api/auth.py new file mode 100644 index 0000000..fe2326a --- /dev/null +++ b/backend/app/api/auth.py @@ -0,0 +1,65 @@ +"""登录/刷新/登出。""" +from __future__ import annotations + +from datetime import datetime, timezone + +from fastapi import APIRouter, Depends, HTTPException, status +from jwt.exceptions import InvalidTokenError +from sqlalchemy import select +from sqlalchemy.ext.asyncio import AsyncSession + +from app.config import settings +from app.core.security import ( + create_access_token, + create_refresh_token, + decode_token, + verify_password, +) +from app.database import get_session +from app.models.user import User +from app.schemas.auth import LoginRequest, RefreshRequest, TokenPair + +router = APIRouter(prefix="/auth", tags=["auth"]) + + +def _pair_for(user: User) -> TokenPair: + access = create_access_token(user.id, extra={"role": user.role.value}) + refresh = create_refresh_token(user.id) + return TokenPair( + access_token=access, + refresh_token=refresh, + expires_in=settings.access_token_ttl_min * 60, + ) + + +@router.post("/login", response_model=TokenPair) +async def login(body: LoginRequest, session: AsyncSession = Depends(get_session)): + user = ( + await session.execute(select(User).where(User.username == body.username)) + .scalars() + .first() + ) + if not user or not user.is_active or not verify_password(body.password, user.password_hash): + raise HTTPException(status.HTTP_401_UNAUTHORIZED, "Invalid credentials") + user.last_login_at = datetime.now(timezone.utc) + await session.commit() + return _pair_for(user) + + +@router.post("/refresh", response_model=TokenPair) +async def refresh(body: RefreshRequest, session: AsyncSession = Depends(get_session)): + try: + payload = decode_token(body.refresh_token) + if payload.get("type") != "refresh": + raise InvalidTokenError("wrong type") + uid = int(payload["sub"]) + except (InvalidTokenError, KeyError, ValueError): + raise HTTPException(status.HTTP_401_UNAUTHORIZED, "Invalid refresh token") + user = ( + await session.execute(select(User).where(User.id == uid, User.is_active.is_(True))) + .scalars() + .first() + ) + if not user: + raise HTTPException(status.HTTP_401_UNAUTHORIZED, "User not found") + return _pair_for(user) diff --git a/backend/app/api/bookmarks.py b/backend/app/api/bookmarks.py new file mode 100644 index 0000000..139521d --- /dev/null +++ b/backend/app/api/bookmarks.py @@ -0,0 +1,73 @@ +"""/bookmarks 收藏。""" +from __future__ import annotations + +from fastapi import APIRouter, Depends, HTTPException, status +from sqlalchemy import select +from sqlalchemy.ext.asyncio import AsyncSession + +from app.core.deps import get_current_user +from app.database import get_session +from app.models.article import Article +from app.models.bookmark import Bookmark +from app.models.user import User +from app.schemas.misc import BookmarkIn, BookmarkOut + +router = APIRouter(prefix="/bookmarks", tags=["bookmarks"]) + + +@router.post("", response_model=BookmarkOut, status_code=status.HTTP_201_CREATED) +async def add( + body: BookmarkIn, + user: User = Depends(get_current_user), + session: AsyncSession = Depends(get_session), +): + art = (await session.execute(select(Article).where(Article.id == body.article_id))).scalar_one_or_none() + if not art: + raise HTTPException(status.HTTP_404_NOT_FOUND, "Article not found") + # 已存在则直接返回 + existing = ( + await session.execute( + select(Bookmark).where( + Bookmark.user_id == user.id, Bookmark.article_id == body.article_id + ) + ) + ).scalar_one_or_none() + if existing: + return BookmarkOut.model_validate(existing) + bm = Bookmark(user_id=user.id, article_id=body.article_id, note=body.note) + session.add(bm) + await session.commit() + await session.refresh(bm) + return BookmarkOut.model_validate(bm) + + +@router.delete("/{article_id}", status_code=status.HTTP_204_NO_CONTENT) +async def remove( + article_id: int, + user: User = Depends(get_current_user), + session: AsyncSession = Depends(get_session), +): + bm = ( + await session.execute( + select(Bookmark).where( + Bookmark.user_id == user.id, Bookmark.article_id == article_id + ) + ) + ).scalar_one_or_none() + if bm: + await session.delete(bm) + await session.commit() + return None + + +@router.get("", response_model=list[BookmarkOut]) +async def list_mine( + user: User = Depends(get_current_user), + session: AsyncSession = Depends(get_session), +): + rows = ( + await session.execute( + select(Bookmark).where(Bookmark.user_id == user.id).order_by(Bookmark.created_at.desc()) + ) + ).scalars() + return [BookmarkOut.model_validate(b) for b in rows] diff --git a/backend/app/api/me.py b/backend/app/api/me.py new file mode 100644 index 0000000..66a0b71 --- /dev/null +++ b/backend/app/api/me.py @@ -0,0 +1,68 @@ +"""/me 当前用户信息 + 翻译配额。""" +from __future__ import annotations + +from datetime import datetime, timezone + +from fastapi import APIRouter, Depends +from pydantic import BaseModel +from sqlalchemy.ext.asyncio import AsyncSession + +from app.config import settings +from app.core.deps import get_current_user +from app.database import get_session +from app.models.user import User +from app.redis_client import get_redis + +router = APIRouter(prefix="/me", tags=["me"]) + + +class MeOut(BaseModel): + id: int + username: str + email: str | None + role: str + display_name: str | None + created_at: datetime + + +class UsageOut(BaseModel): + month: str + used_chars: int + quota_chars: int + remaining_chars: int + buffered_quota: int + pct_used: float + + +@router.get("", response_model=MeOut) +async def me(user: User = Depends(get_current_user)): + return MeOut( + id=user.id, + username=user.username, + email=user.email, + role=user.role.value, + display_name=user.display_name, + created_at=user.created_at, + ) + + +@router.get("/usage", response_model=UsageOut) +async def usage( + user: User = Depends(get_current_user), + session: AsyncSession = Depends(get_session), # noqa: ARG001 +): + r = get_redis() + now = datetime.now(timezone.utc) + key = f"translation:month:{now:%Y%m}" + used = int(await r.get(key) or 0) + quota = settings.tencent_tmt_quota_month + buffered = int(quota * (1 - settings.tencent_tmt_quota_buffer)) + remaining = max(0, quota - used) + return UsageOut( + month=f"{now:%Y%m}", + used_chars=used, + quota_chars=quota, + remaining_chars=remaining, + buffered_quota=buffered, + pct_used=round(used / quota * 100, 2) if quota else 0.0, + ) diff --git a/backend/app/api/sources.py b/backend/app/api/sources.py new file mode 100644 index 0000000..eebb1b3 --- /dev/null +++ b/backend/app/api/sources.py @@ -0,0 +1,25 @@ +"""/sources 源列表(只读,所有登录用户可看)。""" +from __future__ import annotations + +from fastapi import APIRouter, Depends +from sqlalchemy import select +from sqlalchemy.ext.asyncio import AsyncSession + +from app.core.deps import get_current_user +from app.database import get_session +from app.models.source import Source +from app.models.user import User +from app.schemas.source import SourceOut + +router = APIRouter(prefix="/sources", tags=["sources"]) + + +@router.get("", response_model=list[SourceOut]) +async def list_sources( + user: User = Depends(get_current_user), # noqa: ARG001 + session: AsyncSession = Depends(get_session), +): + rows = ( + await session.execute(select(Source).order_by(Source.priority.desc(), Source.name)) + ).scalars() + return [SourceOut.model_validate(s) for s in rows] diff --git a/backend/app/api/subscriptions.py b/backend/app/api/subscriptions.py new file mode 100644 index 0000000..7ec64e2 --- /dev/null +++ b/backend/app/api/subscriptions.py @@ -0,0 +1,68 @@ +"""/subscriptions 关键词订阅。""" +from __future__ import annotations + +from fastapi import APIRouter, Depends, HTTPException, status +from sqlalchemy import select +from sqlalchemy.ext.asyncio import AsyncSession + +from app.core.deps import get_current_user +from app.database import get_session +from app.models.subscription import Subscription +from app.models.user import User +from app.schemas.misc import SubscriptionIn, SubscriptionOut + +router = APIRouter(prefix="/subscriptions", tags=["subscriptions"]) + + +@router.post("", response_model=SubscriptionOut, status_code=status.HTTP_201_CREATED) +async def create( + body: SubscriptionIn, + user: User = Depends(get_current_user), + session: AsyncSession = Depends(get_session), +): + sub = Subscription( + user_id=user.id, + keyword=body.keyword, + match_in=body.match_in, + channel=body.channel, + target=body.target, + ) + session.add(sub) + await session.commit() + await session.refresh(sub) + return SubscriptionOut.model_validate(sub) + + +@router.get("", response_model=list[SubscriptionOut]) +async def list_mine( + user: User = Depends(get_current_user), + session: AsyncSession = Depends(get_session), +): + rows = ( + await session.execute( + select(Subscription) + .where(Subscription.user_id == user.id) + .order_by(Subscription.created_at.desc()) + ) + ).scalars() + return [SubscriptionOut.model_validate(s) for s in rows] + + +@router.delete("/{sub_id}", status_code=status.HTTP_204_NO_CONTENT) +async def delete( + sub_id: int, + user: User = Depends(get_current_user), + session: AsyncSession = Depends(get_session), +): + sub = ( + await session.execute( + select(Subscription).where( + Subscription.id == sub_id, Subscription.user_id == user.id + ) + ) + ).scalar_one_or_none() + if not sub: + raise HTTPException(status.HTTP_404_NOT_FOUND, "Subscription not found") + await session.delete(sub) + await session.commit() + return None diff --git a/backend/app/config.py b/backend/app/config.py new file mode 100644 index 0000000..42f8ed8 --- /dev/null +++ b/backend/app/config.py @@ -0,0 +1,104 @@ +"""应用配置:从 .env / 环境变量读取,集中管理所有开关。""" +from __future__ import annotations + +from functools import lru_cache +from pathlib import Path + +from pydantic import Field, field_validator +from pydantic_settings import BaseSettings, SettingsConfigDict + + +class Settings(BaseSettings): + model_config = SettingsConfigDict( + env_file=".env", + env_file_encoding="utf-8", + case_sensitive=False, + extra="ignore", + ) + + # ===== 通用 ===== + tz: str = "Asia/Hong_Kong" + log_level: str = "INFO" + + # ===== 数据库 ===== + postgres_user: str + postgres_password: str + postgres_db: str + postgres_host: str = "postgres" + postgres_port: int = 5432 + + @property + def database_url(self) -> str: + # asyncpg + return ( + f"postgresql+asyncpg://{self.postgres_user}:{self.postgres_password}" + f"@{self.postgres_host}:{self.postgres_port}/{self.postgres_db}" + ) + + @property + def sync_database_url(self) -> str: + # alembic 用的同步 URL + return ( + f"postgresql+psycopg2://{self.postgres_user}:{self.postgres_password}" + f"@{self.postgres_host}:{self.postgres_port}/{self.postgres_db}" + ) + + # ===== Redis ===== + redis_host: str = "redis" + redis_port: int = 6379 + redis_password: str + redis_db: int = 0 + + @property + def redis_url(self) -> str: + return ( + f"redis://:{self.redis_password}@{self.redis_host}:{self.redis_port}/{self.redis_db}" + ) + + # ===== JWT ===== + jwt_secret: str + jwt_algorithm: str = "HS256" + access_token_ttl_min: int = 60 + refresh_token_ttl_day: int = 14 + + # ===== 腾讯云 TMT ===== + tencentcloud_secret_id: str = "" + tencentcloud_secret_key: str = "" + tencentcloud_region: str = "ap-hongkong" + tencent_tmt_endpoint: str = "tmt.tencentcloudapi.com" + tencent_tmt_quota_month: int = 5_000_000 + tencent_tmt_quota_buffer: float = 0.05 + tencent_tmt_max_chars_per_req: int = 4500 + + @field_validator("tencent_tmt_quota_buffer") + @classmethod + def _check_buffer(cls, v: float) -> float: + if not 0.0 <= v <= 0.5: + raise ValueError("buffer 必须在 0~0.5") + return v + + # ===== 本地翻译 ===== + local_translate_enabled: bool = False + local_translate_model: str = "nllb-200-distilled-600M" + local_translate_device: str = "cpu" + + # ===== 抓取 ===== + fetch_global_qps: int = 4 + fetch_timeout: int = 20 + fetch_fail_pause_threshold: int = 3 + fetch_max_retries: int = 2 + + # ===== Caddy / 域名 ===== + domain: str = "" + acme_email: str = "" + + # ===== 内部路径(部署后可调) ===== + project_root: Path = Path(__file__).resolve().parents[2] + + +@lru_cache +def get_settings() -> Settings: + return Settings() # type: ignore[call-arg] + + +settings = get_settings() diff --git a/backend/app/core/__init__.py b/backend/app/core/__init__.py new file mode 100644 index 0000000..cb896a0 --- /dev/null +++ b/backend/app/core/__init__.py @@ -0,0 +1 @@ +"""core utilities.""" diff --git a/backend/app/core/deps.py b/backend/app/core/deps.py new file mode 100644 index 0000000..b8ad45f --- /dev/null +++ b/backend/app/core/deps.py @@ -0,0 +1,77 @@ +"""通用依赖:获取当前用户、要求 owner。""" +from __future__ import annotations + +from datetime import datetime, timezone + +from fastapi import Depends, HTTPException, status +from fastapi.security import HTTPAuthorizationCredentials, HTTPBearer +from jwt.exceptions import InvalidTokenError +from sqlalchemy import select +from sqlalchemy.ext.asyncio import AsyncSession + +from app.core.security import decode_token, hash_api_token +from app.database import get_session +from app.models.api_token import ApiToken +from app.models.user import User, UserRole + +_bearer = HTTPBearer(auto_error=False) + + +async def _resolve_user( + creds: HTTPAuthorizationCredentials | None = Depends(_bearer), + session: AsyncSession = Depends(get_session), +) -> User: + if creds is None or not creds.credentials: + raise HTTPException(status.HTTP_401_UNAUTHORIZED, "Missing credentials") + + token = creds.credentials + + # 1) 先试 API Token(sha256 比较) + h = hash_api_token(token) + api_row = ( + await session.execute( + select(ApiToken).where(ApiToken.token_hash == h, ApiToken.revoked_at.is_(None)) + ) + .scalars() + .first() + ) + if api_row: + if api_row.expires_at and api_row.expires_at < datetime.now(timezone.utc): + raise HTTPException(status.HTTP_401_UNAUTHORIZED, "Token expired") + user = ( + await session.execute(select(User).where(User.id == api_row.user_id)) + .scalars() + .first() + ) + if user and user.is_active: + api_row.last_used_at = datetime.now(timezone.utc) + await session.commit() + return user + + # 2) 试 JWT + try: + payload = decode_token(token) + if payload.get("type") != "access": + raise InvalidTokenError("wrong type") + uid = int(payload["sub"]) + except (InvalidTokenError, KeyError, ValueError): + raise HTTPException(status.HTTP_401_UNAUTHORIZED, "Invalid token") + + user = ( + await session.execute(select(User).where(User.id == uid, User.is_active.is_(True))) + .scalars() + .first() + ) + if user is None: + raise HTTPException(status.HTTP_401_UNAUTHORIZED, "User not found or inactive") + return user + + +async def get_current_user(user: User = Depends(_resolve_user)) -> User: + return user + + +async def require_owner(user: User = Depends(get_current_user)) -> User: + if user.role != UserRole.OWNER: + raise HTTPException(status.HTTP_403_FORBIDDEN, "Owner only") + return user diff --git a/backend/app/core/security.py b/backend/app/core/security.py new file mode 100644 index 0000000..e2d64b7 --- /dev/null +++ b/backend/app/core/security.py @@ -0,0 +1,73 @@ +"""鉴权核心:密码哈希 + JWT 编解码 + API Token。""" +from __future__ import annotations + +import hashlib +import hmac +import secrets +from datetime import datetime, timedelta, timezone +from typing import Any + +import jwt +from passlib.context import CryptContext + +from app.config import settings + +# bcrypt 4.0.1 与 passlib 1.7.4 兼容 +pwd_ctx = CryptContext(schemes=["bcrypt"], deprecated="auto", bcrypt__rounds=12) + + +def hash_password(plain: str) -> str: + return pwd_ctx.hash(plain) + + +def verify_password(plain: str, hashed: str) -> bool: + try: + return pwd_ctx.verify(plain, hashed) + except Exception: + return False + + +# === JWT === +def create_access_token(subject: str | int, extra: dict[str, Any] | None = None) -> str: + now = datetime.now(timezone.utc) + payload: dict[str, Any] = { + "sub": str(subject), + "type": "access", + "iat": int(now.timestamp()), + "exp": int((now + timedelta(minutes=settings.access_token_ttl_min)).timestamp()), + } + if extra: + payload.update(extra) + return jwt.encode(payload, settings.jwt_secret, algorithm=settings.jwt_algorithm) + + +def create_refresh_token(subject: str | int) -> str: + now = datetime.now(timezone.utc) + payload = { + "sub": str(subject), + "type": "refresh", + "iat": int(now.timestamp()), + "exp": int((now + timedelta(days=settings.refresh_token_ttl_day)).timestamp()), + "jti": secrets.token_urlsafe(16), + } + return jwt.encode(payload, settings.jwt_secret, algorithm=settings.jwt_algorithm) + + +def decode_token(token: str) -> dict[str, Any]: + return jwt.decode(token, settings.jwt_secret, algorithms=[settings.jwt_algorithm]) + + +# === API Token(给 Android 用)=== +def generate_api_token() -> tuple[str, str]: + """返回 (raw_token, token_hash)。raw_token 只显示一次。""" + raw = secrets.token_urlsafe(32) + return raw, hash_api_token(raw) + + +def hash_api_token(raw: str) -> str: + # 简单 sha256 即可(随机性已经够) + return hashlib.sha256(raw.encode()).hexdigest() + + +def constant_time_eq(a: str, b: str) -> bool: + return hmac.compare_digest(a, b) diff --git a/backend/app/database.py b/backend/app/database.py new file mode 100644 index 0000000..2ad8e88 --- /dev/null +++ b/backend/app/database.py @@ -0,0 +1,52 @@ +"""异步 SQLAlchemy 数据库连接。""" +from __future__ import annotations + +from collections.abc import AsyncGenerator + +from sqlalchemy.ext.asyncio import ( + AsyncSession, + async_sessionmaker, + create_async_engine, +) +from sqlalchemy.orm import DeclarativeBase + +from app.config import settings + + +class Base(DeclarativeBase): + """所有 ORM 模型的基类。""" + + +engine = create_async_engine( + settings.database_url, + echo=False, + pool_size=5, + max_overflow=10, + pool_pre_ping=True, + pool_recycle=1800, +) + +AsyncSessionLocal = async_sessionmaker( + bind=engine, + class_=AsyncSession, + expire_on_commit=False, + autoflush=False, +) + + +async def get_session() -> AsyncGenerator[AsyncSession, None]: + """FastAPI 依赖:请求级 session。""" + async with AsyncSessionLocal() as session: + try: + yield session + finally: + await session.close() + + +async def init_db() -> None: + """开发期用,生产请用 alembic。""" + # import models to register them + from app.models import article, source, user # noqa: F401 + + async with engine.begin() as conn: + await conn.run_sync(Base.metadata.create_all) diff --git a/backend/app/main.py b/backend/app/main.py new file mode 100644 index 0000000..c9b91f0 --- /dev/null +++ b/backend/app/main.py @@ -0,0 +1,117 @@ +"""FastAPI 入口。 + +- 注册路由 +- 启动 / 关闭事件:连接池、调度器 +- CORS +- 全局异常处理 +""" +from __future__ import annotations + +import logging +from contextlib import asynccontextmanager + +from fastapi import FastAPI, Request +from fastapi.exceptions import RequestValidationError +from fastapi.middleware.cors import CORSMiddleware +from fastapi.responses import JSONResponse +from starlette.exceptions import HTTPException as StarletteHTTPException + +from app.api import admin, articles, auth, bookmarks, me, sources, subscriptions +from app.config import settings +from app.database import engine +from app.redis_client import close_redis, get_redis + +logger = logging.getLogger("news.api") +logging.basicConfig( + level=settings.log_level, + format="%(asctime)s [%(levelname)s] %(name)s: %(message)s", +) + + +@asynccontextmanager +async def lifespan(app: FastAPI): + # 启动 + logger.info("api starting, tz=%s", settings.tz) + # 触发 redis 连接 + await get_redis().ping() + yield + # 关闭 + logger.info("api shutting down") + await close_redis() + await engine.dispose() + + +app = FastAPI( + title="Diary News", + description="Private news aggregator", + version="0.1.0", + default_response_class=JSONResponse, + lifespan=lifespan, + docs_url="/api/docs" if settings.log_level == "DEBUG" else None, + redoc_url=None, +) + +# CORS:网页 + Android,简单放开(私有) +app.add_middleware( + CORSMiddleware, + allow_origins=["*"], # MVP 放开,生产收紧 + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"], +) + + +# === 全局异常处理(RFC 7807) === +@app.exception_handler(StarletteHTTPException) +async def http_exc_handler(request: Request, exc: StarletteHTTPException): + return JSONResponse( + status_code=exc.status_code, + content={ + "type": "about:blank", + "title": exc.detail if isinstance(exc.detail, str) else "Error", + "status": exc.status_code, + "instance": str(request.url), + }, + headers=exc.headers or None, + ) + + +@app.exception_handler(RequestValidationError) +async def validation_exc_handler(request: Request, exc: RequestValidationError): + return JSONResponse( + status_code=422, + content={ + "type": "about:blank", + "title": "Validation Error", + "status": 422, + "errors": exc.errors(), + "instance": str(request.url), + }, + ) + + +# === 路由 === +API_PREFIX = "/api/v1" + +app.include_router(auth.router, prefix=API_PREFIX) +app.include_router(me.router, prefix=API_PREFIX) +app.include_router(articles.router, prefix=API_PREFIX) +app.include_router(sources.router, prefix=API_PREFIX) +app.include_router(bookmarks.router, prefix=API_PREFIX) +app.include_router(subscriptions.router, prefix=API_PREFIX) +app.include_router(admin.router, prefix=API_PREFIX) + + +# === 健康检查 === +@app.get("/healthz", include_in_schema=False) +async def healthz(): + try: + await get_redis().ping() + except Exception as e: + return JSONResponse({"status": "degraded", "redis": str(e)}, status_code=503) + return {"status": "ok"} + + +@app.get("/", include_in_schema=False) +async def root(): + return {"name": "diary-news", "version": app.version, "docs": "/api/docs"} diff --git a/backend/app/models/__init__.py b/backend/app/models/__init__.py new file mode 100644 index 0000000..85a2040 --- /dev/null +++ b/backend/app/models/__init__.py @@ -0,0 +1,21 @@ +"""所有 ORM 模型。 + +新模型请在这里 import,确保 Alembic 自动发现。 +""" +from app.models.api_token import ApiToken # noqa: F401 +from app.models.article import Article # noqa: F401 +from app.models.bookmark import Bookmark # noqa: F401 +from app.models.source import Source, SourceKind # noqa: F401 +from app.models.subscription import Subscription # noqa: F401 +from app.models.user import User, UserRole # noqa: F401 + +__all__ = [ + "ApiToken", + "Article", + "Bookmark", + "Source", + "SourceKind", + "Subscription", + "User", + "UserRole", +] diff --git a/backend/app/models/api_token.py b/backend/app/models/api_token.py new file mode 100644 index 0000000..fc39e43 --- /dev/null +++ b/backend/app/models/api_token.py @@ -0,0 +1,27 @@ +"""API Token(给 Android 用,可独立撤销)。""" +from __future__ import annotations + +from datetime import datetime + +from sqlalchemy import DateTime, ForeignKey, String, func +from sqlalchemy.orm import Mapped, mapped_column + +from app.database import Base + + +class ApiToken(Base): + __tablename__ = "api_tokens" + + id: Mapped[int] = mapped_column(primary_key=True) + user_id: Mapped[int] = mapped_column( + ForeignKey("users.id", ondelete="CASCADE"), nullable=False, index=True + ) + name: Mapped[str] = mapped_column(String(64), nullable=False) # "Xiaomi-14" + token_hash: Mapped[str] = mapped_column(String(128), unique=True, nullable=False, index=True) + # 只存 hash,原始 token 一次性返回给用户 + last_used_at: Mapped[datetime | None] = mapped_column(DateTime(timezone=True)) + expires_at: Mapped[datetime | None] = mapped_column(DateTime(timezone=True)) + revoked_at: Mapped[datetime | None] = mapped_column(DateTime(timezone=True)) + created_at: Mapped[datetime] = mapped_column( + DateTime(timezone=True), server_default=func.now(), nullable=False + ) diff --git a/backend/app/models/article.py b/backend/app/models/article.py new file mode 100644 index 0000000..369c6cb --- /dev/null +++ b/backend/app/models/article.py @@ -0,0 +1,91 @@ +"""文章主表:原文 + 译文 + ML 字段预留。""" +from __future__ import annotations + +from datetime import datetime + +from sqlalchemy import ( + BigInteger, + DateTime, + Float, + ForeignKey, + Index, + Integer, + String, + Text, + func, +) +from sqlalchemy.dialects.postgresql import JSONB +from sqlalchemy.orm import Mapped, mapped_column, relationship + +from app.database import Base + + +class Article(Base): + __tablename__ = "articles" + + id: Mapped[int] = mapped_column(BigInteger, primary_key=True) + + # === 来源 === + source_id: Mapped[int] = mapped_column( + ForeignKey("sources.id", ondelete="CASCADE"), nullable=False, index=True + ) + source: Mapped["Source"] = relationship(back_populates="articles", lazy="joined") # noqa: F821 + + # === 原文标识 === + url: Mapped[str] = mapped_column(Text, nullable=False) + url_hash: Mapped[str] = mapped_column(String(40), unique=True, nullable=False, index=True) + guid: Mapped[str | None] = mapped_column(String(255), index=True) # 源站给的 ID + + # === 原文内容 === + title: Mapped[str] = mapped_column(Text, nullable=False) + body_html: Mapped[str | None] = mapped_column(Text) # 抽取后保留结构 + body_text: Mapped[str] = mapped_column(Text, nullable=False, default="") + lang_src: Mapped[str | None] = mapped_column(String(8)) + author: Mapped[str | None] = mapped_column(String(255)) + image_url: Mapped[str | None] = mapped_column(Text) + + # === 译文 === + title_zh: Mapped[str | None] = mapped_column(Text) + body_zh_html: Mapped[str | None] = mapped_column(Text) + body_zh_text: Mapped[str | None] = mapped_column(Text) + summary_zh: Mapped[str | None] = mapped_column(Text) + + # === 翻译状态 === + translation_status: Mapped[str] = mapped_column( + String(16), default="pending", nullable=False, index=True + ) + # pending / ok / partial / failed / n/a + translation_engine: Mapped[str | None] = mapped_column(String(16)) + # tencent / nllb / cache / skip + translation_chars: Mapped[int] = mapped_column(Integer, default=0, nullable=False) + translated_at: Mapped[datetime | None] = mapped_column(DateTime(timezone=True)) + + # === ML 字段(预留,MVP 全 null)=== + category: Mapped[str | None] = mapped_column(String(32), index=True) + commentary: Mapped[str | None] = mapped_column(Text) + entities: Mapped[dict | None] = mapped_column(JSONB) + sentiment: Mapped[float | None] = mapped_column(Float) + topic_id: Mapped[str | None] = mapped_column(String(64), index=True) + bias: Mapped[str | None] = mapped_column(String(16)) # left/center/right + + # === 去重 === + duplicate_of: Mapped[int | None] = mapped_column( + ForeignKey("articles.id", ondelete="SET NULL"), index=True + ) + + # === 时间 === + published_at: Mapped[datetime | None] = mapped_column(DateTime(timezone=True), index=True) + fetched_at: Mapped[datetime] = mapped_column( + DateTime(timezone=True), server_default=func.now(), nullable=False, index=True + ) + created_at: Mapped[datetime] = mapped_column( + DateTime(timezone=True), server_default=func.now(), nullable=False + ) + + __table_args__ = ( + Index("ix_articles_source_published", "source_id", "published_at"), + Index("ix_articles_status_published", "translation_status", "published_at"), + ) + + def __repr__(self) -> str: + return f"
" diff --git a/backend/app/models/bookmark.py b/backend/app/models/bookmark.py new file mode 100644 index 0000000..b76870f --- /dev/null +++ b/backend/app/models/bookmark.py @@ -0,0 +1,27 @@ +"""收藏。""" +from __future__ import annotations + +from datetime import datetime + +from sqlalchemy import DateTime, ForeignKey, UniqueConstraint, func +from sqlalchemy.orm import Mapped, mapped_column + +from app.database import Base + + +class Bookmark(Base): + __tablename__ = "bookmarks" + + id: Mapped[int] = mapped_column(primary_key=True) + user_id: Mapped[int] = mapped_column( + ForeignKey("users.id", ondelete="CASCADE"), nullable=False, index=True + ) + article_id: Mapped[int] = mapped_column( + ForeignKey("articles.id", ondelete="CASCADE"), nullable=False, index=True + ) + note: Mapped[str | None] = mapped_column() + created_at: Mapped[datetime] = mapped_column( + DateTime(timezone=True), server_default=func.now(), nullable=False + ) + + __table_args__ = (UniqueConstraint("user_id", "article_id", name="uq_bookmark_user_article"),) diff --git a/backend/app/models/source.py b/backend/app/models/source.py new file mode 100644 index 0000000..0ba41c7 --- /dev/null +++ b/backend/app/models/source.py @@ -0,0 +1,64 @@ +"""采集源模型。""" +from __future__ import annotations + +import enum +from datetime import datetime + +from sqlalchemy import ( + JSON, + Boolean, + DateTime, + Enum, + Integer, + String, + Text, + func, +) +from sqlalchemy.orm import Mapped, mapped_column, relationship + +from app.database import Base + + +class SourceKind(str, enum.Enum): + RSS = "rss" + HTML_LIST = "html_list" + TG_CHANNEL = "tg_channel" + + +class Source(Base): + __tablename__ = "sources" + + id: Mapped[int] = mapped_column(primary_key=True) + name: Mapped[str] = mapped_column(String(128), nullable=False) + slug: Mapped[str] = mapped_column(String(128), unique=True, index=True, nullable=False) + kind: Mapped[SourceKind] = mapped_column( + Enum(SourceKind, name="source_kind"), + default=SourceKind.RSS, + nullable=False, + ) + url: Mapped[str] = mapped_column(Text, nullable=False) + detail_selector: Mapped[dict | None] = mapped_column(JSON) + fetch_interval_min: Mapped[int] = mapped_column(Integer, default=60, nullable=False) + fetch_cron: Mapped[str | None] = mapped_column(String(64)) # 5 段 cron + translate_to: Mapped[str] = mapped_column(String(8), default="zh", nullable=False) + enabled: Mapped[bool] = mapped_column(Boolean, default=True, nullable=False) + region: Mapped[str | None] = mapped_column(String(32), index=True) + language_src: Mapped[str | None] = mapped_column(String(8)) + priority: Mapped[int] = mapped_column(Integer, default=50, nullable=False, index=True) + headers_json: Mapped[dict | None] = mapped_column(JSON) + last_fetched_at: Mapped[datetime | None] = mapped_column(DateTime(timezone=True)) + last_status: Mapped[str | None] = mapped_column(String(64)) + consecutive_failures: Mapped[int] = mapped_column(Integer, default=0, nullable=False) + created_at: Mapped[datetime] = mapped_column( + DateTime(timezone=True), server_default=func.now(), nullable=False + ) + updated_at: Mapped[datetime] = mapped_column( + DateTime(timezone=True), server_default=func.now(), onupdate=func.now(), nullable=False + ) + + articles: Mapped[list["Article"]] = relationship( # noqa: F821 + back_populates="source", cascade="all, delete-orphan", lazy="noload" + ) + + def __repr__(self) -> str: + return f"" diff --git a/backend/app/models/subscription.py b/backend/app/models/subscription.py new file mode 100644 index 0000000..1a0990d --- /dev/null +++ b/backend/app/models/subscription.py @@ -0,0 +1,48 @@ +"""关键词订阅(命中即通知)。""" +from __future__ import annotations + +import enum +from datetime import datetime + +from sqlalchemy import ( + Boolean, + DateTime, + Enum, + ForeignKey, + String, + Text, + func, +) +from sqlalchemy.orm import Mapped, mapped_column + +from app.database import Base + + +class SubscriptionMatch(str, enum.Enum): + ANY = "any" # 标题或正文 + TITLE = "title" + BODY = "body" + + +class Subscription(Base): + __tablename__ = "subscriptions" + + id: Mapped[int] = mapped_column(primary_key=True) + user_id: Mapped[int] = mapped_column( + ForeignKey("users.id", ondelete="CASCADE"), nullable=False, index=True + ) + keyword: Mapped[str] = mapped_column(String(255), nullable=False) + # 简单关键词,匹配走 ILIKE '%kw%';后续可加 regex/lucene + match_in: Mapped[SubscriptionMatch] = mapped_column( + Enum(SubscriptionMatch, name="subscription_match"), + default=SubscriptionMatch.ANY, + nullable=False, + ) + channel: Mapped[str] = mapped_column(String(32), default="telegram", nullable=False) + # telegram / email / web + target: Mapped[str | None] = mapped_column(Text) # chat_id / email + enabled: Mapped[bool] = mapped_column(Boolean, default=True, nullable=False) + last_hit_at: Mapped[datetime | None] = mapped_column(DateTime(timezone=True)) + created_at: Mapped[datetime] = mapped_column( + DateTime(timezone=True), server_default=func.now(), nullable=False + ) diff --git a/backend/app/models/user.py b/backend/app/models/user.py new file mode 100644 index 0000000..533db37 --- /dev/null +++ b/backend/app/models/user.py @@ -0,0 +1,41 @@ +"""用户模型。 + +Phase 1 仅 owner + member 两级,后续扩展。 +""" +from __future__ import annotations + +import enum +from datetime import datetime + +from sqlalchemy import Boolean, DateTime, Enum, String, func +from sqlalchemy.orm import Mapped, mapped_column + +from app.database import Base + + +class UserRole(str, enum.Enum): + OWNER = "owner" + MEMBER = "member" + + +class User(Base): + __tablename__ = "users" + + id: Mapped[int] = mapped_column(primary_key=True) + username: Mapped[str] = mapped_column(String(64), unique=True, index=True, nullable=False) + email: Mapped[str | None] = mapped_column(String(255), unique=True, index=True) + password_hash: Mapped[str] = mapped_column(String(255), nullable=False) + role: Mapped[UserRole] = mapped_column( + Enum(UserRole, name="user_role"), + default=UserRole.MEMBER, + nullable=False, + ) + is_active: Mapped[bool] = mapped_column(Boolean, default=True, nullable=False) + display_name: Mapped[str | None] = mapped_column(String(128)) + created_at: Mapped[datetime] = mapped_column( + DateTime(timezone=True), server_default=func.now(), nullable=False + ) + last_login_at: Mapped[datetime | None] = mapped_column(DateTime(timezone=True)) + + def __repr__(self) -> str: + return f"" diff --git a/backend/app/redis_client.py b/backend/app/redis_client.py new file mode 100644 index 0000000..57dd7c6 --- /dev/null +++ b/backend/app/redis_client.py @@ -0,0 +1,31 @@ +"""Redis 客户端(单例)。用于: +- 翻译缓存 +- 翻译字符配额(月度) +- 限流(后续) +""" +from __future__ import annotations + +import redis.asyncio as redis_async + +from app.config import settings + +_pool: redis_async.Redis | None = None + + +def get_redis() -> redis_async.Redis: + global _pool + if _pool is None: + _pool = redis_async.from_url( + settings.redis_url, + encoding="utf-8", + decode_responses=True, + max_connections=20, + ) + return _pool + + +async def close_redis() -> None: + global _pool + if _pool is not None: + await _pool.aclose() + _pool = None diff --git a/backend/app/schemas/__init__.py b/backend/app/schemas/__init__.py new file mode 100644 index 0000000..c86db6a --- /dev/null +++ b/backend/app/schemas/__init__.py @@ -0,0 +1 @@ +"""Pydantic schemas for API I/O.""" diff --git a/backend/app/schemas/article.py b/backend/app/schemas/article.py new file mode 100644 index 0000000..7bf8bee --- /dev/null +++ b/backend/app/schemas/article.py @@ -0,0 +1,83 @@ +"""Article schemas.""" +from __future__ import annotations + +from datetime import datetime + +from pydantic import BaseModel, ConfigDict, Field + + +class SourceBrief(BaseModel): + model_config = ConfigDict(from_attributes=True) + + id: int + name: str + slug: str + region: str | None = None + + +class ArticleListItem(BaseModel): + """列表项:精简字段。""" + + model_config = ConfigDict(from_attributes=True) + + id: int + source: SourceBrief + title: str + title_zh: str | None = None + summary_zh: str | None = None + lang_src: str | None = None + translation_status: str + category: str | None = None + published_at: datetime | None = None + fetched_at: datetime + image_url: str | None = None + is_starred: bool = False + + +class ArticleDetail(BaseModel): + model_config = ConfigDict(from_attributes=True) + + id: int + source: SourceBrief + url: str + title: str + body_html: str | None = None + body_text: str + title_zh: str | None = None + body_zh_html: str | None = None + body_zh_text: str | None = None + summary_zh: str | None = None + lang_src: str | None = None + author: str | None = None + image_url: str | None = None + translation_status: str + translation_engine: str | None = None + translated_at: datetime | None = None + category: str | None = None + commentary: str | None = None + entities: dict | None = None + sentiment: float | None = None + duplicate_of: int | None = None + published_at: datetime | None = None + fetched_at: datetime + is_starred: bool = False + + +class ArticleListResponse(BaseModel): + items: list[ArticleListItem] + next_cursor: str | None = None + total: int | None = None + + +class ArticleQuery(BaseModel): + """用作 ?query= 解析参考(实际 FastAPI 直接用 Query)。""" + + since: datetime | None = None + until: datetime | None = None + source: str | None = None # 逗号分隔 slug + category: str | None = None + q: str | None = None + lang: str = Field(default="both", pattern=r"^(src|zh|both)$") + limit: int = Field(default=50, ge=1, le=200) + cursor: str | None = None + starred_only: bool = False diff --git a/backend/app/schemas/auth.py b/backend/app/schemas/auth.py new file mode 100644 index 0000000..4bc26e6 --- /dev/null +++ b/backend/app/schemas/auth.py @@ -0,0 +1,20 @@ +"""Auth schemas.""" +from __future__ import annotations + +from pydantic import BaseModel, Field + + +class LoginRequest(BaseModel): + username: str = Field(min_length=1, max_length=64) + password: str = Field(min_length=6, max_length=128) + + +class TokenPair(BaseModel): + access_token: str + refresh_token: str + token_type: str = "bearer" + expires_in: int # seconds + + +class RefreshRequest(BaseModel): + refresh_token: str diff --git a/backend/app/schemas/misc.py b/backend/app/schemas/misc.py new file mode 100644 index 0000000..e0643c3 --- /dev/null +++ b/backend/app/schemas/misc.py @@ -0,0 +1,43 @@ +"""Bookmark / Subscription schemas.""" +from __future__ import annotations + +from datetime import datetime + +from pydantic import BaseModel, ConfigDict, Field + +from app.models.subscription import SubscriptionMatch + + +class BookmarkIn(BaseModel): + article_id: int + note: str | None = None + + +class BookmarkOut(BaseModel): + model_config = ConfigDict(from_attributes=True) + + id: int + user_id: int + article_id: int + note: str | None = None + created_at: datetime + + +class SubscriptionIn(BaseModel): + keyword: str = Field(min_length=1, max_length=255) + match_in: SubscriptionMatch = SubscriptionMatch.ANY + channel: str = "telegram" + target: str | None = None + + +class SubscriptionOut(BaseModel): + model_config = ConfigDict(from_attributes=True) + + id: int + keyword: str + match_in: SubscriptionMatch + channel: str + target: str | None = None + enabled: bool + last_hit_at: datetime | None = None + created_at: datetime diff --git a/backend/app/schemas/source.py b/backend/app/schemas/source.py new file mode 100644 index 0000000..a3422a8 --- /dev/null +++ b/backend/app/schemas/source.py @@ -0,0 +1,51 @@ +"""Source schemas.""" +from __future__ import annotations + +from datetime import datetime + +from pydantic import BaseModel, ConfigDict, Field, HttpUrl + +from app.models.source import SourceKind + + +class SourceOut(BaseModel): + model_config = ConfigDict(from_attributes=True) + + id: int + name: str + slug: str + kind: SourceKind + url: str + enabled: bool + region: str | None = None + language_src: str | None = None + priority: int + fetch_interval_min: int + translate_to: str + last_fetched_at: datetime | None = None + last_status: str | None = None + consecutive_failures: int = 0 + + +class SourceIn(BaseModel): + name: str = Field(min_length=1, max_length=128) + slug: str = Field(min_length=1, max_length=128, pattern=r"^[a-z0-9-]+$") + kind: SourceKind = SourceKind.RSS + url: HttpUrl + region: str | None = None + language_src: str | None = None + priority: int = Field(default=50, ge=1, le=100) + fetch_interval_min: int = Field(default=60, ge=5, le=1440) + translate_to: str = "zh" + enabled: bool = True + detail_selector: dict | None = None + headers_json: dict | None = None + + +class SourceUpdate(BaseModel): + name: str | None = None + enabled: bool | None = None + priority: int | None = Field(default=None, ge=1, le=100) + fetch_interval_min: int | None = Field(default=None, ge=5, le=1440) + region: str | None = None + translate_to: str | None = None diff --git a/backend/app/scripts/__init__.py b/backend/app/scripts/__init__.py new file mode 100644 index 0000000..fed8839 --- /dev/null +++ b/backend/app/scripts/__init__.py @@ -0,0 +1 @@ +"""命令行脚本集合。""" diff --git a/backend/app/scripts/create_user.py b/backend/app/scripts/create_user.py new file mode 100644 index 0000000..a630a3e --- /dev/null +++ b/backend/app/scripts/create_user.py @@ -0,0 +1,56 @@ +"""创建用户(默认 owner)。""" +from __future__ import annotations + +import argparse +import asyncio +import sys +from getpass import getpass + +from sqlalchemy import select + +from app.core.security import hash_password +from app.database import AsyncSessionLocal +from app.models.user import User, UserRole + + +async def main(username: str, password: str, email: str | None, role: UserRole) -> int: + async with AsyncSessionLocal() as session: + exists = (await session.execute(select(User).where(User.username == username))).scalar_one_or_none() + if exists: + print(f"user '{username}' already exists (id={exists.id})", file=sys.stderr) + return 1 + u = User( + username=username, + email=email, + password_hash=hash_password(password), + role=role, + is_active=True, + ) + session.add(u) + await session.commit() + await session.refresh(u) + print(f"created user id={u.id} username={u.username} role={u.role.value}") + return 0 + + +def cli() -> None: + p = argparse.ArgumentParser() + p.add_argument("--username", required=True) + p.add_argument("--password", default=None, help="缺省则交互输入") + p.add_argument("--email", default=None) + p.add_argument("--role", choices=["owner", "member"], default="member") + args = p.parse_args() + password = args.password + if not password: + pw1 = getpass("password: ") + pw2 = getpass("password (again): ") + if pw1 != pw2 or len(pw1) < 6: + print("passwords differ or too short", file=sys.stderr) + sys.exit(2) + password = pw1 + rc = asyncio.run(main(args.username, password, args.email, UserRole(args.role))) + sys.exit(rc) + + +if __name__ == "__main__": + cli() diff --git a/backend/app/scripts/seed_sources.py b/backend/app/scripts/seed_sources.py new file mode 100644 index 0000000..89ca006 --- /dev/null +++ b/backend/app/scripts/seed_sources.py @@ -0,0 +1,114 @@ +"""种子:导入 MVP 5 源。 + +- Reuters World +- BBC World +- Al Jazeera +- NHK World +- DW + +RSS 链接为公开 feed,实际链接可能变更;若 fetch 失败,先看 /admin/health。 +""" +from __future__ import annotations + +import asyncio +import sys + +from sqlalchemy import select +from sqlalchemy.dialects.postgresql import insert as pg_insert +from sqlalchemy.exc import IntegrityError + +from app.database import AsyncSessionLocal +from app.models.source import Source, SourceKind + +SEEDS = [ + { + "name": "Reuters World", + "slug": "reuters-world", + "kind": SourceKind.RSS, + "url": "https://feeds.reuters.com/Reuters/worldNews", + "region": "global", + "language_src": "en", + "priority": 90, + "fetch_interval_min": 30, + "translate_to": "zh", + "enabled": True, + }, + { + "name": "BBC World", + "slug": "bbc-world", + "kind": SourceKind.RSS, + "url": "https://feeds.bbci.co.uk/news/world/rss.xml", + "region": "global", + "language_src": "en", + "priority": 85, + "fetch_interval_min": 30, + "translate_to": "zh", + "enabled": True, + }, + { + "name": "Al Jazeera", + "slug": "aljazeera", + "kind": SourceKind.RSS, + "url": "https://www.aljazeera.com/xml/rss/all.xml", + "region": "mena", + "language_src": "en", + "priority": 80, + "fetch_interval_min": 45, + "translate_to": "zh", + "enabled": True, + }, + { + "name": "NHK World", + "slug": "nhk-world", + "kind": SourceKind.RSS, + "url": "https://www3.nhk.or.jp/rss/news/cat0.xml", + "region": "asia", + "language_src": "en", + "priority": 70, + "fetch_interval_min": 60, + "translate_to": "zh", + "enabled": True, + }, + { + "name": "DW (Deutsche Welle)", + "slug": "dw", + "kind": SourceKind.RSS, + "url": "https://rss.dw.com/xml/rss-en-all", + "region": "eu", + "language_src": "en", + "priority": 70, + "fetch_interval_min": 60, + "translate_to": "zh", + "enabled": True, + }, +] + + +async def main() -> int: + async with AsyncSessionLocal() as session: + inserted = 0 + for row in SEEDS: + stmt = ( + pg_insert(Source) + .values(**row) + .on_conflict_do_nothing(index_elements=["slug"]) + .returning(Source.id) + ) + try: + r = await session.execute(stmt) + rid = r.scalar_one_or_none() + if rid is not None: + inserted += 1 + print(f" + {row['slug']} (id={rid})") + else: + print(f" = {row['slug']} (already exists)") + except IntegrityError as e: + print(f" ! {row['slug']}: {e}", file=sys.stderr) + await session.rollback() + await session.commit() + print(f"seeded {inserted} new source(s)") + return 0 + + +if __name__ == "__main__": + sys.exit(asyncio.run(main())) diff --git a/backend/app/services/__init__.py b/backend/app/services/__init__.py new file mode 100644 index 0000000..950d236 --- /dev/null +++ b/backend/app/services/__init__.py @@ -0,0 +1 @@ +"""Services (fetchers / translation).""" diff --git a/backend/app/services/fetchers/__init__.py b/backend/app/services/fetchers/__init__.py new file mode 100644 index 0000000..7078681 --- /dev/null +++ b/backend/app/services/fetchers/__init__.py @@ -0,0 +1,12 @@ +"""Fetcher implementations.""" +from app.services.fetchers.base import BaseFetcher, FetchedItem +from app.services.fetchers.rss import RSSFetcher + +__all__ = ["BaseFetcher", "FetchedItem", "RSSFetcher"] + + +def get_fetcher(kind: str, **kwargs) -> BaseFetcher: + if kind == "rss": + return RSSFetcher(**kwargs) + # html_list / tg_channel: Phase 2 实现,这里抛错 + raise NotImplementedError(f"fetcher not implemented for kind={kind}") diff --git a/backend/app/services/fetchers/base.py b/backend/app/services/fetchers/base.py new file mode 100644 index 0000000..ec09fd0 --- /dev/null +++ b/backend/app/services/fetchers/base.py @@ -0,0 +1,67 @@ +"""Fetcher 抽象基类 + 通用工具。""" +from __future__ import annotations + +import hashlib +from abc import ABC, abstractmethod +from dataclasses import dataclass, field +from datetime import datetime +from typing import Any + +import httpx + +from app.config import settings + + +def normalize_url(url: str) -> str: + """去 utm_*、fragment、尾斜杠,用于 url_hash。""" + from urllib.parse import urlsplit, urlunsplit, parse_qsl, urlencode + + sp = urlsplit(url.strip()) + # 去掉 fragment + fragment = "" + # 过滤 utm_* + qs = [(k, v) for k, v in parse_qsl(sp.query, keep_blank_values=True) if not k.lower().startswith("utm_")] + query = urlencode(qs) + # 路径末尾 / + path = sp.path.rstrip("/") or "/" + return urlunsplit((sp.scheme.lower(), sp.netloc.lower(), path, query, fragment)) + + +def url_hash(url: str) -> str: + return hashlib.sha1(normalize_url(url).encode()).hexdigest() + + +@dataclass +class FetchedItem: + """统一返回结构:一个待入库的条目。""" + + url: str + title: str + body_html: str | None = None + body_text: str = "" + published_at: datetime | None = None + lang: str | None = None + author: str | None = None + image_url: str | None = None + guid: str | None = None + raw: dict[str, Any] = field(default_factory=dict) + + +class BaseFetcher(ABC): + def __init__(self, url: str, headers: dict | None = None): + self.url = url + self.headers = headers or {"User-Agent": "DiaryNews/0.1 (+https://github.com/)"} + + @abstractmethod + async def fetch(self) -> list[FetchedItem]: + """拉取并解析,返回 FetchedItem 列表。""" + + async def _http_get(self) -> bytes: + async with httpx.AsyncClient( + timeout=settings.fetch_timeout, + follow_redirects=True, + headers=self.headers, + ) as client: + r = await client.get(self.url) + r.raise_for_status() + return r.content diff --git a/backend/app/services/fetchers/rss.py b/backend/app/services/fetchers/rss.py new file mode 100644 index 0000000..63a1e05 --- /dev/null +++ b/backend/app/services/fetchers/rss.py @@ -0,0 +1,100 @@ +"""RSS / Atom fetcher(基于 feedparser)。""" +from __future__ import annotations + +from datetime import datetime, timezone +from email.utils import parsedate_to_datetime + +import feedparser +from dateutil import parser as dtp + +from app.services.fetchers.base import BaseFetcher, FetchedItem + + +class RSSFetcher(BaseFetcher): + async def fetch(self) -> list[FetchedItem]: + raw = await self._http_get() + # feedparser 在不同 Python 下处理 bytes/str + try: + text = raw.decode("utf-8") + except UnicodeDecodeError: + text = raw.decode("utf-8", errors="replace") + feed = feedparser.parse(text) + if feed.bozo and not feed.entries: + # 整篇解析失败 + raise RuntimeError(f"RSS parse failed: {feed.bozo_exception}") + items: list[FetchedItem] = [] + for e in feed.entries: + url = e.get("link") or e.get("id") + if not url: + continue + title = (e.get("title") or "").strip() + if not title: + continue + + body_html = None + body_text = "" + if e.get("content"): + # 选最长 content + contents = sorted(e["content"], key=lambda c: -len(c.get("value", ""))) + body_html = contents[0].get("value") + if not body_html: + body_html = e.get("summary") + if body_html: + from bs4 import BeautifulSoup + + soup = BeautifulSoup(body_html, "lxml") + # 去 script/style + for tag in soup(["script", "style", "noscript"]): + tag.decompose() + body_text = soup.get_text(separator="\n", strip=True) + + published_at = _parse_dt(e.get("published") or e.get("updated") or e.get("created")) + author = e.get("author") + image_url = None + if e.get("media_content"): + try: + image_url = e["media_content"][0].get("url") + except (IndexError, KeyError, TypeError): + pass + if not image_url and e.get("media_thumbnail"): + try: + image_url = e["media_thumbnail"][0].get("url") + except (IndexError, KeyError, TypeError): + pass + if not image_url and e.get("enclosures"): + for enc in e["enclosures"]: + if enc.get("type", "").startswith("image/"): + image_url = enc.get("href") or enc.get("url") + break + + items.append( + FetchedItem( + url=url, + title=title, + body_html=body_html, + body_text=body_text, + published_at=published_at, + lang=e.get("language") or feed.feed.get("language"), + author=author, + image_url=image_url, + guid=e.get("id") or e.get("guid"), + ) + ) + return items + + +def _parse_dt(s: str | None) -> datetime | None: + if not s: + return None + try: + dt = dtp.parse(s) + except (ValueError, TypeError, dtp.ParserError): + try: + dt = parsedate_to_datetime(s) + except Exception: + return None + if dt is None: + return None + if dt.tzinfo is None: + dt = dt.replace(tzinfo=timezone.utc) + return dt.astimezone(timezone.utc) diff --git a/backend/app/services/translation/__init__.py b/backend/app/services/translation/__init__.py new file mode 100644 index 0000000..becb758 --- /dev/null +++ b/backend/app/services/translation/__init__.py @@ -0,0 +1 @@ +"""Translation services.""" diff --git a/backend/app/services/translation/base.py b/backend/app/services/translation/base.py new file mode 100644 index 0000000..247ad4d --- /dev/null +++ b/backend/app/services/translation/base.py @@ -0,0 +1,26 @@ +"""翻译后端抽象。""" +from __future__ import annotations + +from abc import ABC, abstractmethod +from dataclasses import dataclass + + +@dataclass +class TranslationResult: + text: str + engine: str + chars: int + cached: bool = False + + +class BaseTranslator(ABC): + name: str = "base" + + @abstractmethod + async def translate(self, text: str, source: str = "auto", target: str = "zh") -> TranslationResult: + """同步调用,失败抛异常。""" + + +def count_chars(s: str) -> int: + """近似的字符计数(Unicode 码点)。腾讯 TMT 按字符数计费。""" + return len(s) diff --git a/backend/app/services/translation/local.py b/backend/app/services/translation/local.py new file mode 100644 index 0000000..d426f0f --- /dev/null +++ b/backend/app/services/translation/local.py @@ -0,0 +1,62 @@ +"""本地翻译(降级用,需要 transformers + 模型文件)。 + +默认关闭。启用方式: +- LOCAL_TRANSLATE_ENABLED=true +- 容器内预装模型(Volume 挂载) +""" +from __future__ import annotations + +import logging + +from app.config import settings +from app.services.translation.base import BaseTranslator, TranslationResult + +logger = logging.getLogger("news.translate.local") + + +class LocalTranslator(BaseTranslator): + name = "nllb" + + def __init__(self): + if not settings.local_translate_enabled: + raise RuntimeError("LocalTranslator disabled in settings") + # 模型懒加载(避免 import 时加载大模型) + self._pipe = None + + def _ensure_loaded(self): + if self._pipe is not None: + return + from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, pipeline + + model_name = settings.local_translate_model + logger.info("loading local translation model: %s", model_name) + tok = AutoTokenizer.from_pretrained(model_name) + model = AutoModelForSeq2SeqLM.from_pretrained(model_name) + self._pipe = pipeline( + "translation", + model=model, + tokenizer=tok, + device=settings.local_translate_device, + ) + + async def translate( + self, text: str, source: str = "auto", target: str = "zh" + ) -> TranslationResult: + if not text.strip(): + return TranslationResult(text=text, engine=self.name, chars=0) + self._ensure_loaded() + import asyncio + + loop = asyncio.get_running_loop() + # NLLB 的 src_lang/tgt_lang 比较长,简单按约定:en→zh_Hans + src = "eng_Latn" if source in ("en", "auto") else source + tgt = "zho_Hans" if target == "zh" else target + out = await loop.run_in_executor( + None, + lambda: self._pipe( + text, src_lang=src, tgt_lang=tgt, max_length=2000 + ), + ) + return TranslationResult( + text=out[0]["translation_text"], engine=self.name, chars=len(text) + ) diff --git a/backend/app/services/translation/service.py b/backend/app/services/translation/service.py new file mode 100644 index 0000000..bdeaec5 --- /dev/null +++ b/backend/app/services/translation/service.py @@ -0,0 +1,146 @@ +"""翻译服务门面:配额检查 + 缓存 + 引擎选择 + 月度计数。""" +from __future__ import annotations + +import asyncio +import hashlib +import logging +from datetime import datetime, timezone +from typing import Protocol + +from app.config import settings +from app.redis_client import get_redis +from app.services.translation.base import BaseTranslator, TranslationResult +from app.services.translation.local import LocalTranslator +from app.services.translation.tencent import TencentTranslator + +logger = logging.getLogger("news.translate.service") + + +# 缓存 key +def _cache_key(text: str, src: str, tgt: str) -> str: + h = hashlib.sha1(f"{src}|{tgt}|{text}".encode()).hexdigest() + return f"translation:cache:{h}" + + +def _month_key() -> str: + now = datetime.now(timezone.utc) + return f"translation:month:{now:%Y%m}" + + +class TranslationService: + def __init__(self): + self._tencent: BaseTranslator | None = None + self._local: BaseTranslator | None = None + self._sem = asyncio.Semaphore(3) # 并发限流 + + def _primary(self) -> BaseTranslator: + if self._tencent is None: + self._tencent = TencentTranslator() + return self._tencent + + def _fallback(self) -> BaseTranslator | None: + if self._local is None and settings.local_translate_enabled: + try: + self._local = LocalTranslator() + except Exception as e: + logger.warning("local translator init failed: %s", e) + self._local = None + return self._local + + async def can_use_tencent(self, chars: int) -> bool: + if not settings.tencentcloud_secret_id: + return False + r = get_redis() + used = int(await r.get(_month_key()) or 0) + buffered = int( + settings.tencent_tmt_quota_month * (1 - settings.tencent_tmt_quota_buffer) + ) + return (used + chars) <= buffered + + async def add_usage(self, chars: int) -> None: + r = get_redis() + # 用 INCRBY + EXPIRE 月初;简单做法:每次 set + 设 TTL + key = _month_key() + async with r.pipeline(transaction=False) as pipe: + pipe.incrby(key, chars) + # 月底过期(下下月 1 日) + now = datetime.now(timezone.utc) + if now.month == 12: + next_month = now.replace(year=now.year + 1, month=1, day=1) + else: + next_month = now.replace(month=now.month + 1, day=1) + ttl = int((next_month - now).total_seconds()) + 86400 + pipe.expire(key, ttl) + await pipe.execute() + + async def translate( + self, text: str, source: str = "auto", target: str = "zh" + ) -> TranslationResult: + if not text.strip(): + return TranslationResult(text=text, engine="skip", chars=0) + + chars = len(text) + # 1) 缓存 + r = get_redis() + ck = _cache_key(text, source, target) + cached = await r.get(ck) + if cached is not None: + return TranslationResult(text=cached, engine="cache", chars=chars, cached=True) + + # 2) 选引擎 + use_tencent = await self.can_use_tencent(chars) + engine: BaseTranslator + if use_tencent: + engine = self._primary() + else: + fb = self._fallback() + if fb is None: + # 没本地:返回原文 + 标记 + return TranslationResult( + text=text + "\n\n[本条未翻译:配额耗尽且未启用本地翻译]", + engine="skip", + chars=chars, + ) + engine = fb + logger.info("fallback to local translator for %d chars", chars) + + # 3) 调用 + async with self._sem: + try: + res = await engine.translate(text, source=source, target=target) + except Exception as e: + # 失败:降级 + logger.exception("translate failed with %s: %s", engine.name, e) + fb = self._fallback() + if fb is not None and engine is not fb: + res = await fb.translate(text, source=source, target=target) + else: + res = TranslationResult( + text=text + f"\n\n[翻译失败: {e}]", + engine="skip", + chars=chars, + ) + + # 4) 写缓存(无论引擎) + try: + await r.set(ck, res.text, ex=60 * 60 * 24 * 30) # 30 天 + except Exception: + pass + + # 5) 计数(只在 tencent 上计) + if res.engine == "tencent": + try: + await self.add_usage(res.chars or chars) + except Exception as e: + logger.warning("add_usage failed: %s", e) + + return res + + +# 全局单例 +service = TranslationService() + + +# 让后端 worker 直接调 +class _Protocol(Protocol): + async def translate(self, text: str, source: str = "auto", target: str = "zh") -> TranslationResult: ... diff --git a/backend/app/services/translation/tencent.py b/backend/app/services/translation/tencent.py new file mode 100644 index 0000000..41d48f6 --- /dev/null +++ b/backend/app/services/translation/tencent.py @@ -0,0 +1,74 @@ +"""腾讯云文本翻译 TMT。""" +from __future__ import annotations + +import asyncio +import logging +import random +from typing import Any + +from tencentcloud.common import credential +from tencentcloud.common.exception.tencent_cloud_sdk_exception import ( + TencentCloudSDKException, +) +from tencentcloud.tmt.v20180321 import models, tmt_client + +from app.config import settings +from app.services.translation.base import BaseTranslator, TranslationResult + +logger = logging.getLogger("news.translate.tencent") + +# 常见语种映射 +_LANG_MAP = { + "en": "en", + "zh": "zh", + "ja": "ja", + "ko": "ko", + "fr": "fr", + "de": "de", + "es": "es", + "ru": "ru", + "ar": "ar", +} + + +class TencentTranslator(BaseTranslator): + name = "tencent" + + def __init__(self): + if not settings.tencentcloud_secret_id or not settings.tencentcloud_secret_key: + raise RuntimeError("Tencent Cloud credentials missing") + self.cred = credential.Credential( + settings.tencentcloud_secret_id, settings.tencentcloud_secret_key + ) + self.client = tmt_client.TmtClient(self.cred, settings.tencentcloud_region) + + async def translate( + self, text: str, source: str = "auto", target: str = "zh" + ) -> TranslationResult: + if not text.strip(): + return TranslationResult(text=text, engine=self.name, chars=0) + + source = _LANG_MAP.get(source, source if source != "auto" else "auto") + target = _LANG_MAP.get(target, target) + + # 简单重试 + for attempt in range(2): + try: + req = models.TextTranslateRequest() + req.SourceText = text + req.Source = source + req.Target = target + req.ProjectId = 0 + # SDK 同步调用 → 放线程池 + resp: Any = await asyncio.to_thread(self.client.TextTranslate, req) + out = getattr(resp, "TargetText", "") or "" + return TranslationResult( + text=out, engine=self.name, chars=len(text), cached=False + ) + except TencentCloudSDKException as e: + logger.warning("tencent translate attempt %s failed: %s", attempt, e) + if attempt == 0: + await asyncio.sleep(0.5 + random.random()) + else: + raise + raise RuntimeError("unreachable") diff --git a/backend/app/workers/__init__.py b/backend/app/workers/__init__.py new file mode 100644 index 0000000..b718bc9 --- /dev/null +++ b/backend/app/workers/__init__.py @@ -0,0 +1 @@ +"""Background workers (fetch + translate + scheduler).""" diff --git a/backend/app/workers/__main__.py b/backend/app/workers/__main__.py new file mode 100644 index 0000000..eb066ce --- /dev/null +++ b/backend/app/workers/__main__.py @@ -0,0 +1,112 @@ +"""Worker 入口:启动调度器 + 异步任务。 + +`docker compose exec worker python -m app.workers` +""" +from __future__ import annotations + +import asyncio +import logging +import signal +from datetime import datetime, timezone + +from apscheduler.schedulers.asyncio import AsyncIOScheduler +from apscheduler.triggers.cron import CronTrigger +from apscheduler.triggers.interval import IntervalTrigger +from sqlalchemy import select + +from app.config import settings +from app.database import AsyncSessionLocal +from app.models.source import Source +from app.workers.pipeline import fetch_one_source, run_once + +logger = logging.getLogger("news.worker") +logging.basicConfig( + level=settings.log_level, + format="%(asctime)s [%(levelname)s] %(name)s: %(message)s", +) + + +async def _rebuild_jobs(scheduler: AsyncIOScheduler) -> None: + """从 sources 表动态构建 job(可热更新)。""" + scheduler.remove_all_jobs() + async with AsyncSessionLocal() as s: + rows = (await s.execute(select(Source).where(Source.enabled.is_(True)))).scalars() + sources = list(rows) + if not sources: + logger.warning("no enabled sources; scheduler idle") + return + for src in sources: + trigger = ( + CronTrigger.from_crontab(src.fetch_cron) + if src.fetch_cron + else IntervalTrigger(minutes=src.fetch_interval_min) + ) + scheduler.add_job( + fetch_one_source, + trigger=trigger, + args=[src.id], + id=f"src:{src.slug}", + replace_existing=True, + max_instances=1, + coalesce=True, + misfire_grace_time=300, + ) + logger.info("scheduled %s every %s", src.slug, src.fetch_cron or f"{src.fetch_interval_min}m") + + +async def _daily_rebuild() -> None: + """每天 00:30 重建 job 列表(支持运行时新增源)。""" + scheduler = AsyncIOScheduler() + # 临时实例,只为重建用 + # 实际用全局 scheduler 实例 + pass + + +def build_scheduler() -> AsyncIOScheduler: + sched = AsyncIOScheduler(timezone="Asia/Hong_Kong") + return sched + + +async def main() -> None: + scheduler = build_scheduler() + await _rebuild_jobs(scheduler) + # 每天 00:30 重建一次 + scheduler.add_job( + _rebuild_jobs, + trigger=CronTrigger(hour=0, minute=30), + args=[scheduler], + id="rebuild_jobs", + replace_existing=True, + ) + # 启动时立即跑一次 + scheduler.add_job( + run_once, + trigger=IntervalTrigger(minutes=0), + id="startup_run", + next_run_time=datetime.now(timezone.utc), + ) + + scheduler.start() + logger.info("scheduler started with %d jobs", len(scheduler.get_jobs())) + + stop = asyncio.Event() + + def _signal_handler(): + logger.info("shutdown signal received") + stop.set() + + loop = asyncio.get_running_loop() + for sig in (signal.SIGINT, signal.SIGTERM): + try: + loop.add_signal_handler(sig, _signal_handler) + except NotImplementedError: + # Windows 等不支持 + pass + + await stop.wait() + logger.info("stopping scheduler") + scheduler.shutdown(wait=False) + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/backend/app/workers/pipeline.py b/backend/app/workers/pipeline.py new file mode 100644 index 0000000..bf7aab7 --- /dev/null +++ b/backend/app/workers/pipeline.py @@ -0,0 +1,274 @@ +"""核心 pipeline: +- 抓取(去重 + 入库) +- 翻译(分块 + 配额管理) +- 手动 run_once / fetch_one_source / translate_article +""" +from __future__ import annotations + +import asyncio +import logging +from datetime import datetime, timezone + +from sqlalchemy import select +from sqlalchemy.dialects.postgresql import insert as pg_insert + +from app.config import settings +from app.database import AsyncSessionLocal +from app.models.article import Article +from app.models.source import Source, SourceKind +from app.services.fetchers import get_fetcher +from app.services.fetchers.base import FetchedItem, url_hash +from app.services.translation.service import service as translation_service + +logger = logging.getLogger("news.pipeline") + +TRANSLATE_BODY_MAX = 8000 # 单篇正文最大翻译字符 +SEM_PER_SOURCE = asyncio.Semaphore(2) # 同一源抓取并发 + + +# === 抓取 + 入库 === +async def fetch_one_source(source_id: int) -> None: + async with SEM_PER_SOURCE: + async with AsyncSessionLocal() as session: + src = ( + await session.execute(select(Source).where(Source.id == source_id)) + ).scalar_one_or_none() + if not src or not src.enabled: + logger.info("source %s disabled or missing", source_id) + return + + try: + fetcher = get_fetcher(src.kind.value, url=src.url, headers=src.headers_json) + items = await fetcher.fetch() + except Exception as e: + logger.exception("fetch failed for %s: %s", src.slug, e) + await _mark_failure(source_id, f"fetch: {type(e).__name__}: {e}") + return + + if not items: + await _mark_success(source_id, n_new=0) + return + + n_new = await _bulk_insert(src, items) + await _mark_success(source_id, n_new=n_new) + logger.info("source %s: %d new articles", src.slug, n_new) + + # 入库后,挑高优先级 / 没翻译的开始翻译 + await _translate_recent_for_source(source_id, max_n=20) + + +async def _mark_failure(source_id: int, status: str) -> None: + async with AsyncSessionLocal() as session: + src = ( + await session.execute(select(Source).where(Source.id == source_id)) + ).scalar_one_or_none() + if not src: + return + src.last_status = status + src.consecutive_failures += 1 + src.last_fetched_at = datetime.now(timezone.utc) + if src.consecutive_failures >= settings.fetch_fail_pause_threshold: + # 退避:把 interval 翻倍,封顶 720 分钟 + src.fetch_interval_min = min(720, src.fetch_interval_min * 2) + logger.warning( + "source %s paused, interval bumped to %dm", + src.slug, + src.fetch_interval_min, + ) + await session.commit() + + +async def _mark_success(source_id: int, n_new: int) -> None: + async with AsyncSessionLocal() as session: + src = ( + await session.execute(select(Source).where(Source.id == source_id)) + ).scalar_one_or_none() + if not src: + return + src.last_status = f"ok:new={n_new}" + src.consecutive_failures = 0 + src.last_fetched_at = datetime.now(timezone.utc) + await session.commit() + + +async def _bulk_insert(src: Source, items: list[FetchedItem]) -> int: + """用 PG ON CONFLICT DO NOTHING 去重;返回新插入行数。""" + if not items: + return 0 + rows = [] + for it in items: + if not it.title or not it.url: + continue + rows.append( + { + "source_id": src.id, + "url": it.url, + "url_hash": url_hash(it.url), + "guid": it.guid, + "title": it.title[:512], + "body_html": (it.body_html or "")[:65535], + "body_text": (it.body_text or "")[:65535], + "lang_src": it.lang or src.language_src, + "author": it.author, + "image_url": it.image_url, + "published_at": it.published_at, + "translation_status": "pending", + "translate_to": src.translate_to, + } + ) + if not rows: + return 0 + + async with AsyncSessionLocal() as session: + stmt = ( + pg_insert(Article) + .values(rows) + .on_conflict_do_nothing(index_elements=["url_hash"]) + .returning(Article.id) + ) + result = await session.execute(stmt) + inserted_ids = [r[0] for r in result.all()] + await session.commit() + return len(inserted_ids) + + +# === 翻译 === +async def _translate_recent_for_source(source_id: int, max_n: int = 20) -> None: + async with AsyncSessionLocal() as session: + rows = ( + await session.execute( + select(Article) + .where(Article.source_id == source_id, Article.translation_status == "pending") + .order_by(Article.published_at.desc().nullslast(), Article.id.desc()) + .limit(max_n) + ) + ).scalars() + article_ids = [a.id for a in rows] + for aid in article_ids: + await translate_article(aid) + + +async def translate_article(article_id: int) -> None: + async with AsyncSessionLocal() as session: + art = ( + await session.execute(select(Article).where(Article.id == article_id)) + ).scalar_one_or_none() + if not art: + return + if art.translation_status not in ("pending", "failed"): + return + title = art.title + body_text = (art.body_text or "")[:TRANSLATE_BODY_MAX] + lang_src = art.lang_src or "auto" + target = "zh" + article_id_ref = art.id + + if not body_text and not title: + return + + total_chars = 0 + try: + # title + tr_title = await translation_service.translate(title, source=lang_src, target=target) + total_chars += tr_title.chars + + # body 段落切分 + 重组 + chunks = _chunk_text(body_text, max_chars=settings.tencent_tmt_max_chars_per_req) + translated_chunks: list[str] = [] + for ch in chunks: + tr = await translation_service.translate(ch, source=lang_src, target=target) + total_chars += tr.chars + translated_chunks.append(tr.text) + tr_body = "\n\n".join(translated_chunks) + + engine_label = "tencent" + status = "ok" if (tr_title.text and tr_body) else "partial" + except Exception as e: + logger.exception("translate article %s failed: %s", article_id, e) + async with AsyncSessionLocal() as session: + art = ( + await session.execute(select(Article).where(Article.id == article_id)) + ).scalar_one_or_none() + if art: + art.translation_status = "failed" + await session.commit() + return + + # 写回 + async with AsyncSessionLocal() as session: + art = ( + await session.execute(select(Article).where(Article.id == article_id_ref)) + ).scalar_one_or_none() + if art: + art.title_zh = tr_title.text if tr_title.text else None + art.body_zh_text = tr_body or None + art.body_zh_html = _wrap_html(tr_body) if tr_body else None + art.translation_status = status + art.translation_engine = engine_label + art.translation_chars = total_chars + art.translated_at = datetime.now(timezone.utc) + await session.commit() + logger.info("article %s translated: %d chars, %s", article_id, total_chars, engine_label) + + +def _chunk_text(text: str, max_chars: int) -> list[str]: + if not text: + return [] + paragraphs = [p.strip() for p in text.split("\n\n") if p.strip()] + chunks: list[str] = [] + cur = "" + for p in paragraphs: + if len(p) > max_chars: + # 单段过长:按句号切 + sentences = _split_long_para(p, max_chars) + for s in sentences: + if len(cur) + len(s) + 2 > max_chars: + if cur: + chunks.append(cur) + cur = s + else: + cur = (cur + "\n\n" + s).strip() if cur else s + else: + if len(cur) + len(p) + 2 > max_chars: + if cur: + chunks.append(cur) + cur = p + else: + cur = (cur + "\n\n" + p).strip() if cur else p + if cur: + chunks.append(cur) + return chunks + + +def _split_long_para(para: str, max_chars: int) -> list[str]: + parts: list[str] = [] + cur = "" + for ch in para: + cur += ch + if ch in ".!?。!?" and len(cur) >= max_chars // 2: + parts.append(cur.strip()) + cur = "" + if cur.strip(): + parts.append(cur.strip()) + if not parts: + return [para[:max_chars]] + return parts + + +def _wrap_html(text: str) -> str: + """把译文包成 HTML 段落。""" + from bs4 import BeautifulSoup + + parts = [f"

{p.strip()}

" for p in text.split("\n\n") if p.strip()] + return "\n".join(parts) if parts else "" + + +# === 全量跑(供测试 / 手动触发) === +async def run_once() -> None: + async with AsyncSessionLocal() as session: + rows = (await session.execute(select(Source).where(Source.enabled.is_(True)))).scalars() + sources = list(rows) + + logger.info("run_once: %d enabled sources", len(sources)) + tasks = [fetch_one_source(s.id) for s in sources] + await asyncio.gather(*tasks, return_exceptions=True) diff --git a/backend/pyproject.toml b/backend/pyproject.toml new file mode 100644 index 0000000..04473b2 --- /dev/null +++ b/backend/pyproject.toml @@ -0,0 +1,72 @@ +[project] +name = "news-aggregator" +version = "0.1.0" +description = "Private news aggregator with multi-source RSS, translation, web + Android clients" +requires-python = ">=3.12" +dependencies = [ + # web + "fastapi>=0.115.0", + "uvicorn[standard]>=0.32.0", + "pydantic>=2.9.0", + "pydantic-settings>=2.6.0", + "python-multipart>=0.0.12", + # db + "sqlalchemy[asyncio]>=2.0.36", + "asyncpg>=0.30.0", + "alembic>=1.14.0", + "psycopg2-binary>=2.9.10", # alembic sync driver + # cache / queue + "redis>=5.2.0", + # auth + "passlib[bcrypt]>=1.7.4", + "bcrypt==4.0.1", # 锁版本,passlib 与新版 bcrypt 不兼容 + "pyjwt>=2.10.0", + # fetch / parse + "feedparser>=6.0.11", + "httpx>=0.28.0", + "trafilatura>=2.0.0", + "beautifulsoup4>=4.12.3", + "lxml>=5.3.0", + "python-dateutil>=2.9.0", + # translation + "tencentcloud-sdk-python>=3.0.1200", + # scheduling + "apscheduler>=3.10.4", + # observability + "structlog>=24.4.0", + "orjson>=3.10.10", + # util + "pydantic-extra-types>=2.10.0", + "email-validator>=2.2.0", + "python-slugify>=8.0.4", +] + +[project.optional-dependencies] +dev = [ + "pytest>=8.3.0", + "pytest-asyncio>=0.24.0", + "ruff>=0.7.0", + "mypy>=1.13.0", +] + +[tool.ruff] +line-length = 110 +target-version = "py312" + +[tool.ruff.lint] +select = ["E", "F", "I", "B", "UP", "W"] +ignore = ["E501"] + +[tool.mypy] +python_version = "3.12" +ignore_missing_imports = true +strict_optional = true +warn_unused_ignores = true + +[build-system] +requires = ["setuptools>=68"] +build-backend = "setuptools.build_meta" + +[tool.setuptools.packages.find] +where = ["."] +include = ["app*"] diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..da1bdb0 --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,99 @@ +name: news-aggregator + +services: + postgres: + image: postgres:16-alpine + restart: unless-stopped + environment: + POSTGRES_USER: ${POSTGRES_USER} + POSTGRES_PASSWORD: ${POSTGRES_PASSWORD} + POSTGRES_DB: ${POSTGRES_DB} + TZ: ${TZ} + volumes: + - pg_data:/var/lib/postgresql/data + healthcheck: + test: ["CMD-SHELL", "pg_isready -U ${POSTGRES_USER} -d ${POSTGRES_DB}"] + interval: 10s + timeout: 5s + retries: 5 + # 不暴露到宿主机 + expose: + - "5432" + + redis: + image: redis:7-alpine + restart: unless-stopped + command: ["redis-server", "--requirepass", "${REDIS_PASSWORD}", "--maxmemory", "256mb", "--maxmemory-policy", "allkeys-lru"] + volumes: + - redis_data:/data + healthcheck: + test: ["CMD", "redis-cli", "-a", "${REDIS_PASSWORD}", "ping"] + interval: 10s + timeout: 3s + retries: 5 + expose: + - "6379" + + api: + build: + context: ./backend + dockerfile: Dockerfile + restart: unless-stopped + env_file: .env + depends_on: + postgres: + condition: service_healthy + redis: + condition: service_healthy + expose: + - "8000" + volumes: + - ./backend/app:/app/app + command: ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"] + + worker: + build: + context: ./backend + dockerfile: Dockerfile + restart: unless-stopped + env_file: .env + depends_on: + postgres: + condition: service_healthy + redis: + condition: service_healthy + volumes: + - ./backend/app:/app/app + command: ["python", "-m", "app.workers"] + + caddy: + image: caddy:2-alpine + restart: unless-stopped + env_file: .env + ports: + - "80:80" + - "443:443" + volumes: + - ./Caddyfile:/etc/caddy/Caddyfile:ro + - caddy_data:/data + - caddy_config:/config + depends_on: + - api + + frontend: + build: + context: ./frontend + dockerfile: Dockerfile + args: + VITE_API_BASE: ${VITE_API_BASE:-/api/v1} + restart: unless-stopped + expose: + - "80" + depends_on: + - api + +volumes: + pg_data: + redis_data: + caddy_data: + caddy_config: diff --git a/docs/acceptance.md b/docs/acceptance.md new file mode 100644 index 0000000..3ad168e --- /dev/null +++ b/docs/acceptance.md @@ -0,0 +1,65 @@ +# MVP 验收清单(Phase 1) + +部署完成后,逐项验证;任何一项失败先看 `DEPLOY.md` 的 FAQ + `docker compose logs`。 + +## 0. 服务健康 + +- [ ] `docker compose ps` 所有服务 `running` +- [ ] `curl http://YOUR_IP/api/v1/healthz` → `{"status":"ok"}` +- [ ] `curl http://YOUR_IP/` → 返回 HTML(SPA 入口) + +## 1. 账号与登录 + +- [ ] `docker compose exec api python -m app.scripts.create_user --username owner --password XXXX` +- [ ] 浏览器打开首页 → 登录页 → 用 owner 登录成功 +- [ ] 侧边栏显示 "owner (owner)" +- [ ] 顶栏翻译配额显示 "翻译: 0 / 5,000,000 (0.0%)" + +## 2. 源管理 + +- [ ] `docker compose exec api python -m app.scripts.seed_sources` +- [ ] 进入 "源管理(Admin)" 页,看到 5 条源(Reuters/BBC/Al Jazeera/NHK/DW) +- [ ] "源健康" 页 5 个源都在 +- [ ] 某个源点 "立即抓取" → message 提示 "已加入抓取队列" +- [ ] 等 1~2 分钟,看 worker 日志:`docker compose logs -f worker | grep -E "fetch|articles"` + +## 3. 文章采集与展示 + +- [ ] "24h 列表" 页有文章(数量与抓取量相关,首次可能 10~50 条) +- [ ] 卡片显示:源标签 / 语种 / 发布时间 / 英文标题 / 中文标题 +- [ ] 顶栏配额数字 > 0(说明翻译已消耗字符) +- [ ] 点开文章详情: + - [ ] 原文 + 译文双卡片 + - [ ] 翻译状态为 "ok" + - [ ] 翻译引擎为 "tencent" + - [ ] 点 "原文链接 ↗" 跳到源站 +- [ ] 部分文章可能还是 "pending" / "partial",等下个 worker 周期 + +## 4. 收藏 + +- [ ] 文章详情点 "☆ 收藏" → 变 "★ 已收藏" +- [ ] 侧边栏 "收藏" 看到刚才那篇 +- [ ] 列表页对应卡片有 star 标记(后续可加视觉) + +## 5. 失败降级验证(可选) + +- [ ] 把腾讯云 secret 配错 → 重启 worker → 下次抓取会触发翻译失败 +- [ ] 文章状态变为 "failed",详情页有黄色提示 +- [ ] 改为正确 secret → 点 "重译" → 成功 + +## 6. 调度器验证 + +- [ ] `docker compose logs -f worker` → 应看到 "scheduler started with N jobs" +- [ ] 看到 "scheduled every " +- [ ] 等过 fetch_interval_min 分钟后,日志有新的 "source X: N new articles" + +## 7. 配额监控(可选) + +- [ ] `curl -H "Authorization: Bearer $TOKEN" http://YOUR_IP/api/v1/me/usage` +- [ ] 看到 used_chars > 0 + +## 8. 验收完成 + +- [ ] 全 7 项通过 → Phase 1 验收 ✅ +- [ ] 任意失败 → 看 `docker compose logs ` + `DEPLOY.md` FAQ +- [ ] 性能 / 体验改进 → 进 Phase 2 diff --git a/docs/architecture.md b/docs/architecture.md new file mode 100644 index 0000000..d39b2b6 --- /dev/null +++ b/docs/architecture.md @@ -0,0 +1,77 @@ +# 架构设计 + +> 对应方案 v0.1 的实现版本。 + +## 模块边界 + +| 模块 | 路径 | 职责 | +| --- | --- | --- | +| API | `backend/app/api/` | HTTP 路由,处理鉴权 / 入参 / 出参 | +| 业务 | `backend/app/services/` | 抓取、翻译、领域逻辑 | +| Worker | `backend/app/workers/` | 后台调度、pipeline | +| 数据 | `backend/app/models/` | SQLAlchemy ORM | +| 迁移 | `backend/alembic/` | 数据库 schema 版本 | +| 前端 | `frontend/src/` | Vue 3 + Naive UI | + +## 数据流 + +``` +Source (DB) + │ + ▼ +Scheduler (cron / interval) + │ + ▼ +RSSFetcher.fetch() ── HTTP GET ──► upstream RSS + │ + ▼ +FetchedItem list + │ + ▼ (url_hash UNIQUE 去重) +Article INSERT + │ + ▼ (translation_status='pending') +TranslationService.translate() + ├─ Redis cache hit → return + ├─ quota check + ├─ Tencent TMT (主) ──► 30 天 Redis 月度计数 + └─ Local NLLB (降级,需启用) + │ + ▼ +Article UPDATE (title_zh / body_zh_* / status) +``` + +## 关键设计决策 + +- **PostgreSQL**:UNIQUE 约束 + `ON CONFLICT DO NOTHING` 做去重,O(1) 写 +- **Redis 三用**:翻译缓存(30 天 TTL)+ 月度配额(INCRBY)+ 后续限流 +- **AScheduler 重构 jobs**:每天 00:30 从 DB 重新读,运行时新增源自动生效 +- **翻译分块**:按段落切,单段 > 1500 字符按句号再切,单请求 ≤ 4500 字符(腾讯 TMT 上限) +- **失败退避**:某源连续失败 3 次,fetch_interval × 2(封顶 720 分钟),成功一次恢复 +- **API Token 双轨**:网页用短期 JWT(15min)+ refresh(14d);Android 用长期 API Token(可独立撤销) + +## 字段保留 + +`articles` 表里这些字段已建,MVP 全部 null,后续 enrichment 阶段直接写值不动表: +- `category` / `commentary` / `entities` / `sentiment` / `topic_id` / `bias` + +## 安全 + +- 密码 bcrypt(cost=12) +- JWT 走 HTTPS-only cookie(网页) / Bearer header(APP) +- 数据库/Redis 不暴露到宿主机 +- Caddy 做 TLS 终止 +- API 限流(MVP 暂未实现,后续加) + +## 不在 MVP + +- ❌ 全文搜索(可用 PG `to_tsvector`,MVP 先简单 ILIKE) +- ❌ PWA 离线缓存 +- ❌ Android 客户端 +- ❌ 自动分类/点评/实体识别 +- ❌ 主题聚类 +- ❌ 跨源立场 +- ❌ Telegram 推送 +- ❌ i18n(只 zh) + +见 [`DEPLOY.md`](../DEPLOY.md) 跑起来,见 [`../README.md`](../README.md) 看全貌。 diff --git a/frontend/Dockerfile b/frontend/Dockerfile new file mode 100644 index 0000000..9dc61ce --- /dev/null +++ b/frontend/Dockerfile @@ -0,0 +1,19 @@ +# Build stage +FROM node:20-alpine AS build + +ARG VITE_API_BASE=/api/v1 +ENV VITE_API_BASE=$VITE_API_BASE + +WORKDIR /app +COPY package.json ./ +RUN npm install --no-audit --no-fund +COPY . . +RUN npm run build + +# Runtime: nginx static +FROM nginx:1.27-alpine + +COPY --from=build /app/dist /usr/share/nginx/html +COPY nginx.conf /etc/nginx/conf.d/default.conf + +EXPOSE 80 diff --git a/frontend/index.html b/frontend/index.html new file mode 100644 index 0000000..6e35980 --- /dev/null +++ b/frontend/index.html @@ -0,0 +1,13 @@ + + + + + + + Diary News + + +
+ + + diff --git a/frontend/nginx.conf b/frontend/nginx.conf new file mode 100644 index 0000000..d3d3409 --- /dev/null +++ b/frontend/nginx.conf @@ -0,0 +1,21 @@ +server { + listen 80; + server_name _; + root /usr/share/nginx/html; + index index.html; + + # SPA fallback + location / { + try_files $uri $uri/ /index.html; + } + + # cache hashed assets + location ~* \.(?:js|css|woff2?|svg|png|jpg|jpeg|gif|ico)$ { + expires 30d; + add_header Cache-Control "public, immutable"; + } + + # gzip + gzip on; + gzip_types text/plain text/css application/json application/javascript text/xml application/xml application/xml+rss text/javascript; +} diff --git a/frontend/package.json b/frontend/package.json new file mode 100644 index 0000000..1bd74ed --- /dev/null +++ b/frontend/package.json @@ -0,0 +1,26 @@ +{ + "name": "diary-news-web", + "private": true, + "version": "0.1.0", + "type": "module", + "scripts": { + "dev": "vite", + "build": "vue-tsc -b && vite build", + "preview": "vite preview" + }, + "dependencies": { + "axios": "^1.7.7", + "dayjs": "^1.11.13", + "naive-ui": "^2.40.1", + "pinia": "^2.2.6", + "vue": "^3.5.12", + "vue-router": "^4.4.5", + "vfonts": "^0.0.3" + }, + "devDependencies": { + "@vitejs/plugin-vue": "^5.1.4", + "typescript": "^5.6.3", + "vite": "^5.4.10", + "vue-tsc": "^2.1.10" + } +} diff --git a/frontend/public/favicon.svg b/frontend/public/favicon.svg new file mode 100644 index 0000000..4684b73 --- /dev/null +++ b/frontend/public/favicon.svg @@ -0,0 +1 @@ +N diff --git a/frontend/src/App.vue b/frontend/src/App.vue new file mode 100644 index 0000000..3f099be --- /dev/null +++ b/frontend/src/App.vue @@ -0,0 +1,19 @@ + + + diff --git a/frontend/src/api/articles.ts b/frontend/src/api/articles.ts new file mode 100644 index 0000000..617f44e --- /dev/null +++ b/frontend/src/api/articles.ts @@ -0,0 +1,111 @@ +import { http } from './client' + +export interface Source { + id: number + name: string + slug: string + kind: string + url: string + enabled: boolean + region?: string | null + language_src?: string | null + priority: number + fetch_interval_min: number + translate_to: string + last_fetched_at?: string | null + last_status?: string | null + consecutive_failures: number +} + +export interface ArticleListItem { + id: number + source: { id: number; name: string; slug: string; region?: string | null } + title: string + title_zh?: string | null + summary_zh?: string | null + lang_src?: string | null + translation_status: string + category?: string | null + published_at?: string | null + fetched_at: string + image_url?: string | null + is_starred: boolean +} + +export interface ArticleListResponse { + items: ArticleListItem[] + next_cursor: string | null + total: number | null +} + +export interface ArticleDetail extends ArticleListItem { + url: string + body_html?: string | null + body_text: string + body_zh_html?: string | null + body_zh_text?: string | null + author?: string | null + translation_engine?: string | null + translated_at?: string | null + commentary?: string | null + entities?: Record | null + sentiment?: number | null + duplicate_of?: number | null +} + +export const articlesApi = { + list(params: Record = {}) { + return http.get('/articles', { params }).then((r) => r.data) + }, + get(id: number) { + return http.get(`/articles/${id}`).then((r) => r.data) + }, +} + +export const sourcesApi = { + list() { + return http.get('/sources').then((r) => r.data) + }, +} + +export const meApi = { + me() { + return http.get('/me').then((r) => r.data) + }, + usage() { + return http.get('/me/usage').then((r) => r.data) + }, +} + +export const bookmarksApi = { + list() { + return http.get('/bookmarks').then((r) => r.data) + }, + add(article_id: number, note?: string) { + return http.post('/bookmarks', { article_id, note }).then((r) => r.data) + }, + remove(article_id: number) { + return http.delete(`/bookmarks/${article_id}`).then((r) => r.data) + }, +} + +export const adminApi = { + listSources() { + return http.get('/admin/sources').then((r) => r.data) + }, + createSource(body: any) { + return http.post('/admin/sources', body).then((r) => r.data) + }, + updateSource(id: number, body: any) { + return http.patch(`/admin/sources/${id}`, body).then((r) => r.data) + }, + deleteSource(id: number) { + return http.delete(`/admin/sources/${id}`).then((r) => r.data) + }, + refresh(id: number) { + return http.post(`/admin/refresh/${id}`).then((r) => r.data) + }, + health() { + return http.get('/admin/health').then((r) => r.data) + }, +} diff --git a/frontend/src/api/client.ts b/frontend/src/api/client.ts new file mode 100644 index 0000000..0eae0aa --- /dev/null +++ b/frontend/src/api/client.ts @@ -0,0 +1,37 @@ +import axios, { AxiosError, type AxiosInstance } from 'axios' +import { useAuthStore } from '@/stores/auth' + +const BASE = import.meta.env.VITE_API_BASE || '/api/v1' + +export const http: AxiosInstance = axios.create({ + baseURL: BASE, + timeout: 20000, +}) + +http.interceptors.request.use((cfg) => { + const auth = useAuthStore() + if (auth.accessToken) { + cfg.headers = cfg.headers ?? {} + cfg.headers.Authorization = `Bearer ${auth.accessToken}` + } + return cfg +}) + +http.interceptors.response.use( + (r) => r, + async (err: AxiosError) => { + const auth = useAuthStore() + const original: any = err.config + if (err.response?.status === 401 && !original?._retry && auth.refreshToken) { + original._retry = true + try { + await auth.refresh() + original.headers.Authorization = `Bearer ${auth.accessToken}` + return http(original) + } catch { + auth.logout() + } + } + return Promise.reject(err) + } +) diff --git a/frontend/src/components/AppLayout.vue b/frontend/src/components/AppLayout.vue new file mode 100644 index 0000000..3ad8a05 --- /dev/null +++ b/frontend/src/components/AppLayout.vue @@ -0,0 +1,82 @@ + + + diff --git a/frontend/src/main.ts b/frontend/src/main.ts new file mode 100644 index 0000000..501dee0 --- /dev/null +++ b/frontend/src/main.ts @@ -0,0 +1,10 @@ +import { createApp } from 'vue' +import { createPinia } from 'pinia' +import App from './App.vue' +import router from './router' +import './style.css' + +const app = createApp(App) +app.use(createPinia()) +app.use(router) +app.mount('#app') diff --git a/frontend/src/router.ts b/frontend/src/router.ts new file mode 100644 index 0000000..696dcc1 --- /dev/null +++ b/frontend/src/router.ts @@ -0,0 +1,39 @@ +import { createRouter, createWebHistory, type RouteRecordRaw } from 'vue-router' +import { useAuthStore } from '@/stores/auth' + +const routes: RouteRecordRaw[] = [ + { path: '/login', component: () => import('@/views/Login.vue'), meta: { layout: 'blank' } }, + { + path: '/', + component: () => import('@/components/AppLayout.vue'), + meta: { requiresAuth: true }, + children: [ + { path: '', component: () => import('@/views/Feed.vue') }, + { path: 'article/:id', component: () => import('@/views/ArticleDetail.vue') }, + { path: 'sources', component: () => import('@/views/Sources.vue') }, + { path: 'bookmarks', component: () => import('@/views/Bookmarks.vue') }, + { path: 'admin/sources', component: () => import('@/views/AdminSources.vue'), meta: { ownerOnly: true } }, + ], + }, +] + +const router = createRouter({ + history: createWebHistory(), + routes, +}) + +router.beforeEach((to) => { + const auth = useAuthStore() + auth.restore() + if (to.meta.requiresAuth && !auth.isLogged) { + return { path: '/login', query: { next: to.fullPath } } + } + if (to.meta.ownerOnly && !auth.isOwner) { + return { path: '/' } + } + if (to.path === '/login' && auth.isLogged) { + return { path: '/' } + } +}) + +export default router diff --git a/frontend/src/stores/auth.ts b/frontend/src/stores/auth.ts new file mode 100644 index 0000000..35de25c --- /dev/null +++ b/frontend/src/stores/auth.ts @@ -0,0 +1,61 @@ +import { defineStore } from 'pinia' +import { http } from '@/api/client' + +interface User { + id: number + username: string + role: 'owner' | 'member' + email?: string | null +} + +const ACCESS_KEY = 'dn.access' +const REFRESH_KEY = 'dn.refresh' +const USER_KEY = 'dn.user' + +export const useAuthStore = defineStore('auth', { + state: () => ({ + accessToken: '' as string, + refreshToken: '' as string, + user: null as User | null, + }), + getters: { + isLogged: (s) => !!s.accessToken, + isOwner: (s) => s.user?.role === 'owner', + }, + actions: { + async login(username: string, password: string) { + const { data } = await http.post('/auth/login', { username, password }) + this.accessToken = data.access_token + this.refreshToken = data.refresh_token + localStorage.setItem(ACCESS_KEY, this.accessToken) + localStorage.setItem(REFRESH_KEY, this.refreshToken) + await this.fetchMe() + }, + async refresh() { + const { data } = await http.post('/auth/refresh', { refresh_token: this.refreshToken }) + this.accessToken = data.access_token + this.refreshToken = data.refresh_token + localStorage.setItem(ACCESS_KEY, this.accessToken) + localStorage.setItem(REFRESH_KEY, this.refreshToken) + }, + async fetchMe() { + const { data } = await http.get('/me') + this.user = data + localStorage.setItem(USER_KEY, JSON.stringify(data)) + }, + restore() { + this.accessToken = localStorage.getItem(ACCESS_KEY) || '' + this.refreshToken = localStorage.getItem(REFRESH_KEY) || '' + const u = localStorage.getItem(USER_KEY) + if (u) this.user = JSON.parse(u) + }, + logout() { + this.accessToken = '' + this.refreshToken = '' + this.user = null + localStorage.removeItem(ACCESS_KEY) + localStorage.removeItem(REFRESH_KEY) + localStorage.removeItem(USER_KEY) + }, + }, +}) diff --git a/frontend/src/style.css b/frontend/src/style.css new file mode 100644 index 0000000..93d3316 --- /dev/null +++ b/frontend/src/style.css @@ -0,0 +1,29 @@ +:root { + --max-width: 1200px; + font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, 'Helvetica Neue', + Arial, 'PingFang SC', 'Hiragino Sans GB', 'Microsoft YaHei', sans-serif; + line-height: 1.6; + color: #1f2328; + background: #fafbfc; +} + +* { box-sizing: border-box; } + +html, body, #app { + margin: 0; + padding: 0; + min-height: 100vh; +} + +a { color: inherit; text-decoration: none; } +a:hover { color: #2080f0; } + +img { max-width: 100%; } + +.n-card.article-card { + margin-bottom: 16px; + transition: box-shadow 0.15s; +} +.n-card.article-card:hover { + box-shadow: 0 4px 12px rgba(0, 0, 0, 0.08); +} diff --git a/frontend/src/views/AdminSources.vue b/frontend/src/views/AdminSources.vue new file mode 100644 index 0000000..fd325d3 --- /dev/null +++ b/frontend/src/views/AdminSources.vue @@ -0,0 +1,143 @@ + + + diff --git a/frontend/src/views/ArticleDetail.vue b/frontend/src/views/ArticleDetail.vue new file mode 100644 index 0000000..73f0c18 --- /dev/null +++ b/frontend/src/views/ArticleDetail.vue @@ -0,0 +1,151 @@ + + + diff --git a/frontend/src/views/Bookmarks.vue b/frontend/src/views/Bookmarks.vue new file mode 100644 index 0000000..95c478a --- /dev/null +++ b/frontend/src/views/Bookmarks.vue @@ -0,0 +1,67 @@ + + + diff --git a/frontend/src/views/Feed.vue b/frontend/src/views/Feed.vue new file mode 100644 index 0000000..55b322a --- /dev/null +++ b/frontend/src/views/Feed.vue @@ -0,0 +1,133 @@ + + + diff --git a/frontend/src/views/Login.vue b/frontend/src/views/Login.vue new file mode 100644 index 0000000..2301c39 --- /dev/null +++ b/frontend/src/views/Login.vue @@ -0,0 +1,59 @@ + + + diff --git a/frontend/src/views/Sources.vue b/frontend/src/views/Sources.vue new file mode 100644 index 0000000..2c8c1d2 --- /dev/null +++ b/frontend/src/views/Sources.vue @@ -0,0 +1,80 @@ + + + diff --git a/frontend/tsconfig.json b/frontend/tsconfig.json new file mode 100644 index 0000000..f2f2d75 --- /dev/null +++ b/frontend/tsconfig.json @@ -0,0 +1,22 @@ +{ + "compilerOptions": { + "target": "ES2020", + "useDefineForClassFields": true, + "module": "ESNext", + "lib": ["ES2020", "DOM", "DOM.Iterable"], + "skipLibCheck": true, + "moduleResolution": "bundler", + "allowImportingTsExtensions": true, + "resolveJsonModule": true, + "isolatedModules": true, + "noEmit": true, + "jsx": "preserve", + "strict": true, + "noUnusedLocals": false, + "noUnusedParameters": false, + "noFallthroughCasesInSwitch": true, + "paths": { "@/*": ["./src/*"] } + }, + "include": ["src/**/*.ts", "src/**/*.d.ts", "src/**/*.tsx", "src/**/*.vue"], + "references": [{ "path": "./tsconfig.node.json" }] +} diff --git a/frontend/tsconfig.node.json b/frontend/tsconfig.node.json new file mode 100644 index 0000000..97ede7e --- /dev/null +++ b/frontend/tsconfig.node.json @@ -0,0 +1,11 @@ +{ + "compilerOptions": { + "composite": true, + "skipLibCheck": true, + "module": "ESNext", + "moduleResolution": "bundler", + "allowSyntheticDefaultImports": true, + "strict": true + }, + "include": ["vite.config.ts"] +} diff --git a/frontend/vite.config.ts b/frontend/vite.config.ts new file mode 100644 index 0000000..7b43d70 --- /dev/null +++ b/frontend/vite.config.ts @@ -0,0 +1,29 @@ +import { defineConfig, loadEnv } from 'vite' +import vue from '@vitejs/plugin-vue' +import { fileURLToPath, URL } from 'node:url' + +export default defineConfig(({ mode }) => { + const env = loadEnv(mode, process.cwd(), '') + return { + plugins: [vue()], + resolve: { + alias: { + '@': fileURLToPath(new URL('./src', import.meta.url)), + }, + }, + server: { + host: '0.0.0.0', + port: 5173, + proxy: { + '/api': { + target: env.VITE_API_PROXY || 'http://localhost:8000', + changeOrigin: true, + }, + }, + }, + build: { + target: 'es2020', + outDir: 'dist', + }, + } +}) diff --git a/news-aggregator-plan.md b/news-aggregator-plan.md new file mode 100644 index 0000000..a2d4cc7 --- /dev/null +++ b/news-aggregator-plan.md @@ -0,0 +1,613 @@ +# 私人新闻汇总系统 · 方案设计 v0.1 + +> 适用环境:香港云服务器 / Ubuntu 24.04 LTS / Intel E5·Platinum / 30G SSD / IP 直访 +> 目标受众:自己 + 家人/小圈子(2~10 人) +> 设计原则:**轻量、可控、可扩展、不被反爬干掉** + +--- + +## 0. TL;DR(一页版) + +- **架构**:前后端分离 + 单一 API 网关,采集 → 入库 → 翻译 → API → 前后端展示。 +- **存储**:PostgreSQL 主库 + 本地文件(原文/图片)+ Redis 缓存。**30G SSD 是硬约束**,所以图片默认只存外链,正文做"近 30 天热保留 + 冷归档"两段式。 +- **采集**:凌晨分波次拉取,每源独立 cron 表达式;RSS 走 `feedparser`,非 RSS 走 `trafilatura`/`playwright`。 +- **翻译**:腾讯云为主(500 万字符配额),超额走本地 `LibreTranslate`/`NLLB`;`title/summary/正文` 分块,字符用滑动窗口月度计数。 +- **展示**:网页默认"过去 24h"瀑布流(原文 + 译文并列),Android(Kotlin · Jetpack Compose)走同套 API。 +- **预留**:`category` / `commentary` / `entities` / `sentiment` 字段先建好,模型后插。 + +--- + +## 1. 系统总览 + +### 1.1 业务目标(为什么做) + +1. **破除信息茧房**:同源 + 异源对照,可选集成 Ground News 立场标记。 +2. **抗审查 / 抗算法推荐**:原始列表流,不做兴趣推荐(至少 MVP 不做)。 +3. **可读可搜可归档**:私有领域做"个人情报库",不是又一个今日头条。 +4. **成本可控**:跑在一台 30G 香港 VPS 上,月费用 ≤ 50 HKD。 + +### 1.2 非目标(明确不做) + +- ❌ 不做内容创作/UGC +- ❌ 不做用户关注关系、社交 +- ❌ MVP 不做推荐/个性化(后续可选) +- ❌ 不做 iOS 端(用户明确) +- ❌ 不做爬虫对抗到极致(愿意被 ban IP 就 ban) + +### 1.3 用户与权限 + +| 角色 | 权限 | +| --- | --- | +| Owner(你) | 全部 + 源管理 + 翻译配额监控 + 用户管理 | +| Member(家人/朋友) | 只读 + 收藏 + 关键词订阅 | +| Guest(可后续加) | 只读 + 24h 滑窗 | + +鉴权用 JWT + HTTP-Only Cookie(网页)/ Bearer Token(APP),密码 bcrypt。 + +--- + +## 2. 整体架构 + +### 2.1 分层(逻辑视图) + +``` +┌──────────────────────────────────────────────────────────┐ +│ 表现层(Presentation) │ +│ ├─ Web (Vue 3 / Vite, PWA, 自适应) │ +│ └─ Android (Kotlin / Jetpack Compose, Material 3) │ +└──────────────────┬───────────────────────────────────────┘ + │ HTTPS / JSON (统一 API) +┌──────────────────▼───────────────────────────────────────┐ +│ API 网关层 (FastAPI) │ +│ ├─ /v1/articles /v1/sources /v1/subscriptions │ +│ ├─ /v1/auth /v1/me /v1/search │ +│ └─ /v1/admin/* (源/翻译/任务) │ +└──────────────────┬───────────────────────────────────────┘ + │ +┌──────────────────▼───────────────────────────────────────┐ +│ 业务服务层 (Service) │ +│ ├─ article_service source_service │ +│ ├─ translation_service (配额/降级/缓存) │ +│ ├─ search_service subscription_service │ +│ └─ enrichment_service (分类/点评/实体)预留 │ +└──────────────────┬───────────────────────────────────────┘ + │ +┌──────────────────▼───────────────────────────────────────┐ +│ 数据访问层 (Repository / ORM) │ +│ └─ SQLAlchemy 2.0 (async) · Alembic 迁移 │ +└──────────────────┬───────────────────────────────────────┘ + │ +┌──────────────────▼───────────────────────────────────────┐ +│ 存储层 │ +│ ├─ PostgreSQL 16 (结构化 + 全文检索) │ +│ ├─ Redis 7 (缓存 / 队列 / 限流) │ +│ └─ 本地文件系统 (HTML 快照 / 图片,可选) │ +└──────────────────────────────────────────────────────────┘ + +┌──────────────────────────────────────────────────────────┐ +│ 后台层 (Background / Off-band) │ +│ ├─ Scheduler (APScheduler) → 触发各源采集 │ +│ ├─ Worker (asyncio task pool) → 解析/翻译/入库存 │ +│ └─ Watchdog (健康检查 + 失败重试 + 告警) │ +└──────────────────────────────────────────────────────────┘ +``` + +### 2.2 部署视图(单机,30G 强约束) + +``` +┌──────── 香港 VPS (30G SSD) ────────┐ +│ Docker Compose │ +│ ├─ caddy (反代 + HTTPS) │ +│ ├─ api (FastAPI) │ +│ ├─ worker (后台 worker) │ +│ ├─ scheduler (APScheduler) │ +│ ├─ postgres (主库) │ +│ ├─ redis (缓存) │ +│ └─ meilisearch (可选,全文搜索) │ +└────────────────────────────────────┘ +``` + +- 数据卷预估(30G 总盘): + - 系统 + Docker images ≈ 5G + - PostgreSQL 30 天热数据 ≈ 3G(压缩 + 文本截断 8KB/篇上限) + - Redis 内存上限 256MB + - HTML 快照(可选,默认关闭)≈ 1G/30天 + - 留 10G 以上 buffer → ✅ 余量充足 + +### 2.3 数据流(单篇文章生命周期) + +``` +[源站] → RSS/HTTP → 抓取器(原文 HTML) + ↓ + 解析(title/body/url/published_at/source) + ↓ + 去重(url hash + title simhash) + ↓ + 入 articles 表(原文) + ↓ + 翻译任务入队 → Tencent API → 译文回写 + ↓ + (可选)分类 / 实体 / 摘要任务 + ↓ + API 可被消费 +``` + +--- + +## 3. 技术选型(明确推荐 + 备选) + +| 层 | 推荐 | 备选 | 选它的理由 | +| --- | --- | --- | --- | +| 后端框架 | **Python 3.12 + FastAPI** | Node.js Nest / Go Fiber | 爬虫/解析/ML 库生态最好;中文分词/jieba 顺手 | +| ORM | SQLAlchemy 2.0(async) + Alembic | Tortoise ORM | 生态最熟,迁移稳 | +| 数据库 | **PostgreSQL 16** | SQLite | 全文检索 / JSONB / FTS 都好;SQLite 不支持并发写 | +| 缓存/队列 | **Redis 7** | dramatiq 单机 | 后续好扩;BullMQ 思路熟悉 | +| 任务调度 | APScheduler(进程内) | Celery beat | 30G 单机不需要重型 Celery | +| RSS 解析 | feedparser | — | 工业标准 | +| HTML 抽取 | **trafilatura** | newspaper3k / readability | 多语种 + 准确率高 + 快 | +| 动态渲染 | playwright(按需) | — | 仅对 JS 站点启用 | +| 翻译 SDK | tencentcloud-sdk-python | — | 官方 | +| 本地翻译 | **NLLB-200-distilled-1.3B**(INT8) | LibreTranslate | 离线 fallback | +| 网页前端 | **Vue 3 + Vite + Pinia + Naive UI** | Nuxt / SvelteKit | 学习成本低,产物轻 | +| Android | **Kotlin + Compose + Hilt + Retrofit + Room** | — | 你定 Kotlin | +| 反代 | Caddy | Nginx | 自动 HTTPS,配置短 | +| 监控 | Uptime Kuma(可选容器) | — | 1 个 docker,UI 美 | + +> 30G 硬盘上不跑 ML 模型服务(太大)。本地翻译做成"按需调用小模型"或"调用本机 HTTP 接口",模型文件按需下载或不放服务器。 + +--- + +## 4. 采集层详细设计 + +### 4.1 源的分类(决定采集器) + +| 类别 | 例子 | 采集器 | +| --- | --- | --- | +| RSS 完整 | Reuters / AP / BBC / Al Jazeera / NHK / DW / France24 | `feedparser` | +| RSS 部分 | NYT / Guardian(部分 RSS) | `feedparser` + 抓详情 | +| HTML 列表页 | Ground News / Bing News | `trafilatura` + 列表抽取 | +| Twitter/X | 暂不接(反爬代价高) | — | +| Telegram Channel | 用户后续可加 | Telethon 客户端 | + +### 4.2 源配置(数据库表) + +```sql +CREATE TABLE sources ( + id BIGSERIAL PRIMARY KEY, + name TEXT NOT NULL, -- 'Reuters World' + slug TEXT UNIQUE NOT NULL, -- 'reuters-world' + kind TEXT NOT NULL, -- 'rss' | 'html_list' | 'tg_channel' + url TEXT NOT NULL, -- RSS URL 或 列表 URL + detail_selector JSONB, -- 详情页抽取规则(非RSS) + fetch_interval_min INT NOT NULL DEFAULT 60, + fetch_cron TEXT, -- 可选,覆盖 interval,例 '15 2 * * *' + translate_to TEXT NOT NULL DEFAULT 'zh', -- 目标语言 + enabled BOOLEAN NOT NULL DEFAULT TRUE, + region TEXT, -- 'global' | 'eu' | 'cn' | ... + language_src TEXT, -- 源语种 'en' | 'auto' + priority INT DEFAULT 50, -- 1-100,影响翻译优先级 + headers_json JSONB, -- 自定义 UA/Cookie + last_fetched_at TIMESTAMPTZ, + last_status TEXT, -- 'ok' | 'fail:timeout' ... + created_at TIMESTAMPTZ DEFAULT now() +); +``` + +> **手工配合实现 RSS 源预定**:暴露 `/v1/admin/sources` 的 CRUD + 网页表单,你手工一条条加。MVP 不做 OPML 导入,但保留口子。 + +### 4.3 调度策略 + +- **错峰**:不一次性 wake-up 全部源。按源的 `priority` + `region` 哈希,凌晨分散到不同分钟。 +- **分层时间窗**: + - `priority ≥ 80`:每 30 分钟 + - `priority 50~79`:每 2 小时 + - `priority < 50`:每 6 小时 / 每日 +- **退避**:某源连续 3 次失败,自动把 `fetch_interval_min` × 2,封顶 720min;成功一次后恢复。 +- **统一超时**:单源 fetch ≤ 20s,parse ≤ 10s,失败即记日志,不入主流程。 + +### 4.4 去重 + +三层去重,严格度递减: + +1. **URL 规范化 hash**(主键之 `url_hash` UNIQUE):去除 `utm_*`、hash fragment、尾斜杠。 +2. **title simhash**:相同事件不同 URL 合并(MVP 标 `duplicate_of` 字段)。 +3. **嵌入向量余弦**(后续):同主题聚类展示。 + +### 4.5 抓取器容错 + +- 限速:全局 QPS ≤ 4(礼貌),单源失败重试 3 次,指数退避。 +- 代理:暂不用,先直连。香港出去对欧美/日本都通。 +- 反爬:合规 UA + 极简 Cookie;若某源 `403/429` 高发,接 `playwright` 兜底。 +- 法律:每篇保留来源链接 + 发布时间,不做全文二改,只翻译展示。 + +--- + +## 5. 翻译层详细设计 + +### 5.1 翻译策略(总分总) + +| 字段 | 是否翻译 | 说明 | +| --- | --- | --- | +| title | ✅ 强制 | 优先级最高 | +| summary(若有) | ✅ 强制 | 摘要抽取后续接 | +| body(HTML→纯文本) | ✅ 强制 | 截断 8000 字/篇,超过分段 | +| 段落内嵌 HTML 标签 | ❌ 保留 | 翻译完按位置回插(用 `data-idx` 占位) | + +**分块规则**: +- 按段落切分(双换行) +- 单段 > 1500 字符 → 强制按句号再切 +- 单次 API 请求 body ≤ 5000 字符(腾讯 TMT 限制) +- 译文回写时按 `data-idx` 还原 DOM 结构 + +### 5.2 字符计量(关键:别超 500 万/月) + +```python +# 月度计数器(Redis) +translation:month:202606 = 124533 # 当月已用 + +# 公式 +total_chars = sum(len(seg.encode('utf-8')) // 2 + 1 for seg in segments) +# 用 Unicode 码点近似,腾讯 TMT 实际按"字符数"计 + +# 流程 +pre_check(待翻译字符数) → if (已用+本次) < 500万 → 走腾讯 + else → 走本地 NLLB +``` + +- **保留 5% 缓冲**:到达 475 万字符,自动切本地。 +- **每月 1 日 00:00 HKT** 重置(用 cron + Redis SET)。 + +### 5.3 翻译缓存(白嫖配额) + +- 缓存键:`sha1(source_id + url_hash + lang_pair + text)` +- 命中直接返回,不计字符。 +- 命中率经验值:同源同事件重复抓 30~60%,月省 30%+ 字符。 + +### 5.4 失败降级 + +| 失败类型 | 处置 | +| --- | --- | +| 腾讯 TMT 429/5xx | 重试 2 次,仍失败 → 写 `translation_status='pending'`,后台重排 | +| 配额耗尽 | 切本地 NLLB(离线 INT8 模型) | +| 模型文件缺失 | 直接用原文 + 文末 `[本条未翻译]` 标记 | +| 段落超长截断 | 截断后用 `[...]` 占位,用户可点"展开"看原文 | + +### 5.5 译文表结构(节选) + +```sql +CREATE TABLE articles ( + id BIGSERIAL PRIMARY KEY, + source_id BIGINT REFERENCES sources(id) ON DELETE CASCADE, + url TEXT NOT NULL, + url_hash CHAR(40) NOT NULL UNIQUE, + title TEXT NOT NULL, + title_zh TEXT, + body_html TEXT, -- 抽取后保留结构 + body_text TEXT NOT NULL, -- 纯文本 + body_zh_html TEXT, + body_zh_text TEXT, + lang_src TEXT, -- 'en' | 'ja' | ... + published_at TIMESTAMPTZ, + fetched_at TIMESTAMPTZ DEFAULT now(), + translated_at TIMESTAMPTZ, + translation_status TEXT DEFAULT 'pending', -- pending|ok|partial|failed|n/a + translation_engine TEXT, -- 'tencent' | 'nllb' | 'cache' + translation_chars INT DEFAULT 0, + category TEXT, -- 预留 + commentary TEXT, -- 预留 + entities JSONB, -- 预留 + sentiment REAL, -- 预留 -1..1 + duplicate_of BIGINT REFERENCES articles(id), + is_starred BOOLEAN DEFAULT FALSE, + created_at TIMESTAMPTZ DEFAULT now() +); +CREATE INDEX articles_published_at_idx ON articles (published_at DESC); +CREATE INDEX articles_source_id_idx ON articles (source_id); +CREATE INDEX articles_fts_idx ON articles + USING GIN (to_tsvector('simple', coalesce(title,'') || ' ' || coalesce(body_text,''))); +``` + +--- + +## 6. 数据层 + +### 6.1 PostgreSQL 必备扩展 + +- `pg_trgm`:title 相似度去重 +- `btree_gin` / `pg_stat_statements`:性能/诊断 +- `uuid-ossp`:不必须,主键用 BIGSERIAL + +### 6.2 核心表(全清单) + +``` +sources 采集源配置 +articles 文章主表(原文 + 译文) +article_media 文章图片/附件(默认只存 url) +users 用户 +subscriptions 关键词订阅 +bookmarks 收藏 +read_history 阅读历史(可后续分析) +api_tokens API token(给 Android 用) +audit_logs 关键操作审计 +``` + +### 6.3 备份 + +- `pg_dump` 每日凌晨 4 点 → 压缩到本地 `/var/backups/pg/` +- 保留 7 天滚动 +- 强烈建议再 push 到一个外部对象存储(腾讯云 COS/阿里云 OSS)做异地灾备 +- 30G 盘放不下 7 天全量 dump → 增量 + 每周一全量 + +### 6.4 冷热分层(防 30G 撑爆) + +- 热数据:`published_at > now() - 30 day` 全字段 +- 冷数据:`published_at <= now() - 30 day` 只保留 `title`/`title_zh`/`url`/`body_zh_text`(丢弃 `body_html` 原文) +- 90 天以上:进入"归档表" `articles_archive`,主表查询更轻 + +--- + +## 7. API 设计(RESTful + JSON) + +### 7.1 设计原则 + +- 资源用复数名词:`/v1/articles` +- 时间分页用 `?since=&until=...&limit=...&cursor=...` +- 错误用 RFC 7807 `application/problem+json` +- 列表返回精简字段,详情返回全字段 +- 所有时间 ISO8601 + UTC(前端自行渲染本地) + +### 7.2 鉴权 + +- 登录:`POST /v1/auth/login` → 返回 access(15min) + refresh(7d) +- APP:用长期 API Token(`api_tokens` 表),可撤销 +- Admin 接口:`/v1/admin/*` 强制 `role=owner` + +### 7.3 核心端点(MVP) + +| Method | Path | 说明 | +| --- | --- | --- | +| `GET` | `/v1/articles` | 列表,默认过去 24h。支持 `source` / `lang` / `q` / `cursor` | +| `GET` | `/v1/articles/{id}` | 详情(原文 + 译文 + 媒体 + 实体) | +| `GET` | `/v1/sources` | 源列表(已登录用户) | +| `GET` | `/v1/me` | 当前用户信息 + 翻译配额 | +| `POST` | `/v1/bookmarks` / `DELETE /v1/bookmarks/{article_id}` | 收藏 | +| `POST` | `/v1/subscriptions` | 创建关键词订阅(正则/简单词) | +| `GET` | `/v1/search?q=...` | 全文检索 | +| `GET` | `/v1/stats/usage` | 翻译字符用量,管理端可见 | +| `POST` | `/v1/admin/sources` | 新增/更新源 | +| `POST` | `/v1/admin/refresh/{source_id}` | 手动触发某源抓取 | +| `POST` | `/v1/admin/translation/rerun/{article_id}` | 重译 | + +### 7.4 示例 + +```http +GET /v1/articles?since=2026-06-07T00:00:00Z&limit=50&source=reuters,bbc +Authorization: Bearer + +200 OK +{ + "items": [ + { + "id": 1234, + "source": {"id": 1, "name": "Reuters", "region": "global"}, + "title": "Fed signals ...", + "title_zh": "美联储暗示 ...", + "published_at": "2026-06-07T08:32:00Z", + "lang_src": "en", + "translation_status": "ok", + "summary_zh": "...", + "category": "finance", + "has_commentary": false + } + ], + "next_cursor": "eyJ..." +} +``` + +--- + +## 8. 表现层 + +### 8.1 网页(Web) + +- **首屏**:过去 24h 卡片瀑布,按时间倒序。可切"按源" / "按地区"。 +- **详情页**:左原文 / 右译文并列,可关一侧;底部"分类 / 点评 / 实体 / 相关"预留位。 +- **暗模式**:跟随系统。 +- **PWA**:可"添加到主屏幕",离线缓存最近 50 篇。 +- **技术**:Vue 3 + Vite + Pinia + Vue Router + Naive UI;纯静态构建,首次加载 < 200KB gzip。 + +### 8.2 Android App + +- **架构**:Clean Architecture(Compose UI → ViewModel → UseCase → Repository → Retrofit/Room) +- **关键页面**: + 1. Feed(24h 列表,下拉刷新、无限滚动) + 2. 详情(原文/译文 Tab 切换,字号调节,长按收藏,生词本高亮) + 3. 源管理(只读 + 启停) + 4. 订阅/收藏 + 5. 搜索 + 6. 设置(主题、语种、配额、API 地址) +- **关键库**:Hilt、Retrofit + OkHttp、Kotlinx Serialization、Room(本地缓存)、WorkManager(周期同步 + 推送通道预留)、DataStore(偏好) +- **目标**:APK < 10MB,minSdk 26。 + +--- + +## 9. 机器学习/智能功能(预留接口,MVP 不实现) + +| 功能 | 字段 | 触发 | 模型 | +| --- | --- | --- | --- | +| 自动分类 | `category` | 入库后 | zero-shot:MDeBERTa / bge 嵌入 + 简单聚类 | +| 一句话点评 | `commentary` | 入库后 | 小尺寸 instruct LLM,本地或 API | +| 实体识别 | `entities`(人/地/机) | 入库后 | GLiNER / spaCy | +| 情感 | `sentiment` | 入库后 | twitter-roberta | +| 摘要 | `summary_zh` | 入库后 | 抽取式(首段+关键句)优先 | +| 主题聚类 | `topic_id` | 每日跑 | DBSCAN / HDBSCAN | +| 跨源立场 | `bias_left/center/right` | 后续 | 对接 Ground News 数据 | + +所有这些做成 **enrichment pipeline**,在 worker 里顺序挂 hook,MVP 全部 `null`,字段都在。 + +--- + +## 10. 部署与运维 + +### 10.1 目录结构 + +``` +/srv/news/ + ├─ docker-compose.yml + ├─ .env # 密钥 + ├─ caddy/Caddyfile + ├─ api/ + ├─ worker/ + ├─ scheduler/ + ├─ migrations/ # alembic + └─ data/ + ├─ postgres/ + └─ backups/ +``` + +### 10.2 .env(示例,不要提交) + +``` +POSTGRES_PASSWORD=*** +REDIS_PASSWORD=*** +TENCENTCLOUD_SECRET_ID=*** +TENCENTCLOUD_SECRET_KEY=*** +TENCENTCLOUD_REGION=ap-hongkong +TENCENT_TMT_QUOTA_MONTH=5_000_000 +JWT_SECRET=*** +DOMAIN=news.example.com # 或裸 IP +``` + +### 10.3 启动 + +```bash +docker compose up -d +docker compose exec api alembic upgrade head +docker compose exec api python -m app.scripts.seed_sources +``` + +### 10.4 监控 + +- Uptime Kuma:Docker 一行,UI 看 API/Postgres/Redis 状态 +- API 自带 `/healthz`、`/metrics`(Prometheus 格式,可选) +- 翻译配额告警:每 6h 检查,> 80% 推送到 Telegram bot + +### 10.5 安全 + +- Caddy 自动 HTTPS(域名)或自签(裸 IP) +- Fail2ban 守护 SSH +- 数据库/Redis 不暴露 5432/6379 端口 +- API 限流:每用户 60 req/min(IP + user 双维度) +- 密码 bcrypt cost=12 + +--- + +## 11. 开发路线图(我自己拍的时间,别太较真) + +### Phase 0 · 立项(1 天) +- 决策:技术栈 + 服务器初始化 +- 建仓 + CI 草稿 + +### Phase 1 · MVP(2~3 周) +- ✅ 采集器(RSS 5 个源,跑通) +- ✅ 入库 + 翻译(腾讯云,带缓存) +- ✅ 网页:列表 + 详情 +- ✅ 鉴权(单用户,自用) +- ✅ 凌晨调度跑通 + +### Phase 2 · 扩源 + 体验(2 周) +- 源扩到 20+,加 HTML 列表 +- 网页:筛选、搜索、PWA +- 配额监控 + 告警 +- 备份脚本 + +### Phase 3 · Android(2~3 周) +- APP 端到端 +- 推送通道(可选 FCM / WebPush → Telegram) +- 离线阅读 + +### Phase 4 · 智能增强(2 周+) +- 分类/摘要/实体 +- 主题聚类 + 周报邮件 +- 跨源立场 + +--- + +## 12. 补充功能(你可能没想到的,先列给你筛) + +### 12.1 内容维度 + +- **全文搜索** + 高亮(网页 + APP 双端) +- **关键词订阅**:命中即通知/邮件 +- **收藏 + 标签**:自建标签,可视化分类 +- **阅读历史 + 统计**:今日/本周阅读时长 +- **导出**:单篇 Markdown / 批量 CSV / 邮件简报 +- **每日精选邮件**(凌晨 7 点,前 24h 摘要) +- **实体时间线**:同一人物/机构多源跟踪 +- **反信息茧房视图**:同一事件并排多源报道 +- **媒体偏见可视化**(后续接 Ground News 数据) + +### 12.2 系统维度 + +- **RSS 源管理 UI**(代替手工 SQL) +- **翻译配额仪表盘**(用量趋势/剩余天数) +- **源健康看板**(抓取成功率、平均延迟) +- **手动重抓 / 重译** +- **内容删除/合规** 工具(被 DMCA 时一键下架) +- **多用户**:家人/朋友的子账号 + 独立收藏 +- **iOS PWA**:用网页 PWA 顶一下,先不写原生 +- **Telegram 推送机器人**:最简通知渠道 +- **2FA / Passkey**:管理员登录加一道锁 +- **API Token**:给 Android 用,可独立撤销 +- **审计日志**:谁看了什么(隐私 vs 共享的可调) +- **全文快照开关**:合规要求下可一键保留 HTML 留证 + +### 12.3 体验维度 + +- **暗模式**(网页 + APP) +- **字号/行距** 调节 +- **TTS 朗读**(浏览器 `SpeechSynthesis` 一行调用) +- **生词本 / 翻译对照高亮** +- **滑动操作**(APP 端快速收藏/已读) +- **桌面小组件**(APP) +- **侧边栏:今日摘要 + 配额 + 源状态** +- **快捷键**(网页 j/k 上下条,o 打开,s 收藏) + +### 12.4 还没想清楚(留个尾巴) + +- **法律边界**:翻译展示是否构成"公开展示"?如果是私人圈(2~10 人)风险低;但要避免做"开放注册" +- **内容版权**:只展示摘要 + 链接,正文翻译控制在合理比例(MVP 设 8KB 上限,实际只占原文 30~60%) +- **是否做付费墙破解**:❌ 不做,放弃相应源 + +--- + +## 13. 风险与对策 + +| 风险 | 概率 | 影响 | 对策 | +| --- | --- | --- | --- | +| 源站改版,抓取失效 | 高 | 中 | 解析器独立可热更新;失败率高自动暂停源 | +| 腾讯翻译额度爆 | 中 | 中 | 本地 NLLB 兜底;每日用量监控 | +| 香港 VPS 被墙/限速 | 中 | 高 | 多源冗余;备选新加坡/日本节点 | +| 30G 硬盘撑爆 | 低 | 高 | 冷热分层 + 外部备份 | +| 源站发律师函 | 低 | 高 | 摘要化展示 + 链接跳转原文 | +| 多人共用导致配额紧张 | 中 | 低 | 默认每日 10 万字符上限,可调 | + +--- + +## 14. 第一次跑起来你要做的事 + +1. 服务器初始化(Ubuntu 24 + Docker + Caddy) +2. 注册腾讯云账号,开通文本翻译 TMT,拿 Secret +3. 选 5 个最常用的源(我建议:Reuters、BBC World、Al Jazeera、NHK World、DW),先把 RSS URL 备齐 +4. 我这边起仓写代码,3 周后跑 MVP +5. 跑稳后加 Android + 智能功能 + +--- + +**下一步**:你看完这个方案,挑几个点: +- ① 哪些功能要砍/加? +- ② 技术栈有没有要换的(比如你就是想用 NestJS 不用 FastAPI)? +- ③ MVP 的 5 个源定哪几个? +- ④ 想不想让我现在就开始 Phase 1 的代码? + +回我一句就行。