#!/usr/bin/env python3 # -*- coding: utf-8 -*- """diary-news 服务器健康检查 checklist。 可在本机跑(SSH 远端)或在服务器上直接跑(用 --local)。 走 docker compose 的 6 个服务:postgres / redis / api / worker / caddy / frontend, 外加主机层面的端口/磁盘/内存/日志。 依赖: pip install paramiko 用法(Windows PowerShell): $env:REMOTE_PASS = '你的root密码' python scripts/healthcheck.py python scripts/healthcheck.py --local # 在服务器上直接跑 python scripts/healthcheck.py --host 1.2.3.4 --port 22 --user news python scripts/healthcheck.py --only docker,disk # 只跑指定组 python scripts/healthcheck.py --json out.json # 导出结构化结果 环境变量(可覆盖默认值): REMOTE_HOST 207.57.129.228 REMOTE_PORT 19717 REMOTE_USER root REMOTE_PASS (SSH 必填; --local 不需要) COMPOSE_DIR /srv/news API_BASE_URL http://127.0.0.1:8000 # API 健康检查端点 """ from __future__ import annotations import argparse import base64 import json import os import re import sys import time from dataclasses import dataclass, field, asdict from typing import Callable, Optional # 可选依赖:只在远程模式下需要 try: import paramiko # type: ignore except ImportError: paramiko = None # --local 模式不强制 # ============== 配置 ============== DEFAULT_HOST = "207.57.129.228" DEFAULT_PORT = 19717 DEFAULT_USER = "root" DEFAULT_COMPOSE = "/srv/news" DEFAULT_API_BASE = "http://127.0.0.1/api/v1/healthz" # 走 Caddy 80 反代到 api:8000 SSH_TIMEOUT = 30 # docker-compose.yml 里声明的 6 个服务 EXPECTED_SERVICES = ["postgres", "redis", "api", "worker", "caddy", "frontend"] # 关键端口(默认只检对外服务的 80;其他按需加) KEY_PORTS = { "http": 80, # Caddy / Frontend 对外端口 } # ============== 数据结构 ============== @dataclass class Check: name: str group: str ok: bool summary: str detail: str = "" elapsed_ms: int = 0 severity: str = "info" # info / warn / error command: str = "" # 执行的命令(失败时方便复现) @dataclass class Report: target: str started_at: str finished_at: str = "" checks: list = field(default_factory=list) def add(self, c: Check, verbose: bool = False) -> None: self.checks.append(asdict(c)) # 控制台输出 icon = "✓" if c.ok else "✗" sev = "" if c.severity == "info" else f" [{c.severity.upper()}]" print(f" {icon}{sev} {c.name}: {c.summary} ({c.elapsed_ms}ms)") # 失败时:error 永远显示完整 detail + 命令;warn 默认前 12 行,--verbose 全显 if not c.ok: if c.command: print(f" $ {c.command}") if c.detail: if c.severity == "error" or verbose: for line in c.detail.splitlines() or ["(no detail)"]: print(f" {line}") else: lines = c.detail.splitlines() for line in lines[:12]: print(f" {line}") if len(lines) > 12: print(f" ... (共 {len(lines)} 行,用 --verbose 看完整)") def summary(self) -> tuple[int, int, int]: ok = sum(1 for c in self.checks if c["ok"]) bad = len(self.checks) - ok err = sum(1 for c in self.checks if not c["ok"] and c["severity"] == "error") return ok, bad, err # ============== 远程执行抽象 ============== class Remote: """统一封装: paramiko SSH 走远端, --local 直接在本机 shell。""" def __init__(self, local: bool, host: str = "", port: int = 22, user: str = "root", password: str = ""): self.local = local self.client: Optional[paramiko.SSHClient] = None if local: return if paramiko is None: print("ERROR: paramiko 未安装,远程模式需要 `pip install paramiko`", file=sys.stderr) sys.exit(2) pw = password or os.environ.get("REMOTE_PASS", "") if not pw: print("ERROR: 请先设置环境变量 REMOTE_PASS,或加 --password xxx", file=sys.stderr) sys.exit(2) c = paramiko.SSHClient() c.set_missing_host_key_policy(paramiko.AutoAddPolicy()) c.connect(host, port=port, username=user, password=pw, timeout=SSH_TIMEOUT, banner_timeout=SSH_TIMEOUT, auth_timeout=SSH_TIMEOUT, allow_agent=False, look_for_keys=False) self.client = c def run(self, cmd: str, timeout: int = 60) -> tuple[int, str, str]: """执行命令,返回 (rc, stdout, stderr)。""" if self.local: import subprocess try: p = subprocess.run(cmd, shell=True, capture_output=True, text=True, timeout=timeout) return p.returncode, p.stdout, p.stderr except subprocess.TimeoutExpired as e: return 124, e.stdout or "", f"timeout after {timeout}s" assert self.client is not None _si, so, se = self.client.exec_command(cmd, timeout=timeout, get_pty=True) out = so.read().decode(errors="replace") err = se.read().decode(errors="replace") rc = so.channel.recv_exit_status() return rc, out, err def close(self) -> None: if self.client is not None: self.client.close() # ============== 检查项 ============== def timed(fn: Callable) -> Callable: def wrapper(*args, **kwargs): t0 = time.time() c = fn(*args, **kwargs) c.elapsed_ms = int((time.time() - t0) * 1000) return c return wrapper @timed def check_compose_ps(remote: Remote, compose_dir: str) -> Check: """1.1 docker compose ps — 所有服务应 healthy / running。""" cmd = f"cd {compose_dir} && docker compose ps --format '{{{{.Service}}}}|{{{{.State}}}}|{{{{.Status}}}}'" rc, out, err = remote.run(cmd, timeout=30) lines = [l.strip() for l in out.splitlines() if l.strip()] running, unhealthy, missing = set(), set(), set(EXPECTED_SERVICES) detail_lines = [] for line in lines: parts = line.split("|") if len(parts) < 3: continue svc, state, status = parts[0], parts[1], parts[2] missing.discard(svc) detail_lines.append(f" {svc:10s} {state:12s} {status}") if state.lower() in ("running", "healthy") and "exit" not in status.lower(): running.add(svc) elif state.lower() in ("running",) and "(healthy)" in status.lower(): running.add(svc) else: unhealthy.add(svc) ok = not missing and not unhealthy and len(running) == len(EXPECTED_SERVICES) summary = ( f"{len(running)}/{len(EXPECTED_SERVICES)} running" if ok else f"missing={sorted(missing) or '-'} unhealthy={sorted(unhealthy) or '-'}" ) sev = "error" if missing else ("warn" if unhealthy else "info") return Check("docker compose ps", "docker", ok, summary, "\n".join(detail_lines), severity=sev) @timed def check_container_logs(remote: Remote, compose_dir: str) -> Check: """1.2 最近 worker / api 日志是否有 ERROR / Traceback。""" cmd = ( f"cd {compose_dir} && " "docker compose logs --tail=200 --no-color worker api 2>&1 | " "grep -E -i 'traceback|error|exception|critical' | head -20" ) rc, out, err = remote.run(cmd, timeout=30) out = out.strip() if not out: return Check("近 200 行 worker/api 日志无 ERROR", "docker", True, "clean", severity="info") count = len([l for l in out.splitlines() if l.strip()]) return Check("近 200 行 worker/api 日志无 ERROR", "docker", False, f"{count} 行可疑", out, severity="warn") @timed def check_disk(remote: Remote) -> Check: """1.3 磁盘空间 — 关键挂载点使用率。""" rc, out, err = remote.run("df -h --output=target,size,used,avail,pcent 2>/dev/null | grep -E '/$|/srv|/var$'") out = out.strip() high = [] for line in out.splitlines(): m = re.search(r"(\d+)%", line) if m and int(m.group(1)) >= 85: high.append(line.strip()) ok = not high summary = "ok" if ok else f"高占用: {'; '.join(high)}" return Check("磁盘空间", "docker", ok, summary, out, severity="warn" if not ok else "info") def _parse_size_to_mb(token: str) -> float: """把 '1.9Gi' / '806Mi' / '512Ki' / '1024' 转成 MB。""" m = re.match(r"^\s*(\d+(?:\.\d+)?)\s*([KMG]?i?B?)?\s*$", token) if not m: return 0.0 val = float(m.group(1)) unit = (m.group(2) or "").upper() if unit.startswith("GI") or unit == "G": return val * 1024 if unit.startswith("MI") or unit == "M": return val if unit.startswith("KI") or unit == "K": return val / 1024 # 无单位,默认 KiB (free -h 罕见) return val / 1024 @timed def check_memory(remote: Remote) -> Check: """1.4 内存 + Swap。""" rc, out, _ = remote.run("free -h | head -3") out = out.strip() high = False pct = 0.0 for line in out.splitlines(): if line.startswith("Mem"): parts = line.split() # ['Mem:', 'total', 'used', 'free', 'shared', 'buff/cache', 'available'] if len(parts) >= 7: total_mb = _parse_size_to_mb(parts[1]) used_mb = _parse_size_to_mb(parts[2]) if total_mb > 0: pct = used_mb / total_mb * 100 if pct > 90: high = True summary = "ok" if not high else f">90% used ({pct:.1f}%)" return Check("内存使用", "host", not high, summary, out, severity="warn" if high else "info") @timed def check_ports(remote: Remote) -> Check: """1.5 关键端口监听(默认只检 80)。 用 ss -tln 拿到 LISTEN 行的 LocalAddress 字段(第 4 列,包含 0.0.0.0:80、*:443、[::]:80 等)。 不用 -H(避免不同发行版 header 行差异); 不用 ss -l(避免加 unix socket 干扰)。 """ cmd = ( "ss -tln 2>/dev/null | " "awk 'tolower($1) ~ /listen/ {print $4}' | sort -u" ) rc, out, _ = remote.run(cmd) listening = set() for m in re.finditer(r":(\d+)$", out, re.MULTILINE): listening.add(int(m.group(1))) need = set(KEY_PORTS.values()) missing = sorted(need - listening) ok = not missing label = "/".join(str(p) for p in need) return Check(f"关键端口 {label} 监听", "network", ok, "ok" if ok else f"缺失 {missing}", f"监听中: {sorted(listening)}\n# raw ss output:\n{out.strip()}", command=cmd, severity="warn" if not ok else "info") @timed def check_docker_system(remote: Remote) -> Check: """1.6 docker system df — 卷 / 镜像 / 构建缓存占用。""" rc, out, _ = remote.run("docker system df 2>&1") out = out.strip() # 看 images / build cache 是否爆掉 bloated = False for line in out.splitlines(): if "GB" in line: m = re.search(r"(\d+\.\d+)\s*GB", line) if m and float(m.group(1)) > 5: bloated = True return Check("docker system df", "docker", not bloated, "ok" if not bloated else "有 >5GB 的大件", out, severity="warn" if bloated else "info") @timed def check_api_health(remote: Remote, api_base: str) -> Check: """1.7 API 健康端点。 api_base 接受两种形式: - 完整 URL(已含路径): 'http://127.0.0.1/api/v1/healthz' → 直接用 - 基础 URL: 'http://127.0.0.1:8000' → 自动拼 /api/v1/healthz """ base = api_base.rstrip("/") # 已经看起来是健康端点(以 /healthz 或 /health 结尾)就直接用 if base.endswith("/healthz") or base.endswith("/health"): url = base else: url = f"{base}/api/v1/healthz" cmd = ( f"curl -sS -m 5 -o /tmp/hc_body -w 'http=%{{http_code}} t=%{{time_total}}\\n' '{url}'; " f"echo '--- body ---'; head -c 400 /tmp/hc_body 2>/dev/null; echo" ) rc, out, _ = remote.run(cmd) m = re.search(r"http=(\d+)", out) code = int(m.group(1)) if m else 0 ok = 200 <= code < 400 summary = f"http={code}" + (" (✓ ok)" if ok else " (✗ failed)") return Check(f"API {url}", "app", ok, summary, out.strip(), command=cmd, severity="error" if not ok else "info") @timed def check_db_counts(remote: Remote, compose_dir: str) -> Check: """1.8 articles / sources 表行数(从 .env 读凭据)。""" cmd = ( f"cd {compose_dir} && " "set -a; . ./.env; set +a; " "docker compose exec -T postgres psql -U \"$POSTGRES_USER\" -d \"$POSTGRES_DB\" -t -A -c " "\"SELECT 'articles='||count(*) FROM articles;" "SELECT 'sources='||count(*) FROM sources;" "SELECT 'translated='||count(*) FROM articles WHERE title_zh IS NOT NULL;" "SELECT 'untranslated_24h='||count(*) FROM articles " " WHERE published_at > now() - interval '24 hour' AND title_zh IS NULL;\" 2>&1" ) rc, out, _ = remote.run(cmd, timeout=30) out = out.strip() untrans_m = re.search(r"untranslated_24h=(\d+)", out) untrans_24h = int(untrans_m.group(1)) if untrans_m else -1 ok = rc == 0 and untrans_24h <= 50 # 24h 内未翻译超过 50 算异常 sev = "warn" if (untrans_24h > 50 and untrans_24h <= 200) else ("error" if untrans_24h > 200 else "info") return Check("DB 行数 articles/sources", "app", ok, out.replace("\n", " | "), severity=sev) @timed def check_llm_workflow(remote: Remote, compose_dir: str) -> Check: """1.13 LLM 工作流落实度:5 个步骤的状态分布 + 24h 增量。 步骤(按 enrichment.py:294 顺序): 1. 翻译 translation_status (translation_loop) 2. 分类 classify_status (enrichment 第 1 步) 3. 排版 format_status (enrichment 第 2 步,生成 body_zh_formatted) 4. 插图 image_ai_status (enrichment 第 3 步,生成 image_ai_url) 5. 评论 commentary_status (enrichment 第 4 步,生成 commentary) 判据: - 翻译失败的行 ≥ 5% → warn(但已知有可能是源站没译文、源是中文等,不是 worker 锅) - 24h 增量中,翻译成功的文章里: LLM 全部 n/a → info(LLM 增强关闭 / 还没轮到这个 batch) LLM 全部 ok → ✓ 好 任一 failed 比例 ≥ 20% → warn(LLM 部分任务坏掉) 区分"n/a"(LLM 关了)和"pending"(排队中)和"ok/failed": - LLM 没配 / 关了 → 全 n/a,这是正常状态,info - LLM 开了但文章还没 enrich 完 → n/a + pending 共存,info """ # 一次拿 5 个状态的全局分布 + 24h 内翻译成功的文章里 4 个 LLM 状态的分布 sql = r""" SELECT 'tr_glob' AS k, translation_status AS st, count(*)::int AS n FROM articles GROUP BY translation_status UNION ALL SELECT 'cl_glob', classify_status, count(*)::int FROM articles GROUP BY classify_status UNION ALL SELECT 'fm_glob', format_status, count(*)::int FROM articles GROUP BY format_status UNION ALL SELECT 'im_glob', image_ai_status, count(*)::int FROM articles GROUP BY image_ai_status UNION ALL SELECT 'co_glob', commentary_status, count(*)::int FROM articles GROUP BY commentary_status UNION ALL -- 24h 内翻译成功(translation_status=ok)的文章里,4 个 LLM 状态分布 SELECT 'cl_24h', classify_status, count(*)::int FROM articles WHERE translation_status='ok' AND translated_at > now()-interval '24 hour' GROUP BY classify_status UNION ALL SELECT 'fm_24h', format_status, count(*)::int FROM articles WHERE translation_status='ok' AND translated_at > now()-interval '24 hour' GROUP BY format_status UNION ALL SELECT 'im_24h', image_ai_status, count(*)::int FROM articles WHERE translation_status='ok' AND translated_at > now()-interval '24 hour' GROUP BY image_ai_status UNION ALL SELECT 'co_24h', commentary_status, count(*)::int FROM articles WHERE translation_status='ok' AND translated_at > now()-interval '24 hour' GROUP BY commentary_status; """ cmd = ( f"cd {compose_dir} && " "set -a; . ./.env; set +a; " "docker compose exec -T postgres psql -U \"$POSTGRES_USER\" -d \"$POSTGRES_DB\" -t -A -F $'\\t' -c \"" + sql.replace(chr(34), chr(92) + chr(34)) + "\" 2>&1" ) rc, out, _ = remote.run(cmd, timeout=30) # 解析:tab 分隔,3 列 (k, st, n) glob: dict[str, dict[str, int]] = {} # glob['tr_glob'] = {'ok': 100, 'failed': 5, ...} for line in out.splitlines(): line = line.strip() if line.count("\t") < 2: continue k, st, n_s = line.split("\t", 2) try: n = int(n_s) except ValueError: continue glob.setdefault(k, {})[st] = n if not glob: return Check( "LLM 工作流落实度(翻译/分类/排版/插图/评论)", "app", False, "查询无结果(SQL 失败?)", detail=out[:600], command=cmd, severity="error", ) # === 1) 翻译全局健康 === tr = glob.get("tr_glob", {}) tr_total = sum(tr.values()) tr_failed = tr.get("failed", 0) + tr.get("partial", 0) tr_failed_pct = (tr_failed / tr_total * 100) if tr_total else 0.0 tr_ok = tr.get("ok", 0) # === 2) 24h 翻译成功的文章里 4 个 LLM 状态的落实度 === # 总样本 = cl_24h 的所有值之和(也等于其他 3 个的样本量) llm_24h_total = sum(glob.get("cl_24h", {}).values()) llm_summary: list[str] = [] llm_issues: list[str] = [] for prefix, name in [("cl_24h", "分类"), ("fm_24h", "排版"), ("im_24h", "插图"), ("co_24h", "评论")]: d = glob.get(prefix, {}) ok = d.get("ok", 0) failed = d.get("failed", 0) pending = d.get("pending", 0) na = d.get("n/a", 0) if llm_24h_total == 0: llm_summary.append(f"{name}: 无 24h 翻译样本") continue ok_pct = ok / llm_24h_total * 100 fail_pct = failed / llm_24h_total * 100 llm_summary.append( f"{name}: ok={ok} failed={failed} pending={pending} n/a={na} ({ok_pct:.0f}% ok)" ) if fail_pct >= 20: llm_issues.append(f"{name} 24h 失败率 {fail_pct:.0f}% (≥20%)") # === 3) 全局 LLM 状态分布(用于看整体)=== glob_parts: list[str] = [] for prefix, name in [("cl_glob", "分类"), ("fm_glob", "排版"), ("im_glob", "插图"), ("co_glob", "评论")]: d = glob.get(prefix, {}) if d: parts = ",".join(f"{k}={v}" for k, v in sorted(d.items(), key=lambda x: -x[1])[:3]) glob_parts.append(f"{name} {parts}") # === 4) 汇总判据 === issues: list[str] = [] if tr_failed_pct >= 20: issues.append(f"翻译失败率 {tr_failed_pct:.0f}% ≥20%") elif tr_failed_pct >= 5: issues.append(f"翻译失败率 {tr_failed_pct:.0f}% ≥5%") issues.extend(llm_issues) if llm_24h_total == 0: # 24h 内没翻译成功的文章,工作流谈不上"落实"不"落实",info 跳过 sev = "info" summary = f"24h 内无翻译成功样本(无法评估 LLM 工作流)" else: sev = "error" if any("≥20%" in i and "失败" in i for i in issues) else ( "warn" if issues else "info" ) summary = f"翻译 ok={tr_ok}/{tr_total} ({100 - tr_failed_pct:.0f}%) | " + " · ".join(llm_summary) if issues: summary += " · " + "; ".join(issues[:2]) detail_lines = [ f"翻译全局(全量): " + ", ".join(f"{k}={v}" for k, v in sorted(tr.items(), key=lambda x: -x[1])), f"翻译失败率: {tr_failed_pct:.1f}%", f"24h 已翻译文章样本: {llm_24h_total} 篇", ] + llm_summary + [ "", "全局 LLM 状态(全量,取 top3):", ] + [f" {p}" for p in glob_parts] if issues: detail_lines.append("") detail_lines.append("⚠ 问题: " + "; ".join(issues)) ok = not issues and llm_24h_total > 0 return Check( "LLM 工作流落实度(翻译/分类/排版/插图/评论)", "app", ok, summary, detail="\n".join(detail_lines), command="psql: 5 个 status 字段 × 全局/24h 分布", severity=sev, ) @timed def check_translation_sample(remote: Remote, compose_dir: str, sample_n: int = 3) -> Check: """1.9 抽查最近 24h 内已翻译的 N 篇文章(默认 3 篇),检查翻译质量。 抽样条件: published_at > now()-24h AND title_zh IS NOT NULL AND translation_status IN ('ok','partial') 判据(每篇): - title_zh 非空 - body_zh_text 非空 - title_zh != title (未翻译 fallback 的典型表现) - title_zh 长度 >= 2 整体判据: - 没候选: info (无样本,worker 还没产出) - 全部通过: ok - 通过 1 / N 篇: error (翻译管线几乎坏了) - 通过 2..N-1: warn (部分文章翻译坏掉) """ # 一次拉 sample_n 条,字段用 \t 分隔,转义好 psql 输出 sql = ( f"SELECT id, " f" coalesce(source_id::text,'?') AS src, " f" title, " f" title_zh, " f" coalesce(substring(body_zh_text, 1, 200), '') AS body_zh_preview, " f" translation_status, " f" translation_engine, " f" coalesce(to_char(translated_at, 'YYYY-MM-DD HH24:MI'), '-') AS tat, " f" coalesce(lang_src,'-') AS lang, " f" coalesce(char_length(title),0) AS tlen, " f" coalesce(char_length(title_zh),0) AS zlen, " f" coalesce(char_length(body_zh_text),0) AS blen " f"FROM articles " f"WHERE published_at > now() - interval '24 hour' " f" AND title_zh IS NOT NULL " f" AND translation_status IN ('ok','partial') " f"ORDER BY random() " f"LIMIT {sample_n};" ) # 头部一行,方便按列对齐 header = "id\tsrc\ttitle\ttitle_zh\tbody_zh_preview\tstatus\tengine\ttranslated_at\tlang\ttlen\tzlen\tblen" cmd = ( f"cd {compose_dir} && " "set -a; . ./.env; set +a; " f"echo '{header}'; " f"docker compose exec -T postgres psql -U \"$POSTGRES_USER\" -d \"$POSTGRES_DB\" -t -A -F $'\\t' -c \"{sql.replace(chr(34), chr(92)+chr(34))}\" 2>&1" ) rc, out, err = remote.run(cmd, timeout=30) # 解析输出:跳过 header 行(就是 echo 的那个),保留真实数据行 lines = [l for l in out.splitlines() if l.strip() and not l.startswith("id\t")] # 一些 psql 在 -t 模式下仍可能输出 NOTICE 之类 — 按制表符列数过滤 rows = [] for l in lines: if l.count("\t") >= 9: # 至少 10 列 rows.append(l.split("\t")) if not rows: # 候选为 0 = 24h 内没有已翻译文章(可能刚启动 / 数据少) return Check( f"翻译抽查({sample_n}篇/24h)", "app", True, f"无样本(24h 内暂无已翻译文章)", detail=f"# raw output:\n{out.strip()[:500]}", severity="info", command=cmd, ) # 逐篇判分 verdicts: list[tuple[bool, str]] = [] # (ok, 一行可读摘要) bad_detail: list[str] = [] for cols in rows: try: (aid, src, title, title_zh, body_zh_pv, status, engine, tat, lang, tlen, zlen, blen) = cols[:12] except ValueError: continue tlen_i, zlen_i, blen_i = int(tlen or 0), int(zlen or 0), int(blen or 0) # 判据 reasons: list[str] = [] if not title_zh.strip(): reasons.append("title_zh 空") if not body_zh_pv.strip(): reasons.append("body_zh_text 空") if title_zh.strip() and title.strip() and title_zh.strip() == title.strip(): reasons.append("title_zh == title(未翻译)") if zlen_i < 2: reasons.append(f"title_zh 长度={zlen_i}") is_ok = len(reasons) == 0 verdicts.append((is_ok, reasons)) # 详细行:可读的"原文标题 / 译文标题 / 长度 / 状态" t_disp = (title[:50] + "…") if len(title) > 50 else title z_disp = (title_zh[:50] + "…") if len(title_zh) > 50 else title_zh line = (f"#{aid} src={src} lang={lang} status={status} " f"len: 原 {tlen_i} → 译 {zlen_i} (body_zh {blen_i}) " f"engine={engine} at={tat}") if is_ok: line = "✓ " + line + f"\n 原: {t_disp}\n 译: {z_disp}" else: line = "✗ " + line + f"\n 原因: {'; '.join(reasons)}\n 原: {t_disp}\n 译: {z_disp}" bad_detail.append(line) passed = sum(1 for ok, _ in verdicts if ok) total = len(verdicts) if passed == total: sev, summary = "info", f"{passed}/{total} 通过" elif passed == 0: sev, summary = "error", f"0/{total} 通过 ⚠ 翻译管线可能挂了" else: sev = "warn" summary = f"{passed}/{total} 通过(部分文章翻译异常)" ok_flag = (passed == total) return Check( f"翻译抽查({sample_n}篇/24h)", "app", ok_flag, summary, detail="\n".join(bad_detail), command=cmd, severity=sev, ) @timed def check_redis(remote: Remote, compose_dir: str) -> Check: """1.9 Redis ping + 内存。""" cmd = ( f"cd {compose_dir} && " "set -a; . ./.env; set +a; " "docker compose exec -T redis redis-cli -a \"$REDIS_PASSWORD\" --no-auth-warning " "ping 2>&1; " "docker compose exec -T redis redis-cli -a \"$REDIS_PASSWORD\" --no-auth-warning " "info memory 2>&1 | grep -E 'used_memory_human|used_memory_peak_human|maxmemory_human'" ) rc, out, _ = remote.run(cmd, timeout=20) pong = "PONG" in out return Check("Redis", "app", pong, out.strip().replace("\n", " | "), severity="error" if not pong else "info") @timed def check_homepage(remote: Remote, api_base: str, auth_token: str = "") -> Check: """1.10 首页 SPA + Feed API + 移动端适配。 前端是 Vue SPA,首页 index.html 是空壳;真正要查的是: 1) / 200 + 包含 viewport meta + 引用了 JS bundle 2) /api/v1/articles?page=1&page_size=10 返回 {items,total,total_pages}, items[].title_zh 存在(翻译过的文章会展示) ← 此端点需 auth 3) 移动端: index.html 含 viewport,前端 style.css 含 @media (max-width: 768px) 401 视为"端点需要 token,服务正常" → info,不污染汇总。 """ # 1) 拉首页 HTML rc1, html, _ = remote.run("curl -sS -m 5 http://127.0.0.1/", timeout=10) has_viewport = "name=\"viewport\"" in html or "name='viewport'" in html has_app_div = 'id="app"' in html has_js = "main.ts" in html or "/src/main.ts" in html or "/assets/index-" in html has_lang_zh = 'lang="zh-CN"' in html or "lang='zh-CN'" in html # 2) 拉首页文章列表 API(需 auth) api_url = f"{api_base.rstrip('/').removesuffix('/api/v1/healthz')}/api/v1/articles?page=1&page_size=10" auth_header = "" if auth_token: # 用 base64 转义,避免 shell history / ps 里看见明文 tok_b64 = base64.b64encode(auth_token.encode("utf-8")).decode("ascii") auth_header = f" -H 'Authorization: Bearer $(echo {tok_b64} | base64 -d)'" rc2, body, _ = remote.run( "curl -sS -m 8 '" + api_url + "'" + auth_header + " -w '\\n---HTTP=%{http_code} TIME=%{time_total}---\\n' 2>&1", timeout=15, ) items: list = [] api_code = 0 total = 0 api_err = "" try: marker = "\n---HTTP=" if marker in body: json_part, status_part = body.rsplit(marker, 1) m = re.search(r"HTTP=(\d+)", status_part) api_code = int(m.group(1)) if m else 0 else: json_part = body data = json.loads(json_part) items = data.get("items") or [] total = int(data.get("total") or 0) except Exception as e: api_err = f"{type(e).__name__}: {e}" data = None # 3) 移动端断点 — 在服务端 grep 计数,避免 head 截断 css_href = "" m = re.search(r']+rel="stylesheet"[^>]+href="([^"]+)"', html) if m: css_href = m.group(1) mobile_768 = mobile_480 = 0 if css_href: cmd_css = ( "curl -sS -m 8 'http://127.0.0.1" + css_href + "' | " "grep -oc -E 'max-width:[[:space:]]*768px' || true; " "echo ---480---; " "curl -sS -m 8 'http://127.0.0.1" + css_href + "' | " "grep -oc -E 'max-width:[[:space:]]*480px' || true" ) rc3, css_out, _ = remote.run(cmd_css, timeout=15) # 解析"数\n---480---\n数" parts = re.split(r"---480---", css_out) try: mobile_768 = int((parts[0].strip().splitlines() or ["0"])[-1]) except Exception: pass try: mobile_480 = int((parts[1].strip().splitlines() or ["0"])[-1]) if len(parts) > 1 else 0 except Exception: pass # === 汇总 === issues: list[str] = [] if not has_viewport: issues.append("首页 HTML 缺 viewport meta(移动端不友好)") if not has_app_div: issues.append("首页 HTML 缺 #app 挂载点") if not has_js: issues.append("首页 HTML 没引 JS bundle") if not has_lang_zh: issues.append("首页 HTML lang 不是 zh-CN") # Feed API 状态:401 没带 token 时不算 error;带 token 还 401 算 error need_auth_msg = "" if api_code == 401 and not auth_token: need_auth_msg = "Feed API 401(端点需登录)— 用 --auth-user / --auth-pass 传 owner 凭据" elif api_code != 200: issues.append(f"Feed API 返回 {api_code} (非 200)") if api_err: issues.append(f"Feed API 解析失败: {api_err}") if data is not None and not items and api_code == 200: issues.append(f"Feed API 返回 items 为空 (total={total})") # 译文抽样 sample = [] for it in items[:3]: sample.append({ "id": it.get("id"), "title": (it.get("title") or "")[:60], "title_zh": (it.get("title_zh") or "")[:60], "status": it.get("translation_status"), "engine": it.get("translation_engine"), }) has_zh = sum(1 for it in items if it.get("title_zh")) summary_parts = [ f"html: {'✓' if has_viewport and has_app_div and has_js else '✗'}", f"feed: {len(items)}/{total} (有译文 {has_zh})" if api_code == 200 else f"feed: http={api_code}", f"mobile-css: {mobile_768}×768 + {mobile_480}×480" if css_href else "mobile-css: (无 CSS 链接)", ] summary = " · ".join(summary_parts) if need_auth_msg: summary += " · " + need_auth_msg elif issues: summary += " · " + "; ".join(issues[:2]) # 判定:HTML 元素都齐 + (有 token 拿到了数据 或 401 无 token 算 info) html_ok = has_viewport and has_app_div and has_js and has_lang_zh if need_auth_msg: # 没 token → 401 → 服务正常,降级 info ok = html_ok sev = "info" else: ok = html_ok and not issues sev = "error" if (api_code not in (0, 200) and not need_auth_msg) else ( "warn" if issues else "info" ) detail_lines = [ f"首页 HTML: viewport={has_viewport} #app={has_app_div} js={has_js} lang-zh={has_lang_zh}", f"Feed API: http={api_code} items={len(items)} total={total} 译过={has_zh}", ] if css_href: detail_lines.append(f"CSS: {css_href} mobile: 768px={mobile_768} 处, 480px={mobile_480} 处") if sample: detail_lines.append("首屏抽样:") for s in sample: detail_lines.append( f" #{s['id']} {s['title']!r} → {s['title_zh']!r} " f"[{s['status']}/{s['engine']}]" ) if need_auth_msg: detail_lines.append("提示: " + need_auth_msg) if issues: detail_lines.append("问题: " + "; ".join(issues)) return Check( "首页 SPA + Feed API + 移动端", "app", ok, summary, detail="\n".join(detail_lines), command=f"GET /; GET {api_url}; GET {css_href or '(no css)'}", severity=sev, ) @timed def check_article_detail(remote: Remote, api_base: str, auth_token: str = "") -> Check: """1.11 详情页:取一篇最新已翻译文章,GET /api/v1/articles/{id},看: - status=200 - 字段齐: title / title_zh / body_zh_text 或 body_zh_formatted - body_zh_formatted 含