1289 lines
53 KiB
Python
1289 lines
53 KiB
Python
#!/usr/bin/env python3
|
||
# -*- coding: utf-8 -*-
|
||
"""diary-news 服务器健康检查 checklist。
|
||
|
||
可在本机跑(SSH 远端)或在服务器上直接跑(用 --local)。
|
||
走 docker compose 的 6 个服务:postgres / redis / api / worker / caddy / frontend,
|
||
外加主机层面的端口/磁盘/内存/日志。
|
||
|
||
依赖:
|
||
pip install paramiko
|
||
|
||
用法(Windows PowerShell):
|
||
$env:REMOTE_PASS = '你的root密码'
|
||
python scripts/healthcheck.py
|
||
python scripts/healthcheck.py --local # 在服务器上直接跑
|
||
python scripts/healthcheck.py --host 1.2.3.4 --port 22 --user news
|
||
python scripts/healthcheck.py --only docker,disk # 只跑指定组
|
||
python scripts/healthcheck.py --json out.json # 导出结构化结果
|
||
|
||
环境变量(可覆盖默认值):
|
||
REMOTE_HOST 207.57.129.228
|
||
REMOTE_PORT 19717
|
||
REMOTE_USER root
|
||
REMOTE_PASS (SSH 必填; --local 不需要)
|
||
COMPOSE_DIR /srv/news
|
||
API_BASE_URL http://127.0.0.1:8000 # API 健康检查端点
|
||
"""
|
||
from __future__ import annotations
|
||
|
||
import argparse
|
||
import base64
|
||
import json
|
||
import os
|
||
import re
|
||
import sys
|
||
import time
|
||
from dataclasses import dataclass, field, asdict
|
||
from typing import Callable, Optional
|
||
|
||
# 可选依赖:只在远程模式下需要
|
||
try:
|
||
import paramiko # type: ignore
|
||
except ImportError:
|
||
paramiko = None # --local 模式不强制
|
||
|
||
|
||
# ============== 配置 ==============
|
||
DEFAULT_HOST = "207.57.129.228"
|
||
DEFAULT_PORT = 19717
|
||
DEFAULT_USER = "root"
|
||
DEFAULT_COMPOSE = "/srv/news"
|
||
DEFAULT_API_BASE = "http://127.0.0.1/api/v1/healthz" # 走 Caddy 80 反代到 api:8000
|
||
SSH_TIMEOUT = 30
|
||
|
||
# docker-compose.yml 里声明的 6 个服务
|
||
EXPECTED_SERVICES = ["postgres", "redis", "api", "worker", "caddy", "frontend"]
|
||
|
||
# 关键端口(默认只检对外服务的 80;其他按需加)
|
||
KEY_PORTS = {
|
||
"http": 80, # Caddy / Frontend 对外端口
|
||
}
|
||
|
||
|
||
# ============== 数据结构 ==============
|
||
@dataclass
|
||
class Check:
|
||
name: str
|
||
group: str
|
||
ok: bool
|
||
summary: str
|
||
detail: str = ""
|
||
elapsed_ms: int = 0
|
||
severity: str = "info" # info / warn / error
|
||
command: str = "" # 执行的命令(失败时方便复现)
|
||
|
||
|
||
@dataclass
|
||
class Report:
|
||
target: str
|
||
started_at: str
|
||
finished_at: str = ""
|
||
checks: list = field(default_factory=list)
|
||
|
||
def add(self, c: Check, verbose: bool = False) -> None:
|
||
self.checks.append(asdict(c))
|
||
# 控制台输出
|
||
icon = "✓" if c.ok else "✗"
|
||
sev = "" if c.severity == "info" else f" [{c.severity.upper()}]"
|
||
print(f" {icon}{sev} {c.name}: {c.summary} ({c.elapsed_ms}ms)")
|
||
# 失败时:error 永远显示完整 detail + 命令;warn 默认前 12 行,--verbose 全显
|
||
if not c.ok:
|
||
if c.command:
|
||
print(f" $ {c.command}")
|
||
if c.detail:
|
||
if c.severity == "error" or verbose:
|
||
for line in c.detail.splitlines() or ["(no detail)"]:
|
||
print(f" {line}")
|
||
else:
|
||
lines = c.detail.splitlines()
|
||
for line in lines[:12]:
|
||
print(f" {line}")
|
||
if len(lines) > 12:
|
||
print(f" ... (共 {len(lines)} 行,用 --verbose 看完整)")
|
||
|
||
def summary(self) -> tuple[int, int, int]:
|
||
ok = sum(1 for c in self.checks if c["ok"])
|
||
bad = len(self.checks) - ok
|
||
err = sum(1 for c in self.checks if not c["ok"] and c["severity"] == "error")
|
||
return ok, bad, err
|
||
|
||
|
||
# ============== 远程执行抽象 ==============
|
||
class Remote:
|
||
"""统一封装: paramiko SSH 走远端, --local 直接在本机 shell。"""
|
||
def __init__(self, local: bool, host: str = "", port: int = 22,
|
||
user: str = "root", password: str = ""):
|
||
self.local = local
|
||
self.client: Optional[paramiko.SSHClient] = None
|
||
if local:
|
||
return
|
||
if paramiko is None:
|
||
print("ERROR: paramiko 未安装,远程模式需要 `pip install paramiko`", file=sys.stderr)
|
||
sys.exit(2)
|
||
pw = password or os.environ.get("REMOTE_PASS", "")
|
||
if not pw:
|
||
print("ERROR: 请先设置环境变量 REMOTE_PASS,或加 --password xxx", file=sys.stderr)
|
||
sys.exit(2)
|
||
c = paramiko.SSHClient()
|
||
c.set_missing_host_key_policy(paramiko.AutoAddPolicy())
|
||
c.connect(host, port=port, username=user, password=pw,
|
||
timeout=SSH_TIMEOUT, banner_timeout=SSH_TIMEOUT, auth_timeout=SSH_TIMEOUT,
|
||
allow_agent=False, look_for_keys=False)
|
||
self.client = c
|
||
|
||
def run(self, cmd: str, timeout: int = 60) -> tuple[int, str, str]:
|
||
"""执行命令,返回 (rc, stdout, stderr)。"""
|
||
if self.local:
|
||
import subprocess
|
||
try:
|
||
p = subprocess.run(cmd, shell=True, capture_output=True,
|
||
text=True, timeout=timeout)
|
||
return p.returncode, p.stdout, p.stderr
|
||
except subprocess.TimeoutExpired as e:
|
||
return 124, e.stdout or "", f"timeout after {timeout}s"
|
||
assert self.client is not None
|
||
_si, so, se = self.client.exec_command(cmd, timeout=timeout, get_pty=True)
|
||
out = so.read().decode(errors="replace")
|
||
err = se.read().decode(errors="replace")
|
||
rc = so.channel.recv_exit_status()
|
||
return rc, out, err
|
||
|
||
def close(self) -> None:
|
||
if self.client is not None:
|
||
self.client.close()
|
||
|
||
|
||
# ============== 检查项 ==============
|
||
def timed(fn: Callable) -> Callable:
|
||
def wrapper(*args, **kwargs):
|
||
t0 = time.time()
|
||
c = fn(*args, **kwargs)
|
||
c.elapsed_ms = int((time.time() - t0) * 1000)
|
||
return c
|
||
return wrapper
|
||
|
||
|
||
@timed
|
||
def check_compose_ps(remote: Remote, compose_dir: str) -> Check:
|
||
"""1.1 docker compose ps — 所有服务应 healthy / running。"""
|
||
cmd = f"cd {compose_dir} && docker compose ps --format '{{{{.Service}}}}|{{{{.State}}}}|{{{{.Status}}}}'"
|
||
rc, out, err = remote.run(cmd, timeout=30)
|
||
lines = [l.strip() for l in out.splitlines() if l.strip()]
|
||
running, unhealthy, missing = set(), set(), set(EXPECTED_SERVICES)
|
||
detail_lines = []
|
||
for line in lines:
|
||
parts = line.split("|")
|
||
if len(parts) < 3:
|
||
continue
|
||
svc, state, status = parts[0], parts[1], parts[2]
|
||
missing.discard(svc)
|
||
detail_lines.append(f" {svc:10s} {state:12s} {status}")
|
||
if state.lower() in ("running", "healthy") and "exit" not in status.lower():
|
||
running.add(svc)
|
||
elif state.lower() in ("running",) and "(healthy)" in status.lower():
|
||
running.add(svc)
|
||
else:
|
||
unhealthy.add(svc)
|
||
ok = not missing and not unhealthy and len(running) == len(EXPECTED_SERVICES)
|
||
summary = (
|
||
f"{len(running)}/{len(EXPECTED_SERVICES)} running"
|
||
if ok
|
||
else f"missing={sorted(missing) or '-'} unhealthy={sorted(unhealthy) or '-'}"
|
||
)
|
||
sev = "error" if missing else ("warn" if unhealthy else "info")
|
||
return Check("docker compose ps", "docker", ok, summary, "\n".join(detail_lines), severity=sev)
|
||
|
||
|
||
@timed
|
||
def check_container_logs(remote: Remote, compose_dir: str) -> Check:
|
||
"""1.2 最近 worker / api 日志是否有 ERROR / Traceback。"""
|
||
cmd = (
|
||
f"cd {compose_dir} && "
|
||
"docker compose logs --tail=200 --no-color worker api 2>&1 | "
|
||
"grep -E -i 'traceback|error|exception|critical' | head -20"
|
||
)
|
||
rc, out, err = remote.run(cmd, timeout=30)
|
||
out = out.strip()
|
||
if not out:
|
||
return Check("近 200 行 worker/api 日志无 ERROR", "docker",
|
||
True, "clean", severity="info")
|
||
count = len([l for l in out.splitlines() if l.strip()])
|
||
return Check("近 200 行 worker/api 日志无 ERROR", "docker",
|
||
False, f"{count} 行可疑", out, severity="warn")
|
||
|
||
|
||
@timed
|
||
def check_disk(remote: Remote) -> Check:
|
||
"""1.3 磁盘空间 — 关键挂载点使用率。"""
|
||
rc, out, err = remote.run("df -h --output=target,size,used,avail,pcent 2>/dev/null | grep -E '/$|/srv|/var$'")
|
||
out = out.strip()
|
||
high = []
|
||
for line in out.splitlines():
|
||
m = re.search(r"(\d+)%", line)
|
||
if m and int(m.group(1)) >= 85:
|
||
high.append(line.strip())
|
||
ok = not high
|
||
summary = "ok" if ok else f"高占用: {'; '.join(high)}"
|
||
return Check("磁盘空间", "docker", ok, summary, out, severity="warn" if not ok else "info")
|
||
|
||
|
||
def _parse_size_to_mb(token: str) -> float:
|
||
"""把 '1.9Gi' / '806Mi' / '512Ki' / '1024' 转成 MB。"""
|
||
m = re.match(r"^\s*(\d+(?:\.\d+)?)\s*([KMG]?i?B?)?\s*$", token)
|
||
if not m:
|
||
return 0.0
|
||
val = float(m.group(1))
|
||
unit = (m.group(2) or "").upper()
|
||
if unit.startswith("GI") or unit == "G":
|
||
return val * 1024
|
||
if unit.startswith("MI") or unit == "M":
|
||
return val
|
||
if unit.startswith("KI") or unit == "K":
|
||
return val / 1024
|
||
# 无单位,默认 KiB (free -h 罕见)
|
||
return val / 1024
|
||
|
||
|
||
@timed
|
||
def check_memory(remote: Remote) -> Check:
|
||
"""1.4 内存 + Swap。"""
|
||
rc, out, _ = remote.run("free -h | head -3")
|
||
out = out.strip()
|
||
high = False
|
||
pct = 0.0
|
||
for line in out.splitlines():
|
||
if line.startswith("Mem"):
|
||
parts = line.split()
|
||
# ['Mem:', 'total', 'used', 'free', 'shared', 'buff/cache', 'available']
|
||
if len(parts) >= 7:
|
||
total_mb = _parse_size_to_mb(parts[1])
|
||
used_mb = _parse_size_to_mb(parts[2])
|
||
if total_mb > 0:
|
||
pct = used_mb / total_mb * 100
|
||
if pct > 90:
|
||
high = True
|
||
summary = "ok" if not high else f">90% used ({pct:.1f}%)"
|
||
return Check("内存使用", "host", not high, summary, out,
|
||
severity="warn" if high else "info")
|
||
|
||
|
||
@timed
|
||
def check_ports(remote: Remote) -> Check:
|
||
"""1.5 关键端口监听(默认只检 80)。
|
||
用 ss -tln 拿到 LISTEN 行的 LocalAddress 字段(第 4 列,包含 0.0.0.0:80、*:443、[::]:80 等)。
|
||
不用 -H(避免不同发行版 header 行差异); 不用 ss -l(避免加 unix socket 干扰)。
|
||
"""
|
||
cmd = (
|
||
"ss -tln 2>/dev/null | "
|
||
"awk 'tolower($1) ~ /listen/ {print $4}' | sort -u"
|
||
)
|
||
rc, out, _ = remote.run(cmd)
|
||
listening = set()
|
||
for m in re.finditer(r":(\d+)$", out, re.MULTILINE):
|
||
listening.add(int(m.group(1)))
|
||
need = set(KEY_PORTS.values())
|
||
missing = sorted(need - listening)
|
||
ok = not missing
|
||
label = "/".join(str(p) for p in need)
|
||
return Check(f"关键端口 {label} 监听", "network", ok,
|
||
"ok" if ok else f"缺失 {missing}",
|
||
f"监听中: {sorted(listening)}\n# raw ss output:\n{out.strip()}",
|
||
command=cmd, severity="warn" if not ok else "info")
|
||
|
||
|
||
@timed
|
||
def check_docker_system(remote: Remote) -> Check:
|
||
"""1.6 docker system df — 卷 / 镜像 / 构建缓存占用。"""
|
||
rc, out, _ = remote.run("docker system df 2>&1")
|
||
out = out.strip()
|
||
# 看 images / build cache 是否爆掉
|
||
bloated = False
|
||
for line in out.splitlines():
|
||
if "GB" in line:
|
||
m = re.search(r"(\d+\.\d+)\s*GB", line)
|
||
if m and float(m.group(1)) > 5:
|
||
bloated = True
|
||
return Check("docker system df", "docker", not bloated,
|
||
"ok" if not bloated else "有 >5GB 的大件",
|
||
out, severity="warn" if bloated else "info")
|
||
|
||
|
||
@timed
|
||
def check_api_health(remote: Remote, api_base: str) -> Check:
|
||
"""1.7 API 健康端点。
|
||
api_base 接受两种形式:
|
||
- 完整 URL(已含路径): 'http://127.0.0.1/api/v1/healthz' → 直接用
|
||
- 基础 URL: 'http://127.0.0.1:8000' → 自动拼 /api/v1/healthz
|
||
"""
|
||
base = api_base.rstrip("/")
|
||
# 已经看起来是健康端点(以 /healthz 或 /health 结尾)就直接用
|
||
if base.endswith("/healthz") or base.endswith("/health"):
|
||
url = base
|
||
else:
|
||
url = f"{base}/api/v1/healthz"
|
||
cmd = (
|
||
f"curl -sS -m 5 -o /tmp/hc_body -w 'http=%{{http_code}} t=%{{time_total}}\\n' '{url}'; "
|
||
f"echo '--- body ---'; head -c 400 /tmp/hc_body 2>/dev/null; echo"
|
||
)
|
||
rc, out, _ = remote.run(cmd)
|
||
m = re.search(r"http=(\d+)", out)
|
||
code = int(m.group(1)) if m else 0
|
||
ok = 200 <= code < 400
|
||
summary = f"http={code}" + (" (✓ ok)" if ok else " (✗ failed)")
|
||
return Check(f"API {url}", "app", ok, summary, out.strip(),
|
||
command=cmd, severity="error" if not ok else "info")
|
||
|
||
|
||
@timed
|
||
def check_db_counts(remote: Remote, compose_dir: str) -> Check:
|
||
"""1.8 articles / sources 表行数(从 .env 读凭据)。"""
|
||
cmd = (
|
||
f"cd {compose_dir} && "
|
||
"set -a; . ./.env; set +a; "
|
||
"docker compose exec -T postgres psql -U \"$POSTGRES_USER\" -d \"$POSTGRES_DB\" -t -A -c "
|
||
"\"SELECT 'articles='||count(*) FROM articles;"
|
||
"SELECT 'sources='||count(*) FROM sources;"
|
||
"SELECT 'translated='||count(*) FROM articles WHERE title_zh IS NOT NULL;"
|
||
"SELECT 'untranslated_24h='||count(*) FROM articles "
|
||
" WHERE published_at > now() - interval '24 hour' AND title_zh IS NULL;\" 2>&1"
|
||
)
|
||
rc, out, _ = remote.run(cmd, timeout=30)
|
||
out = out.strip()
|
||
untrans_m = re.search(r"untranslated_24h=(\d+)", out)
|
||
untrans_24h = int(untrans_m.group(1)) if untrans_m else -1
|
||
ok = rc == 0 and untrans_24h <= 50 # 24h 内未翻译超过 50 算异常
|
||
sev = "warn" if (untrans_24h > 50 and untrans_24h <= 200) else ("error" if untrans_24h > 200 else "info")
|
||
return Check("DB 行数 articles/sources", "app", ok,
|
||
out.replace("\n", " | "),
|
||
severity=sev)
|
||
|
||
|
||
@timed
|
||
def check_llm_workflow(remote: Remote, compose_dir: str) -> Check:
|
||
"""1.13 LLM 工作流落实度:5 个步骤的状态分布 + 24h 增量。
|
||
步骤(按 enrichment.py:294 顺序):
|
||
1. 翻译 translation_status (translation_loop)
|
||
2. 分类 classify_status (enrichment 第 1 步)
|
||
3. 排版 format_status (enrichment 第 2 步,生成 body_zh_formatted)
|
||
4. 插图 image_ai_status (enrichment 第 3 步,生成 image_ai_url)
|
||
5. 评论 commentary_status (enrichment 第 4 步,生成 commentary)
|
||
判据:
|
||
- 翻译失败的行 ≥ 5% → warn(但已知有可能是源站没译文、源是中文等,不是 worker 锅)
|
||
- 24h 增量中,翻译成功的文章里:
|
||
LLM 全部 n/a → info(LLM 增强关闭 / 还没轮到这个 batch)
|
||
LLM 全部 ok → ✓ 好
|
||
任一 failed 比例 ≥ 20% → warn(LLM 部分任务坏掉)
|
||
区分"n/a"(LLM 关了)和"pending"(排队中)和"ok/failed":
|
||
- LLM 没配 / 关了 → 全 n/a,这是正常状态,info
|
||
- LLM 开了但文章还没 enrich 完 → n/a + pending 共存,info
|
||
"""
|
||
# 一次拿 5 个状态的全局分布 + 24h 内翻译成功的文章里 4 个 LLM 状态的分布
|
||
sql = r"""
|
||
SELECT 'tr_glob' AS k, translation_status AS st, count(*)::int AS n
|
||
FROM articles GROUP BY translation_status
|
||
UNION ALL
|
||
SELECT 'cl_glob', classify_status, count(*)::int FROM articles GROUP BY classify_status
|
||
UNION ALL
|
||
SELECT 'fm_glob', format_status, count(*)::int FROM articles GROUP BY format_status
|
||
UNION ALL
|
||
SELECT 'im_glob', image_ai_status, count(*)::int FROM articles GROUP BY image_ai_status
|
||
UNION ALL
|
||
SELECT 'co_glob', commentary_status, count(*)::int FROM articles GROUP BY commentary_status
|
||
UNION ALL
|
||
-- 24h 内翻译成功(translation_status=ok)的文章里,4 个 LLM 状态分布
|
||
SELECT 'cl_24h', classify_status, count(*)::int FROM articles
|
||
WHERE translation_status='ok' AND translated_at > now()-interval '24 hour'
|
||
GROUP BY classify_status
|
||
UNION ALL
|
||
SELECT 'fm_24h', format_status, count(*)::int FROM articles
|
||
WHERE translation_status='ok' AND translated_at > now()-interval '24 hour'
|
||
GROUP BY format_status
|
||
UNION ALL
|
||
SELECT 'im_24h', image_ai_status, count(*)::int FROM articles
|
||
WHERE translation_status='ok' AND translated_at > now()-interval '24 hour'
|
||
GROUP BY image_ai_status
|
||
UNION ALL
|
||
SELECT 'co_24h', commentary_status, count(*)::int FROM articles
|
||
WHERE translation_status='ok' AND translated_at > now()-interval '24 hour'
|
||
GROUP BY commentary_status;
|
||
"""
|
||
cmd = (
|
||
f"cd {compose_dir} && "
|
||
"set -a; . ./.env; set +a; "
|
||
"docker compose exec -T postgres psql -U \"$POSTGRES_USER\" -d \"$POSTGRES_DB\" -t -A -F $'\\t' -c \""
|
||
+ sql.replace(chr(34), chr(92) + chr(34))
|
||
+ "\" 2>&1"
|
||
)
|
||
rc, out, _ = remote.run(cmd, timeout=30)
|
||
|
||
# 解析:tab 分隔,3 列 (k, st, n)
|
||
glob: dict[str, dict[str, int]] = {} # glob['tr_glob'] = {'ok': 100, 'failed': 5, ...}
|
||
for line in out.splitlines():
|
||
line = line.strip()
|
||
if line.count("\t") < 2:
|
||
continue
|
||
k, st, n_s = line.split("\t", 2)
|
||
try:
|
||
n = int(n_s)
|
||
except ValueError:
|
||
continue
|
||
glob.setdefault(k, {})[st] = n
|
||
|
||
if not glob:
|
||
return Check(
|
||
"LLM 工作流落实度(翻译/分类/排版/插图/评论)", "app", False,
|
||
"查询无结果(SQL 失败?)",
|
||
detail=out[:600],
|
||
command=cmd,
|
||
severity="error",
|
||
)
|
||
|
||
# === 1) 翻译全局健康 ===
|
||
tr = glob.get("tr_glob", {})
|
||
tr_total = sum(tr.values())
|
||
tr_failed = tr.get("failed", 0) + tr.get("partial", 0)
|
||
tr_failed_pct = (tr_failed / tr_total * 100) if tr_total else 0.0
|
||
tr_ok = tr.get("ok", 0)
|
||
|
||
# === 2) 24h 翻译成功的文章里 4 个 LLM 状态的落实度 ===
|
||
# 总样本 = cl_24h 的所有值之和(也等于其他 3 个的样本量)
|
||
llm_24h_total = sum(glob.get("cl_24h", {}).values())
|
||
llm_summary: list[str] = []
|
||
llm_issues: list[str] = []
|
||
for prefix, name in [("cl_24h", "分类"), ("fm_24h", "排版"),
|
||
("im_24h", "插图"), ("co_24h", "评论")]:
|
||
d = glob.get(prefix, {})
|
||
ok = d.get("ok", 0)
|
||
failed = d.get("failed", 0)
|
||
pending = d.get("pending", 0)
|
||
na = d.get("n/a", 0)
|
||
if llm_24h_total == 0:
|
||
llm_summary.append(f"{name}: 无 24h 翻译样本")
|
||
continue
|
||
ok_pct = ok / llm_24h_total * 100
|
||
fail_pct = failed / llm_24h_total * 100
|
||
llm_summary.append(
|
||
f"{name}: ok={ok} failed={failed} pending={pending} n/a={na} ({ok_pct:.0f}% ok)"
|
||
)
|
||
if fail_pct >= 20:
|
||
llm_issues.append(f"{name} 24h 失败率 {fail_pct:.0f}% (≥20%)")
|
||
|
||
# === 3) 全局 LLM 状态分布(用于看整体)===
|
||
glob_parts: list[str] = []
|
||
for prefix, name in [("cl_glob", "分类"), ("fm_glob", "排版"),
|
||
("im_glob", "插图"), ("co_glob", "评论")]:
|
||
d = glob.get(prefix, {})
|
||
if d:
|
||
parts = ",".join(f"{k}={v}" for k, v in sorted(d.items(), key=lambda x: -x[1])[:3])
|
||
glob_parts.append(f"{name} {parts}")
|
||
|
||
# === 4) 汇总判据 ===
|
||
issues: list[str] = []
|
||
if tr_failed_pct >= 20:
|
||
issues.append(f"翻译失败率 {tr_failed_pct:.0f}% ≥20%")
|
||
elif tr_failed_pct >= 5:
|
||
issues.append(f"翻译失败率 {tr_failed_pct:.0f}% ≥5%")
|
||
issues.extend(llm_issues)
|
||
|
||
if llm_24h_total == 0:
|
||
# 24h 内没翻译成功的文章,工作流谈不上"落实"不"落实",info 跳过
|
||
sev = "info"
|
||
summary = f"24h 内无翻译成功样本(无法评估 LLM 工作流)"
|
||
else:
|
||
sev = "error" if any("≥20%" in i and "失败" in i for i in issues) else (
|
||
"warn" if issues else "info"
|
||
)
|
||
summary = f"翻译 ok={tr_ok}/{tr_total} ({100 - tr_failed_pct:.0f}%) | " + " · ".join(llm_summary)
|
||
if issues:
|
||
summary += " · " + "; ".join(issues[:2])
|
||
|
||
detail_lines = [
|
||
f"翻译全局(全量): " + ", ".join(f"{k}={v}" for k, v in sorted(tr.items(), key=lambda x: -x[1])),
|
||
f"翻译失败率: {tr_failed_pct:.1f}%",
|
||
f"24h 已翻译文章样本: {llm_24h_total} 篇",
|
||
] + llm_summary + [
|
||
"",
|
||
"全局 LLM 状态(全量,取 top3):",
|
||
] + [f" {p}" for p in glob_parts]
|
||
if issues:
|
||
detail_lines.append("")
|
||
detail_lines.append("⚠ 问题: " + "; ".join(issues))
|
||
|
||
ok = not issues and llm_24h_total > 0
|
||
return Check(
|
||
"LLM 工作流落实度(翻译/分类/排版/插图/评论)", "app", ok, summary,
|
||
detail="\n".join(detail_lines),
|
||
command="psql: 5 个 status 字段 × 全局/24h 分布",
|
||
severity=sev,
|
||
)
|
||
|
||
|
||
@timed
|
||
def check_translation_sample(remote: Remote, compose_dir: str, sample_n: int = 3) -> Check:
|
||
"""1.9 抽查最近 24h 内已翻译的 N 篇文章(默认 3 篇),检查翻译质量。
|
||
|
||
抽样条件: published_at > now()-24h AND title_zh IS NOT NULL
|
||
AND translation_status IN ('ok','partial')
|
||
判据(每篇):
|
||
- title_zh 非空
|
||
- body_zh_text 非空
|
||
- title_zh != title (未翻译 fallback 的典型表现)
|
||
- title_zh 长度 >= 2
|
||
整体判据:
|
||
- 没候选: info (无样本,worker 还没产出)
|
||
- 全部通过: ok
|
||
- 通过 1 / N 篇: error (翻译管线几乎坏了)
|
||
- 通过 2..N-1: warn (部分文章翻译坏掉)
|
||
"""
|
||
# 一次拉 sample_n 条,字段用 \t 分隔,转义好 psql 输出
|
||
sql = (
|
||
f"SELECT id, "
|
||
f" coalesce(source_id::text,'?') AS src, "
|
||
f" title, "
|
||
f" title_zh, "
|
||
f" coalesce(substring(body_zh_text, 1, 200), '') AS body_zh_preview, "
|
||
f" translation_status, "
|
||
f" translation_engine, "
|
||
f" coalesce(to_char(translated_at, 'YYYY-MM-DD HH24:MI'), '-') AS tat, "
|
||
f" coalesce(lang_src,'-') AS lang, "
|
||
f" coalesce(char_length(title),0) AS tlen, "
|
||
f" coalesce(char_length(title_zh),0) AS zlen, "
|
||
f" coalesce(char_length(body_zh_text),0) AS blen "
|
||
f"FROM articles "
|
||
f"WHERE published_at > now() - interval '24 hour' "
|
||
f" AND title_zh IS NOT NULL "
|
||
f" AND translation_status IN ('ok','partial') "
|
||
f"ORDER BY random() "
|
||
f"LIMIT {sample_n};"
|
||
)
|
||
# 头部一行,方便按列对齐
|
||
header = "id\tsrc\ttitle\ttitle_zh\tbody_zh_preview\tstatus\tengine\ttranslated_at\tlang\ttlen\tzlen\tblen"
|
||
cmd = (
|
||
f"cd {compose_dir} && "
|
||
"set -a; . ./.env; set +a; "
|
||
f"echo '{header}'; "
|
||
f"docker compose exec -T postgres psql -U \"$POSTGRES_USER\" -d \"$POSTGRES_DB\" -t -A -F $'\\t' -c \"{sql.replace(chr(34), chr(92)+chr(34))}\" 2>&1"
|
||
)
|
||
rc, out, err = remote.run(cmd, timeout=30)
|
||
|
||
# 解析输出:跳过 header 行(就是 echo 的那个),保留真实数据行
|
||
lines = [l for l in out.splitlines() if l.strip() and not l.startswith("id\t")]
|
||
# 一些 psql 在 -t 模式下仍可能输出 NOTICE 之类 — 按制表符列数过滤
|
||
rows = []
|
||
for l in lines:
|
||
if l.count("\t") >= 9: # 至少 10 列
|
||
rows.append(l.split("\t"))
|
||
|
||
if not rows:
|
||
# 候选为 0 = 24h 内没有已翻译文章(可能刚启动 / 数据少)
|
||
return Check(
|
||
f"翻译抽查({sample_n}篇/24h)", "app", True,
|
||
f"无样本(24h 内暂无已翻译文章)",
|
||
detail=f"# raw output:\n{out.strip()[:500]}",
|
||
severity="info",
|
||
command=cmd,
|
||
)
|
||
|
||
# 逐篇判分
|
||
verdicts: list[tuple[bool, str]] = [] # (ok, 一行可读摘要)
|
||
bad_detail: list[str] = []
|
||
for cols in rows:
|
||
try:
|
||
(aid, src, title, title_zh, body_zh_pv, status,
|
||
engine, tat, lang, tlen, zlen, blen) = cols[:12]
|
||
except ValueError:
|
||
continue
|
||
tlen_i, zlen_i, blen_i = int(tlen or 0), int(zlen or 0), int(blen or 0)
|
||
# 判据
|
||
reasons: list[str] = []
|
||
if not title_zh.strip():
|
||
reasons.append("title_zh 空")
|
||
if not body_zh_pv.strip():
|
||
reasons.append("body_zh_text 空")
|
||
if title_zh.strip() and title.strip() and title_zh.strip() == title.strip():
|
||
reasons.append("title_zh == title(未翻译)")
|
||
if zlen_i < 2:
|
||
reasons.append(f"title_zh 长度={zlen_i}")
|
||
is_ok = len(reasons) == 0
|
||
verdicts.append((is_ok, reasons))
|
||
|
||
# 详细行:可读的"原文标题 / 译文标题 / 长度 / 状态"
|
||
t_disp = (title[:50] + "…") if len(title) > 50 else title
|
||
z_disp = (title_zh[:50] + "…") if len(title_zh) > 50 else title_zh
|
||
line = (f"#{aid} src={src} lang={lang} status={status} "
|
||
f"len: 原 {tlen_i} → 译 {zlen_i} (body_zh {blen_i}) "
|
||
f"engine={engine} at={tat}")
|
||
if is_ok:
|
||
line = "✓ " + line + f"\n 原: {t_disp}\n 译: {z_disp}"
|
||
else:
|
||
line = "✗ " + line + f"\n 原因: {'; '.join(reasons)}\n 原: {t_disp}\n 译: {z_disp}"
|
||
bad_detail.append(line)
|
||
|
||
passed = sum(1 for ok, _ in verdicts if ok)
|
||
total = len(verdicts)
|
||
if passed == total:
|
||
sev, summary = "info", f"{passed}/{total} 通过"
|
||
elif passed == 0:
|
||
sev, summary = "error", f"0/{total} 通过 ⚠ 翻译管线可能挂了"
|
||
else:
|
||
sev = "warn"
|
||
summary = f"{passed}/{total} 通过(部分文章翻译异常)"
|
||
|
||
ok_flag = (passed == total)
|
||
return Check(
|
||
f"翻译抽查({sample_n}篇/24h)", "app", ok_flag, summary,
|
||
detail="\n".join(bad_detail),
|
||
command=cmd, severity=sev,
|
||
)
|
||
|
||
|
||
@timed
|
||
def check_redis(remote: Remote, compose_dir: str) -> Check:
|
||
"""1.9 Redis ping + 内存。"""
|
||
cmd = (
|
||
f"cd {compose_dir} && "
|
||
"set -a; . ./.env; set +a; "
|
||
"docker compose exec -T redis redis-cli -a \"$REDIS_PASSWORD\" --no-auth-warning "
|
||
"ping 2>&1; "
|
||
"docker compose exec -T redis redis-cli -a \"$REDIS_PASSWORD\" --no-auth-warning "
|
||
"info memory 2>&1 | grep -E 'used_memory_human|used_memory_peak_human|maxmemory_human'"
|
||
)
|
||
rc, out, _ = remote.run(cmd, timeout=20)
|
||
pong = "PONG" in out
|
||
return Check("Redis", "app", pong, out.strip().replace("\n", " | "),
|
||
severity="error" if not pong else "info")
|
||
|
||
|
||
@timed
|
||
def check_homepage(remote: Remote, api_base: str, auth_token: str = "") -> Check:
|
||
"""1.10 首页 SPA + Feed API + 移动端适配。
|
||
前端是 Vue SPA,首页 index.html 是空壳;真正要查的是:
|
||
1) / 200 + 包含 viewport meta + 引用了 JS bundle
|
||
2) /api/v1/articles?page=1&page_size=10 返回 {items,total,total_pages},
|
||
items[].title_zh 存在(翻译过的文章会展示) ← 此端点需 auth
|
||
3) 移动端: index.html 含 viewport,前端 style.css 含 @media (max-width: 768px)
|
||
401 视为"端点需要 token,服务正常" → info,不污染汇总。
|
||
"""
|
||
# 1) 拉首页 HTML
|
||
rc1, html, _ = remote.run("curl -sS -m 5 http://127.0.0.1/", timeout=10)
|
||
has_viewport = "name=\"viewport\"" in html or "name='viewport'" in html
|
||
has_app_div = 'id="app"' in html
|
||
has_js = "main.ts" in html or "/src/main.ts" in html or "/assets/index-" in html
|
||
has_lang_zh = 'lang="zh-CN"' in html or "lang='zh-CN'" in html
|
||
|
||
# 2) 拉首页文章列表 API(需 auth)
|
||
api_url = f"{api_base.rstrip('/').removesuffix('/api/v1/healthz')}/api/v1/articles?page=1&page_size=10"
|
||
auth_header = ""
|
||
if auth_token:
|
||
# 用 base64 转义,避免 shell history / ps 里看见明文
|
||
tok_b64 = base64.b64encode(auth_token.encode("utf-8")).decode("ascii")
|
||
auth_header = f" -H 'Authorization: Bearer $(echo {tok_b64} | base64 -d)'"
|
||
rc2, body, _ = remote.run(
|
||
"curl -sS -m 8 '" + api_url + "'" + auth_header +
|
||
" -w '\\n---HTTP=%{http_code} TIME=%{time_total}---\\n' 2>&1",
|
||
timeout=15,
|
||
)
|
||
items: list = []
|
||
api_code = 0
|
||
total = 0
|
||
api_err = ""
|
||
try:
|
||
marker = "\n---HTTP="
|
||
if marker in body:
|
||
json_part, status_part = body.rsplit(marker, 1)
|
||
m = re.search(r"HTTP=(\d+)", status_part)
|
||
api_code = int(m.group(1)) if m else 0
|
||
else:
|
||
json_part = body
|
||
data = json.loads(json_part)
|
||
items = data.get("items") or []
|
||
total = int(data.get("total") or 0)
|
||
except Exception as e:
|
||
api_err = f"{type(e).__name__}: {e}"
|
||
data = None
|
||
|
||
# 3) 移动端断点 — 在服务端 grep 计数,避免 head 截断
|
||
css_href = ""
|
||
m = re.search(r'<link[^>]+rel="stylesheet"[^>]+href="([^"]+)"', html)
|
||
if m:
|
||
css_href = m.group(1)
|
||
mobile_768 = mobile_480 = 0
|
||
if css_href:
|
||
cmd_css = (
|
||
"curl -sS -m 8 'http://127.0.0.1" + css_href + "' | "
|
||
"grep -oc -E 'max-width:[[:space:]]*768px' || true; "
|
||
"echo ---480---; "
|
||
"curl -sS -m 8 'http://127.0.0.1" + css_href + "' | "
|
||
"grep -oc -E 'max-width:[[:space:]]*480px' || true"
|
||
)
|
||
rc3, css_out, _ = remote.run(cmd_css, timeout=15)
|
||
# 解析"数\n---480---\n数"
|
||
parts = re.split(r"---480---", css_out)
|
||
try: mobile_768 = int((parts[0].strip().splitlines() or ["0"])[-1])
|
||
except Exception: pass
|
||
try: mobile_480 = int((parts[1].strip().splitlines() or ["0"])[-1]) if len(parts) > 1 else 0
|
||
except Exception: pass
|
||
|
||
# === 汇总 ===
|
||
issues: list[str] = []
|
||
if not has_viewport: issues.append("首页 HTML 缺 viewport meta(移动端不友好)")
|
||
if not has_app_div: issues.append("首页 HTML 缺 #app 挂载点")
|
||
if not has_js: issues.append("首页 HTML 没引 JS bundle")
|
||
if not has_lang_zh: issues.append("首页 HTML lang 不是 zh-CN")
|
||
|
||
# Feed API 状态:401 没带 token 时不算 error;带 token 还 401 算 error
|
||
need_auth_msg = ""
|
||
if api_code == 401 and not auth_token:
|
||
need_auth_msg = "Feed API 401(端点需登录)— 用 --auth-user / --auth-pass 传 owner 凭据"
|
||
elif api_code != 200:
|
||
issues.append(f"Feed API 返回 {api_code} (非 200)")
|
||
if api_err:
|
||
issues.append(f"Feed API 解析失败: {api_err}")
|
||
if data is not None and not items and api_code == 200:
|
||
issues.append(f"Feed API 返回 items 为空 (total={total})")
|
||
|
||
# 译文抽样
|
||
sample = []
|
||
for it in items[:3]:
|
||
sample.append({
|
||
"id": it.get("id"),
|
||
"title": (it.get("title") or "")[:60],
|
||
"title_zh": (it.get("title_zh") or "")[:60],
|
||
"status": it.get("translation_status"),
|
||
"engine": it.get("translation_engine"),
|
||
})
|
||
has_zh = sum(1 for it in items if it.get("title_zh"))
|
||
|
||
summary_parts = [
|
||
f"html: {'✓' if has_viewport and has_app_div and has_js else '✗'}",
|
||
f"feed: {len(items)}/{total} (有译文 {has_zh})" if api_code == 200
|
||
else f"feed: http={api_code}",
|
||
f"mobile-css: {mobile_768}×768 + {mobile_480}×480" if css_href
|
||
else "mobile-css: (无 CSS 链接)",
|
||
]
|
||
summary = " · ".join(summary_parts)
|
||
if need_auth_msg:
|
||
summary += " · " + need_auth_msg
|
||
elif issues:
|
||
summary += " · " + "; ".join(issues[:2])
|
||
|
||
# 判定:HTML 元素都齐 + (有 token 拿到了数据 或 401 无 token 算 info)
|
||
html_ok = has_viewport and has_app_div and has_js and has_lang_zh
|
||
if need_auth_msg:
|
||
# 没 token → 401 → 服务正常,降级 info
|
||
ok = html_ok
|
||
sev = "info"
|
||
else:
|
||
ok = html_ok and not issues
|
||
sev = "error" if (api_code not in (0, 200) and not need_auth_msg) else (
|
||
"warn" if issues else "info"
|
||
)
|
||
|
||
detail_lines = [
|
||
f"首页 HTML: viewport={has_viewport} #app={has_app_div} js={has_js} lang-zh={has_lang_zh}",
|
||
f"Feed API: http={api_code} items={len(items)} total={total} 译过={has_zh}",
|
||
]
|
||
if css_href:
|
||
detail_lines.append(f"CSS: {css_href} mobile: 768px={mobile_768} 处, 480px={mobile_480} 处")
|
||
if sample:
|
||
detail_lines.append("首屏抽样:")
|
||
for s in sample:
|
||
detail_lines.append(
|
||
f" #{s['id']} {s['title']!r} → {s['title_zh']!r} "
|
||
f"[{s['status']}/{s['engine']}]"
|
||
)
|
||
if need_auth_msg:
|
||
detail_lines.append("提示: " + need_auth_msg)
|
||
if issues:
|
||
detail_lines.append("问题: " + "; ".join(issues))
|
||
|
||
return Check(
|
||
"首页 SPA + Feed API + 移动端", "app", ok, summary,
|
||
detail="\n".join(detail_lines),
|
||
command=f"GET /; GET {api_url}; GET {css_href or '(no css)'}",
|
||
severity=sev,
|
||
)
|
||
|
||
|
||
@timed
|
||
def check_article_detail(remote: Remote, api_base: str, auth_token: str = "") -> Check:
|
||
"""1.11 详情页:取一篇最新已翻译文章,GET /api/v1/articles/{id},看:
|
||
- status=200
|
||
- 字段齐: title / title_zh / body_zh_text 或 body_zh_formatted
|
||
- body_zh_formatted 含 <div class="article-body"> (说明 LLM 排版版带了 CSS 容器)
|
||
- 移动端: meta viewport(首页的) + 详情页路由 /article/{id}
|
||
401 视为"端点需 auth"— 提示用户加 --auth-user / --auth-pass,不污染汇总。
|
||
"""
|
||
base = api_base.rstrip("/").removesuffix("/api/v1/healthz")
|
||
list_url = f"{base}/api/v1/articles?page=1&page_size=1"
|
||
tok_b64 = base64.b64encode(auth_token.encode("utf-8")).decode("ascii") if auth_token else ""
|
||
auth_h = f" -H 'Authorization: Bearer $(echo {tok_b64} | base64 -d)'" if tok_b64 else ""
|
||
rc, list_body, _ = remote.run(
|
||
"curl -sS -m 8 '" + list_url + "'" + auth_h + " -w '\\n---HTTP=%{http_code}---\\n' 2>&1",
|
||
timeout=10,
|
||
)
|
||
article_id = None
|
||
list_code = 0
|
||
if rc == 0 and list_body:
|
||
try:
|
||
marker = "\n---HTTP="
|
||
if marker in list_body:
|
||
json_part, status_part = list_body.rsplit(marker, 1)
|
||
m = re.search(r"HTTP=(\d+)", status_part)
|
||
list_code = int(m.group(1)) if m else 0
|
||
else:
|
||
json_part = list_body
|
||
data = json.loads(json_part)
|
||
if data.get("items"):
|
||
article_id = data["items"][0]["id"]
|
||
except Exception:
|
||
pass
|
||
if list_code == 401 and not auth_token:
|
||
return Check(
|
||
"详情页 API + 译文 CSS", "app", True,
|
||
"需 owner token(用 --auth-user / --auth-pass)",
|
||
detail=f"# raw list response:\n{list_body[:300]}",
|
||
command=f"GET {list_url} (no token)",
|
||
severity="info",
|
||
)
|
||
if not article_id:
|
||
return Check(
|
||
"详情页 API + 译文 CSS", "app", False,
|
||
f"无可用文章样本(列表 http={list_code}, items=0?)",
|
||
detail=list_body[:500],
|
||
command=list_url,
|
||
severity="warn",
|
||
)
|
||
|
||
# 拉详情
|
||
detail_url = f"{base}/api/v1/articles/{article_id}"
|
||
rc2, body2, _ = remote.run(
|
||
"curl -sS -m 8 '" + detail_url + "'" + auth_h + " -w '\\n---HTTP=%{http_code}---\\n' 2>&1",
|
||
timeout=10,
|
||
)
|
||
api_code = 0
|
||
article = {}
|
||
parse_err = ""
|
||
try:
|
||
marker = "\n---HTTP="
|
||
if marker in body2:
|
||
json_part, status_part = body2.rsplit(marker, 1)
|
||
else:
|
||
json_part, status_part = body2, ""
|
||
m = re.search(r"HTTP=(\d+)", status_part)
|
||
api_code = int(m.group(1)) if m else 0
|
||
article = json.loads(json_part)
|
||
except Exception as e:
|
||
parse_err = f"{type(e).__name__}: {e}"
|
||
|
||
if api_code != 200 or not article:
|
||
return Check(
|
||
f"详情页 API #{article_id} + 译文 CSS", "app", False,
|
||
f"http={api_code} parse_err={parse_err or '-'}",
|
||
detail=body2[:500],
|
||
command=detail_url,
|
||
severity="error",
|
||
)
|
||
|
||
# 判据
|
||
title = article.get("title") or ""
|
||
title_zh = article.get("title_zh") or ""
|
||
body_zh_text = article.get("body_zh_text") or ""
|
||
body_zh_formatted = article.get("body_zh_formatted") or ""
|
||
body_zh_html = article.get("body_zh_html") or ""
|
||
fmt_status = article.get("format_status") or "n/a"
|
||
tr_status = article.get("translation_status") or "-"
|
||
tr_engine = article.get("translation_engine") or "-"
|
||
|
||
issues: list[str] = []
|
||
if not title_zh: issues.append("缺 title_zh(无译文)")
|
||
if not (body_zh_text or body_zh_formatted or body_zh_html):
|
||
issues.append("缺 body_zh_text/formatted/html(译文全空)")
|
||
if title_zh and title and title_zh.strip() == title.strip():
|
||
issues.append("title_zh == title(未翻译)")
|
||
|
||
has_css_container = (
|
||
'class="article-body"' in body_zh_formatted
|
||
or "class='article-body'" in body_zh_formatted
|
||
)
|
||
css_info = "✓ 排版版带 .article-body 容器" if has_css_container else (
|
||
"✗ 排版版缺 .article-body 容器(译文没套 CSS)"
|
||
if body_zh_formatted
|
||
else "— 无排版版(用原始译文展示)"
|
||
)
|
||
|
||
if not has_css_container and body_zh_formatted:
|
||
issues.append("排版版 body_zh_formatted 缺 .article-body CSS 容器")
|
||
|
||
summary = (
|
||
f"#{article_id} {tr_status}/{tr_engine} fmt={fmt_status} "
|
||
f"译字 {len(title_zh)}/{len(body_zh_text)}; CSS {css_info}"
|
||
)
|
||
if issues:
|
||
summary += " · " + "; ".join(issues[:2])
|
||
|
||
detail_lines = [
|
||
f"原标题: {title[:80]!r}",
|
||
f"译标题: {title_zh[:80]!r}",
|
||
f"body_zh_text 长度: {len(body_zh_text)}",
|
||
f"body_zh_formatted 长度: {len(body_zh_formatted)} status={fmt_status}",
|
||
f"body_zh_html 长度: {len(body_zh_html)}",
|
||
f"CSS 容器(.article-body): {'有' if has_css_container else '无'}",
|
||
]
|
||
# 抽 body_zh_formatted 前 300 字符(可能 < 字符被转义了)
|
||
if body_zh_formatted:
|
||
detail_lines.append(f"body_zh_formatted 前 300: {body_zh_formatted[:300]!r}")
|
||
|
||
ok = not issues and api_code == 200
|
||
sev = "error" if (api_code != 200) else ("warn" if issues else "info")
|
||
return Check(
|
||
f"详情页 API #{article_id} + 译文 CSS", "app", ok, summary,
|
||
detail="\n".join(detail_lines),
|
||
command=detail_url,
|
||
severity=sev,
|
||
)
|
||
|
||
|
||
@timed
|
||
def check_agnes_llm(remote: Remote, compose_dir: str) -> Check:
|
||
"""1.12 Agnes LLM 健康:真发一次 chat/completions 调用。
|
||
- 读 .env 的 AGNES_API_KEY / AGNES_BASE_URL / AGNES_CHAT_MODEL
|
||
- 没配 → info 跳过(LLM 增强是可选模块)
|
||
- 配了 → 发一次最小调用(max_tokens=8,短 prompt)看 200 + choices[0].message.content
|
||
注意:为了避免 API 密钥泄露到 shell history,密钥用 base64 编码后
|
||
在远程 shell 里 decode 出来,再注入到 curl Header。
|
||
"""
|
||
# 1) 读 .env 拿 3 个变量
|
||
rc, env_out, _ = remote.run(
|
||
f"cd {compose_dir} 2>/dev/null && "
|
||
"grep -E '^(AGNES_API_KEY|AGNES_BASE_URL|AGNES_CHAT_MODEL)=' .env 2>/dev/null"
|
||
)
|
||
api_key = base_url = model = ""
|
||
for line in env_out.splitlines():
|
||
m = re.match(r"^AGNES_API_KEY=(.+)$", line)
|
||
if m: api_key = m.group(1).strip().strip('"').strip("'")
|
||
m = re.match(r"^AGNES_BASE_URL=(.+)$", line)
|
||
if m: base_url = m.group(1).strip().strip('"').strip("'")
|
||
m = re.match(r"^AGNES_CHAT_MODEL=(.+)$", line)
|
||
if m: model = m.group(1).strip().strip('"').strip("'")
|
||
|
||
if not api_key or api_key.startswith("your_"):
|
||
return Check(
|
||
"Agnes LLM 联通", "app", True,
|
||
"未配 AGNES_API_KEY(LLM 增强模块关闭),跳过",
|
||
detail=env_out.strip()[:300],
|
||
severity="info",
|
||
)
|
||
|
||
base_url = base_url or "https://apihub.agnes-ai.com/v1"
|
||
model = model or "agnes-2.0-flash"
|
||
chat_url = f"{base_url.rstrip('/')}/chat/completions"
|
||
|
||
# 2) base64 编码密钥 + payload,再在 shell 里 decode 出来拼 header
|
||
# 避免 API key 出现在 process list / history 里
|
||
key_b64 = base64.b64encode(api_key.encode("utf-8")).decode("ascii")
|
||
payload_obj = {
|
||
"model": model,
|
||
"messages": [
|
||
{"role": "system", "content": "You are a ping bot. Reply with a single word."},
|
||
{"role": "user", "content": "ping"},
|
||
],
|
||
"max_tokens": 8,
|
||
"temperature": 0,
|
||
}
|
||
payload_b64 = base64.b64encode(
|
||
json.dumps(payload_obj, ensure_ascii=False).encode("utf-8")
|
||
).decode("ascii")
|
||
|
||
cmd = (
|
||
f"KEY_B64={key_b64}; "
|
||
f"PAYLOAD_B64={payload_b64}; "
|
||
"BODY=$(echo \"$PAYLOAD_B64\" | base64 -d); "
|
||
f"curl -sS -m 25 -o /tmp/agnes_resp -w 'http=%{{http_code}} t=%{{time_total}}\\n' "
|
||
"-H \"Authorization: Bearer $(echo $KEY_B64 | base64 -d)\" "
|
||
"-H 'Content-Type: application/json' "
|
||
f"-d \"$BODY\" '{chat_url}'; "
|
||
"echo '--- body (first 400 chars) ---'; head -c 400 /tmp/agnes_resp 2>/dev/null; echo"
|
||
)
|
||
rc2, out, _ = remote.run(cmd, timeout=40)
|
||
|
||
# 解析
|
||
m = re.search(r"http=(\d+)\s+t=([\d.]+)", out)
|
||
code = int(m.group(1)) if m else 0
|
||
elapsed = float(m.group(2)) if m else 0
|
||
body_str = ""
|
||
if "--- body" in out:
|
||
body_str = out.split("--- body", 1)[1].split("---", 1)[-1].strip()
|
||
|
||
if code != 200:
|
||
return Check(
|
||
f"Agnes LLM chat 调用", "app", False,
|
||
f"http={code} t={elapsed:.1f}s",
|
||
detail=out[:600],
|
||
command=f"POST {chat_url} (auth via base64-decoded key, not echoed)",
|
||
severity="error",
|
||
)
|
||
# 看返回里有没有 text
|
||
try:
|
||
resp = json.loads(out.split("--- body", 1)[-1].split("---", 1)[-1].strip() or body_str)
|
||
text = (resp.get("choices") or [{}])[0].get("message", {}).get("content", "")
|
||
except Exception:
|
||
text = ""
|
||
|
||
ok = code == 200 and bool(text)
|
||
summary = f"http={code} t={elapsed:.1f}s model={model} reply={text[:30]!r}"
|
||
return Check(
|
||
"Agnes LLM chat 调用", "app", ok, summary,
|
||
detail=f"# model: {model}\n# base_url: {base_url}\n# raw:\n{out[:800]}",
|
||
command=f"POST {chat_url}",
|
||
severity="info" if ok else "warn",
|
||
)
|
||
|
||
|
||
@timed
|
||
def check_caddy(remote: Remote) -> Check:
|
||
"""1.10 Caddy 反代 — 80 端口根路径 200/301/302。"""
|
||
cmd = "curl -sS -m 5 -o /dev/null -w 'http=%{http_code} t=%{time_total}\\n' http://127.0.0.1/"
|
||
rc, out, _ = remote.run(cmd)
|
||
m = re.search(r"http=(\d+)", out)
|
||
code = int(m.group(1)) if m else 0
|
||
ok = 200 <= code < 400
|
||
return Check("Caddy http://127.0.0.1/", "app", ok, out.strip(),
|
||
severity="error" if not ok else "info")
|
||
|
||
|
||
@timed
|
||
def check_frontend(remote: Remote) -> Check:
|
||
"""1.11 Frontend — 80 端口 / 返回 index.html。"""
|
||
cmd = (
|
||
"curl -sS -m 5 -o /dev/null -w 'http=%{http_code} t=%{time_total} ct=%{content_type}\\n' http://127.0.0.1/; "
|
||
"curl -sS -m 5 http://127.0.0.1/ | head -3"
|
||
)
|
||
rc, out, _ = remote.run(cmd)
|
||
m = re.search(r"http=(\d+)", out)
|
||
code = int(m.group(1)) if m else 0
|
||
ok = 200 <= code < 400 and ("html" in out.lower() or "<!doctype" in out.lower())
|
||
return Check("Frontend 首页", "app", ok, out.splitlines()[0] if out else "",
|
||
out, severity="warn" if not ok else "info")
|
||
|
||
|
||
@timed
|
||
def check_tls_cert(remote: Remote) -> Check:
|
||
"""1.12 HTTPS 证书 — 仅在 .env 里 DOMAIN 非空时检查。"""
|
||
# 先从 .env 读 DOMAIN 值(没配就跳过)
|
||
rc, env_out, _ = remote.run(
|
||
f"cd {COMPOSE_DIR} 2>/dev/null && "
|
||
"grep -E '^DOMAIN=' .env 2>/dev/null | head -1"
|
||
)
|
||
domain = ""
|
||
for line in env_out.splitlines():
|
||
m = re.match(r"^DOMAIN=(.+)$", line.strip())
|
||
if m:
|
||
domain = m.group(1).strip().strip('"').strip("'")
|
||
break
|
||
if not domain:
|
||
return Check("HTTPS 证书(域名)", "app", True,
|
||
"未配 DOMAIN,跳过(走 IP 模式)", severity="info")
|
||
# 有域名,拉证书
|
||
cmd2 = f"echo | openssl s_client -servername {domain} -connect {domain}:443 2>/dev/null | openssl x509 -noout -dates 2>&1"
|
||
rc2, out2, _ = remote.run(cmd2, timeout=15)
|
||
m = re.search(r"notAfter=(.+)", out2)
|
||
if not m:
|
||
return Check(f"HTTPS 证书 {domain}", "app", False,
|
||
"无法获取证书(可能 443 未开)", out2, severity="warn")
|
||
return Check(f"HTTPS 证书 {domain}", "app", True, f"notAfter={m.group(1).strip()}",
|
||
severity="info")
|
||
|
||
|
||
@timed
|
||
def check_docker_logs_size(remote: Remote, compose_dir: str) -> Check:
|
||
"""1.13 日志卷积压。"""
|
||
cmd = (
|
||
f"cd {compose_dir} && "
|
||
"docker compose logs --no-color --tail=0 2>&1 >/dev/null; "
|
||
"du -sh /var/lib/docker/containers/*/*-json.log 2>/dev/null | sort -h | tail -5"
|
||
)
|
||
rc, out, _ = remote.run(cmd, timeout=20)
|
||
big = []
|
||
for line in out.splitlines():
|
||
m = re.match(r"(\d+)([KMG]?)\s+", line.strip())
|
||
if not m: continue
|
||
size, unit = int(m.group(1)), m.group(2)
|
||
mb = size * (1024 if unit == "G" else 1 if unit == "M" else 1/1024)
|
||
if unit == "G" or (unit == "M" and size > 200):
|
||
big.append(line.strip())
|
||
return Check("容器日志大小", "docker", not big,
|
||
"ok" if not big else f"大日志: {'; '.join(big)}",
|
||
out, severity="warn" if big else "info")
|
||
|
||
|
||
# ============== 主流程 ==============
|
||
GROUPS: dict[str, list[Callable]] = {
|
||
"docker": [
|
||
("docker compose ps", lambda r: check_compose_ps(r, COMPOSE_DIR)),
|
||
("近 200 行 worker/api 日志", lambda r: check_container_logs(r, COMPOSE_DIR)),
|
||
("docker system df", lambda r: check_docker_system(r)),
|
||
("容器日志大小", lambda r: check_docker_logs_size(r, COMPOSE_DIR)),
|
||
],
|
||
"host": [
|
||
("磁盘空间", lambda r: check_disk(r)),
|
||
("内存使用", lambda r: check_memory(r)),
|
||
],
|
||
"network": [
|
||
("关键端口监听", lambda r: check_ports(r)),
|
||
],
|
||
"app": [
|
||
("API 健康", lambda r: check_api_health(r, API_BASE)),
|
||
("Redis ping", lambda r: check_redis(r, COMPOSE_DIR)),
|
||
("DB 行数", lambda r: check_db_counts(r, COMPOSE_DIR)),
|
||
("LLM 工作流落实度", lambda r: check_llm_workflow(r, COMPOSE_DIR)),
|
||
(f"翻译抽查", lambda r: check_translation_sample(r, COMPOSE_DIR, SAMPLE_N)),
|
||
("Caddy 反代", lambda r: check_caddy(r)),
|
||
("Frontend 首页", lambda r: check_frontend(r)),
|
||
("首页 SPA + Feed API", lambda r: check_homepage(r, API_BASE, AUTH_TOKEN)),
|
||
("详情页 + 译文 CSS", lambda r: check_article_detail(r, API_BASE, AUTH_TOKEN)),
|
||
("Agnes LLM 调用", lambda r: check_agnes_llm(r, COMPOSE_DIR)),
|
||
("HTTPS 证书", lambda r: check_tls_cert(r)),
|
||
],
|
||
}
|
||
|
||
|
||
def main() -> int:
|
||
ap = argparse.ArgumentParser(
|
||
description="diary-news 服务器健康检查",
|
||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||
epilog="示例:\n"
|
||
" python healthcheck.py # 跑全部\n"
|
||
" python healthcheck.py --only docker,app # 只跑 docker 和 app 组\n"
|
||
" python healthcheck.py --local --compose-dir . # 服务器本地跑\n"
|
||
" python healthcheck.py --json report.json # 导出结构化报告\n",
|
||
)
|
||
ap.add_argument("--local", action="store_true", help="在服务器本地跑,不走 SSH")
|
||
ap.add_argument("--host", default=os.environ.get("REMOTE_HOST", DEFAULT_HOST))
|
||
ap.add_argument("--port", type=int, default=int(os.environ.get("REMOTE_PORT", DEFAULT_PORT)))
|
||
ap.add_argument("--user", default=os.environ.get("REMOTE_USER", DEFAULT_USER))
|
||
ap.add_argument("--password", default=os.environ.get("REMOTE_PASS", ""))
|
||
ap.add_argument("--compose-dir", default=os.environ.get("COMPOSE_DIR", DEFAULT_COMPOSE))
|
||
ap.add_argument("--api-base", default=os.environ.get("API_BASE_URL", DEFAULT_API_BASE))
|
||
ap.add_argument("--only", help="逗号分隔的组名: docker,host,network,app")
|
||
ap.add_argument("--skip", help="逗号分隔的组名,跳过")
|
||
ap.add_argument("--json", dest="json_out", help="把结果写到 JSON 文件")
|
||
ap.add_argument("--quiet", action="store_true", help="只输出汇总")
|
||
ap.add_argument("--verbose", "-v", action="store_true",
|
||
help="显示失败项的完整原始输出(默认 warn 截断 12 行)")
|
||
ap.add_argument("--sample", type=int, default=3,
|
||
help="翻译抽查的文章数(默认 3 篇,24h 内已翻译的随机样本)")
|
||
ap.add_argument("--auth-user", default=os.environ.get("OWNER_USER", "owner"),
|
||
help="owner 用户名(用于获取 JWT token,调 /api/v1/auth/login)")
|
||
ap.add_argument("--auth-pass", default=os.environ.get("OWNER_PASS", ""),
|
||
help="owner 密码(env: OWNER_PASS)。如不传,API 端点会降级为 info(不污染汇总)")
|
||
ap.add_argument("--skip-auth", action="store_true",
|
||
help="明确跳过 auth token,等价于不传 --auth-pass")
|
||
args = ap.parse_args()
|
||
|
||
global COMPOSE_DIR, API_BASE, SAMPLE_N, AUTH_TOKEN
|
||
COMPOSE_DIR = args.compose_dir
|
||
API_BASE = args.api_base
|
||
SAMPLE_N = max(1, min(args.sample, 20)) # 1..20 封顶,避免误传爆 1000
|
||
# 提前在 main 函数顶部声明,稍后赋值后,GROUPS 里的 lambda 能读到
|
||
|
||
only = set((args.only or "").split(",")) - {""}
|
||
skip = set((args.skip or "").split(",")) - {""}
|
||
|
||
target = "local" if args.local else f"{args.user}@{args.host}:{args.port}"
|
||
print(f"==== diary-news 健康检查 ====")
|
||
print(f"目标: {target}")
|
||
print(f"目录: {COMPOSE_DIR}")
|
||
print(f"时间: {time.strftime('%Y-%m-%d %H:%M:%S %Z')}")
|
||
print()
|
||
|
||
remote = Remote(local=args.local, host=args.host, port=args.port,
|
||
user=args.user, password=args.password)
|
||
report = Report(target=target, started_at=time.strftime("%Y-%m-%dT%H:%M:%S%z"))
|
||
|
||
# ===== 拿 owner token(可选)=====
|
||
# 必须无条件初始化:即使跳过了 login,AUTH_TOKEN 也要在模块 dict 里,
|
||
# 否则 GROUPS 里的 lambda 闭包查找时会 NameError。
|
||
global AUTH_TOKEN
|
||
AUTH_TOKEN = ""
|
||
if not args.skip_auth and args.auth_pass:
|
||
# base64 编码密码再传,避免出现在 process list
|
||
pw_b64 = base64.b64encode(args.auth_pass.encode("utf-8")).decode("ascii")
|
||
login_url = f"{API_BASE.rstrip('/').removesuffix('/api/v1/healthz')}/api/v1/auth/login"
|
||
login_cmd = (
|
||
f"PW_B64={pw_b64}; "
|
||
"BODY=$(printf '{\"username\":\"%s\",\"password\":\"'\"$(echo $PW_B64 | base64 -d)\"'\"}' \""
|
||
+ args.auth_user + "\"); "
|
||
"curl -sS -m 8 -o /tmp/login_resp -w 'http=%{http_code}\\n' "
|
||
f"-H 'Content-Type: application/json' -d \"$BODY\" '{login_url}'; "
|
||
"echo '--- token (jwt header only) ---'; "
|
||
"head -c 200 /tmp/login_resp 2>/dev/null; echo"
|
||
)
|
||
rc, out, _ = remote.run(login_cmd, timeout=15)
|
||
m = re.search(r"http=(\d+)", out)
|
||
if m and m.group(1) == "200":
|
||
try:
|
||
body_str = out.rsplit("--- token (jwt header only) ---", 1)[-1].strip()
|
||
body_str = body_str.rstrip("---").strip()
|
||
resp = json.loads(body_str)
|
||
AUTH_TOKEN = resp.get("access_token") or resp.get("accessToken") or resp.get("token") or ""
|
||
except Exception as e:
|
||
print(f" ⚠ auth: 解析响应失败 {e}")
|
||
if AUTH_TOKEN:
|
||
print(f" ✓ auth: 已登录 owner='{args.auth_user}', token 长度 {len(AUTH_TOKEN)}")
|
||
else:
|
||
code_str = m.group(1) if m else "?"
|
||
print(f" ⚠ auth: 登录失败 http={code_str}, API 检查项将无 token(降级 info)")
|
||
else:
|
||
print(" · auth: 未传 --auth-pass(API 检查项将降级为 info 提示)")
|
||
|
||
try:
|
||
for group, fns in GROUPS.items():
|
||
if only and group not in only: continue
|
||
if skip and group in skip: continue
|
||
print(f"--- [{group}] ---")
|
||
for name, fn in fns:
|
||
try:
|
||
c = fn(remote)
|
||
if not args.quiet:
|
||
report.add(c, verbose=args.verbose)
|
||
else:
|
||
report.checks.append(asdict(c))
|
||
except Exception as e:
|
||
err_c = Check(name, group, False, f"异常: {e}",
|
||
detail=f"type={type(e).__name__}\n{type(e).__doc__ or ''}",
|
||
severity="error")
|
||
if not args.quiet:
|
||
report.add(err_c, verbose=args.verbose)
|
||
else:
|
||
report.checks.append(asdict(err_c))
|
||
print()
|
||
finally:
|
||
remote.close()
|
||
|
||
report.finished_at = time.strftime("%Y-%m-%dT%H:%M:%S%z")
|
||
ok, bad, err = report.summary()
|
||
print(f"==== 汇总 ====")
|
||
print(f" 合计 {len(report.checks)} 项 · 通过 {ok} · 失败 {bad} · 严重错误 {err}")
|
||
if err > 0:
|
||
print(f" ✗ 存在 {err} 个 error 级问题,建议立即排查")
|
||
code = 2
|
||
elif bad > 0:
|
||
print(f" ⚠ 存在 {bad} 个 warn 级问题,建议看一下")
|
||
code = 1
|
||
else:
|
||
print(f" ✓ 全部通过")
|
||
code = 0
|
||
|
||
if args.json_out:
|
||
with open(args.json_out, "w", encoding="utf-8") as f:
|
||
json.dump(asdict(report), f, ensure_ascii=False, indent=2)
|
||
print(f" 报告已写入: {args.json_out}")
|
||
|
||
return code
|
||
|
||
|
||
if __name__ == "__main__":
|
||
sys.exit(main())
|