"""检查去重逻辑 + 启动 HTTP 实测。""" import os, paramiko, json PW = os.environ["REMOTE_PASS"] c = paramiko.SSHClient() c.set_missing_host_key_policy(paramiko.AutoAddPolicy()) c.connect("207.57.129.228", port=19717, username="root", password=PW, timeout=15, allow_agent=False, look_for_keys=False) def run(cmd, t=15): si, so, se = c.exec_command(cmd, timeout=t) out = so.read().decode("utf-8", "replace") err = se.read().decode("utf-8", "replace") rc = so.channel.recv_exit_status() if out: print(out, end="") return out # ========== 1. 启动 HTTP 看看 ========== print("=" * 60) print("1. HTTP 实测") print("=" * 60) # 首页 (Caddy 转发到 frontend) out = run("curl -sS -o /tmp/idx.html -w 'status=%{http_code} size=%{size_download} type=%{content_type}\\n' http://207.57.129.228/") print(f"\n[GET /]") print(f" -> {out.strip()}") out = run("head -c 200 /tmp/idx.html") print(f" body[0:200]: {out}") # /api/v1/healthz out = run("curl -sS -w '\\nstatus=%{http_code}\\n' http://207.57.129.228/api/v1/healthz") print(f"\n[GET /api/v1/healthz]") print(f" -> {out.strip()}") # /api/v1/articles (没 token 应该 401) out = run("curl -sS -w '\\nstatus=%{http_code}\\n' http://207.57.129.228/api/v1/articles?limit=3") print(f"\n[GET /api/v1/articles 无 token]") print(f" -> {out.strip()[:300]}") # 登录 out = run("curl -sS -X POST http://207.57.129.228/api/v1/auth/login -H 'Content-Type: application/json' -d '{\"username\":\"owner\",\"password\":\"Owner2026!\"}'") data = json.loads(out) token = data.get("access_token", "") print(f"\n[POST /api/v1/auth/login]") print(f" -> token: {token[:40]}...") # /api/v1/articles 带 token out = run("curl -sS -w '\nstatus=%{http_code}\n' -H 'Authorization: Bearer " + token + "' 'http://207.57.129.228/api/v1/articles?limit=2'") print(f"\n[GET /api/v1/articles?limit=2 带 token]") print(f" -> {out.strip()[:500]}") # 测静态资源(favicon) out = run("curl -sS -o /dev/null -w 'status=%{http_code} type=%{content_type}\\n' http://207.57.129.228/favicon.svg") print(f"\n[GET /favicon.svg]") print(f" -> {out.strip()}") # ========== 2. 去重审计 ========== print("\n" + "=" * 60) print("2. 去重审计") print("=" * 60) # a) 同一 url_hash 重复数(应该是 0,UNIQUE 约束) out = run("docker exec news-aggregator-postgres-1 psql -U news -d news -tA -c \"SELECT count(*) - count(DISTINCT url_hash) FROM articles;\"") print(f"\n[a) 重复 url_hash 数量(应为 0): {out.strip()}") # b) 同一 url 重复数(可能 url_hash 已经 normalize 过,检查原始 url) out = run("docker exec news-aggregator-postgres-1 psql -U news -d news -tA -c \"SELECT url, count(*) FROM articles GROUP BY url HAVING count(*) > 1 LIMIT 5;\"") print(f"\n[b) 重复 URL(可能含 utm_* 差异):") print(f" {out if out.strip() else ' (无)'}") # c) 同源 / 同标题 / 同一天的,看是不是转载 print("\n[c] 标题相似度去重检查(前 50 字符完全相同):") out = run("docker exec news-aggregator-postgres-1 psql -U news -d news -tA -c \"SELECT LEFT(title, 60), count(*), array_agg(DISTINCT source_id) FROM articles GROUP BY LEFT(title, 60) HAVING count(*) > 1 ORDER BY count(*) DESC LIMIT 5;\"") print(f" {out if out.strip() else ' (无)'}") # d) duplicate_of 字段使用情况 out = run("docker exec news-aggregator-postgres-1 psql -U news -d news -tA -c \"SELECT count(*) FROM articles WHERE duplicate_of IS NOT NULL;\"") print(f"\n[d) duplicate_of 非空的 article 数: {out.strip()}") # e) 抓取日志:reuters 失败时是不是会反复重试 out = run("docker exec news-aggregator-postgres-1 psql -U news -d news -tA -c \"SELECT slug, last_status, consecutive_failures, fetch_interval_min FROM sources ORDER BY id;\"") print(f"\n[e) 源状态(reuters 失败后 interval 翻倍,看是不是还在重试):") print(out) c.close()