Files
diary-news/scripts/_http_check.py

82 lines
3.8 KiB
Python
Raw Normal View History

"""检查去重逻辑 + 启动 HTTP 实测。"""
import os, paramiko, json
PW = os.environ["REMOTE_PASS"]
c = paramiko.SSHClient()
c.set_missing_host_key_policy(paramiko.AutoAddPolicy())
c.connect("207.57.129.228", port=19717, username="root", password=PW, timeout=15, allow_agent=False, look_for_keys=False)
def run(cmd, t=15):
si, so, se = c.exec_command(cmd, timeout=t)
out = so.read().decode("utf-8", "replace")
err = se.read().decode("utf-8", "replace")
rc = so.channel.recv_exit_status()
if out: print(out, end="")
return out
# ========== 1. 启动 HTTP 看看 ==========
print("=" * 60)
print("1. HTTP 实测")
print("=" * 60)
# 首页 (Caddy 转发到 frontend)
out = run("curl -sS -o /tmp/idx.html -w 'status=%{http_code} size=%{size_download} type=%{content_type}\\n' http://207.57.129.228/")
print(f"\n[GET /]")
print(f" -> {out.strip()}")
out = run("head -c 200 /tmp/idx.html")
print(f" body[0:200]: {out}")
# /api/v1/healthz
out = run("curl -sS -w '\\nstatus=%{http_code}\\n' http://207.57.129.228/api/v1/healthz")
print(f"\n[GET /api/v1/healthz]")
print(f" -> {out.strip()}")
# /api/v1/articles (没 token 应该 401)
out = run("curl -sS -w '\\nstatus=%{http_code}\\n' http://207.57.129.228/api/v1/articles?limit=3")
print(f"\n[GET /api/v1/articles 无 token]")
print(f" -> {out.strip()[:300]}")
# 登录
out = run("curl -sS -X POST http://207.57.129.228/api/v1/auth/login -H 'Content-Type: application/json' -d '{\"username\":\"owner\",\"password\":\"Owner2026!\"}'")
data = json.loads(out)
token = data.get("access_token", "")
print(f"\n[POST /api/v1/auth/login]")
print(f" -> token: {token[:40]}...")
# /api/v1/articles 带 token
out = run("curl -sS -w '\nstatus=%{http_code}\n' -H 'Authorization: Bearer " + token + "' 'http://207.57.129.228/api/v1/articles?limit=2'")
print(f"\n[GET /api/v1/articles?limit=2 带 token]")
print(f" -> {out.strip()[:500]}")
# 测静态资源(favicon)
out = run("curl -sS -o /dev/null -w 'status=%{http_code} type=%{content_type}\\n' http://207.57.129.228/favicon.svg")
print(f"\n[GET /favicon.svg]")
print(f" -> {out.strip()}")
# ========== 2. 去重审计 ==========
print("\n" + "=" * 60)
print("2. 去重审计")
print("=" * 60)
# a) 同一 url_hash 重复数(应该是 0,UNIQUE 约束)
out = run("docker exec news-aggregator-postgres-1 psql -U news -d news -tA -c \"SELECT count(*) - count(DISTINCT url_hash) FROM articles;\"")
print(f"\n[a) 重复 url_hash 数量(应为 0): {out.strip()}")
# b) 同一 url 重复数(可能 url_hash 已经 normalize 过,检查原始 url)
out = run("docker exec news-aggregator-postgres-1 psql -U news -d news -tA -c \"SELECT url, count(*) FROM articles GROUP BY url HAVING count(*) > 1 LIMIT 5;\"")
print(f"\n[b) 重复 URL(可能含 utm_* 差异):")
print(f" {out if out.strip() else ' (无)'}")
# c) 同源 / 同标题 / 同一天的,看是不是转载
print("\n[c] 标题相似度去重检查(前 50 字符完全相同):")
out = run("docker exec news-aggregator-postgres-1 psql -U news -d news -tA -c \"SELECT LEFT(title, 60), count(*), array_agg(DISTINCT source_id) FROM articles GROUP BY LEFT(title, 60) HAVING count(*) > 1 ORDER BY count(*) DESC LIMIT 5;\"")
print(f" {out if out.strip() else ' (无)'}")
# d) duplicate_of 字段使用情况
out = run("docker exec news-aggregator-postgres-1 psql -U news -d news -tA -c \"SELECT count(*) FROM articles WHERE duplicate_of IS NOT NULL;\"")
print(f"\n[d) duplicate_of 非空的 article 数: {out.strip()}")
# e) 抓取日志:reuters 失败时是不是会反复重试
out = run("docker exec news-aggregator-postgres-1 psql -U news -d news -tA -c \"SELECT slug, last_status, consecutive_failures, fetch_interval_min FROM sources ORDER BY id;\"")
print(f"\n[e) 源状态(reuters 失败后 interval 翻倍,看是不是还在重试):")
print(out)
c.close()