82 lines
3.8 KiB
Python
82 lines
3.8 KiB
Python
"""检查去重逻辑 + 启动 HTTP 实测。"""
|
|
import os, paramiko, json
|
|
PW = os.environ["REMOTE_PASS"]
|
|
c = paramiko.SSHClient()
|
|
c.set_missing_host_key_policy(paramiko.AutoAddPolicy())
|
|
c.connect("207.57.129.228", port=19717, username="root", password=PW, timeout=15, allow_agent=False, look_for_keys=False)
|
|
def run(cmd, t=15):
|
|
si, so, se = c.exec_command(cmd, timeout=t)
|
|
out = so.read().decode("utf-8", "replace")
|
|
err = se.read().decode("utf-8", "replace")
|
|
rc = so.channel.recv_exit_status()
|
|
if out: print(out, end="")
|
|
return out
|
|
|
|
# ========== 1. 启动 HTTP 看看 ==========
|
|
print("=" * 60)
|
|
print("1. HTTP 实测")
|
|
print("=" * 60)
|
|
|
|
# 首页 (Caddy 转发到 frontend)
|
|
out = run("curl -sS -o /tmp/idx.html -w 'status=%{http_code} size=%{size_download} type=%{content_type}\\n' http://207.57.129.228/")
|
|
print(f"\n[GET /]")
|
|
print(f" -> {out.strip()}")
|
|
out = run("head -c 200 /tmp/idx.html")
|
|
print(f" body[0:200]: {out}")
|
|
|
|
# /api/v1/healthz
|
|
out = run("curl -sS -w '\\nstatus=%{http_code}\\n' http://207.57.129.228/api/v1/healthz")
|
|
print(f"\n[GET /api/v1/healthz]")
|
|
print(f" -> {out.strip()}")
|
|
|
|
# /api/v1/articles (没 token 应该 401)
|
|
out = run("curl -sS -w '\\nstatus=%{http_code}\\n' http://207.57.129.228/api/v1/articles?limit=3")
|
|
print(f"\n[GET /api/v1/articles 无 token]")
|
|
print(f" -> {out.strip()[:300]}")
|
|
|
|
# 登录
|
|
out = run("curl -sS -X POST http://207.57.129.228/api/v1/auth/login -H 'Content-Type: application/json' -d '{\"username\":\"owner\",\"password\":\"Owner2026!\"}'")
|
|
data = json.loads(out)
|
|
token = data.get("access_token", "")
|
|
print(f"\n[POST /api/v1/auth/login]")
|
|
print(f" -> token: {token[:40]}...")
|
|
|
|
# /api/v1/articles 带 token
|
|
out = run("curl -sS -w '\nstatus=%{http_code}\n' -H 'Authorization: Bearer " + token + "' 'http://207.57.129.228/api/v1/articles?limit=2'")
|
|
print(f"\n[GET /api/v1/articles?limit=2 带 token]")
|
|
print(f" -> {out.strip()[:500]}")
|
|
|
|
# 测静态资源(favicon)
|
|
out = run("curl -sS -o /dev/null -w 'status=%{http_code} type=%{content_type}\\n' http://207.57.129.228/favicon.svg")
|
|
print(f"\n[GET /favicon.svg]")
|
|
print(f" -> {out.strip()}")
|
|
|
|
# ========== 2. 去重审计 ==========
|
|
print("\n" + "=" * 60)
|
|
print("2. 去重审计")
|
|
print("=" * 60)
|
|
|
|
# a) 同一 url_hash 重复数(应该是 0,UNIQUE 约束)
|
|
out = run("docker exec news-aggregator-postgres-1 psql -U news -d news -tA -c \"SELECT count(*) - count(DISTINCT url_hash) FROM articles;\"")
|
|
print(f"\n[a) 重复 url_hash 数量(应为 0): {out.strip()}")
|
|
|
|
# b) 同一 url 重复数(可能 url_hash 已经 normalize 过,检查原始 url)
|
|
out = run("docker exec news-aggregator-postgres-1 psql -U news -d news -tA -c \"SELECT url, count(*) FROM articles GROUP BY url HAVING count(*) > 1 LIMIT 5;\"")
|
|
print(f"\n[b) 重复 URL(可能含 utm_* 差异):")
|
|
print(f" {out if out.strip() else ' (无)'}")
|
|
|
|
# c) 同源 / 同标题 / 同一天的,看是不是转载
|
|
print("\n[c] 标题相似度去重检查(前 50 字符完全相同):")
|
|
out = run("docker exec news-aggregator-postgres-1 psql -U news -d news -tA -c \"SELECT LEFT(title, 60), count(*), array_agg(DISTINCT source_id) FROM articles GROUP BY LEFT(title, 60) HAVING count(*) > 1 ORDER BY count(*) DESC LIMIT 5;\"")
|
|
print(f" {out if out.strip() else ' (无)'}")
|
|
|
|
# d) duplicate_of 字段使用情况
|
|
out = run("docker exec news-aggregator-postgres-1 psql -U news -d news -tA -c \"SELECT count(*) FROM articles WHERE duplicate_of IS NOT NULL;\"")
|
|
print(f"\n[d) duplicate_of 非空的 article 数: {out.strip()}")
|
|
|
|
# e) 抓取日志:reuters 失败时是不是会反复重试
|
|
out = run("docker exec news-aggregator-postgres-1 psql -U news -d news -tA -c \"SELECT slug, last_status, consecutive_failures, fetch_interval_min FROM sources ORDER BY id;\"")
|
|
print(f"\n[e) 源状态(reuters 失败后 interval 翻倍,看是不是还在重试):")
|
|
print(out)
|
|
c.close()
|