scripts/_http_check.py

"""检查去重逻辑 + 启动 HTTP 实测。"""
import os, paramiko, json
PW = os.environ["REMOTE_PASS"]
c = paramiko.SSHClient()
c.set_missing_host_key_policy(paramiko.AutoAddPolicy())
c.connect("207.57.129.228", port=19717, username="root", password=PW, timeout=15, allow_agent=False, look_for_keys=False)
def run(cmd, t=15):
    si, so, se = c.exec_command(cmd, timeout=t)
    out = so.read().decode("utf-8", "replace")
    err = se.read().decode("utf-8", "replace")
    rc = so.channel.recv_exit_status()
    if out: print(out, end="")
    return out

# ========== 1. 启动 HTTP 看看 ==========
print("=" * 60)
print("1. HTTP 实测")
print("=" * 60)

# 首页 (Caddy 转发到 frontend)
out = run("curl -sS -o /tmp/idx.html -w 'status=%{http_code} size=%{size_download} type=%{content_type}\\n' http://207.57.129.228/")
print(f"\n[GET /]")
print(f"  -> {out.strip()}")
out = run("head -c 200 /tmp/idx.html")
print(f"  body[0:200]: {out}")

# /api/v1/healthz
out = run("curl -sS -w '\\nstatus=%{http_code}\\n' http://207.57.129.228/api/v1/healthz")
print(f"\n[GET /api/v1/healthz]")
print(f"  -> {out.strip()}")

# /api/v1/articles (没 token 应该 401)
out = run("curl -sS -w '\\nstatus=%{http_code}\\n' http://207.57.129.228/api/v1/articles?limit=3")
print(f"\n[GET /api/v1/articles 无 token]")
print(f"  -> {out.strip()[:300]}")

# 登录
out = run("curl -sS -X POST http://207.57.129.228/api/v1/auth/login -H 'Content-Type: application/json' -d '{\"username\":\"owner\",\"password\":\"Owner2026!\"}'")
data = json.loads(out)
token = data.get("access_token", "")
print(f"\n[POST /api/v1/auth/login]")
print(f"  -> token: {token[:40]}...")

# /api/v1/articles 带 token
out = run("curl -sS -w '\nstatus=%{http_code}\n' -H 'Authorization: Bearer " + token + "' 'http://207.57.129.228/api/v1/articles?limit=2'")
print(f"\n[GET /api/v1/articles?limit=2 带 token]")
print(f"  -> {out.strip()[:500]}")

# 测静态资源(favicon)
out = run("curl -sS -o /dev/null -w 'status=%{http_code} type=%{content_type}\\n' http://207.57.129.228/favicon.svg")
print(f"\n[GET /favicon.svg]")
print(f"  -> {out.strip()}")

# ========== 2. 去重审计 ==========
print("\n" + "=" * 60)
print("2. 去重审计")
print("=" * 60)

# a) 同一 url_hash 重复数(应该是 0,UNIQUE 约束)
out = run("docker exec news-aggregator-postgres-1 psql -U news -d news -tA -c \"SELECT count(*) - count(DISTINCT url_hash) FROM articles;\"")
print(f"\n[a) 重复 url_hash 数量(应为 0): {out.strip()}")

# b) 同一 url 重复数(可能 url_hash 已经 normalize 过,检查原始 url)
out = run("docker exec news-aggregator-postgres-1 psql -U news -d news -tA -c \"SELECT url, count(*) FROM articles GROUP BY url HAVING count(*) > 1 LIMIT 5;\"")
print(f"\n[b) 重复 URL(可能含 utm_* 差异):")
print(f"  {out if out.strip() else '  (无)'}")

# c) 同源 / 同标题 / 同一天的,看是不是转载
print("\n[c] 标题相似度去重检查(前 50 字符完全相同):")
out = run("docker exec news-aggregator-postgres-1 psql -U news -d news -tA -c \"SELECT LEFT(title, 60), count(*), array_agg(DISTINCT source_id) FROM articles GROUP BY LEFT(title, 60) HAVING count(*) > 1 ORDER BY count(*) DESC LIMIT 5;\"")
print(f"  {out if out.strip() else '  (无)'}")

# d) duplicate_of 字段使用情况
out = run("docker exec news-aggregator-postgres-1 psql -U news -d news -tA -c \"SELECT count(*) FROM articles WHERE duplicate_of IS NOT NULL;\"")
print(f"\n[d) duplicate_of 非空的 article 数: {out.strip()}")

# e) 抓取日志:reuters 失败时是不是会反复重试
out = run("docker exec news-aggregator-postgres-1 psql -U news -d news -tA -c \"SELECT slug, last_status, consecutive_failures, fetch_interval_min FROM sources ORDER BY id;\"")
print(f"\n[e) 源状态(reuters 失败后 interval 翻倍,看是不是还在重试):")
print(out)
c.close()
fix: articles.py get_article 链式 await coroutine 报错(.first()) 2026-06-08 00:19:03 +08:00			`"""检查去重逻辑 + 启动 HTTP 实测。"""`
			`import os, paramiko, json`
			`PW = os.environ["REMOTE_PASS"]`
			`c = paramiko.SSHClient()`
			`c.set_missing_host_key_policy(paramiko.AutoAddPolicy())`
			`c.connect("207.57.129.228", port=19717, username="root", password=PW, timeout=15, allow_agent=False, look_for_keys=False)`
			`def run(cmd, t=15):`
			`si, so, se = c.exec_command(cmd, timeout=t)`
			`out = so.read().decode("utf-8", "replace")`
			`err = se.read().decode("utf-8", "replace")`
			`rc = so.channel.recv_exit_status()`
			`if out: print(out, end="")`
			`return out`

			`# ========== 1. 启动 HTTP 看看 ==========`
			`print("=" * 60)`
			`print("1. HTTP 实测")`
			`print("=" * 60)`

			`# 首页 (Caddy 转发到 frontend)`
			`out = run("curl -sS -o /tmp/idx.html -w 'status=%{http_code} size=%{size_download} type=%{content_type}\\n' http://207.57.129.228/")`
			`print(f"\n[GET /]")`
			`print(f" -> {out.strip()}")`
			`out = run("head -c 200 /tmp/idx.html")`
			`print(f" body[0:200]: {out}")`

			`# /api/v1/healthz`
			`out = run("curl -sS -w '\\nstatus=%{http_code}\\n' http://207.57.129.228/api/v1/healthz")`
			`print(f"\n[GET /api/v1/healthz]")`
			`print(f" -> {out.strip()}")`

			`# /api/v1/articles (没 token 应该 401)`
			`out = run("curl -sS -w '\\nstatus=%{http_code}\\n' http://207.57.129.228/api/v1/articles?limit=3")`
			`print(f"\n[GET /api/v1/articles 无 token]")`
			`print(f" -> {out.strip()[:300]}")`

			`# 登录`
			`out = run("curl -sS -X POST http://207.57.129.228/api/v1/auth/login -H 'Content-Type: application/json' -d '{\"username\":\"owner\",\"password\":\"Owner2026!\"}'")`
			`data = json.loads(out)`
			`token = data.get("access_token", "")`
			`print(f"\n[POST /api/v1/auth/login]")`
			`print(f" -> token: {token[:40]}...")`

			`# /api/v1/articles 带 token`
			`out = run("curl -sS -w '\nstatus=%{http_code}\n' -H 'Authorization: Bearer " + token + "' 'http://207.57.129.228/api/v1/articles?limit=2'")`
			`print(f"\n[GET /api/v1/articles?limit=2 带 token]")`
			`print(f" -> {out.strip()[:500]}")`

			`# 测静态资源(favicon)`
			`out = run("curl -sS -o /dev/null -w 'status=%{http_code} type=%{content_type}\\n' http://207.57.129.228/favicon.svg")`
			`print(f"\n[GET /favicon.svg]")`
			`print(f" -> {out.strip()}")`

			`# ========== 2. 去重审计 ==========`
			`print("\n" + "=" * 60)`
			`print("2. 去重审计")`
			`print("=" * 60)`

			`# a) 同一 url_hash 重复数(应该是 0,UNIQUE 约束)`
			`out = run("docker exec news-aggregator-postgres-1 psql -U news -d news -tA -c \"SELECT count(*) - count(DISTINCT url_hash) FROM articles;\"")`
			`print(f"\n[a) 重复 url_hash 数量(应为 0): {out.strip()}")`

			`# b) 同一 url 重复数(可能 url_hash 已经 normalize 过,检查原始 url)`
			`out = run("docker exec news-aggregator-postgres-1 psql -U news -d news -tA -c \"SELECT url, count() FROM articles GROUP BY url HAVING count() > 1 LIMIT 5;\"")`
			`print(f"\n[b) 重复 URL(可能含 utm_* 差异):")`
			`print(f" {out if out.strip() else ' (无)'}")`

			`# c) 同源 / 同标题 / 同一天的,看是不是转载`
			`print("\n[c] 标题相似度去重检查(前 50 字符完全相同):")`
			`out = run("docker exec news-aggregator-postgres-1 psql -U news -d news -tA -c \"SELECT LEFT(title, 60), count(), array_agg(DISTINCT source_id) FROM articles GROUP BY LEFT(title, 60) HAVING count() > 1 ORDER BY count(*) DESC LIMIT 5;\"")`
			`print(f" {out if out.strip() else ' (无)'}")`

			`# d) duplicate_of 字段使用情况`
			`out = run("docker exec news-aggregator-postgres-1 psql -U news -d news -tA -c \"SELECT count(*) FROM articles WHERE duplicate_of IS NOT NULL;\"")`
			`print(f"\n[d) duplicate_of 非空的 article 数: {out.strip()}")`

			`# e) 抓取日志:reuters 失败时是不是会反复重试`
			`out = run("docker exec news-aggregator-postgres-1 psql -U news -d news -tA -c \"SELECT slug, last_status, consecutive_failures, fetch_interval_min FROM sources ORDER BY id;\"")`
			`print(f"\n[e) 源状态(reuters 失败后 interval 翻倍,看是不是还在重试):")`
			`print(out)`
			`c.close()`