56 lines
2.7 KiB
Python
56 lines
2.7 KiB
Python
|
|
"""强制全文重抓:
|
||
|
|
1. 备份 209 篇到 /tmp/articles_backup.json
|
||
|
|
2. DELETE FROM articles
|
||
|
|
3. 触发 run_once 让 worker 重抓(trafilatura 抓全文)
|
||
|
|
4. 等 1 分钟看新数据
|
||
|
|
"""
|
||
|
|
import os, paramiko, json, time
|
||
|
|
PW = os.environ["REMOTE_PASS"]
|
||
|
|
c = paramiko.SSHClient()
|
||
|
|
c.set_missing_host_key_policy(paramiko.AutoAddPolicy())
|
||
|
|
c.connect("207.57.129.228", port=19717, username="root", password=PW, timeout=15, allow_agent=False, look_for_keys=False)
|
||
|
|
def run(cmd, t=60):
|
||
|
|
si, so, se = c.exec_command(cmd, timeout=t)
|
||
|
|
out = so.read().decode("utf-8", "replace")
|
||
|
|
err = se.read().decode("utf-8", "replace")
|
||
|
|
rc = so.channel.recv_exit_status()
|
||
|
|
if out: print(out, end="")
|
||
|
|
if err and "Warning" not in err: print(err, end="", file=__import__("sys").stderr)
|
||
|
|
return out
|
||
|
|
|
||
|
|
# 1) 备份
|
||
|
|
print("=== 1. 备份 209 篇文章到 /tmp/articles_backup.json ===")
|
||
|
|
run("docker exec news-aggregator-postgres-1 pg_dump -U news -d news -t articles --data-only --column-inserts > /tmp/articles_backup.sql")
|
||
|
|
out = run("ls -la /tmp/articles_backup.sql | awk '{print $5, $9}'")
|
||
|
|
print(f" 备份文件: {out.strip()}")
|
||
|
|
out = run("docker exec news-aggregator-postgres-1 psql -U news -d news -tA -c \"SELECT count(*) FROM articles;\"")
|
||
|
|
print(f" 当前文章数: {out.strip()}")
|
||
|
|
|
||
|
|
# 2) DELETE 全部
|
||
|
|
print("\n=== 2. DELETE 所有文章 ===")
|
||
|
|
out = run("docker exec news-aggregator-postgres-1 psql -U news -d news -c \"DELETE FROM articles;\"")
|
||
|
|
print(out)
|
||
|
|
out = run("docker exec news-aggregator-postgres-1 psql -U news -d news -tA -c \"SELECT count(*) FROM articles;\"")
|
||
|
|
print(f" 删后文章数: {out.strip()}")
|
||
|
|
|
||
|
|
# 3) 触发 run_once
|
||
|
|
print("\n=== 3. 触发 worker run_once(4 源重新 fetch) ===")
|
||
|
|
run("docker exec news-aggregator-worker-1 python -c 'import asyncio; from app.workers.pipeline import run_once; asyncio.run(run_once())' 2>&1 | tail -10", t=120)
|
||
|
|
|
||
|
|
# 4) 等 30 秒看新文章入库
|
||
|
|
print("\n=== 4. 30 秒后看新数据 ===")
|
||
|
|
time.sleep(30)
|
||
|
|
out = run("docker exec news-aggregator-postgres-1 psql -U news -d news -c \"SELECT count(*) AS total, count(CASE WHEN length(body_text) > 1000 THEN 1 END) AS long_body, avg(length(body_text))::int AS avg_len FROM articles;\"")
|
||
|
|
print(out)
|
||
|
|
|
||
|
|
# 5) 看 trafilatura 是否生效
|
||
|
|
print("\n=== 5. 看 RSS 摘要 vs trafilatura 全文 ===")
|
||
|
|
out = run("docker exec news-aggregator-postgres-1 psql -U news -d news -c \"SELECT id, source_id, LEFT(title, 50) AS title, length(body_text) AS body_len FROM articles ORDER BY id LIMIT 10;\"")
|
||
|
|
print(out)
|
||
|
|
|
||
|
|
# 6) translation_status 分布
|
||
|
|
print("\n=== 6. 翻译状态 ===")
|
||
|
|
out = run("docker exec news-aggregator-postgres-1 psql -U news -d news -c \"SELECT translation_status, count(*) FROM articles GROUP BY 1 ORDER BY 1;\"")
|
||
|
|
print(out)
|
||
|
|
c.close()
|