Files
diary-news/scripts/_force_refetch.py

56 lines
2.7 KiB
Python
Raw Normal View History

"""强制全文重抓:
1. 备份 209 篇到 /tmp/articles_backup.json
2. DELETE FROM articles
3. 触发 run_once worker 重抓(trafilatura 抓全文)
4. 1 分钟看新数据
"""
import os, paramiko, json, time
PW = os.environ["REMOTE_PASS"]
c = paramiko.SSHClient()
c.set_missing_host_key_policy(paramiko.AutoAddPolicy())
c.connect("207.57.129.228", port=19717, username="root", password=PW, timeout=15, allow_agent=False, look_for_keys=False)
def run(cmd, t=60):
si, so, se = c.exec_command(cmd, timeout=t)
out = so.read().decode("utf-8", "replace")
err = se.read().decode("utf-8", "replace")
rc = so.channel.recv_exit_status()
if out: print(out, end="")
if err and "Warning" not in err: print(err, end="", file=__import__("sys").stderr)
return out
# 1) 备份
print("=== 1. 备份 209 篇文章到 /tmp/articles_backup.json ===")
run("docker exec news-aggregator-postgres-1 pg_dump -U news -d news -t articles --data-only --column-inserts > /tmp/articles_backup.sql")
out = run("ls -la /tmp/articles_backup.sql | awk '{print $5, $9}'")
print(f" 备份文件: {out.strip()}")
out = run("docker exec news-aggregator-postgres-1 psql -U news -d news -tA -c \"SELECT count(*) FROM articles;\"")
print(f" 当前文章数: {out.strip()}")
# 2) DELETE 全部
print("\n=== 2. DELETE 所有文章 ===")
out = run("docker exec news-aggregator-postgres-1 psql -U news -d news -c \"DELETE FROM articles;\"")
print(out)
out = run("docker exec news-aggregator-postgres-1 psql -U news -d news -tA -c \"SELECT count(*) FROM articles;\"")
print(f" 删后文章数: {out.strip()}")
# 3) 触发 run_once
print("\n=== 3. 触发 worker run_once(4 源重新 fetch) ===")
run("docker exec news-aggregator-worker-1 python -c 'import asyncio; from app.workers.pipeline import run_once; asyncio.run(run_once())' 2>&1 | tail -10", t=120)
# 4) 等 30 秒看新文章入库
print("\n=== 4. 30 秒后看新数据 ===")
time.sleep(30)
out = run("docker exec news-aggregator-postgres-1 psql -U news -d news -c \"SELECT count(*) AS total, count(CASE WHEN length(body_text) > 1000 THEN 1 END) AS long_body, avg(length(body_text))::int AS avg_len FROM articles;\"")
print(out)
# 5) 看 trafilatura 是否生效
print("\n=== 5. 看 RSS 摘要 vs trafilatura 全文 ===")
out = run("docker exec news-aggregator-postgres-1 psql -U news -d news -c \"SELECT id, source_id, LEFT(title, 50) AS title, length(body_text) AS body_len FROM articles ORDER BY id LIMIT 10;\"")
print(out)
# 6) translation_status 分布
print("\n=== 6. 翻译状态 ===")
out = run("docker exec news-aggregator-postgres-1 psql -U news -d news -c \"SELECT translation_status, count(*) FROM articles GROUP BY 1 ORDER BY 1;\"")
print(out)
c.close()