Files
diary-news/scripts/_check_body.py

53 lines
2.6 KiB
Python
Raw Normal View History

"""查 Ronaldo 那篇文章的 body 字段。"""
import os, paramiko
PW = os.environ["REMOTE_PASS"]
c = paramiko.SSHClient()
c.set_missing_host_key_policy(paramiko.AutoAddPolicy())
c.connect("207.57.129.228", port=19717, username="root", password=PW, timeout=15, allow_agent=False, look_for_keys=False)
def run(cmd, t=15):
si, so, se = c.exec_command(cmd, timeout=t)
out = so.read().decode("utf-8", "replace")
err = se.read().decode("utf-8", "replace")
rc = so.channel.recv_exit_status()
if out: print(out, end="")
return out
# 1) 看 body 字段
print("--- 文章 body 字段(可能是空)---")
run("docker exec news-aggregator-postgres-1 psql -U news -d news -c \"SELECT id, title, length(body_html) as html_len, length(body_text) as text_len, length(body_zh_text) as zh_len, lang_src, translation_status, url FROM articles WHERE id = 175177;\"")
# 2) 看 3 篇典型 aljazeera 文章
print("\n--- 抽 3 篇 aljazeera 看 body 长度分布 ---")
run("docker exec news-aggregator-postgres-1 psql -U news -d news -c \"SELECT id, LEFT(title, 50) title, length(body_html) html, length(body_text) txt, length(body_zh_text) zh FROM articles WHERE source_id = 3 ORDER BY fetched_at DESC LIMIT 5;\"")
# 3) 抽 BBC(可能是最丰富的)
print("\n--- 抽 3 篇 BBC 看 body ---")
run("docker exec news-aggregator-postgres-1 psql -U news -d news -c \"SELECT id, LEFT(title, 50) title, length(body_html) html, length(body_text) txt, length(body_zh_text) zh FROM articles WHERE source_id = 2 ORDER BY fetched_at DESC LIMIT 5;\"")
# 4) 拉 RSS 源看看,Al Jazeera 到底有没有内容
print("\n--- 拉 Al Jazeera RSS 原始内容看 ---")
script = b'''
import asyncio, feedparser, httpx
async def main():
async with httpx.AsyncClient(follow_redirects=True, timeout=15) as c:
r = await c.get("https://www.aljazeera.com/xml/rss/all.xml")
f = feedparser.parse(r.text)
for e in f.entries[:3]:
print("---")
print("title:", e.title)
print("link:", e.link)
print("has content:", bool(e.get("content")))
if e.get("content"):
print("content[0] keys:", list(e["content"][0].keys()))
print("content[0].value[:200]:", (e["content"][0].get("value") or "")[:200])
print("has summary:", bool(e.get("summary")))
if e.get("summary"):
print("summary[:200]:", e["summary"][:200])
asyncio.run(main())
'''
import base64
b64 = base64.b64encode(script).decode()
run("docker exec news-aggregator-worker-1 sh -c 'echo " + b64 + " | base64 -d > /app/_t.py'")
run("docker exec -w /app news-aggregator-worker-1 python /app/_t.py 2>&1 | tail -40", t=30)
c.close()