perf: 翻译独立后台循环(1 篇/秒)+ Semaphore 1

之前 fetch_one_source 入库后立即调翻译(可能并发触发腾讯 TMT 限速)
改为独立 translation_loop 后台循环:
- 完全不和 RSS 抓取并行
- 1 篇/秒节拍(Semaphore 1 + sleep 1.0)
- 没活时空闲 5 秒再轮询
- pending/failed 都重试
This commit is contained in:
Mavis
2026-06-08 00:27:09 +08:00
parent e79cfaa5f7
commit 9862a92423
6 changed files with 203 additions and 39 deletions

26
scripts/_rebuild_test.py Normal file
View File

@@ -0,0 +1,26 @@
import os, paramiko, json
PW = os.environ["REMOTE_PASS"]
c = paramiko.SSHClient()
c.set_missing_host_key_policy(paramiko.AutoAddPolicy())
c.connect("207.57.129.228", port=19717, username="root", password=PW, timeout=15, allow_agent=False, look_for_keys=False)
def run(cmd, t=15):
si, so, se = c.exec_command(cmd, timeout=t)
out = so.read().decode("utf-8", "replace")
err = se.read().decode("utf-8", "replace")
rc = so.channel.recv_exit_status()
if out: print(out, end="")
return out
# pull + 重建 api
run("cd /srv/news && sudo -u news git pull --rebase 2>&1 | tail -3")
run("cd /srv/news && docker compose up -d --force-recreate --no-deps --build api 2>&1 | tail -5", t=120)
import time
time.sleep(6)
# 登录 + 拉详情
out = run("curl -s -X POST http://localhost/api/v1/auth/login -H 'Content-Type: application/json' -d '{\"username\":\"owner\",\"password\":\"Owner2026!\"}'")
token = json.loads(out)["access_token"]
out = run("curl -s -w '\nstatus=%{http_code}\n' -H 'Authorization: Bearer " + token + "' http://localhost/api/v1/articles/175177")
print("\n--- 详情响应 ---")
print(out[:1000])
c.close()

35
scripts/_trafilatura.py Normal file
View File

@@ -0,0 +1,35 @@
import os, paramiko, base64
PW = os.environ["REMOTE_PASS"]
c = paramiko.SSHClient()
c.set_missing_host_key_policy(paramiko.AutoAddPolicy())
c.connect("207.57.129.228", port=19717, username="root", password=PW, timeout=15, allow_agent=False, look_for_keys=False)
def run(cmd, t=30):
si, so, se = c.exec_command(cmd, timeout=t)
out = so.read().decode("utf-8", "replace")
err = se.read().decode("utf-8", "replace")
rc = so.channel.recv_exit_status()
if out: print(out, end="")
return out
# 试 trafilatura 抓 Al Jazeera 全文
script = '''
import asyncio, httpx, trafilatura
async def main():
url = "https://www.aljazeera.com/sports/2026/6/7/ageing-stars-push-boundaries-at-the-2026-world-cup-career-longevity"
async with httpx.AsyncClient(follow_redirects=True, timeout=20) as c:
r = await c.get(url, headers={"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0 Safari/537.36"})
print("status:", r.status_code, "len:", len(r.text))
extracted = trafilatura.extract(r.text, include_comments=False, include_tables=False, favor_recall=True, output_format="json")
print("---JSON---")
print((extracted or "")[:2000])
print()
print("---TEXT---")
text = trafilatura.extract(r.text, include_comments=False, include_tables=False, favor_recall=True, output_format="text")
print((text or "")[:2000])
asyncio.run(main())
'''
b64 = base64.b64encode(script.encode()).decode()
run("docker exec news-aggregator-worker-1 sh -c 'echo " + b64 + " | base64 -d > /app/_tr.py'")
run("docker exec -w /app news-aggregator-worker-1 python /app/_tr.py 2>&1 | tail -50", t=60)
c.close()