fix: 翻译失败/降级文本不再写 cache(避免 30 天污染)
之前 service.translate 写 cache 无条件,导致:
- 第一次翻译失败时,'[翻译失败: ...]' 占位符被写进 cache
- 30 天内相同文本的请求(新文章 title 与老文章 title 相同时)全部返回占位符
- 触发 200+ 文章 title_zh 字段被永久污染
修法:仅在 engine ∈ {tencent, nllb, cache} 且文本不含错误标记时,才写 cache。
This commit is contained in:
39
scripts/_check_after.py
Normal file
39
scripts/_check_after.py
Normal file
@@ -0,0 +1,39 @@
|
||||
"""DELETE 后看新数据(30 秒后)。"""
|
||||
import os, paramiko
|
||||
PW = os.environ["REMOTE_PASS"]
|
||||
c = paramiko.SSHClient()
|
||||
c.set_missing_host_key_policy(paramiko.AutoAddPolicy())
|
||||
c.connect("207.57.129.228", port=19717, username="root", password=PW, timeout=15, allow_agent=False, look_for_keys=False)
|
||||
def run(cmd, t=30):
|
||||
si, so, se = c.exec_command(cmd, timeout=t)
|
||||
out = so.read().decode("utf-8", "replace")
|
||||
err = se.read().decode("utf-8", "replace")
|
||||
rc = so.channel.recv_exit_status()
|
||||
if out: print(out, end="")
|
||||
return out
|
||||
|
||||
# 后台启 run_once
|
||||
si, so, se = c.exec_command("nohup docker exec news-aggregator-worker-1 python -c 'import asyncio; from app.workers.pipeline import run_once; asyncio.run(run_once())' > /tmp/run_once.log 2>&1 & echo $!", timeout=10)
|
||||
pid = so.read().decode().strip()
|
||||
print(f"run_once started, PID={pid}")
|
||||
|
||||
# 等 90 秒(全文抓取慢)
|
||||
import time
|
||||
time.sleep(90)
|
||||
|
||||
# 看新数据
|
||||
print("\n--- 文章统计 ---")
|
||||
run("docker exec news-aggregator-postgres-1 psql -U news -d news -c \"SELECT count(*) AS total, count(CASE WHEN length(body_text) > 1000 THEN 1 END) AS long_body, avg(length(body_text))::int AS avg_len, max(length(body_text)) AS max_len FROM articles;\"")
|
||||
|
||||
# 看 RSS 摘要 vs 全文(body_text > 1000 = trafilatura 工作了)
|
||||
print("\n--- body_text 长度分布 ---")
|
||||
run("docker exec news-aggregator-postgres-1 psql -U news -d news -c \"SELECT CASE WHEN length(body_text) < 200 THEN '<200' WHEN length(body_text) < 1000 THEN '200-1k' ELSE '>1k' END AS bucket, count(*) FROM articles GROUP BY 1 ORDER BY 1;\"")
|
||||
|
||||
# 看翻译状态
|
||||
print("\n--- 翻译状态 ---")
|
||||
run("docker exec news-aggregator-postgres-1 psql -U news -d news -c \"SELECT translation_status, count(*) FROM articles GROUP BY 1 ORDER BY 1;\"")
|
||||
|
||||
# 看前 5 篇文章 body 长度 + 来源
|
||||
print("\n--- 前 5 篇 ---")
|
||||
run("docker exec news-aggregator-postgres-1 psql -U news -d news -c \"SELECT id, source_id, LEFT(title, 50) AS title, length(body_text) AS body_len FROM articles ORDER BY id LIMIT 5;\"")
|
||||
c.close()
|
||||
Reference in New Issue
Block a user