diff --git a/backend/app/scripts/seed_sources.py b/backend/app/scripts/seed_sources.py index 2062d51..90cf82b 100644 --- a/backend/app/scripts/seed_sources.py +++ b/backend/app/scripts/seed_sources.py @@ -64,7 +64,7 @@ SEEDS = [ "kind": "rss", "url": "https://www3.nhk.or.jp/rss/news/cat0.xml", "region": "asia", - "language_src": "en", + "language_src": "ja", "priority": 70, "fetch_interval_min": 60, "translate_to": "zh", diff --git a/backend/app/workers/pipeline.py b/backend/app/workers/pipeline.py index e650485..07a2462 100644 --- a/backend/app/workers/pipeline.py +++ b/backend/app/workers/pipeline.py @@ -155,7 +155,12 @@ async def translate_article(article_id: int) -> None: return title = art.title body_text = (art.body_text or "")[:TRANSLATE_BODY_MAX] - lang_src = art.lang_src or "auto" + # lang_src 优先级:article.lang_src > source.language_src > "auto" + # (article 入库时已经优先用了 feedparser 的 lang,这里再做一次兜底) + if not art.lang_src and art.source and art.source.language_src: + lang_src = art.source.language_src + else: + lang_src = art.lang_src or "auto" target = "zh" article_id_ref = art.id diff --git a/scripts/_check_nhk.py b/scripts/_check_nhk.py new file mode 100644 index 0000000..cc50323 --- /dev/null +++ b/scripts/_check_nhk.py @@ -0,0 +1,61 @@ +"""查 NHK 源配置 + 已入库文章 lang_src 实际值。""" +import os, paramiko +PW = os.environ["REMOTE_PASS"] +c = paramiko.SSHClient() +c.set_missing_host_key_policy(paramiko.AutoAddPolicy()) +c.connect("207.57.129.228", port=19717, username="root", password=PW, timeout=15, allow_agent=False, look_for_keys=False) +def run(cmd, t=15): + si, so, se = c.exec_command(cmd, timeout=t) + out = so.read().decode("utf-8", "replace") + err = se.read().decode("utf-8", "replace") + rc = so.channel.recv_exit_status() + if out: print(out, end="") + return out + +# 1) NHK 源配置 +print("--- 1. NHK 源配置 ---") +run("docker exec news-aggregator-postgres-1 psql -U news -d news -c \"SELECT id, slug, language_src FROM sources WHERE slug = 'nhk-world';\"") + +# 2) 实际入库的 NHK 文章 lang_src 分布 +print("\n--- 2. 已入库 NHK 文章 lang_src 分布 ---") +run("docker exec news-aggregator-postgres-1 psql -U news -d news -c \"SELECT lang_src, count(*) FROM articles WHERE source_id = (SELECT id FROM sources WHERE slug = 'nhk-world') GROUP BY 1;\"") + +# 3) 看 NHK RSS feed 实际的 字段 +print("\n--- 3. NHK RSS 实际 language 字段 ---") +script = ''' +import feedparser, httpx +async def main(): + f = feedparser.parse("https://www3.nhk.or.jp/rss/news/cat0.xml") + print("feed.feed.language:", f.feed.get("language")) + if f.entries: + e = f.entries[0] + print("entry.language:", e.get("language")) + print("title:", e.title) +asyncio.run(main()) +''' +import base64 +b64 = base64.b64encode(script.encode()).decode() +run("docker exec news-aggregator-worker-1 sh -c 'echo " + b64 + " | base64 -d > /app/_t.py'") +# 修语法:用 3 撇号 +script = ( + "import feedparser, httpx, asyncio\n" + "async def main():\n" + " f = feedparser.parse('https://www3.nhk.or.jp/rss/news/cat0.xml')\n" + " print('feed.feed.language:', f.feed.get('language'))\n" + " if f.entries:\n" + " e = f.entries[0]\n" + " print('entry.language:', e.get('language'))\n" + " print('title:', e.title)\n" + "asyncio.run(main())\n" +) +b64 = base64.b64encode(script.encode()).decode() +run("docker exec news-aggregator-worker-1 sh -c 'echo " + b64 + " | base64 -d > /app/_t.py'") +run("docker exec -w /app news-aggregator-worker-1 python /app/_t.py 2>&1 | tail -10", t=20) + +# 4) 看翻译 service 收到的 source 是什么(我打一行新文章,看 lang_src 传到 service) +print("\n--- 4. service.translate 实际调用时 source 参数是什么? ---") +# 看 translate_article 代码 +out = run("docker exec news-aggregator-worker-1 python -c 'import app.workers.pipeline; import inspect; print(inspect.getsource(app.workers.pipeline.translate_article))' 2>&1 | grep -E 'lang_src|translate\\(' | head -10") +print(out) + +c.close()