feat(ingest): API Push 短新闻数据层

- alembic 0008:articles 加 is_short_news/external_id/source_ref/content_hash
  (UNIQUE);sources.kind 加 'api_push';api_tokens 加 purpose + source_id
- SourceKind.API_PUSH enum;Article/ApiToken model 加新字段
- enrichment_article 短新闻跳过 format/image;
  enrichment_loop SQL 加 is_short_news 路径(并入'可 enrich' 条件)
- 入库侧由 commit 2(ingest 接口)负责:写 body_zh_text=body_text,
  format/image/commentary_meituan_status='n/a',
  classify/commentary_status='pending'(带 tags 时 classify='ok')

无迁移爆炸半径:articles.url 保持 NOT NULL,短新闻合成 api-push:// 占位
This commit is contained in:
xiaji
2026-06-14 15:51:22 +08:00
parent f690f1f108
commit 3091f291b2
5 changed files with 194 additions and 23 deletions

View File

@@ -35,7 +35,7 @@ import asyncio
import logging
from typing import Any, Mapping
from sqlalchemy import select
from sqlalchemy import and_, or_, select
from app.database import AsyncSessionLocal
from app.models.article import Article
@@ -419,6 +419,8 @@ async def enrich_article(article_id: int) -> dict[str, str]:
if not art:
logger.warning("enrich_article: id=%s not found", article_id)
return {}
# 短新闻(API Push):无 translation,但仍需 enrich(classify + commentary)
# 入库时已把 body_text 复制到 body_zh_text,所以这里可以走统一判断
if not (art.title_zh or art.body_zh_text):
logger.info("enrich_article: id=%s no translation yet, skip", article_id)
return {}
@@ -451,6 +453,11 @@ async def enrich_article(article_id: int) -> dict[str, str]:
if not art:
return {}
# === 短新闻(API Push):跳过 format 和 image(短文不需要排版,用户明确不要配图)===
# 短新闻入表时 format_status / image_ai_status 已置 'n/a',这里再 ensure 一次
# 防止未来 ingest 路径忘了设 status。
is_short = bool(art.is_short_news)
# === 1) classify(黑名单 gate,优先执行)===
blocklist = _merge_blocklist(setting, art.source if art.source_id else None)
try:
@@ -476,23 +483,31 @@ async def enrich_article(article_id: int) -> dict[str, str]:
results["classify"] = f"failed:{type(e).__name__}"
# classify 失败也继续(format/image/commentary 还能跑)
# === 2) format ===
try:
await _enrich_format(art, setting, client)
results["format"] = "ok"
except Exception as e:
logger.exception("enrich format failed for article %s: %s", article_id, e)
art.format_status = "failed"
results["format"] = f"failed:{type(e).__name__}"
# === 2) format(短新闻跳过)===
if is_short:
art.format_status = "n/a"
results["format"] = "skipped"
else:
try:
await _enrich_format(art, setting, client)
results["format"] = "ok"
except Exception as e:
logger.exception("enrich format failed for article %s: %s", article_id, e)
art.format_status = "failed"
results["format"] = f"failed:{type(e).__name__}"
# === 3) image ===
try:
await _enrich_image(art, setting, client)
results["image"] = "ok"
except Exception as e:
logger.exception("enrich image failed for article %s: %s", article_id, e)
art.image_ai_status = "failed"
results["image"] = f"failed:{type(e).__name__}"
# === 3) image(短新闻跳过)===
if is_short:
art.image_ai_status = "n/a"
results["image"] = "skipped"
else:
try:
await _enrich_image(art, setting, client)
results["image"] = "ok"
except Exception as e:
logger.exception("enrich image failed for article %s: %s", article_id, e)
art.image_ai_status = "failed"
results["image"] = f"failed:{type(e).__name__}"
# === 4 + 5) commentary_angel + commentary_meituan 并行 ===
# 关键:每个 provider 独立的 try/except,任一失败不影响另一个
@@ -535,6 +550,8 @@ ENRICHMENT_BATCH_SIZE = 8 # 每轮并发拉取候选,然后顺序处理(LLM 客
async def enrichment_loop() -> None:
"""扫描已翻译但未 enrich 的文章(任一 *_status 为 pending/n/a 且 translation_status=ok)。
短新闻(API Push)例外:translation_status='n/a',但 is_short_news=True 也要被捞出来。
跟 translation_loop 一样常驻。
"""
logger.info("enrichment_loop started")
@@ -543,16 +560,30 @@ async def enrichment_loop() -> None:
while True:
try:
async with AsyncSessionLocal() as session:
# 精准定位待 enrich 的文章:已翻译 + 任一 LLM 状态 ∈ {n/a, pending, failed}
# 精准定位待 enrich 的文章:
# - 长新闻:translation_status='ok' + title_zh 非空 + 任一 LLM 状态 != 'ok'
# - 短新闻:is_short_news=True + body_zh_text 非空(入库时已从 body_text 复制)
# + 任一 LLM 状态 != 'ok'
# (不能用 order_by id ASC + 内存过滤:已 enrich 的文章 id 可能更小,会占满 limit,
# 让 enrichment_loop 永远看不到后面大 id 的 n/a 文章 — 真实踩过的坑)
rows = (
await session.execute(
select(Article)
.where(
Article.translation_status == "ok",
Article.title_zh.is_not(None),
# 任一 LLM 状态不是 ok(包括 NULL)
# === "可 enrich" 条件 ===
or_(
# 长新闻
and_(
Article.translation_status == "ok",
Article.title_zh.is_not(None),
),
# 短新闻(API Push)
and_(
Article.is_short_news.is_(True),
Article.body_zh_text.is_not(None),
),
),
# === "未 enrich" 条件:任一 LLM 状态不是 ok ===
(
(Article.classify_status.is_(None))
| (Article.classify_status != "ok")