feat(ingest): API Push 短新闻数据层
- alembic 0008:articles 加 is_short_news/external_id/source_ref/content_hash (UNIQUE);sources.kind 加 'api_push';api_tokens 加 purpose + source_id - SourceKind.API_PUSH enum;Article/ApiToken model 加新字段 - enrichment_article 短新闻跳过 format/image; enrichment_loop SQL 加 is_short_news 路径(并入'可 enrich' 条件) - 入库侧由 commit 2(ingest 接口)负责:写 body_zh_text=body_text, format/image/commentary_meituan_status='n/a', classify/commentary_status='pending'(带 tags 时 classify='ok') 无迁移爆炸半径:articles.url 保持 NOT NULL,短新闻合成 api-push:// 占位
This commit is contained in:
@@ -35,7 +35,7 @@ import asyncio
|
||||
import logging
|
||||
from typing import Any, Mapping
|
||||
|
||||
from sqlalchemy import select
|
||||
from sqlalchemy import and_, or_, select
|
||||
|
||||
from app.database import AsyncSessionLocal
|
||||
from app.models.article import Article
|
||||
@@ -419,6 +419,8 @@ async def enrich_article(article_id: int) -> dict[str, str]:
|
||||
if not art:
|
||||
logger.warning("enrich_article: id=%s not found", article_id)
|
||||
return {}
|
||||
# 短新闻(API Push):无 translation,但仍需 enrich(classify + commentary)
|
||||
# 入库时已把 body_text 复制到 body_zh_text,所以这里可以走统一判断
|
||||
if not (art.title_zh or art.body_zh_text):
|
||||
logger.info("enrich_article: id=%s no translation yet, skip", article_id)
|
||||
return {}
|
||||
@@ -451,6 +453,11 @@ async def enrich_article(article_id: int) -> dict[str, str]:
|
||||
if not art:
|
||||
return {}
|
||||
|
||||
# === 短新闻(API Push):跳过 format 和 image(短文不需要排版,用户明确不要配图)===
|
||||
# 短新闻入表时 format_status / image_ai_status 已置 'n/a',这里再 ensure 一次
|
||||
# 防止未来 ingest 路径忘了设 status。
|
||||
is_short = bool(art.is_short_news)
|
||||
|
||||
# === 1) classify(黑名单 gate,优先执行)===
|
||||
blocklist = _merge_blocklist(setting, art.source if art.source_id else None)
|
||||
try:
|
||||
@@ -476,23 +483,31 @@ async def enrich_article(article_id: int) -> dict[str, str]:
|
||||
results["classify"] = f"failed:{type(e).__name__}"
|
||||
# classify 失败也继续(format/image/commentary 还能跑)
|
||||
|
||||
# === 2) format ===
|
||||
try:
|
||||
await _enrich_format(art, setting, client)
|
||||
results["format"] = "ok"
|
||||
except Exception as e:
|
||||
logger.exception("enrich format failed for article %s: %s", article_id, e)
|
||||
art.format_status = "failed"
|
||||
results["format"] = f"failed:{type(e).__name__}"
|
||||
# === 2) format(短新闻跳过)===
|
||||
if is_short:
|
||||
art.format_status = "n/a"
|
||||
results["format"] = "skipped"
|
||||
else:
|
||||
try:
|
||||
await _enrich_format(art, setting, client)
|
||||
results["format"] = "ok"
|
||||
except Exception as e:
|
||||
logger.exception("enrich format failed for article %s: %s", article_id, e)
|
||||
art.format_status = "failed"
|
||||
results["format"] = f"failed:{type(e).__name__}"
|
||||
|
||||
# === 3) image ===
|
||||
try:
|
||||
await _enrich_image(art, setting, client)
|
||||
results["image"] = "ok"
|
||||
except Exception as e:
|
||||
logger.exception("enrich image failed for article %s: %s", article_id, e)
|
||||
art.image_ai_status = "failed"
|
||||
results["image"] = f"failed:{type(e).__name__}"
|
||||
# === 3) image(短新闻跳过)===
|
||||
if is_short:
|
||||
art.image_ai_status = "n/a"
|
||||
results["image"] = "skipped"
|
||||
else:
|
||||
try:
|
||||
await _enrich_image(art, setting, client)
|
||||
results["image"] = "ok"
|
||||
except Exception as e:
|
||||
logger.exception("enrich image failed for article %s: %s", article_id, e)
|
||||
art.image_ai_status = "failed"
|
||||
results["image"] = f"failed:{type(e).__name__}"
|
||||
|
||||
# === 4 + 5) commentary_angel + commentary_meituan 并行 ===
|
||||
# 关键:每个 provider 独立的 try/except,任一失败不影响另一个
|
||||
@@ -535,6 +550,8 @@ ENRICHMENT_BATCH_SIZE = 8 # 每轮并发拉取候选,然后顺序处理(LLM 客
|
||||
async def enrichment_loop() -> None:
|
||||
"""扫描已翻译但未 enrich 的文章(任一 *_status 为 pending/n/a 且 translation_status=ok)。
|
||||
|
||||
短新闻(API Push)例外:translation_status='n/a',但 is_short_news=True 也要被捞出来。
|
||||
|
||||
跟 translation_loop 一样常驻。
|
||||
"""
|
||||
logger.info("enrichment_loop started")
|
||||
@@ -543,16 +560,30 @@ async def enrichment_loop() -> None:
|
||||
while True:
|
||||
try:
|
||||
async with AsyncSessionLocal() as session:
|
||||
# 精准定位待 enrich 的文章:已翻译 + 任一 LLM 状态 ∈ {n/a, pending, failed}
|
||||
# 精准定位待 enrich 的文章:
|
||||
# - 长新闻:translation_status='ok' + title_zh 非空 + 任一 LLM 状态 != 'ok'
|
||||
# - 短新闻:is_short_news=True + body_zh_text 非空(入库时已从 body_text 复制)
|
||||
# + 任一 LLM 状态 != 'ok'
|
||||
# (不能用 order_by id ASC + 内存过滤:已 enrich 的文章 id 可能更小,会占满 limit,
|
||||
# 让 enrichment_loop 永远看不到后面大 id 的 n/a 文章 — 真实踩过的坑)
|
||||
rows = (
|
||||
await session.execute(
|
||||
select(Article)
|
||||
.where(
|
||||
Article.translation_status == "ok",
|
||||
Article.title_zh.is_not(None),
|
||||
# 任一 LLM 状态不是 ok(包括 NULL)
|
||||
# === "可 enrich" 条件 ===
|
||||
or_(
|
||||
# 长新闻
|
||||
and_(
|
||||
Article.translation_status == "ok",
|
||||
Article.title_zh.is_not(None),
|
||||
),
|
||||
# 短新闻(API Push)
|
||||
and_(
|
||||
Article.is_short_news.is_(True),
|
||||
Article.body_zh_text.is_not(None),
|
||||
),
|
||||
),
|
||||
# === "未 enrich" 条件:任一 LLM 状态不是 ok ===
|
||||
(
|
||||
(Article.classify_status.is_(None))
|
||||
| (Article.classify_status != "ok")
|
||||
|
||||
Reference in New Issue
Block a user