backend/app/services/fetchers/api_push.py

"""API Push 短新闻 — normalize 工具。

不走 fetcher 抽象(那是"周期拉取"语义),API Push 是"被动接收"。
提供两个纯函数,供 ingest 路由调用:

- compute_content_hash(external_id, title, body) -> str
- normalize_payload(payload: dict) -> dict(供入库时使用)

设计要点:
- external_id 存在时,作为主幂等 key(L1)
- external_id 缺失时,title+body[:500] 作为兜底指纹(L2)
- url 可选;缺失时合成 api-push://{source_slug}/{hash[:16]} 占位
- 字段长度校验集中在路由里(返回 400),这里只做归一化
"""
from __future__ import annotations

import hashlib
from datetime import datetime, timezone
from typing import Any
from zoneinfo import ZoneInfo

from app.config import settings


def compute_content_hash(
    *,
    external_id: str | None,
    title: str,
    body: str,
) -> str:
    """三层去重核心 key。

    - external_id 存在:`sha1("ext:" + external_id)` —— 调用方幂等保证,最强
    - external_id 缺失:`sha1(title.strip() + "|" + body[:500])` —— 兜底,防尾部噪声

    注:body 取原始字符串的前 500 字符,不做 strip。
    因为不同长度的 body(200字 vs 2000字)前 500 字符一定相等,这是设计意图 —
    仅靠"前 N 字符"判断重复,避免被尾部噪声(URL尾巴/HTML 注释)误判。
    """
    if external_id:
        raw = f"ext:{external_id.strip()}"
    else:
        raw = f"{title.strip()}|{body[:500]}"
    return hashlib.sha1(raw.encode("utf-8")).hexdigest()


def synthesize_url(source_slug: str, content_hash: str) -> str:
    """短新闻 url 占位(articles.url NOT NULL,需要合成)。"""
    return f"api-push://{source_slug}/{content_hash[:16]}"


def normalize_published_at(value: Any) -> datetime:
    """published_at 兜底:无值 → now(本地时区)。

    ⚠️ 关于 naive datetime 的时区推断(2026-06-15 fix):
    - 财联社/微信/微博 这类**中国源**通过 ingest 推送时,通常传 naive 字符串
      (不带 tz 后缀),默认就是 Asia/Shanghai 的"墙上时间"
    - 之前的实现把 naive 强加 UTC,导致所有"中国源"新闻 published_at 错位 8 小时
      (财联社 19:58 被存为 19:58 UTC = 03:58 +08,前端显示"7 小时内"未来时间)
    - 修法:naive 当服务器 settings.tz(默认 Asia/Shanghai)处理,转 UTC 入库
    - aware 原样转 UTC 归一

    入库统一存 UTC(aware),渲染时按调用方 tz(默认 +08)显示。
    """
    server_tz = ZoneInfo(settings.tz)

    def _to_utc(dt: datetime) -> datetime:
        if dt.tzinfo is None:
            # naive → 服务器 tz(默认 Asia/Shanghai),再转 UTC
            dt = dt.replace(tzinfo=server_tz)
        return dt.astimezone(timezone.utc)

    if value is None:
        return datetime.now(server_tz).astimezone(timezone.utc)
    if isinstance(value, datetime):
        return _to_utc(value)
    if isinstance(value, str):
        try:
            # fromisoformat 在 3.11+ 支持 'Z' 后缀;3.12 没问题
            dt = datetime.fromisoformat(value.replace("Z", "+00:00"))
            return _to_utc(dt)
        except ValueError:
            # 解析失败兜底为 now;路由层校验会先于 normalize 跑,pydantic 应该已经报 400 了
            return datetime.now(server_tz).astimezone(timezone.utc)
    return datetime.now(server_tz).astimezone(timezone.utc)


def build_initial_status(*, has_tags: bool) -> dict[str, str]:
    """返回 enrich 状态字段的初始值。

    - has_tags=True → classify_status='ok'(直接用 tags 当分类,不浪费 LLM 调用)
    - has_tags=False → classify_status='pending'(enrichment_loop 会跑 classify)
    - 其他:*_status='n/a' 或 'pending',具体见 commit 1 enrichment_article 的跳过逻辑
    """
    return {
        "translation_status": "n/a",  # 跳过翻译(中文原生)
        "format_status": "n/a",  # 跳过排版(短文不需要)
        "image_ai_status": "n/a",  # 跳过插图(用户明确不要)
        "classify_status": "ok" if has_tags else "pending",
        "commentary_status": "pending",  # 双 provider 评论都跑
        "commentary_meituan_status": "pending",
    }
-												feat(ingest): API Push 短新闻接口层

- POST /api/v1/ingest:鉴权(X-Ingest-Token) + 限速(每 token 2 篇/秒,
  Redis 滑动桶,INGEST_RATE_PER_SEC 可调) + 三层去重(L1 external_id /
  L2 content_hash / L3 DB UNIQUE 兜底,均带 reason)
- 写入字段:is_short_news=True、translation/format/image_ai_status='n/a'、
  classify_status=(有 tags?'ok':'pending')、commentary_{angel,meituan}_status='pending'、
  body_zh_text=body_text(走统一路径,前端/prompt 不用改)
- services/fetchers/api_push.py:compute_content_hash + synthesize_url +
  normalize_published_at + build_initial_status 纯函数
- schemas/ingest.py:IngestPayload(title 1-200/body 1-5000/tags 去重去空) +
  IngestResponse(article_id/content_hash/status/reason/matched_external_id)
- admin.py:POST/GET/DELETE /admin/sources/{id}/ingest-tokens — owner 生成
  (raw_token 仅一次性返回)、列出、撤销
- schemas/article.py:ArticleListItem 加 is_short_news/source_ref;
  ArticleDetail 加 is_short_news/source_ref/external_id
- main.py:挂 ingest router;config.py + .env.example:ingest_rate_per_sec 默认 2

短新闻由 commit 1 enrichment_loop 自动接管 classify + 双 provider commentary,
跳过 format/image。

											
										
										
											2026-06-14 16:04:45 +08:00
+								"""API Push 短新闻 — normalize 工具。
 								不走 fetcher 抽象(那是"周期拉取"语义),API Push 是"被动接收"。
 								提供两个纯函数,供 ingest 路由调用:
 								- compute_content_hash(external_id, title, body) -> str
 								- normalize_payload(payload: dict) -> dict(供入库时使用)
 								设计要点:
 								- external_id 存在时,作为主幂等 key(L1)
 								- external_id 缺失时,title+body[:500] 作为兜底指纹(L2)
 								- url 可选;缺失时合成 api-push://{source_slug}/{hash[:16]} 占位
 								- 字段长度校验集中在路由里(返回 400),这里只做归一化
 								"""
 								from __future__ import annotations
 								import hashlib
 								from datetime import datetime, timezone
 								from typing import Any
-												fix(api_push): naive published_at 改当 Asia/Shanghai,不再当 UTC

症状: 财联社/微信等中国源推 ingest 时,payload published_at 通常不带 tz
(naive 字符串),含义是北京时间的墙上时间。
后端之前用 .replace(tzinfo=timezone.utc) 强行当 UTC,导致入库后:
  '2026-06-15 19:58:42' (naive,真意=北京 19:58) →
  PG 存 2026-06-15 19:58:42+00 = 渲染成 2026-06-16 03:58:42+08

前端 dayjs(...).fromNow() 算到当前 +08 时间 20:55 → 显示 '7 小时内'(未来时间)

根因: 中国源给的 naive 字符串实际就是 Asia/Shanghai,不该被当 UTC

修法:
- 新增 zoneinfo + settings.tz 依赖
- naive → settings.tz(默认 Asia/Shanghai),再 astimezone(UTC) 入库
- aware → astimezone(UTC) 归一

注意: 只修未来的新数据;已入库的错位数据需要 backfill(见 PR 后讨论)

											
										
										
											2026-06-15 21:01:45 +08:00
+								from zoneinfo import ZoneInfo
 								from app.config import settings
-												feat(ingest): API Push 短新闻接口层

- POST /api/v1/ingest:鉴权(X-Ingest-Token) + 限速(每 token 2 篇/秒,
  Redis 滑动桶,INGEST_RATE_PER_SEC 可调) + 三层去重(L1 external_id /
  L2 content_hash / L3 DB UNIQUE 兜底,均带 reason)
- 写入字段:is_short_news=True、translation/format/image_ai_status='n/a'、
  classify_status=(有 tags?'ok':'pending')、commentary_{angel,meituan}_status='pending'、
  body_zh_text=body_text(走统一路径,前端/prompt 不用改)
- services/fetchers/api_push.py:compute_content_hash + synthesize_url +
  normalize_published_at + build_initial_status 纯函数
- schemas/ingest.py:IngestPayload(title 1-200/body 1-5000/tags 去重去空) +
  IngestResponse(article_id/content_hash/status/reason/matched_external_id)
- admin.py:POST/GET/DELETE /admin/sources/{id}/ingest-tokens — owner 生成
  (raw_token 仅一次性返回)、列出、撤销
- schemas/article.py:ArticleListItem 加 is_short_news/source_ref;
  ArticleDetail 加 is_short_news/source_ref/external_id
- main.py:挂 ingest router;config.py + .env.example:ingest_rate_per_sec 默认 2

短新闻由 commit 1 enrichment_loop 自动接管 classify + 双 provider commentary,
跳过 format/image。

											
										
										
											2026-06-14 16:04:45 +08:00
 								def compute_content_hash(
 								    *,
 								    external_id: str | None,
 								    title: str,
 								    body: str,
 								) -> str:
 								    """三层去重核心 key。
 								    - external_id 存在:`sha1("ext:" + external_id)` —— 调用方幂等保证,最强
 								    - external_id 缺失:`sha1(title.strip() + "|" + body[:500])` —— 兜底,防尾部噪声
 								    注:body 取原始字符串的前 500 字符,不做 strip。
 								    因为不同长度的 body(200字 vs 2000字)前 500 字符一定相等,这是设计意图 —
 								    仅靠"前 N 字符"判断重复,避免被尾部噪声(URL尾巴/HTML 注释)误判。
 								    """
 								    if external_id:
 								        raw = f"ext:{external_id.strip()}"
 								    else:
 								        raw = f"{title.strip()}|{body[:500]}"
 								    return hashlib.sha1(raw.encode("utf-8")).hexdigest()
 								def synthesize_url(source_slug: str, content_hash: str) -> str:
 								    """短新闻 url 占位(articles.url NOT NULL,需要合成)。"""
 								    return f"api-push://{source_slug}/{content_hash[:16]}"
 								def normalize_published_at(value: Any) -> datetime:
-												fix(api_push): naive published_at 改当 Asia/Shanghai,不再当 UTC

症状: 财联社/微信等中国源推 ingest 时,payload published_at 通常不带 tz
(naive 字符串),含义是北京时间的墙上时间。
后端之前用 .replace(tzinfo=timezone.utc) 强行当 UTC,导致入库后:
  '2026-06-15 19:58:42' (naive,真意=北京 19:58) →
  PG 存 2026-06-15 19:58:42+00 = 渲染成 2026-06-16 03:58:42+08

前端 dayjs(...).fromNow() 算到当前 +08 时间 20:55 → 显示 '7 小时内'(未来时间)

根因: 中国源给的 naive 字符串实际就是 Asia/Shanghai,不该被当 UTC

修法:
- 新增 zoneinfo + settings.tz 依赖
- naive → settings.tz(默认 Asia/Shanghai),再 astimezone(UTC) 入库
- aware → astimezone(UTC) 归一

注意: 只修未来的新数据;已入库的错位数据需要 backfill(见 PR 后讨论)

											
										
										
											2026-06-15 21:01:45 +08:00
+								    """published_at 兜底:无值 → now(本地时区)。
 								    ⚠️ 关于 naive datetime 的时区推断(2026-06-15 fix):
 								    - 财联社/微信/微博 这类**中国源**通过 ingest 推送时,通常传 naive 字符串
 								      (不带 tz 后缀),默认就是 Asia/Shanghai 的"墙上时间"
 								    - 之前的实现把 naive 强加 UTC,导致所有"中国源"新闻 published_at 错位 8 小时
 								      (财联社 19:58 被存为 19:58 UTC = 03:58 +08,前端显示"7 小时内"未来时间)
 								    - 修法:naive 当服务器 settings.tz(默认 Asia/Shanghai)处理,转 UTC 入库
 								    - aware 原样转 UTC 归一
 								    入库统一存 UTC(aware),渲染时按调用方 tz(默认 +08)显示。
 								    """
 								    server_tz = ZoneInfo(settings.tz)
 								    def _to_utc(dt: datetime) -> datetime:
 								        if dt.tzinfo is None:
 								            # naive → 服务器 tz(默认 Asia/Shanghai),再转 UTC
 								            dt = dt.replace(tzinfo=server_tz)
 								        return dt.astimezone(timezone.utc)
-												feat(ingest): API Push 短新闻接口层

- POST /api/v1/ingest:鉴权(X-Ingest-Token) + 限速(每 token 2 篇/秒,
  Redis 滑动桶,INGEST_RATE_PER_SEC 可调) + 三层去重(L1 external_id /
  L2 content_hash / L3 DB UNIQUE 兜底,均带 reason)
- 写入字段:is_short_news=True、translation/format/image_ai_status='n/a'、
  classify_status=(有 tags?'ok':'pending')、commentary_{angel,meituan}_status='pending'、
  body_zh_text=body_text(走统一路径,前端/prompt 不用改)
- services/fetchers/api_push.py:compute_content_hash + synthesize_url +
  normalize_published_at + build_initial_status 纯函数
- schemas/ingest.py:IngestPayload(title 1-200/body 1-5000/tags 去重去空) +
  IngestResponse(article_id/content_hash/status/reason/matched_external_id)
- admin.py:POST/GET/DELETE /admin/sources/{id}/ingest-tokens — owner 生成
  (raw_token 仅一次性返回)、列出、撤销
- schemas/article.py:ArticleListItem 加 is_short_news/source_ref;
  ArticleDetail 加 is_short_news/source_ref/external_id
- main.py:挂 ingest router;config.py + .env.example:ingest_rate_per_sec 默认 2

短新闻由 commit 1 enrichment_loop 自动接管 classify + 双 provider commentary,
跳过 format/image。

											
										
										
											2026-06-14 16:04:45 +08:00
+								    if value is None:
-												fix(api_push): naive published_at 改当 Asia/Shanghai,不再当 UTC

症状: 财联社/微信等中国源推 ingest 时,payload published_at 通常不带 tz
(naive 字符串),含义是北京时间的墙上时间。
后端之前用 .replace(tzinfo=timezone.utc) 强行当 UTC,导致入库后:
  '2026-06-15 19:58:42' (naive,真意=北京 19:58) →
  PG 存 2026-06-15 19:58:42+00 = 渲染成 2026-06-16 03:58:42+08

前端 dayjs(...).fromNow() 算到当前 +08 时间 20:55 → 显示 '7 小时内'(未来时间)

根因: 中国源给的 naive 字符串实际就是 Asia/Shanghai,不该被当 UTC

修法:
- 新增 zoneinfo + settings.tz 依赖
- naive → settings.tz(默认 Asia/Shanghai),再 astimezone(UTC) 入库
- aware → astimezone(UTC) 归一

注意: 只修未来的新数据;已入库的错位数据需要 backfill(见 PR 后讨论)

											
										
										
											2026-06-15 21:01:45 +08:00
+								        return datetime.now(server_tz).astimezone(timezone.utc)
-												feat(ingest): API Push 短新闻接口层

- POST /api/v1/ingest:鉴权(X-Ingest-Token) + 限速(每 token 2 篇/秒,
  Redis 滑动桶,INGEST_RATE_PER_SEC 可调) + 三层去重(L1 external_id /
  L2 content_hash / L3 DB UNIQUE 兜底,均带 reason)
- 写入字段:is_short_news=True、translation/format/image_ai_status='n/a'、
  classify_status=(有 tags?'ok':'pending')、commentary_{angel,meituan}_status='pending'、
  body_zh_text=body_text(走统一路径,前端/prompt 不用改)
- services/fetchers/api_push.py:compute_content_hash + synthesize_url +
  normalize_published_at + build_initial_status 纯函数
- schemas/ingest.py:IngestPayload(title 1-200/body 1-5000/tags 去重去空) +
  IngestResponse(article_id/content_hash/status/reason/matched_external_id)
- admin.py:POST/GET/DELETE /admin/sources/{id}/ingest-tokens — owner 生成
  (raw_token 仅一次性返回)、列出、撤销
- schemas/article.py:ArticleListItem 加 is_short_news/source_ref;
  ArticleDetail 加 is_short_news/source_ref/external_id
- main.py:挂 ingest router;config.py + .env.example:ingest_rate_per_sec 默认 2

短新闻由 commit 1 enrichment_loop 自动接管 classify + 双 provider commentary,
跳过 format/image。

											
										
										
											2026-06-14 16:04:45 +08:00
+								    if isinstance(value, datetime):
-												fix(api_push): naive published_at 改当 Asia/Shanghai,不再当 UTC

症状: 财联社/微信等中国源推 ingest 时,payload published_at 通常不带 tz
(naive 字符串),含义是北京时间的墙上时间。
后端之前用 .replace(tzinfo=timezone.utc) 强行当 UTC,导致入库后:
  '2026-06-15 19:58:42' (naive,真意=北京 19:58) →
  PG 存 2026-06-15 19:58:42+00 = 渲染成 2026-06-16 03:58:42+08

前端 dayjs(...).fromNow() 算到当前 +08 时间 20:55 → 显示 '7 小时内'(未来时间)

根因: 中国源给的 naive 字符串实际就是 Asia/Shanghai,不该被当 UTC

修法:
- 新增 zoneinfo + settings.tz 依赖
- naive → settings.tz(默认 Asia/Shanghai),再 astimezone(UTC) 入库
- aware → astimezone(UTC) 归一

注意: 只修未来的新数据;已入库的错位数据需要 backfill(见 PR 后讨论)

											
										
										
											2026-06-15 21:01:45 +08:00
+								        return _to_utc(value)
-												feat(ingest): API Push 短新闻接口层

- POST /api/v1/ingest:鉴权(X-Ingest-Token) + 限速(每 token 2 篇/秒,
  Redis 滑动桶,INGEST_RATE_PER_SEC 可调) + 三层去重(L1 external_id /
  L2 content_hash / L3 DB UNIQUE 兜底,均带 reason)
- 写入字段:is_short_news=True、translation/format/image_ai_status='n/a'、
  classify_status=(有 tags?'ok':'pending')、commentary_{angel,meituan}_status='pending'、
  body_zh_text=body_text(走统一路径,前端/prompt 不用改)
- services/fetchers/api_push.py:compute_content_hash + synthesize_url +
  normalize_published_at + build_initial_status 纯函数
- schemas/ingest.py:IngestPayload(title 1-200/body 1-5000/tags 去重去空) +
  IngestResponse(article_id/content_hash/status/reason/matched_external_id)
- admin.py:POST/GET/DELETE /admin/sources/{id}/ingest-tokens — owner 生成
  (raw_token 仅一次性返回)、列出、撤销
- schemas/article.py:ArticleListItem 加 is_short_news/source_ref;
  ArticleDetail 加 is_short_news/source_ref/external_id
- main.py:挂 ingest router;config.py + .env.example:ingest_rate_per_sec 默认 2

短新闻由 commit 1 enrichment_loop 自动接管 classify + 双 provider commentary,
跳过 format/image。

											
										
										
											2026-06-14 16:04:45 +08:00
+								    if isinstance(value, str):
 								        try:
 								            # fromisoformat 在 3.11+ 支持 'Z' 后缀;3.12 没问题
 								            dt = datetime.fromisoformat(value.replace("Z", "+00:00"))
-												fix(api_push): naive published_at 改当 Asia/Shanghai,不再当 UTC

症状: 财联社/微信等中国源推 ingest 时,payload published_at 通常不带 tz
(naive 字符串),含义是北京时间的墙上时间。
后端之前用 .replace(tzinfo=timezone.utc) 强行当 UTC,导致入库后:
  '2026-06-15 19:58:42' (naive,真意=北京 19:58) →
  PG 存 2026-06-15 19:58:42+00 = 渲染成 2026-06-16 03:58:42+08

前端 dayjs(...).fromNow() 算到当前 +08 时间 20:55 → 显示 '7 小时内'(未来时间)

根因: 中国源给的 naive 字符串实际就是 Asia/Shanghai,不该被当 UTC

修法:
- 新增 zoneinfo + settings.tz 依赖
- naive → settings.tz(默认 Asia/Shanghai),再 astimezone(UTC) 入库
- aware → astimezone(UTC) 归一

注意: 只修未来的新数据;已入库的错位数据需要 backfill(见 PR 后讨论)

											
										
										
											2026-06-15 21:01:45 +08:00
+								            return _to_utc(dt)
-												feat(ingest): API Push 短新闻接口层

- POST /api/v1/ingest:鉴权(X-Ingest-Token) + 限速(每 token 2 篇/秒,
  Redis 滑动桶,INGEST_RATE_PER_SEC 可调) + 三层去重(L1 external_id /
  L2 content_hash / L3 DB UNIQUE 兜底,均带 reason)
- 写入字段:is_short_news=True、translation/format/image_ai_status='n/a'、
  classify_status=(有 tags?'ok':'pending')、commentary_{angel,meituan}_status='pending'、
  body_zh_text=body_text(走统一路径,前端/prompt 不用改)
- services/fetchers/api_push.py:compute_content_hash + synthesize_url +
  normalize_published_at + build_initial_status 纯函数
- schemas/ingest.py:IngestPayload(title 1-200/body 1-5000/tags 去重去空) +
  IngestResponse(article_id/content_hash/status/reason/matched_external_id)
- admin.py:POST/GET/DELETE /admin/sources/{id}/ingest-tokens — owner 生成
  (raw_token 仅一次性返回)、列出、撤销
- schemas/article.py:ArticleListItem 加 is_short_news/source_ref;
  ArticleDetail 加 is_short_news/source_ref/external_id
- main.py:挂 ingest router;config.py + .env.example:ingest_rate_per_sec 默认 2

短新闻由 commit 1 enrichment_loop 自动接管 classify + 双 provider commentary,
跳过 format/image。

											
										
										
											2026-06-14 16:04:45 +08:00
+								        except ValueError:
 								            # 解析失败兜底为 now;路由层校验会先于 normalize 跑,pydantic 应该已经报 400 了
-												fix(api_push): naive published_at 改当 Asia/Shanghai,不再当 UTC

症状: 财联社/微信等中国源推 ingest 时,payload published_at 通常不带 tz
(naive 字符串),含义是北京时间的墙上时间。
后端之前用 .replace(tzinfo=timezone.utc) 强行当 UTC,导致入库后:
  '2026-06-15 19:58:42' (naive,真意=北京 19:58) →
  PG 存 2026-06-15 19:58:42+00 = 渲染成 2026-06-16 03:58:42+08

前端 dayjs(...).fromNow() 算到当前 +08 时间 20:55 → 显示 '7 小时内'(未来时间)

根因: 中国源给的 naive 字符串实际就是 Asia/Shanghai,不该被当 UTC

修法:
- 新增 zoneinfo + settings.tz 依赖
- naive → settings.tz(默认 Asia/Shanghai),再 astimezone(UTC) 入库
- aware → astimezone(UTC) 归一

注意: 只修未来的新数据;已入库的错位数据需要 backfill(见 PR 后讨论)

											
										
										
											2026-06-15 21:01:45 +08:00
+								            return datetime.now(server_tz).astimezone(timezone.utc)
 								    return datetime.now(server_tz).astimezone(timezone.utc)
-												feat(ingest): API Push 短新闻接口层

- POST /api/v1/ingest:鉴权(X-Ingest-Token) + 限速(每 token 2 篇/秒,
  Redis 滑动桶,INGEST_RATE_PER_SEC 可调) + 三层去重(L1 external_id /
  L2 content_hash / L3 DB UNIQUE 兜底,均带 reason)
- 写入字段:is_short_news=True、translation/format/image_ai_status='n/a'、
  classify_status=(有 tags?'ok':'pending')、commentary_{angel,meituan}_status='pending'、
  body_zh_text=body_text(走统一路径,前端/prompt 不用改)
- services/fetchers/api_push.py:compute_content_hash + synthesize_url +
  normalize_published_at + build_initial_status 纯函数
- schemas/ingest.py:IngestPayload(title 1-200/body 1-5000/tags 去重去空) +
  IngestResponse(article_id/content_hash/status/reason/matched_external_id)
- admin.py:POST/GET/DELETE /admin/sources/{id}/ingest-tokens — owner 生成
  (raw_token 仅一次性返回)、列出、撤销
- schemas/article.py:ArticleListItem 加 is_short_news/source_ref;
  ArticleDetail 加 is_short_news/source_ref/external_id
- main.py:挂 ingest router;config.py + .env.example:ingest_rate_per_sec 默认 2

短新闻由 commit 1 enrichment_loop 自动接管 classify + 双 provider commentary,
跳过 format/image。

											
										
										
											2026-06-14 16:04:45 +08:00
 								def build_initial_status(*, has_tags: bool) -> dict[str, str]:
 								    """返回 enrich 状态字段的初始值。
 								    - has_tags=True → classify_status='ok'(直接用 tags 当分类,不浪费 LLM 调用)
 								    - has_tags=False → classify_status='pending'(enrichment_loop 会跑 classify)
 								    - 其他:*_status='n/a' 或 'pending',具体见 commit 1 enrichment_article 的跳过逻辑
 								    """
 								    return {
 								        "translation_status": "n/a",  # 跳过翻译(中文原生)
 								        "format_status": "n/a",  # 跳过排版(短文不需要)
 								        "image_ai_status": "n/a",  # 跳过插图(用户明确不要)
 								        "classify_status": "ok" if has_tags else "pending",
 								        "commentary_status": "pending",  # 双 provider 评论都跑
 								        "commentary_meituan_status": "pending",
 								    }