2026-06-14 16:04:45 +08:00
|
|
|
|
"""API Push 短新闻 — normalize 工具。
|
|
|
|
|
|
|
|
|
|
|
|
不走 fetcher 抽象(那是"周期拉取"语义),API Push 是"被动接收"。
|
|
|
|
|
|
提供两个纯函数,供 ingest 路由调用:
|
|
|
|
|
|
|
|
|
|
|
|
- compute_content_hash(external_id, title, body) -> str
|
|
|
|
|
|
- normalize_payload(payload: dict) -> dict(供入库时使用)
|
|
|
|
|
|
|
|
|
|
|
|
设计要点:
|
|
|
|
|
|
- external_id 存在时,作为主幂等 key(L1)
|
|
|
|
|
|
- external_id 缺失时,title+body[:500] 作为兜底指纹(L2)
|
|
|
|
|
|
- url 可选;缺失时合成 api-push://{source_slug}/{hash[:16]} 占位
|
|
|
|
|
|
- 字段长度校验集中在路由里(返回 400),这里只做归一化
|
|
|
|
|
|
"""
|
|
|
|
|
|
from __future__ import annotations
|
|
|
|
|
|
|
|
|
|
|
|
import hashlib
|
|
|
|
|
|
from datetime import datetime, timezone
|
|
|
|
|
|
from typing import Any
|
2026-06-15 21:01:45 +08:00
|
|
|
|
from zoneinfo import ZoneInfo
|
|
|
|
|
|
|
|
|
|
|
|
from app.config import settings
|
2026-06-14 16:04:45 +08:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def compute_content_hash(
|
|
|
|
|
|
*,
|
|
|
|
|
|
external_id: str | None,
|
|
|
|
|
|
title: str,
|
|
|
|
|
|
body: str,
|
|
|
|
|
|
) -> str:
|
|
|
|
|
|
"""三层去重核心 key。
|
|
|
|
|
|
|
|
|
|
|
|
- external_id 存在:`sha1("ext:" + external_id)` —— 调用方幂等保证,最强
|
|
|
|
|
|
- external_id 缺失:`sha1(title.strip() + "|" + body[:500])` —— 兜底,防尾部噪声
|
|
|
|
|
|
|
|
|
|
|
|
注:body 取原始字符串的前 500 字符,不做 strip。
|
|
|
|
|
|
因为不同长度的 body(200字 vs 2000字)前 500 字符一定相等,这是设计意图 —
|
|
|
|
|
|
仅靠"前 N 字符"判断重复,避免被尾部噪声(URL尾巴/HTML 注释)误判。
|
|
|
|
|
|
"""
|
|
|
|
|
|
if external_id:
|
|
|
|
|
|
raw = f"ext:{external_id.strip()}"
|
|
|
|
|
|
else:
|
|
|
|
|
|
raw = f"{title.strip()}|{body[:500]}"
|
|
|
|
|
|
return hashlib.sha1(raw.encode("utf-8")).hexdigest()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def synthesize_url(source_slug: str, content_hash: str) -> str:
|
|
|
|
|
|
"""短新闻 url 占位(articles.url NOT NULL,需要合成)。"""
|
|
|
|
|
|
return f"api-push://{source_slug}/{content_hash[:16]}"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def normalize_published_at(value: Any) -> datetime:
|
2026-06-15 21:01:45 +08:00
|
|
|
|
"""published_at 兜底:无值 → now(本地时区)。
|
|
|
|
|
|
|
|
|
|
|
|
⚠️ 关于 naive datetime 的时区推断(2026-06-15 fix):
|
|
|
|
|
|
- 财联社/微信/微博 这类**中国源**通过 ingest 推送时,通常传 naive 字符串
|
|
|
|
|
|
(不带 tz 后缀),默认就是 Asia/Shanghai 的"墙上时间"
|
|
|
|
|
|
- 之前的实现把 naive 强加 UTC,导致所有"中国源"新闻 published_at 错位 8 小时
|
|
|
|
|
|
(财联社 19:58 被存为 19:58 UTC = 03:58 +08,前端显示"7 小时内"未来时间)
|
|
|
|
|
|
- 修法:naive 当服务器 settings.tz(默认 Asia/Shanghai)处理,转 UTC 入库
|
|
|
|
|
|
- aware 原样转 UTC 归一
|
|
|
|
|
|
|
|
|
|
|
|
入库统一存 UTC(aware),渲染时按调用方 tz(默认 +08)显示。
|
|
|
|
|
|
"""
|
|
|
|
|
|
server_tz = ZoneInfo(settings.tz)
|
|
|
|
|
|
|
|
|
|
|
|
def _to_utc(dt: datetime) -> datetime:
|
|
|
|
|
|
if dt.tzinfo is None:
|
|
|
|
|
|
# naive → 服务器 tz(默认 Asia/Shanghai),再转 UTC
|
|
|
|
|
|
dt = dt.replace(tzinfo=server_tz)
|
|
|
|
|
|
return dt.astimezone(timezone.utc)
|
|
|
|
|
|
|
2026-06-14 16:04:45 +08:00
|
|
|
|
if value is None:
|
2026-06-15 21:01:45 +08:00
|
|
|
|
return datetime.now(server_tz).astimezone(timezone.utc)
|
2026-06-14 16:04:45 +08:00
|
|
|
|
if isinstance(value, datetime):
|
2026-06-15 21:01:45 +08:00
|
|
|
|
return _to_utc(value)
|
2026-06-14 16:04:45 +08:00
|
|
|
|
if isinstance(value, str):
|
|
|
|
|
|
try:
|
|
|
|
|
|
# fromisoformat 在 3.11+ 支持 'Z' 后缀;3.12 没问题
|
|
|
|
|
|
dt = datetime.fromisoformat(value.replace("Z", "+00:00"))
|
2026-06-15 21:01:45 +08:00
|
|
|
|
return _to_utc(dt)
|
2026-06-14 16:04:45 +08:00
|
|
|
|
except ValueError:
|
|
|
|
|
|
# 解析失败兜底为 now;路由层校验会先于 normalize 跑,pydantic 应该已经报 400 了
|
2026-06-15 21:01:45 +08:00
|
|
|
|
return datetime.now(server_tz).astimezone(timezone.utc)
|
|
|
|
|
|
return datetime.now(server_tz).astimezone(timezone.utc)
|
2026-06-14 16:04:45 +08:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def build_initial_status(*, has_tags: bool) -> dict[str, str]:
|
|
|
|
|
|
"""返回 enrich 状态字段的初始值。
|
|
|
|
|
|
|
|
|
|
|
|
- has_tags=True → classify_status='ok'(直接用 tags 当分类,不浪费 LLM 调用)
|
|
|
|
|
|
- has_tags=False → classify_status='pending'(enrichment_loop 会跑 classify)
|
|
|
|
|
|
- 其他:*_status='n/a' 或 'pending',具体见 commit 1 enrichment_article 的跳过逻辑
|
|
|
|
|
|
"""
|
|
|
|
|
|
return {
|
|
|
|
|
|
"translation_status": "n/a", # 跳过翻译(中文原生)
|
|
|
|
|
|
"format_status": "n/a", # 跳过排版(短文不需要)
|
|
|
|
|
|
"image_ai_status": "n/a", # 跳过插图(用户明确不要)
|
|
|
|
|
|
"classify_status": "ok" if has_tags else "pending",
|
|
|
|
|
|
"commentary_status": "pending", # 双 provider 评论都跑
|
|
|
|
|
|
"commentary_meituan_status": "pending",
|
|
|
|
|
|
}
|