feat(llm): classify 前置 + 黑名单 drop 删文章 + 排版用 .diary-para
- enrichment._enrich_classify 前置,返回 (drop, categories)
- 注入 {blocklist} 占位符到 prompt(全局 + per-source 合并)
- drop=True → 整篇 DELETE,后 3 步直接 skip
- 兜底:即使 LLM 没正确返回 drop 字段,本地也匹配一次
- enrichment._enrich_format 排版段落 class 名固定为 diary-para
- CSS 仍内联到 style,前端 .diary-para 兜底
- enrichment._merge_blocklist: 全局 + per-source 合并去重保序
- schemas/llm.LlmSettingOut/Update 暴露 blocklist_tags
- DEFAULT_PROMPTS.classify_prompt 加 {blocklist} + drop 字段说明
This commit is contained in:
@@ -18,6 +18,8 @@ class LlmSettingOut(BaseModel):
|
|||||||
image_model: str = "agnes-image-2.1-flash"
|
image_model: str = "agnes-image-2.1-flash"
|
||||||
interval_sec: float = 2.0
|
interval_sec: float = 2.0
|
||||||
enabled: bool = True
|
enabled: bool = True
|
||||||
|
# 全局屏蔽分类标签;与 sources.blocklist_tags 合并后注入 classify prompt
|
||||||
|
blocklist_tags: list[str] = []
|
||||||
updated_at: datetime | None = None
|
updated_at: datetime | None = None
|
||||||
|
|
||||||
|
|
||||||
@@ -33,6 +35,7 @@ class LlmSettingUpdate(BaseModel):
|
|||||||
image_model: str | None = Field(default=None, min_length=1, max_length=64)
|
image_model: str | None = Field(default=None, min_length=1, max_length=64)
|
||||||
interval_sec: float | None = Field(default=None, ge=0.0, le=60.0)
|
interval_sec: float | None = Field(default=None, ge=0.0, le=60.0)
|
||||||
enabled: bool | None = None
|
enabled: bool | None = None
|
||||||
|
blocklist_tags: list[str] | None = None
|
||||||
|
|
||||||
|
|
||||||
# === 默认提示词(模板,用户可改)===
|
# === 默认提示词(模板,用户可改)===
|
||||||
@@ -50,7 +53,10 @@ DEFAULT_PROMPTS = {
|
|||||||
"classify_prompt": (
|
"classify_prompt": (
|
||||||
"你是新闻分类助手。请阅读以下新闻,返回 2-5 个最相关的分类标签(多标签)。\n"
|
"你是新闻分类助手。请阅读以下新闻,返回 2-5 个最相关的分类标签(多标签)。\n"
|
||||||
"可选标签(可自由组合,不限于此): 时政 / 经济 / 科技 / 军事 / 社会 / 国际 / 体育 / 文化 / 环境 / 健康 / 金融 / 能源 / 气候\n"
|
"可选标签(可自由组合,不限于此): 时政 / 经济 / 科技 / 军事 / 社会 / 国际 / 体育 / 文化 / 环境 / 健康 / 金融 / 能源 / 气候\n"
|
||||||
"严格要求:只返回 JSON,形如 {\"categories\": [\"时政\", \"国际\", \"经济\"]},不要其他内容。\n\n"
|
"黑名单分类(若新闻属于或主要围绕这些领域,务必将 drop 设为 true): {blocklist}\n"
|
||||||
|
"严格要求:只返回 JSON,形如 {\"categories\": [\"时政\", \"国际\", \"经济\"], \"drop\": false},"
|
||||||
|
"若新闻属于或主要围绕黑名单中的任何分类,将 drop 设为 true 并把该分类放入 categories。"
|
||||||
|
"不要其他内容。\n\n"
|
||||||
"标题:{title}\n摘要:{summary}\n正文(节选):{body}\n"
|
"标题:{title}\n摘要:{summary}\n正文(节选):{body}\n"
|
||||||
),
|
),
|
||||||
"commentary_prompt": (
|
"commentary_prompt": (
|
||||||
|
|||||||
@@ -1,8 +1,8 @@
|
|||||||
"""LLM 智能增强服务(翻译后调)。
|
"""LLM 智能增强服务(翻译后调)。
|
||||||
|
|
||||||
4 个独立任务:
|
4 个独立任务(按顺序):
|
||||||
1. format — 排版译文(写入 body_zh_formatted)
|
1. classify — 分类 + 黑名单 gate(命中则删文章,后 3 步跳过)
|
||||||
2. classify — 分类(写入 category,多标签)
|
2. format — 排版译文(写入 body_zh_formatted,容器用 .article-body + 段落 .diary-para)
|
||||||
3. image — 生成插图(写入 image_ai_url,prompt 用正文第一段)
|
3. image — 生成插图(写入 image_ai_url,prompt 用正文第一段)
|
||||||
4. commentary — 写点评(写入 commentary)
|
4. commentary — 写点评(写入 commentary)
|
||||||
|
|
||||||
@@ -11,10 +11,15 @@
|
|||||||
- 字号: 17px
|
- 字号: 17px
|
||||||
- 行高: 1.7
|
- 行高: 1.7
|
||||||
- 颜色: #3e3e3e
|
- 颜色: #3e3e3e
|
||||||
- 段落: margin-bottom 1.5em(自动空一行)
|
- 段落: margin-bottom 1.5em(自动空一行);class 名固定为 diary-para
|
||||||
|
|
||||||
|
黑名单机制:
|
||||||
|
- classify 任务合并 llm_settings.blocklist_tags(全局) + source.blocklist_tags(per-source)
|
||||||
|
- 注入到 prompt 的 {blocklist} 占位符,LLM 返回 {"drop": true, "categories": [...]} 则删文章
|
||||||
|
- 合并去重后为空 → classify 任务只产出 categories 不产出 drop
|
||||||
|
|
||||||
设计:
|
设计:
|
||||||
- 任务入口: enrich_article(article_id, settings_row)
|
- 任务入口: enrich_article(article_id)
|
||||||
- 任务间互不影响:每个任务独立 try/except + 写 status
|
- 任务间互不影响:每个任务独立 try/except + 写 status
|
||||||
- 全部任务共走 LlmClient 的全局限速
|
- 全部任务共走 LlmClient 的全局限速
|
||||||
- 若设置 enabled=False,只跳过(不调 LLM)
|
- 若设置 enabled=False,只跳过(不调 LLM)
|
||||||
@@ -31,6 +36,7 @@ from sqlalchemy import select
|
|||||||
from app.database import AsyncSessionLocal
|
from app.database import AsyncSessionLocal
|
||||||
from app.models.article import Article
|
from app.models.article import Article
|
||||||
from app.models.llm_setting import LlmSetting
|
from app.models.llm_setting import LlmSetting
|
||||||
|
from app.models.source import Source
|
||||||
from app.schemas.llm import get_default_prompts
|
from app.schemas.llm import get_default_prompts
|
||||||
from app.services.llm.client import LlmClient
|
from app.services.llm.client import LlmClient
|
||||||
|
|
||||||
@@ -48,6 +54,9 @@ ARTICLE_BODY_LINE_HEIGHT = "1.7"
|
|||||||
ARTICLE_BODY_COLOR = "#3e3e3e"
|
ARTICLE_BODY_COLOR = "#3e3e3e"
|
||||||
ARTICLE_BODY_P_MARGIN_BOTTOM = "1.5em"
|
ARTICLE_BODY_P_MARGIN_BOTTOM = "1.5em"
|
||||||
|
|
||||||
|
# === 排版段落 class 名(项目级固定,前端 .diary-para 兜底)===
|
||||||
|
DIARY_PARA_CLASS = "diary-para"
|
||||||
|
|
||||||
# === 插图默认尺寸(适中,不再用 1024x768)===
|
# === 插图默认尺寸(适中,不再用 1024x768)===
|
||||||
# 写死到 enrichment 里,行为稳定;setting.image_size 仍可由用户在 UI 改,
|
# 写死到 enrichment 里,行为稳定;setting.image_size 仍可由用户在 UI 改,
|
||||||
# 但默认行为不依赖它,避免意外被改成很大。
|
# 但默认行为不依赖它,避免意外被改成很大。
|
||||||
@@ -107,8 +116,13 @@ async def _enrich_format(article: Article, setting: LlmSetting, client: LlmClien
|
|||||||
temperature=0.3,
|
temperature=0.3,
|
||||||
max_tokens=2000,
|
max_tokens=2000,
|
||||||
)
|
)
|
||||||
# 极简 HTML 包裹:按段切 + <p>,整体包到带固定 CSS 的 <div> 里
|
# 段落 class 名固定为 diary-para(项目级固定,前端 .diary-para 兜底);
|
||||||
parts = [f"<p>{p.strip()}</p>" for p in text.split("\n\n") if p.strip()]
|
# CSS 仍内联到 style,保证分享/导出场景不丢。
|
||||||
|
parts = [
|
||||||
|
f'<p class="{DIARY_PARA_CLASS}">{p.strip()}</p>'
|
||||||
|
for p in text.split("\n\n")
|
||||||
|
if p.strip()
|
||||||
|
]
|
||||||
if not parts:
|
if not parts:
|
||||||
article.body_zh_formatted = None
|
article.body_zh_formatted = None
|
||||||
else:
|
else:
|
||||||
@@ -129,30 +143,67 @@ def _wrap_article_body(inner_html: str) -> str:
|
|||||||
)
|
)
|
||||||
# 段落样式也内联,保证 v-html 渲染时一定生效
|
# 段落样式也内联,保证 v-html 渲染时一定生效
|
||||||
p_style = f"margin:0 0 {ARTICLE_BODY_P_MARGIN_BOTTOM} 0;"
|
p_style = f"margin:0 0 {ARTICLE_BODY_P_MARGIN_BOTTOM} 0;"
|
||||||
inner_with_p_style = inner_html.replace("<p>", f'<p style="{p_style}">')
|
# 内层 HTML 的 <p class="diary-para" ...> 形式;把 style 插到 class 后面
|
||||||
|
inner_with_p_style = inner_html.replace(
|
||||||
|
f'<p class="{DIARY_PARA_CLASS}">',
|
||||||
|
f'<p class="{DIARY_PARA_CLASS}" style="{p_style}">',
|
||||||
|
)
|
||||||
return f'<div class="article-body" style="{inline_style}">{inner_with_p_style}</div>'
|
return f'<div class="article-body" style="{inline_style}">{inner_with_p_style}</div>'
|
||||||
|
|
||||||
|
|
||||||
# === 单任务:classify ===
|
# === 单任务:classify (含黑名单 drop gate) ===
|
||||||
async def _enrich_classify(article: Article, setting: LlmSetting, client: LlmClient) -> None:
|
async def _enrich_classify(
|
||||||
|
article: Article,
|
||||||
|
setting: LlmSetting,
|
||||||
|
client: LlmClient,
|
||||||
|
blocklist: list[str],
|
||||||
|
) -> tuple[bool, list[str]]:
|
||||||
|
"""分类 + 黑名单判断。
|
||||||
|
|
||||||
|
返回 (drop, categories):
|
||||||
|
- drop=True → 整篇文章应删除(分类命中 blocklist)
|
||||||
|
- categories → 写入 article.category 的多标签列表
|
||||||
|
"""
|
||||||
template = setting.classify_prompt or get_default_prompts()["classify_prompt"]
|
template = setting.classify_prompt or get_default_prompts()["classify_prompt"]
|
||||||
# 老 prompt 可能只支持 {title}/{summary},不支持 {body} —— _safe_format 兜底
|
# 老 prompt 可能只支持 {title}/{summary},不支持 {body} / {blocklist} —— _safe_format 兜底
|
||||||
vars_ = {
|
vars_ = {
|
||||||
"title": (article.title_zh or article.title)[:200],
|
"title": (article.title_zh or article.title)[:200],
|
||||||
"summary": (article.summary_zh or "")[:400],
|
"summary": (article.summary_zh or "")[:400],
|
||||||
"body": (article.body_zh_text or "")[:1500],
|
"body": (article.body_zh_text or "")[:1500],
|
||||||
|
"blocklist": "、".join(blocklist) if blocklist else "(无)",
|
||||||
}
|
}
|
||||||
prompt = _safe_format(template, vars_)
|
prompt = _safe_format(template, vars_)
|
||||||
result = await client.classify_json(
|
result = await client.classify_json(
|
||||||
system="你是新闻分类助手,只返回 JSON。",
|
system="你是新闻分类助手,只返回 JSON。",
|
||||||
user=prompt,
|
user=prompt,
|
||||||
)
|
)
|
||||||
cats = result.get("categories") or result.get("tags") or []
|
cats_raw = result.get("categories") or result.get("tags") or []
|
||||||
if isinstance(cats, list) and cats:
|
cats: list[str] = []
|
||||||
# 多标签(2-5 个),逗号分隔存到 category 字段(已有索引)
|
if isinstance(cats_raw, list):
|
||||||
joined = ",".join(str(c).strip() for c in cats[:DEFAULT_IMAGE_MAX_TAGS] if str(c).strip())
|
cats = [str(c).strip() for c in cats_raw[:DEFAULT_IMAGE_MAX_TAGS] if str(c).strip()]
|
||||||
article.category = joined[:64] or None
|
# 兼容两种来源:LLM 自己判断的 drop 字段,或后端兜底检查命中
|
||||||
article.classify_status = "ok"
|
drop_flag = bool(result.get("drop"))
|
||||||
|
if not drop_flag and blocklist:
|
||||||
|
# 兜底:即使 LLM 没正确返回 drop 字段,我们也用本地匹配兜底
|
||||||
|
bl_set = {b.strip() for b in blocklist if b and b.strip()}
|
||||||
|
drop_flag = any(c in bl_set for c in cats)
|
||||||
|
return drop_flag, cats
|
||||||
|
|
||||||
|
|
||||||
|
def _merge_blocklist(setting: LlmSetting, source: Source | None) -> list[str]:
|
||||||
|
"""合并全局 + per-source blocklist,去重保序。"""
|
||||||
|
out: list[str] = []
|
||||||
|
seen: set[str] = set()
|
||||||
|
sources: list[list[str]] = [setting.blocklist_tags or []]
|
||||||
|
if source is not None:
|
||||||
|
sources.append(source.blocklist_tags or [])
|
||||||
|
for src in sources:
|
||||||
|
for t in src:
|
||||||
|
t = (t or "").strip()
|
||||||
|
if t and t not in seen:
|
||||||
|
seen.add(t)
|
||||||
|
out.append(t)
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
# === 单任务:image ===
|
# === 单任务:image ===
|
||||||
@@ -205,6 +256,10 @@ async def _enrich_commentary(article: Article, setting: LlmSetting, client: LlmC
|
|||||||
async def enrich_article(article_id: int) -> dict[str, str]:
|
async def enrich_article(article_id: int) -> dict[str, str]:
|
||||||
"""对单篇文章做 4 项 LLM 增强。
|
"""对单篇文章做 4 项 LLM 增强。
|
||||||
|
|
||||||
|
顺序:classify(黑名单 gate) → format → image → commentary
|
||||||
|
- classify 命中 blocklist → 整篇文章 DELETE,后续任务直接 return
|
||||||
|
- 任一任务失败,只标 status 不影响其他任务
|
||||||
|
|
||||||
返回 {task: status} 字典(用于日志)。
|
返回 {task: status} 字典(用于日志)。
|
||||||
"""
|
"""
|
||||||
async with AsyncSessionLocal() as session:
|
async with AsyncSessionLocal() as session:
|
||||||
@@ -240,30 +295,54 @@ async def enrich_article(article_id: int) -> dict[str, str]:
|
|||||||
if not art:
|
if not art:
|
||||||
return {}
|
return {}
|
||||||
|
|
||||||
# 4 个任务(互不影响);format / classify / commentary 是 chat,image 是 image
|
# === 1) classify(黑名单 gate,优先执行)===
|
||||||
# 串行执行(已经过 client 内部 Semaphore),但每个 try/except 独立
|
blocklist = _merge_blocklist(setting, art.source if art.source_id else None)
|
||||||
tasks: list[tuple[str, Any]] = [
|
try:
|
||||||
("format", _enrich_format(art, setting, client)),
|
drop, cats = await _enrich_classify(art, setting, client, blocklist)
|
||||||
("classify", _enrich_classify(art, setting, client)),
|
art.classify_status = "ok"
|
||||||
("image", _enrich_image(art, setting, client)),
|
if cats:
|
||||||
("commentary", _enrich_commentary(art, setting, client)),
|
art.category = ",".join(cats)[:64] or None
|
||||||
]
|
if drop:
|
||||||
for name, coro in tasks:
|
# 命中 blocklist → 删文章,后续 3 步全跳
|
||||||
try:
|
logger.info(
|
||||||
await coro
|
"enrich_article id=%s dropped (blocklist hit, cats=%s, blocklist=%s)",
|
||||||
results[name] = "ok"
|
article_id, cats, blocklist,
|
||||||
except Exception as e:
|
)
|
||||||
logger.exception("enrich %s failed for article %s: %s", name, article_id, e)
|
await session.delete(art)
|
||||||
results[name] = f"failed:{type(e).__name__}"
|
await session.commit()
|
||||||
# 标 status
|
return {"classify": "dropped", "format": "skipped", "image": "skipped", "commentary": "skipped"}
|
||||||
if name == "format":
|
except Exception as e:
|
||||||
art.format_status = "failed"
|
logger.exception("enrich classify failed for article %s: %s", article_id, e)
|
||||||
elif name == "classify":
|
art.classify_status = "failed"
|
||||||
art.classify_status = "failed"
|
results["classify"] = f"failed:{type(e).__name__}"
|
||||||
elif name == "image":
|
# classify 失败也继续(format/image/commentary 还能跑)
|
||||||
art.image_ai_status = "failed"
|
|
||||||
elif name == "commentary":
|
# === 2) format ===
|
||||||
art.commentary_status = "failed"
|
try:
|
||||||
|
await _enrich_format(art, setting, client)
|
||||||
|
results["format"] = "ok"
|
||||||
|
except Exception as e:
|
||||||
|
logger.exception("enrich format failed for article %s: %s", article_id, e)
|
||||||
|
art.format_status = "failed"
|
||||||
|
results["format"] = f"failed:{type(e).__name__}"
|
||||||
|
|
||||||
|
# === 3) image ===
|
||||||
|
try:
|
||||||
|
await _enrich_image(art, setting, client)
|
||||||
|
results["image"] = "ok"
|
||||||
|
except Exception as e:
|
||||||
|
logger.exception("enrich image failed for article %s: %s", article_id, e)
|
||||||
|
art.image_ai_status = "failed"
|
||||||
|
results["image"] = f"failed:{type(e).__name__}"
|
||||||
|
|
||||||
|
# === 4) commentary ===
|
||||||
|
try:
|
||||||
|
await _enrich_commentary(art, setting, client)
|
||||||
|
results["commentary"] = "ok"
|
||||||
|
except Exception as e:
|
||||||
|
logger.exception("enrich commentary failed for article %s: %s", article_id, e)
|
||||||
|
art.commentary_status = "failed"
|
||||||
|
results["commentary"] = f"failed:{type(e).__name__}"
|
||||||
|
|
||||||
await session.commit()
|
await session.commit()
|
||||||
logger.info("enrich_article id=%s: %s", article_id, results)
|
logger.info("enrich_article id=%s: %s", article_id, results)
|
||||||
|
|||||||
Reference in New Issue
Block a user