From 76e95908e83743e4274ec23b44fa45791163b9ed Mon Sep 17 00:00:00 2001 From: Mavis Date: Tue, 9 Jun 2026 15:14:53 +0800 Subject: [PATCH] =?UTF-8?q?fix(llm):=20=5Fsafe=5Fformat=20=E9=98=B2=20Valu?= =?UTF-8?q?eError,=E6=A8=A1=E6=9D=BF=E9=87=8C=E7=A4=BA=E4=BE=8B=20JSON=20?= =?UTF-8?q?=E4=B9=9F=E8=83=BD=E6=AD=A3=E5=B8=B8=20format?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit bug: classify_prompt 默认值里含示例 JSON {\\"categories\\": [...]},str.format 看到花括号就试图解析为 placeholder/format spec,遇到 \\" 时抛: ValueError: Invalid format specifier ' [\\"时政\\"]' for object of type 'str' 修复: - 引入 placeholder_re 提取所有合法 {varname} 占位符,stash 成 sentinel - 剩余的 { / } 一律 escape 成 {{ / }},str.format 自然还原 - 用户显式写的 {{ / }}(标准转义)单独 stash,不被重复 escape - 极端情况(KeyError/IndexError/ValueError)兜底:按原文返回,记录 warn 8 个本地单测全过(含示例 JSON 模板 / 老 prompt 缺变量 / 用户显式 {{ 场景) --- backend/app/services/llm/enrichment.py | 44 ++++++++++++++++++++++++-- 1 file changed, 41 insertions(+), 3 deletions(-) diff --git a/backend/app/services/llm/enrichment.py b/backend/app/services/llm/enrichment.py index d27d7e1..00192ac 100644 --- a/backend/app/services/llm/enrichment.py +++ b/backend/app/services/llm/enrichment.py @@ -77,11 +77,49 @@ def _safe_format(template: str, vars_: Mapping[str, Any]) -> str: 用途:数据库里用户已存的 prompt 模板可能是旧版的(只支持部分占位符), 新代码传了更多变量也不应崩。 + + 防御: + - 模板里出现的非占位符 `{` / `}`(比如示例 JSON `{"k": "v"}`)会被先 escape 成 `{{` / `}}`, + 避免 str.format 误解析为占位符/格式说明符而抛 ValueError。 + - 用户显式写的 `{{` / `}}`(标准 str.format 转义语法)会被原样保留,不被重复 escape。 """ + import re + + placeholder_re = re.compile(r"\{([A-Za-z_][A-Za-z0-9_.\[\]]*)\}") + sentinels: list[str] = [] + sentinel_map: dict[str, str] = {} + user_escape: list[str] = [] + + def _stash(m: re.Match) -> str: + name = m.group(1) + s = f"\x00PH{len(sentinels)}\x00" + sentinels.append(name) + sentinel_map[s] = name + return s + + def _stash_brace(s: str) -> str: + sentinel = f"\x00UE{len(user_escape)}\x00" + user_escape.append(s) + return sentinel + + # 1) 先 stash 用户显式 {{ / }} + staged = template.replace("{{", _stash_brace("{{")).replace("}}", _stash_brace("}}")) + # 2) stash 合法占位符 + staged = placeholder_re.sub(_stash, staged) + # 3) escape 剩下的单个 { / }(示例 JSON 等字面量) + escaped = staged.replace("{", "{{").replace("}", "}}") + # 4) 还原占位符 + final = escaped + for s, name in sentinel_map.items(): + final = final.replace(s, "{" + name + "}") + # 5) 还原用户显式 {{ / }} + for i, raw in enumerate(user_escape): + final = final.replace(f"\x00UE{i}\x00", raw) + try: - return template.format_map(_SafeDict(vars_)) - except (KeyError, IndexError) as e: - # 极端情况(比如 {} 这种非法占位符)兜底 + return final.format_map(_SafeDict(vars_)) + except (KeyError, IndexError, ValueError) as e: + # 极端情况兜底:按原文返回 logger.warning("_safe_format 解析失败,按原文返回: %s", e) return template