Files
diary-news/backend/app/services/search.py
mavis 85c05c19a7 refactor(search): 只展示 keyword 续接词,去掉 titles 段
产品决定:搜索建议只展示 ts_stat 高频词续接(如'美'→美国/美军/美国政府),
不要真实文章 id 提示(用户认为这种'文章#566871'是噪音,没连续性)。

改动:
- SearchSuggestionsResponse 去 title,只剩 query + keywords
- SearchService 只查 search_keywords,fallback 路径也只针对 keywords
- Feed.vue: 删掉 suggestTitles 状态 + SuggestTitleOption 类型联合,
  renderSuggestion 简化成 '词' 标签 + 词文本 + 右侧 weight 数字
- 0011 迁移: 删 search_title_suggestions 表 + 3 索引 + trigger + 函数
  (trigger 在每篇文章 INSERT/UPDATE 都会跑,删了能省掉无用性能损耗)
- 删除: app/models/search_title_suggestion.py + backfill_search_suggestions.py
  替换成: app/scripts/refresh_search_keywords.py(只跑一次词频刷新)
2026-06-15 19:37:40 +08:00

95 lines
3.1 KiB
Python

"""搜索建议服务:纯 keyword 续接词(高频词)。
- search_keywords(prefix_keys @> ARRAY[''], ORDER BY weight DESC)
- fallback: 表空时回退实时 ts_stat(冷启动 / worker 没刷新过)
"""
from __future__ import annotations
import logging
from sqlalchemy import desc, select
from sqlalchemy.ext.asyncio import AsyncSession
from app.models.search_keyword import SearchKeyword
logger = logging.getLogger("news.search")
class SearchService:
"""搜索建议 service — 仅返回 keyword 续接词。
设计:输入 prefix,返回 { query, keywords[] }。
- keywords 是 ts_stat 聚合后的高频词(从 articles.title_zh + body_zh_text + commentary 算)
- 用 GIN 数组索引 prefix_keys @> ARRAY['前缀'],亚毫秒
- 表空时回退到实时 ts_stat 聚合(慢但能用)
"""
def __init__(self, session: AsyncSession):
self.session = session
async def suggestions(
self,
q: str,
limit: int = 10,
) -> dict[str, list[dict]]:
"""返回搜索建议(仅 keywords)。
Args:
q: 前缀(1-20 字符)
limit: 最多返回多少(默认 10,最大 20)
Returns:
{"query": q, "keywords": [{"word", "weight", "source"}, ...]}
"""
q = q.strip()
if not q:
return {"query": q, "keywords": []}
# 1) 查 search_keywords(GIN 数组包含,亚毫秒)
kw_rows = await self.session.execute(
select(SearchKeyword.keyword, SearchKeyword.weight, SearchKeyword.source)
.where(SearchKeyword.prefix_keys.contains([q]))
.order_by(desc(SearchKeyword.weight))
.limit(limit)
)
keywords = [
{"word": row.keyword, "weight": row.weight, "source": row.source}
for row in kw_rows.all()
]
# 2) 冷启动 fallback:表空时回退到实时 ts_stat 聚合
if not keywords:
keywords = await self._fallback_keywords(q, limit)
return {"query": q, "keywords": keywords}
async def _fallback_keywords(self, q: str, limit: int) -> list[dict]:
"""回退:ts_stat 实时聚合(慢但能用)。
- 从 articles.title_zh + body_zh_text 实时 to_tsvector(chinese_zh)
- 适用:search_keywords 表空 + worker 没刷新过
- ts_stat(text) 单参 — 第二参 weights mask 不能传 'a'(zhparser 不标 A 权重会 0 行)
"""
from sqlalchemy import text
sql = text(
"""
SELECT word, nentry::int AS weight
FROM ts_stat(
$$SELECT to_tsvector('chinese_zh',
coalesce(title_zh, '') || ' ' || coalesce(body_zh_text, '')
)
FROM articles
WHERE title_zh IS NOT NULL OR body_zh_text IS NOT NULL
LIMIT 500$$
) AS s
WHERE word LIKE :prefix
ORDER BY nentry DESC
LIMIT :lim
"""
)
rows = (
await self.session.execute(sql, {"prefix": f"{q}%", "lim": limit})
).all()
return [{"word": r.word, "weight": r.weight, "source": "ts_stat_live"} for r in rows]