产品决定:搜索建议只展示 ts_stat 高频词续接(如'美'→美国/美军/美国政府), 不要真实文章 id 提示(用户认为这种'文章#566871'是噪音,没连续性)。 改动: - SearchSuggestionsResponse 去 title,只剩 query + keywords - SearchService 只查 search_keywords,fallback 路径也只针对 keywords - Feed.vue: 删掉 suggestTitles 状态 + SuggestTitleOption 类型联合, renderSuggestion 简化成 '词' 标签 + 词文本 + 右侧 weight 数字 - 0011 迁移: 删 search_title_suggestions 表 + 3 索引 + trigger + 函数 (trigger 在每篇文章 INSERT/UPDATE 都会跑,删了能省掉无用性能损耗) - 删除: app/models/search_title_suggestion.py + backfill_search_suggestions.py 替换成: app/scripts/refresh_search_keywords.py(只跑一次词频刷新)
95 lines
3.1 KiB
Python
95 lines
3.1 KiB
Python
"""搜索建议服务:纯 keyword 续接词(高频词)。
|
|
|
|
- search_keywords(prefix_keys @> ARRAY['美'], ORDER BY weight DESC)
|
|
- fallback: 表空时回退实时 ts_stat(冷启动 / worker 没刷新过)
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import logging
|
|
|
|
from sqlalchemy import desc, select
|
|
from sqlalchemy.ext.asyncio import AsyncSession
|
|
|
|
from app.models.search_keyword import SearchKeyword
|
|
|
|
logger = logging.getLogger("news.search")
|
|
|
|
|
|
class SearchService:
|
|
"""搜索建议 service — 仅返回 keyword 续接词。
|
|
|
|
设计:输入 prefix,返回 { query, keywords[] }。
|
|
- keywords 是 ts_stat 聚合后的高频词(从 articles.title_zh + body_zh_text + commentary 算)
|
|
- 用 GIN 数组索引 prefix_keys @> ARRAY['前缀'],亚毫秒
|
|
- 表空时回退到实时 ts_stat 聚合(慢但能用)
|
|
"""
|
|
|
|
def __init__(self, session: AsyncSession):
|
|
self.session = session
|
|
|
|
async def suggestions(
|
|
self,
|
|
q: str,
|
|
limit: int = 10,
|
|
) -> dict[str, list[dict]]:
|
|
"""返回搜索建议(仅 keywords)。
|
|
|
|
Args:
|
|
q: 前缀(1-20 字符)
|
|
limit: 最多返回多少(默认 10,最大 20)
|
|
|
|
Returns:
|
|
{"query": q, "keywords": [{"word", "weight", "source"}, ...]}
|
|
"""
|
|
q = q.strip()
|
|
if not q:
|
|
return {"query": q, "keywords": []}
|
|
|
|
# 1) 查 search_keywords(GIN 数组包含,亚毫秒)
|
|
kw_rows = await self.session.execute(
|
|
select(SearchKeyword.keyword, SearchKeyword.weight, SearchKeyword.source)
|
|
.where(SearchKeyword.prefix_keys.contains([q]))
|
|
.order_by(desc(SearchKeyword.weight))
|
|
.limit(limit)
|
|
)
|
|
keywords = [
|
|
{"word": row.keyword, "weight": row.weight, "source": row.source}
|
|
for row in kw_rows.all()
|
|
]
|
|
|
|
# 2) 冷启动 fallback:表空时回退到实时 ts_stat 聚合
|
|
if not keywords:
|
|
keywords = await self._fallback_keywords(q, limit)
|
|
|
|
return {"query": q, "keywords": keywords}
|
|
|
|
async def _fallback_keywords(self, q: str, limit: int) -> list[dict]:
|
|
"""回退:ts_stat 实时聚合(慢但能用)。
|
|
|
|
- 从 articles.title_zh + body_zh_text 实时 to_tsvector(chinese_zh)
|
|
- 适用:search_keywords 表空 + worker 没刷新过
|
|
- ts_stat(text) 单参 — 第二参 weights mask 不能传 'a'(zhparser 不标 A 权重会 0 行)
|
|
"""
|
|
from sqlalchemy import text
|
|
|
|
sql = text(
|
|
"""
|
|
SELECT word, nentry::int AS weight
|
|
FROM ts_stat(
|
|
$$SELECT to_tsvector('chinese_zh',
|
|
coalesce(title_zh, '') || ' ' || coalesce(body_zh_text, '')
|
|
)
|
|
FROM articles
|
|
WHERE title_zh IS NOT NULL OR body_zh_text IS NOT NULL
|
|
LIMIT 500$$
|
|
) AS s
|
|
WHERE word LIKE :prefix
|
|
ORDER BY nentry DESC
|
|
LIMIT :lim
|
|
"""
|
|
)
|
|
rows = (
|
|
await self.session.execute(sql, {"prefix": f"{q}%", "lim": limit})
|
|
).all()
|
|
return [{"word": r.word, "weight": r.weight, "source": "ts_stat_live"} for r in rows]
|