refactor(search): 只展示 keyword 续接词,去掉 titles 段

产品决定:搜索建议只展示 ts_stat 高频词续接(如'美'→美国/美军/美国政府),
不要真实文章 id 提示(用户认为这种'文章#566871'是噪音,没连续性)。

改动:
- SearchSuggestionsResponse 去 title,只剩 query + keywords
- SearchService 只查 search_keywords,fallback 路径也只针对 keywords
- Feed.vue: 删掉 suggestTitles 状态 + SuggestTitleOption 类型联合,
  renderSuggestion 简化成 '词' 标签 + 词文本 + 右侧 weight 数字
- 0011 迁移: 删 search_title_suggestions 表 + 3 索引 + trigger + 函数
  (trigger 在每篇文章 INSERT/UPDATE 都会跑,删了能省掉无用性能损耗)
- 删除: app/models/search_title_suggestion.py + backfill_search_suggestions.py
  替换成: app/scripts/refresh_search_keywords.py(只跑一次词频刷新)
This commit is contained in:
mavis
2026-06-15 19:37:40 +08:00
parent db4fd8699b
commit 85c05c19a7
10 changed files with 277 additions and 366 deletions

View File

@@ -1,31 +1,27 @@
"""搜索建议服务:混合 A(高频词)+ B(真实标题) + 冷启动 fallback
"""搜索建议服务:纯 keyword 续接词(高频词)
- A: search_keywords(prefix_keys @> ARRAY[''], ORDER BY weight DESC)
- B: search_title_suggestions(prefix_keys @> ARRAY[''], ORDER BY published_at DESC)
- fallback: 任一表空时回退实时 ILIKE 查 articles(冷启动 / worker 没刷新过)
- search_keywords(prefix_keys @> ARRAY[''], ORDER BY weight DESC)
- fallback: 表空时回退实时 ts_stat(冷启动 / worker 没刷新过)
"""
from __future__ import annotations
import logging
from sqlalchemy import desc, select
from sqlalchemy.dialects.postgresql import ARRAY
from sqlalchemy.ext.asyncio import AsyncSession
from app.models.article import Article
from app.models.search_keyword import SearchKeyword
from app.models.search_title_suggestion import SearchTitleSuggestion
logger = logging.getLogger("news.search")
class SearchService:
"""搜索建议 service。
"""搜索建议 service — 仅返回 keyword 续接词
设计:输入 prefix,返回 { titles, keywords } 两组候选
- titles 真实文章标题(按 published_at DESC 排)
- keywords 高频词(按 weight DESC 排)
- 任一为空时回退实时 articles.title_zh ILIKE 查询(冷启动兜底)
设计:输入 prefix,返回 { query, keywords[] }。
- keywords 是 ts_stat 聚合后的高频词(从 articles.title_zh + body_zh_text + commentary 算)
- 用 GIN 数组索引 prefix_keys @> ARRAY['前缀'],亚毫秒
- 空时回退实时 ts_stat 聚合(慢但能用)
"""
def __init__(self, session: AsyncSession):
@@ -36,42 +32,20 @@ class SearchService:
q: str,
limit: int = 10,
) -> dict[str, list[dict]]:
"""返回搜索建议。
"""返回搜索建议(仅 keywords)
Args:
q: 前缀(1-20 字符)
limit: 每组最多返回多少(默认 10,最大 20)
limit: 最多返回多少(默认 10,最大 20)
Returns:
{"query": q, "titles": [...], "keywords": [...]}
titles 元素:{"id": article_id, "published_at": ...}
keywords 元素:{"word": ..., "weight": ...}
{"query": q, "keywords": [{"word", "weight", "source"}, ...]}
"""
q = q.strip()
if not q:
return {"query": q, "titles": [], "keywords": []}
return {"query": q, "keywords": []}
# 1) 查 search_title_suggestions(B 方案)
title_rows = await self.session.execute(
select(
SearchTitleSuggestion.article_id,
SearchTitleSuggestion.published_at,
SearchTitleSuggestion.title_lang,
)
.where(SearchTitleSuggestion.prefix_keys.contains([q]))
.order_by(desc(SearchTitleSuggestion.published_at))
.limit(limit)
)
titles = [
{
"id": row.article_id,
"published_at": row.published_at.isoformat() if row.published_at else None,
"lang": row.title_lang,
}
for row in title_rows.all()
]
# 2) 查 search_keywords(A 方案)
# 1) 查 search_keywords(GIN 数组包含,亚毫秒)
kw_rows = await self.session.execute(
select(SearchKeyword.keyword, SearchKeyword.weight, SearchKeyword.source)
.where(SearchKeyword.prefix_keys.contains([q]))
@@ -83,49 +57,11 @@ class SearchService:
for row in kw_rows.all()
]
# 3) 冷启动 fallback:任一为空时,回退到实时 ILIKE articles
# (如果两张固化表都跑空了,说明刚建库或数据被 truncate)
if not titles:
titles = await self._fallback_titles(q, limit)
# 2) 冷启动 fallback:空时回退到实时 ts_stat 聚合
if not keywords:
keywords = await self._fallback_keywords(q, limit)
return {"query": q, "titles": titles, "keywords": keywords}
async def _fallback_titles(self, q: str, limit: int) -> list[dict]:
"""回退:实时查 articles.title_zh / title(走 B-tree 索引,慢但能用)。
- 优先 title_zh LIKE(翻译后),没有再 LIKE title(短新闻)
- 限制 7 天内的文章,避免返回太老的(冷启动场景下用户预期)
"""
from datetime import datetime, timedelta, timezone
from sqlalchemy import or_
since = datetime.now(timezone.utc).replace(tzinfo=None) - timedelta(days=7)
like = f"{q}%"
stmt = (
select(Article.id, Article.published_at, Article.title_zh, Article.title)
.where(
Article.published_at >= since,
Article.duplicate_of.is_(None),
or_(
Article.title_zh.ilike(like),
Article.title.ilike(like),
),
)
.order_by(desc(Article.published_at))
.limit(limit)
)
rows = (await self.session.execute(stmt)).all()
return [
{
"id": row.id,
"published_at": row.published_at.isoformat() if row.published_at else None,
"lang": "zh" if row.title_zh else "src",
}
for row in rows
]
return {"query": q, "keywords": keywords}
async def _fallback_keywords(self, q: str, limit: int) -> list[dict]:
"""回退:ts_stat 实时聚合(慢但能用)。