refactor(search): 只展示 keyword 续接词,去掉 titles 段
产品决定:搜索建议只展示 ts_stat 高频词续接(如'美'→美国/美军/美国政府), 不要真实文章 id 提示(用户认为这种'文章#566871'是噪音,没连续性)。 改动: - SearchSuggestionsResponse 去 title,只剩 query + keywords - SearchService 只查 search_keywords,fallback 路径也只针对 keywords - Feed.vue: 删掉 suggestTitles 状态 + SuggestTitleOption 类型联合, renderSuggestion 简化成 '词' 标签 + 词文本 + 右侧 weight 数字 - 0011 迁移: 删 search_title_suggestions 表 + 3 索引 + trigger + 函数 (trigger 在每篇文章 INSERT/UPDATE 都会跑,删了能省掉无用性能损耗) - 删除: app/models/search_title_suggestion.py + backfill_search_suggestions.py 替换成: app/scripts/refresh_search_keywords.py(只跑一次词频刷新)
This commit is contained in:
@@ -1,31 +1,27 @@
|
||||
"""搜索建议服务:混合 A(高频词)+ B(真实标题) + 冷启动 fallback。
|
||||
"""搜索建议服务:纯 keyword 续接词(高频词)。
|
||||
|
||||
- A: search_keywords(prefix_keys @> ARRAY['美'], ORDER BY weight DESC)
|
||||
- B: search_title_suggestions(prefix_keys @> ARRAY['美'], ORDER BY published_at DESC)
|
||||
- fallback: 任一表空时回退实时 ILIKE 查 articles(冷启动 / worker 没刷新过)
|
||||
- search_keywords(prefix_keys @> ARRAY['美'], ORDER BY weight DESC)
|
||||
- fallback: 表空时回退实时 ts_stat(冷启动 / worker 没刷新过)
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
|
||||
from sqlalchemy import desc, select
|
||||
from sqlalchemy.dialects.postgresql import ARRAY
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from app.models.article import Article
|
||||
from app.models.search_keyword import SearchKeyword
|
||||
from app.models.search_title_suggestion import SearchTitleSuggestion
|
||||
|
||||
logger = logging.getLogger("news.search")
|
||||
|
||||
|
||||
class SearchService:
|
||||
"""搜索建议 service。
|
||||
"""搜索建议 service — 仅返回 keyword 续接词。
|
||||
|
||||
设计:输入 prefix,返回 { titles, keywords } 两组候选。
|
||||
- titles 真实文章标题(按 published_at DESC 排)
|
||||
- keywords 高频词(按 weight DESC 排)
|
||||
- 任一为空时回退实时 articles.title_zh ILIKE 查询(冷启动兜底)
|
||||
设计:输入 prefix,返回 { query, keywords[] }。
|
||||
- keywords 是 ts_stat 聚合后的高频词(从 articles.title_zh + body_zh_text + commentary 算)
|
||||
- 用 GIN 数组索引 prefix_keys @> ARRAY['前缀'],亚毫秒
|
||||
- 表空时回退到实时 ts_stat 聚合(慢但能用)
|
||||
"""
|
||||
|
||||
def __init__(self, session: AsyncSession):
|
||||
@@ -36,42 +32,20 @@ class SearchService:
|
||||
q: str,
|
||||
limit: int = 10,
|
||||
) -> dict[str, list[dict]]:
|
||||
"""返回搜索建议。
|
||||
"""返回搜索建议(仅 keywords)。
|
||||
|
||||
Args:
|
||||
q: 前缀(1-20 字符)
|
||||
limit: 每组最多返回多少(默认 10,最大 20)
|
||||
limit: 最多返回多少(默认 10,最大 20)
|
||||
|
||||
Returns:
|
||||
{"query": q, "titles": [...], "keywords": [...]}
|
||||
titles 元素:{"id": article_id, "published_at": ...}
|
||||
keywords 元素:{"word": ..., "weight": ...}
|
||||
{"query": q, "keywords": [{"word", "weight", "source"}, ...]}
|
||||
"""
|
||||
q = q.strip()
|
||||
if not q:
|
||||
return {"query": q, "titles": [], "keywords": []}
|
||||
return {"query": q, "keywords": []}
|
||||
|
||||
# 1) 查 search_title_suggestions(B 方案)
|
||||
title_rows = await self.session.execute(
|
||||
select(
|
||||
SearchTitleSuggestion.article_id,
|
||||
SearchTitleSuggestion.published_at,
|
||||
SearchTitleSuggestion.title_lang,
|
||||
)
|
||||
.where(SearchTitleSuggestion.prefix_keys.contains([q]))
|
||||
.order_by(desc(SearchTitleSuggestion.published_at))
|
||||
.limit(limit)
|
||||
)
|
||||
titles = [
|
||||
{
|
||||
"id": row.article_id,
|
||||
"published_at": row.published_at.isoformat() if row.published_at else None,
|
||||
"lang": row.title_lang,
|
||||
}
|
||||
for row in title_rows.all()
|
||||
]
|
||||
|
||||
# 2) 查 search_keywords(A 方案)
|
||||
# 1) 查 search_keywords(GIN 数组包含,亚毫秒)
|
||||
kw_rows = await self.session.execute(
|
||||
select(SearchKeyword.keyword, SearchKeyword.weight, SearchKeyword.source)
|
||||
.where(SearchKeyword.prefix_keys.contains([q]))
|
||||
@@ -83,49 +57,11 @@ class SearchService:
|
||||
for row in kw_rows.all()
|
||||
]
|
||||
|
||||
# 3) 冷启动 fallback:任一为空时,回退到实时 ILIKE articles
|
||||
# (如果两张固化表都跑空了,说明刚建库或数据被 truncate)
|
||||
if not titles:
|
||||
titles = await self._fallback_titles(q, limit)
|
||||
# 2) 冷启动 fallback:表空时回退到实时 ts_stat 聚合
|
||||
if not keywords:
|
||||
keywords = await self._fallback_keywords(q, limit)
|
||||
|
||||
return {"query": q, "titles": titles, "keywords": keywords}
|
||||
|
||||
async def _fallback_titles(self, q: str, limit: int) -> list[dict]:
|
||||
"""回退:实时查 articles.title_zh / title(走 B-tree 索引,慢但能用)。
|
||||
|
||||
- 优先 title_zh LIKE(翻译后),没有再 LIKE title(短新闻)
|
||||
- 限制 7 天内的文章,避免返回太老的(冷启动场景下用户预期)
|
||||
"""
|
||||
from datetime import datetime, timedelta, timezone
|
||||
|
||||
from sqlalchemy import or_
|
||||
|
||||
since = datetime.now(timezone.utc).replace(tzinfo=None) - timedelta(days=7)
|
||||
like = f"{q}%"
|
||||
stmt = (
|
||||
select(Article.id, Article.published_at, Article.title_zh, Article.title)
|
||||
.where(
|
||||
Article.published_at >= since,
|
||||
Article.duplicate_of.is_(None),
|
||||
or_(
|
||||
Article.title_zh.ilike(like),
|
||||
Article.title.ilike(like),
|
||||
),
|
||||
)
|
||||
.order_by(desc(Article.published_at))
|
||||
.limit(limit)
|
||||
)
|
||||
rows = (await self.session.execute(stmt)).all()
|
||||
return [
|
||||
{
|
||||
"id": row.id,
|
||||
"published_at": row.published_at.isoformat() if row.published_at else None,
|
||||
"lang": "zh" if row.title_zh else "src",
|
||||
}
|
||||
for row in rows
|
||||
]
|
||||
return {"query": q, "keywords": keywords}
|
||||
|
||||
async def _fallback_keywords(self, q: str, limit: int) -> list[dict]:
|
||||
"""回退:ts_stat 实时聚合(慢但能用)。
|
||||
|
||||
Reference in New Issue
Block a user