feat(search): 智能搜索建议 - 固化候选词表 (search_keywords + search_title_suggestions)

后端: - alembic 0009: 两张固化表 + GIN prefix_keys 索引 + articles trigger - /api/v1/search/suggestions: 混合 A(高频词 ts_stat) + B(真实标题) + 冷启动 fallback - worker 每日 03:00 + 启动时刷新 search_keywords - 顺便填 commit 11 TODO: articles.title_zh_tsv + GIN 索引(未来 FTS 基础) 前端: - NInput -> NAutoComplete + debounce 250ms - 选标题 -> 跳详情;选关键词 -> 填入 + 触发搜索 - AbortController 防 race condition 性能: prefix_keys @> ARRAY[prefix] 走 GIN 亚毫秒,100w 行也稳
2026-06-15 18:26:35 +08:00
parent b674fb4b22
commit c3aa0f0cb6
13 changed files with 1028 additions and 7 deletions
--- a/backend/app/services/search.py
+++ b/backend/app/services/search.py
@@ -0,0 +1,160 @@
+"""搜索建议服务:混合 A(高频词)+ B(真实标题) + 冷启动 fallback。
+
+- A: search_keywords(prefix_keys @> ARRAY['美'], ORDER BY weight DESC)
+- B: search_title_suggestions(prefix_keys @> ARRAY['美'], ORDER BY published_at DESC)
+- fallback: 任一表空时回退实时 ILIKE 查 articles(冷启动 / worker 没刷新过)
+"""
+from __future__ import annotations
+
+import logging
+
+from sqlalchemy import desc, select
+from sqlalchemy.dialects.postgresql import ARRAY
+from sqlalchemy.ext.asyncio import AsyncSession
+
+from app.models.article import Article
+from app.models.search_keyword import SearchKeyword
+from app.models.search_title_suggestion import SearchTitleSuggestion
+
+logger = logging.getLogger("news.search")
+
+
+class SearchService:
+    """搜索建议 service。
+
+    设计:输入 prefix,返回 { titles, keywords } 两组候选。
+    - titles 真实文章标题(按 published_at DESC 排)
+    - keywords 高频词(按 weight DESC 排)
+    - 任一为空时回退实时 articles.title_zh ILIKE 查询(冷启动兜底)
+    """
+
+    def __init__(self, session: AsyncSession):
+        self.session = session
+
+    async def suggestions(
+        self,
+        q: str,
+        limit: int = 10,
+    ) -> dict[str, list[dict]]:
+        """返回搜索建议。
+
+        Args:
+            q: 前缀(1-20 字符)
+            limit: 每组最多返回多少(默认 10,最大 20)
+
+        Returns:
+            {"query": q, "titles": [...], "keywords": [...]}
+            titles 元素:{"id": article_id, "published_at": ...}
+            keywords 元素:{"word": ..., "weight": ...}
+        """
+        q = q.strip()
+        if not q:
+            return {"query": q, "titles": [], "keywords": []}
+
+        # 1) 查 search_title_suggestions(B 方案)
+        title_rows = await self.session.execute(
+            select(
+                SearchTitleSuggestion.article_id,
+                SearchTitleSuggestion.published_at,
+                SearchTitleSuggestion.title_lang,
+            )
+            .where(SearchTitleSuggestion.prefix_keys.contains([q]))
+            .order_by(desc(SearchTitleSuggestion.published_at))
+            .limit(limit)
+        )
+        titles = [
+            {
+                "id": row.article_id,
+                "published_at": row.published_at.isoformat() if row.published_at else None,
+                "lang": row.title_lang,
+            }
+            for row in title_rows.all()
+        ]
+
+        # 2) 查 search_keywords(A 方案)
+        kw_rows = await self.session.execute(
+            select(SearchKeyword.keyword, SearchKeyword.weight, SearchKeyword.source)
+            .where(SearchKeyword.prefix_keys.contains([q]))
+            .order_by(desc(SearchKeyword.weight))
+            .limit(limit)
+        )
+        keywords = [
+            {"word": row.keyword, "weight": row.weight, "source": row.source}
+            for row in kw_rows.all()
+        ]
+
+        # 3) 冷启动 fallback:任一为空时,回退到实时 ILIKE articles
+        #    (如果两张固化表都跑空了,说明刚建库或数据被 truncate)
+        if not titles:
+            titles = await self._fallback_titles(q, limit)
+        if not keywords:
+            keywords = await self._fallback_keywords(q, limit)
+
+        return {"query": q, "titles": titles, "keywords": keywords}
+
+    async def _fallback_titles(self, q: str, limit: int) -> list[dict]:
+        """回退:实时查 articles.title_zh / title(走 B-tree 索引,慢但能用)。
+
+        - 优先 title_zh LIKE(翻译后),没有再 LIKE title(短新闻)
+        - 限制 7 天内的文章,避免返回太老的(冷启动场景下用户预期)
+        """
+        from datetime import datetime, timedelta, timezone
+
+        from sqlalchemy import or_
+
+        since = datetime.now(timezone.utc).replace(tzinfo=None) - timedelta(days=7)
+        like = f"{q}%"
+        stmt = (
+            select(Article.id, Article.published_at, Article.title_zh, Article.title)
+            .where(
+                Article.published_at >= since,
+                Article.duplicate_of.is_(None),
+                or_(
+                    Article.title_zh.ilike(like),
+                    Article.title.ilike(like),
+                ),
+            )
+            .order_by(desc(Article.published_at))
+            .limit(limit)
+        )
+        rows = (await self.session.execute(stmt)).all()
+        return [
+            {
+                "id": row.id,
+                "published_at": row.published_at.isoformat() if row.published_at else None,
+                "lang": "zh" if row.title_zh else "src",
+            }
+            for row in rows
+        ]
+
+    async def _fallback_keywords(self, q: str, limit: int) -> list[dict]:
+        """回退:ts_stat 实时聚合(慢但能用)。
+
+        - 从 articles.title_zh + body_zh_text 实时 to_tsvector
+        - 适用:search_keywords 表空 + ts_stat 之前的全量聚合
+        """
+        from sqlalchemy import text
+
+        sql = text(
+            """
+            SELECT word, nentry::int AS weight
+            FROM ts_stat(
+                'simple',
+                (
+                    SELECT to_tsvector(
+                        'simple',
+                        coalesce(title_zh, '') || ' ' || coalesce(body_zh_text, '')
+                    )
+                    FROM articles
+                    WHERE title_zh IS NOT NULL OR body_zh_text IS NOT NULL
+                )
+            )
+            WHERE word LIKE :prefix
+            ORDER BY nentry DESC
+            LIMIT :lim
+            """
+        )
+        rows = (
+            await self.session.execute(sql, {"prefix": f"{q}%", "lim": limit})
+        ).all()
+        return [{"word": r.word, "weight": r.weight, "source": "ts_stat_live"} for r in rows]