refactor(search): 只展示 keyword 续接词,去掉 titles 段

产品决定:搜索建议只展示 ts_stat 高频词续接(如'美'→美国/美军/美国政府), 不要真实文章 id 提示(用户认为这种'文章#566871'是噪音,没连续性)。改动: - SearchSuggestionsResponse 去 title,只剩 query + keywords - SearchService 只查 search_keywords,fallback 路径也只针对 keywords - Feed.vue: 删掉 suggestTitles 状态 + SuggestTitleOption 类型联合, renderSuggestion 简化成 '词' 标签 + 词文本 + 右侧 weight 数字 - 0011 迁移: 删 search_title_suggestions 表 + 3 索引 + trigger + 函数 (trigger 在每篇文章 INSERT/UPDATE 都会跑,删了能省掉无用性能损耗) - 删除: app/models/search_title_suggestion.py + backfill_search_suggestions.py 替换成: app/scripts/refresh_search_keywords.py(只跑一次词频刷新)
2026-06-15 19:37:40 +08:00
parent db4fd8699b
commit 85c05c19a7
10 changed files with 277 additions and 366 deletions
--- a/backend/app/api/search.py
+++ b/backend/app/api/search.py
@@ -1,13 +1,12 @@
-"""/api/v1/search/* — 搜索建议(autocomplete)。
+"""/api/v1/search/* — 搜索建议(autocomplete,纯 keyword 续接词)。

 - GET /api/v1/search/suggestions?q=prefix
-  返回:{"query", "titles": [...], "keywords": [...]}
-  - titles: 真实文章标题(按 published_at DESC),B 方案
-  - keywords: 高频词(按 weight DESC),A 方案
-  - 冷启动:任一表空时自动 fallback 到实时 ILIKE / ts_stat
+  返回:{"query", "keywords": [...]}
+  - keywords: 词频续接词(按 weight DESC),输入"美国"→ ["美国", "美国政府", "美国签证", ...]
+  - 冷启动:search_keywords 表空时自动 fallback 到实时 ts_stat
 - 鉴权:跟 articles 一致(需要登录)

-性能:两个查询都走 GIN 数组索引(prefix_keys @> ARRAY['美']),亚毫秒。
+性能:prefix_keys @> ARRAY['美'] 走 GIN 数组索引,亚毫秒。
 """
 from __future__ import annotations

@@ -17,11 +16,7 @@ from sqlalchemy.ext.asyncio import AsyncSession
 from app.core.deps import get_current_user
 from app.database import get_session
 from app.models.user import User
-from app.schemas.search import (
-    SearchKeywordItem,
-    SearchSuggestionsResponse,
-    SearchTitleSuggestionItem,
-)
+from app.schemas.search import SearchKeywordItem, SearchSuggestionsResponse
 from app.services.search import SearchService

 router = APIRouter(prefix="/search", tags=["search"])
@@ -30,18 +25,18 @@ router = APIRouter(prefix="/search", tags=["search"])
@router.get("/suggestions", response_model=SearchSuggestionsResponse)
 async def get_suggestions(
    q: str = Query(..., min_length=1, max_length=20, description="搜索前缀"),
-    limit: int = Query(10, ge=1, le=20, description="每组最多返回多少"),
+    limit: int = Query(10, ge=1, le=20, description="最多返回多少"),
    _user: User = Depends(get_current_user),  # 需要登录,跟 articles 一致
    session: AsyncSession = Depends(get_session),
 ):
-    """搜索建议:输入 prefix,返回真实标题 + 高频词两组候选。
+    """搜索建议:输入 prefix,返回高频词续接。

-    用法:前端搜索框 onChange 时调用,debounce 200ms。
+    用法:前端搜索框 onChange 时调用,debounce 250ms。
+    选词 → 自动填入 q + 触发搜索。
    """
    svc = SearchService(session)
    raw = await svc.suggestions(q=q, limit=limit)
    return SearchSuggestionsResponse(
        query=raw["query"],
-        titles=[SearchTitleSuggestionItem(**t) for t in raw["titles"]],
        keywords=[SearchKeywordItem(**k) for k in raw["keywords"]],
    )
--- a/backend/app/models/init.py
+++ b/backend/app/models/init.py
@@ -8,7 +8,6 @@ from app.models.article_read import ArticleRead  # noqa: F401
 from app.models.bookmark import Bookmark  # noqa: F401
 from app.models.llm_setting import LlmSetting  # noqa: F401
 from app.models.search_keyword import SearchKeyword  # noqa: F401
-from app.models.search_title_suggestion import SearchTitleSuggestion  # noqa: F401
 from app.models.source import Source, SourceKind  # noqa: F401
 from app.models.subscription import Subscription  # noqa: F401
 from app.models.user import User, UserRole  # noqa: F401
@@ -20,7 +19,6 @@ __all__ = [
    "Bookmark",
    "LlmSetting",
    "SearchKeyword",
-    "SearchTitleSuggestion",
    "Source",
    "SourceKind",
    "Subscription",
--- a/backend/app/models/search_title_suggestion.py
+++ b/backend/app/models/search_title_suggestion.py
@@ -1,43 +0,0 @@
-"""搜索建议 - 真实文章标题片段表(articles 写入 trigger 自动维护)。
-
- 数据源:articles.title_zh(优先)/ articles.title(短新闻回退)
- 用途:/api/v1/search/suggestions 返回"真实文章标题"建议(B 方案)
- 维护:PG trigger(articles INSERT/UPDATE OF title_zh/title/published_at 触发)
- 查询:prefix_keys @> ARRAY['美'] 走 GIN 索引,按 published_at DESC 排序
-"""
-from __future__ import annotations
-
-from datetime import datetime
-
-from sqlalchemy import BigInteger, DateTime, ForeignKey, String, func
-from sqlalchemy.dialects.postgresql import ARRAY, TEXT
-from sqlalchemy.orm import Mapped, mapped_column
-
-from app.database import Base
-
-
-class SearchTitleSuggestion(Base):
-    __tablename__ = "search_title_suggestions"
-
-    id: Mapped[int] = mapped_column(BigInteger, primary_key=True)
-
-    article_id: Mapped[int] = mapped_column(
-        BigInteger,
-        ForeignKey("articles.id", ondelete="CASCADE"),
-        nullable=False,
-    )
-
-    # 该条用的是哪边的文本:'zh' (title_zh) / 'src' (title 短新闻回退)
-    title_lang: Mapped[str] = mapped_column(String(8), nullable=False, default="zh")
-
-    # 预计算前缀数组(从第 1 字到全词)
-    prefix_keys: Mapped[list[str]] = mapped_column(ARRAY(TEXT), nullable=False)
-
-    published_at: Mapped[datetime | None] = mapped_column(DateTime(timezone=True))
-
-    created_at: Mapped[datetime] = mapped_column(
-        DateTime(timezone=True), server_default=func.now(), nullable=False
-    )
-
-    def __repr__(self) -> str:
-        return f"<SearchTitleSuggestion article_id={self.article_id} lang={self.title_lang}>"
--- a/backend/app/schemas/search.py
+++ b/backend/app/schemas/search.py
@@ -1,24 +1,15 @@
-"""搜索建议 schema。"""
+"""搜索建议 schema(纯 keyword 续接词)。"""
 from __future__ import annotations

-from datetime import datetime
-
 from pydantic import BaseModel


-class SearchTitleSuggestionItem(BaseModel):
-    id: int  # article_id
-    published_at: datetime | None = None
-    lang: str  # 'zh' / 'src'
-
-
 class SearchKeywordItem(BaseModel):
    word: str
    weight: int
-    source: str  # 'ts_stat' / 'title_extract' / 'manual' / 'ts_stat_live'
+    source: str  # 'ts_stat' / 'ts_stat_live'


 class SearchSuggestionsResponse(BaseModel):
    query: str
-    titles: list[SearchTitleSuggestionItem] = []
    keywords: list[SearchKeywordItem] = []
--- a/backend/app/scripts/backfill_search_suggestions.py
+++ b/backend/app/scripts/backfill_search_suggestions.py
@@ -1,156 +0,0 @@
-"""回灌 search_title_suggestions 表。
-
- 迁移 0009 给 articles 加了 trigger,新写入的会自动维护
- 但迁移前已有的 articles 没经过 trigger,需要这个脚本一次性回填
- 同时可以手动跑一次 refresh_search_keywords()(可选,worker 也会跑)
-
-用法:
-  cd backend
-  python -m app.scripts.backfill_search_suggestions
-  # 或 docker:
-  docker compose exec api python -m app.scripts.backfill_search_suggestions
-
-设计:
- 用 batch INSERT,避免逐行 trigger 重复触发(虽然 trigger 已经在迁移里创建,
-  重复执行对已存在的条目会先 DELETE 再 INSERT,等价于刷新,无害)
- 进度条:每 1000 篇打一行
- 失败:有 article 字段异常不会阻塞其他
-"""
-from __future__ import annotations
-
-import asyncio
-import logging
-import sys
-from datetime import datetime, timezone
-
-from sqlalchemy import select, text
-from sqlalchemy.dialects.postgresql import insert as pg_insert
-from sqlalchemy.ext.asyncio import AsyncSession
-
-from app.database import AsyncSessionLocal
-from app.models.article import Article
-from app.models.search_title_suggestion import SearchTitleSuggestion
-
-logger = logging.getLogger("news.backfill_search")
-logging.basicConfig(
-    level="INFO",
-    format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
-)
-
-
-MAX_TITLE_LEN = 50  # 跟迁移里的 trigger 一致
-BATCH_SIZE = 500
-
-
-def _build_prefix_keys(text_value: str) -> list[str]:
-    """从 '美联储宣布...' 生成 ['美','美联储','美联储宣',...,'美联储宣布...']"""
-    text_value = (text_value or "")[:MAX_TITLE_LEN]
-    if not text_value:
-        return []
-    return [text_value[:n] for n in range(1, len(text_value) + 1)]
-
-
-async def _process_article_batch(
-    session: AsyncSession,
-    articles: list[Article],
-) -> int:
-    """处理一批 articles,UPSERT 到 search_title_suggestions。
-
-    返回成功插入/更新的条数。
-    """
-    rows = []
-    for art in articles:
-        if art.title_zh and len(art.title_zh.strip()) > 0:
-            src_text = art.title_zh.strip()[:MAX_TITLE_LEN]
-            lang = "zh"
-        elif art.title and len(art.title.strip()) > 0:
-            src_text = art.title.strip()[:MAX_TITLE_LEN]
-            lang = "src"
-        else:
-            continue
-
-        rows.append(
-            {
-                "article_id": art.id,
-                "title_lang": lang,
-                "prefix_keys": _build_prefix_keys(src_text),
-                "published_at": art.published_at,
-            }
-        )
-
-    if not rows:
-        return 0
-
-    # 用 PG 原生 ON CONFLICT 实现 UPSERT(基于 article_id 唯一约束)
-    # 注意:表没建 unique on article_id,所以先 DELETE 再 INSERT
-    # 性能:批量 DELETE 在 article_id 上没索引,可能慢;临时加索引:
-    #   CREATE INDEX IF NOT EXISTS tmp_idx ON search_title_suggestions(article_id);
-    # 简化:每个 batch 内逐条 DELETE 再 INSERT(慢但稳)
-    #   替代方案:直接 TRUNCATE + 全量重灌(回填场景下更简单)
-    for r in rows:
-        await session.execute(
-            text("DELETE FROM search_title_suggestions WHERE article_id = :aid"),
-            {"aid": r["article_id"]},
-        )
-    # bulk insert
-    await session.execute(pg_insert(SearchTitleSuggestion), rows)
-    await session.commit()
-    return len(rows)
-
-
-async def backfill() -> None:
-    """主流程:分批拉 articles,回灌 search_title_suggestions。"""
-    started = datetime.now(timezone.utc)
-    async with AsyncSessionLocal() as session:
-        # 总数
-        total = (await session.execute(select(Article.id))).all()
-        total_count = len(total)
-        logger.info("backfill start: %d articles to process", total_count)
-
-        processed = 0
-        last_id = 0
-        while True:
-            rows = (
-                await session.execute(
-                    select(Article)
-                    .where(Article.id > last_id)
-                    .order_by(Article.id)
-                    .limit(BATCH_SIZE)
-                )
-            ).scalars().all()
-            if not rows:
-                break
-            n = await _process_article_batch(session, list(rows))
-            processed += n
-            last_id = rows[-1].id
-            logger.info(
-                "progress: %d / %d (%.1f%%)",
-                processed, total_count,
-                processed / total_count * 100 if total_count else 0,
-            )
-
-    elapsed = (datetime.now(timezone.utc) - started).total_seconds()
-    logger.info("backfill done: %d rows in %.1fs", processed, elapsed)
-
-    # 顺便触发一次 search_keywords 刷新(让词频表也有数据)
-    logger.info("triggering refresh_search_keywords()...")
-    async with AsyncSessionLocal() as session:
-        try:
-            await session.execute(text("SELECT refresh_search_keywords()"))
-            await session.commit()
-            logger.info("refresh_search_keywords() done")
-        except Exception as e:
-            logger.exception("refresh_search_keywords failed: %s (worker 03:00 会再跑)", e)
-
-
-def main() -> int:
-    try:
-        asyncio.run(backfill())
-    except KeyboardInterrupt:
-        logger.warning("interrupted")
-        return 1
-    return 0
-
-
-if __name__ == "__main__":
-    sys.exit(main())
--- a/backend/app/scripts/refresh_search_keywords.py
+++ b/backend/app/scripts/refresh_search_keywords.py
@@ -0,0 +1,48 @@
+"""刷新 search_keywords(立即跑一次,不依赖 worker 03:00 调度)。
+
+历史:
+- 最初版本是回灌 search_title_suggestions(articles trigger 维护的真实标题)
+- 0011 迁移删了 search_title_suggestions(产品决定只展示 keyword 续接词)
+- 现在脚本只做一件事:立即跑一次 refresh_search_keywords()
+
+用法:
+  docker compose exec api python -m app.scripts.refresh_search_keywords
+  # 预期: search_keywords refreshed
+
+性能:ts_stat 1545 篇文章全量聚合 ~88s(每天 worker 03:00 会自动跑一次,通常不需要手动)
+"""
+from __future__ import annotations
+
+import asyncio
+import logging
+import sys
+
+from sqlalchemy import text
+
+from app.database import AsyncSessionLocal
+
+logger = logging.getLogger("news.refresh_keywords")
+logging.basicConfig(
+    level="INFO",
+    format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
+)
+
+
+async def refresh() -> None:
+    async with AsyncSessionLocal() as s:
+        await s.execute(text("SELECT refresh_search_keywords()"))
+        await s.commit()
+    logger.info("search_keywords refreshed")
+
+
+def main() -> int:
+    try:
+        asyncio.run(refresh())
+    except KeyboardInterrupt:
+        logger.warning("interrupted")
+        return 1
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
--- a/backend/app/services/search.py
+++ b/backend/app/services/search.py
@@ -1,31 +1,27 @@
-"""搜索建议服务:混合 A(高频词)+ B(真实标题) + 冷启动 fallback。
+"""搜索建议服务:纯 keyword 续接词(高频词)。

- A: search_keywords(prefix_keys @> ARRAY['美'], ORDER BY weight DESC)
- B: search_title_suggestions(prefix_keys @> ARRAY['美'], ORDER BY published_at DESC)
- fallback: 任一表空时回退实时 ILIKE 查 articles(冷启动 / worker 没刷新过)
+- search_keywords(prefix_keys @> ARRAY['美'], ORDER BY weight DESC)
+- fallback: 表空时回退实时 ts_stat(冷启动 / worker 没刷新过)
 """
 from __future__ import annotations

 import logging

 from sqlalchemy import desc, select
-from sqlalchemy.dialects.postgresql import ARRAY
 from sqlalchemy.ext.asyncio import AsyncSession

-from app.models.article import Article
 from app.models.search_keyword import SearchKeyword
-from app.models.search_title_suggestion import SearchTitleSuggestion

 logger = logging.getLogger("news.search")


 class SearchService:
-    """搜索建议 service。
+    """搜索建议 service — 仅返回 keyword 续接词。

-    设计:输入 prefix,返回 { titles, keywords } 两组候选。
-    - titles 真实文章标题(按 published_at DESC 排)
-    - keywords 高频词(按 weight DESC 排)
-    - 任一为空时回退实时 articles.title_zh ILIKE 查询(冷启动兜底)
+    设计:输入 prefix,返回 { query, keywords[] }。
+    - keywords 是 ts_stat 聚合后的高频词(从 articles.title_zh + body_zh_text + commentary 算)
+    - 用 GIN 数组索引 prefix_keys @> ARRAY['前缀'],亚毫秒
+    - 表空时回退到实时 ts_stat 聚合(慢但能用)
    """

    def __init__(self, session: AsyncSession):
@@ -36,42 +32,20 @@ class SearchService:
        q: str,
        limit: int = 10,
    ) -> dict[str, list[dict]]:
-        """返回搜索建议。
+        """返回搜索建议(仅 keywords)。

        Args:
            q: 前缀(1-20 字符)
-            limit: 每组最多返回多少(默认 10,最大 20)
+            limit: 最多返回多少(默认 10,最大 20)

        Returns:
-            {"query": q, "titles": [...], "keywords": [...]}
-            titles 元素:{"id": article_id, "published_at": ...}
-            keywords 元素:{"word": ..., "weight": ...}
+            {"query": q, "keywords": [{"word", "weight", "source"}, ...]}
        """
        q = q.strip()
        if not q:
-            return {"query": q, "titles": [], "keywords": []}
+            return {"query": q, "keywords": []}

-        # 1) 查 search_title_suggestions(B 方案)
-        title_rows = await self.session.execute(
-            select(
-                SearchTitleSuggestion.article_id,
-                SearchTitleSuggestion.published_at,
-                SearchTitleSuggestion.title_lang,
-            )
-            .where(SearchTitleSuggestion.prefix_keys.contains([q]))
-            .order_by(desc(SearchTitleSuggestion.published_at))
-            .limit(limit)
-        )
-        titles = [
-            {
-                "id": row.article_id,
-                "published_at": row.published_at.isoformat() if row.published_at else None,
-                "lang": row.title_lang,
-            }
-            for row in title_rows.all()
-        ]
-
-        # 2) 查 search_keywords(A 方案)
+        # 1) 查 search_keywords(GIN 数组包含,亚毫秒)
        kw_rows = await self.session.execute(
            select(SearchKeyword.keyword, SearchKeyword.weight, SearchKeyword.source)
            .where(SearchKeyword.prefix_keys.contains([q]))
@@ -83,49 +57,11 @@ class SearchService:
            for row in kw_rows.all()
        ]

-        # 3) 冷启动 fallback:任一为空时,回退到实时 ILIKE articles
-        #    (如果两张固化表都跑空了,说明刚建库或数据被 truncate)
-        if not titles:
-            titles = await self._fallback_titles(q, limit)
+        # 2) 冷启动 fallback:表空时回退到实时 ts_stat 聚合
        if not keywords:
            keywords = await self._fallback_keywords(q, limit)

-        return {"query": q, "titles": titles, "keywords": keywords}
-
-    async def _fallback_titles(self, q: str, limit: int) -> list[dict]:
-        """回退:实时查 articles.title_zh / title(走 B-tree 索引,慢但能用)。
-
-        - 优先 title_zh LIKE(翻译后),没有再 LIKE title(短新闻)
-        - 限制 7 天内的文章,避免返回太老的(冷启动场景下用户预期)
-        """
-        from datetime import datetime, timedelta, timezone
-
-        from sqlalchemy import or_
-
-        since = datetime.now(timezone.utc).replace(tzinfo=None) - timedelta(days=7)
-        like = f"{q}%"
-        stmt = (
-            select(Article.id, Article.published_at, Article.title_zh, Article.title)
-            .where(
-                Article.published_at >= since,
-                Article.duplicate_of.is_(None),
-                or_(
-                    Article.title_zh.ilike(like),
-                    Article.title.ilike(like),
-                ),
-            )
-            .order_by(desc(Article.published_at))
-            .limit(limit)
-        )
-        rows = (await self.session.execute(stmt)).all()
-        return [
-            {
-                "id": row.id,
-                "published_at": row.published_at.isoformat() if row.published_at else None,
-                "lang": "zh" if row.title_zh else "src",
-            }
-            for row in rows
-        ]
+        return {"query": q, "keywords": keywords}

    async def _fallback_keywords(self, q: str, limit: int) -> list[dict]:
        """回退:ts_stat 实时聚合(慢但能用)。