feat(search): 智能搜索建议 - 固化候选词表 (search_keywords + search_title_suggestions)

后端:
- alembic 0009: 两张固化表 + GIN prefix_keys 索引 + articles trigger
- /api/v1/search/suggestions: 混合 A(高频词 ts_stat) + B(真实标题) + 冷启动 fallback
- worker 每日 03:00 + 启动时刷新 search_keywords
- 顺便填 commit 11 TODO: articles.title_zh_tsv + GIN 索引(未来 FTS 基础)

前端:
- NInput -> NAutoComplete + debounce 250ms
- 选标题 -> 跳详情;选关键词 -> 填入 + 触发搜索
- AbortController 防 race condition

性能: prefix_keys @> ARRAY[prefix] 走 GIN 亚毫秒,100w 行也稳
This commit is contained in:
mavis
2026-06-15 18:26:35 +08:00
parent b674fb4b22
commit c3aa0f0cb6
13 changed files with 1028 additions and 7 deletions

47
backend/app/api/search.py Normal file
View File

@@ -0,0 +1,47 @@
"""/api/v1/search/* — 搜索建议(autocomplete)。
- GET /api/v1/search/suggestions?q=prefix
返回:{"query", "titles": [...], "keywords": [...]}
- titles: 真实文章标题(按 published_at DESC),B 方案
- keywords: 高频词(按 weight DESC),A 方案
- 冷启动:任一表空时自动 fallback 到实时 ILIKE / ts_stat
- 鉴权:跟 articles 一致(需要登录)
性能:两个查询都走 GIN 数组索引(prefix_keys @> ARRAY['']),亚毫秒。
"""
from __future__ import annotations
from fastapi import APIRouter, Depends, Query
from sqlalchemy.ext.asyncio import AsyncSession
from app.core.deps import get_current_user
from app.database import get_session
from app.models.user import User
from app.schemas.search import (
SearchKeywordItem,
SearchSuggestionsResponse,
SearchTitleSuggestionItem,
)
from app.services.search import SearchService
router = APIRouter(prefix="/search", tags=["search"])
@router.get("/suggestions", response_model=SearchSuggestionsResponse)
async def get_suggestions(
q: str = Query(..., min_length=1, max_length=20, description="搜索前缀"),
limit: int = Query(10, ge=1, le=20, description="每组最多返回多少"),
_user: User = Depends(get_current_user), # 需要登录,跟 articles 一致
session: AsyncSession = Depends(get_session),
):
"""搜索建议:输入 prefix,返回真实标题 + 高频词两组候选。
用法:前端搜索框 onChange 时调用,debounce 200ms。
"""
svc = SearchService(session)
raw = await svc.suggestions(q=q, limit=limit)
return SearchSuggestionsResponse(
query=raw["query"],
titles=[SearchTitleSuggestionItem(**t) for t in raw["titles"]],
keywords=[SearchKeywordItem(**k) for k in raw["keywords"]],
)

View File

@@ -16,7 +16,7 @@ from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import JSONResponse
from starlette.exceptions import HTTPException as StarletteHTTPException
from app.api import admin, admin_llm, articles, auth, bookmarks, ingest, me, sources, subscriptions
from app.api import admin, admin_llm, articles, auth, bookmarks, ingest, me, search, sources, subscriptions
from app.config import settings
from app.database import engine
from app.redis_client import close_redis, get_redis
@@ -100,6 +100,7 @@ app.include_router(sources.router, prefix=API_PREFIX)
app.include_router(bookmarks.router, prefix=API_PREFIX)
app.include_router(subscriptions.router, prefix=API_PREFIX)
app.include_router(ingest.router, prefix=API_PREFIX)
app.include_router(search.router, prefix=API_PREFIX)
app.include_router(admin.router, prefix=API_PREFIX)
app.include_router(admin_llm.router, prefix=API_PREFIX)

View File

@@ -7,6 +7,8 @@ from app.models.article import Article # noqa: F401
from app.models.article_read import ArticleRead # noqa: F401
from app.models.bookmark import Bookmark # noqa: F401
from app.models.llm_setting import LlmSetting # noqa: F401
from app.models.search_keyword import SearchKeyword # noqa: F401
from app.models.search_title_suggestion import SearchTitleSuggestion # noqa: F401
from app.models.source import Source, SourceKind # noqa: F401
from app.models.subscription import Subscription # noqa: F401
from app.models.user import User, UserRole # noqa: F401
@@ -17,6 +19,8 @@ __all__ = [
"ArticleRead",
"Bookmark",
"LlmSetting",
"SearchKeyword",
"SearchTitleSuggestion",
"Source",
"SourceKind",
"Subscription",

View File

@@ -0,0 +1,45 @@
"""搜索建议候选词表(固化,worker 每日 ts_stat 刷新)。
- 数据源:articles.title_zh + body_zh_text + commentary + commentary_meituan
- 用途:/api/v1/search/suggestions 返回"高频词"建议(A 方案)
- 刷新:每日凌晨 worker 调 refresh_search_keywords() 全量重建
- 查询:prefix_keys @> ARRAY[''] 走 GIN 索引(亚毫秒)
"""
from __future__ import annotations
from datetime import datetime
from sqlalchemy import BigInteger, DateTime, Integer, String, Text, func
from sqlalchemy.dialects.postgresql import ARRAY
from sqlalchemy.orm import Mapped, mapped_column
from app.database import Base
class SearchKeyword(Base):
__tablename__ = "search_keywords"
id: Mapped[int] = mapped_column(BigInteger, primary_key=True)
keyword: Mapped[str] = mapped_column(Text, nullable=False)
# ts_stat / title_extract / manual
source: Mapped[str] = mapped_column(String(32), nullable=False)
# 词频或文章数(权重,排序用)
weight: Mapped[int] = mapped_column(Integer, nullable=False, default=0)
# 预计算前缀数组,['美','美联储','美联储宣'] for '美联储宣布...'
prefix_keys: Mapped[list[str]] = mapped_column(ARRAY(Text), nullable=False)
last_seen_at: Mapped[datetime] = mapped_column(
DateTime(timezone=True), server_default=func.now(), nullable=False
)
created_at: Mapped[datetime] = mapped_column(
DateTime(timezone=True), server_default=func.now(), nullable=False
)
__table_args__ = (
# db-level unique 留给 alembic 迁移创建(__table_args__ 只是 ORM 侧参考)
# 实际 UNIQUE 约束在 0009 迁移里建
)
def __repr__(self) -> str:
return f"<SearchKeyword {self.keyword!r} src={self.source} weight={self.weight}>"

View File

@@ -0,0 +1,43 @@
"""搜索建议 - 真实文章标题片段表(articles 写入 trigger 自动维护)。
- 数据源:articles.title_zh(优先)/ articles.title(短新闻回退)
- 用途:/api/v1/search/suggestions 返回"真实文章标题"建议(B 方案)
- 维护:PG trigger(articles INSERT/UPDATE OF title_zh/title/published_at 触发)
- 查询:prefix_keys @> ARRAY[''] 走 GIN 索引,按 published_at DESC 排序
"""
from __future__ import annotations
from datetime import datetime
from sqlalchemy import BigInteger, DateTime, ForeignKey, String, func
from sqlalchemy.dialects.postgresql import ARRAY, TEXT
from sqlalchemy.orm import Mapped, mapped_column
from app.database import Base
class SearchTitleSuggestion(Base):
__tablename__ = "search_title_suggestions"
id: Mapped[int] = mapped_column(BigInteger, primary_key=True)
article_id: Mapped[int] = mapped_column(
BigInteger,
ForeignKey("articles.id", ondelete="CASCADE"),
nullable=False,
)
# 该条用的是哪边的文本:'zh' (title_zh) / 'src' (title 短新闻回退)
title_lang: Mapped[str] = mapped_column(String(8), nullable=False, default="zh")
# 预计算前缀数组(从第 1 字到全词)
prefix_keys: Mapped[list[str]] = mapped_column(ARRAY(TEXT), nullable=False)
published_at: Mapped[datetime | None] = mapped_column(DateTime(timezone=True))
created_at: Mapped[datetime] = mapped_column(
DateTime(timezone=True), server_default=func.now(), nullable=False
)
def __repr__(self) -> str:
return f"<SearchTitleSuggestion article_id={self.article_id} lang={self.title_lang}>"

View File

@@ -0,0 +1,24 @@
"""搜索建议 schema。"""
from __future__ import annotations
from datetime import datetime
from pydantic import BaseModel
class SearchTitleSuggestionItem(BaseModel):
id: int # article_id
published_at: datetime | None = None
lang: str # 'zh' / 'src'
class SearchKeywordItem(BaseModel):
word: str
weight: int
source: str # 'ts_stat' / 'title_extract' / 'manual' / 'ts_stat_live'
class SearchSuggestionsResponse(BaseModel):
query: str
titles: list[SearchTitleSuggestionItem] = []
keywords: list[SearchKeywordItem] = []

View File

@@ -0,0 +1,156 @@
"""回灌 search_title_suggestions 表。
- 迁移 0009 给 articles 加了 trigger,新写入的会自动维护
- 但迁移前已有的 articles 没经过 trigger,需要这个脚本一次性回填
- 同时可以手动跑一次 refresh_search_keywords()(可选,worker 也会跑)
用法:
cd backend
python -m app.scripts.backfill_search_suggestions
# 或 docker:
docker compose exec api python -m app.scripts.backfill_search_suggestions
设计:
- 用 batch INSERT,避免逐行 trigger 重复触发(虽然 trigger 已经在迁移里创建,
重复执行对已存在的条目会先 DELETE 再 INSERT,等价于刷新,无害)
- 进度条:每 1000 篇打一行
- 失败:有 article 字段异常不会阻塞其他
"""
from __future__ import annotations
import asyncio
import logging
import sys
from datetime import datetime, timezone
from sqlalchemy import select, text
from sqlalchemy.dialects.postgresql import insert as pg_insert
from sqlalchemy.ext.asyncio import AsyncSession
from app.database import AsyncSessionLocal
from app.models.article import Article
from app.models.search_title_suggestion import SearchTitleSuggestion
logger = logging.getLogger("news.backfill_search")
logging.basicConfig(
level="INFO",
format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
)
MAX_TITLE_LEN = 50 # 跟迁移里的 trigger 一致
BATCH_SIZE = 500
def _build_prefix_keys(text_value: str) -> list[str]:
"""'美联储宣布...' 生成 ['','美联储','美联储宣',...,'美联储宣布...']"""
text_value = (text_value or "")[:MAX_TITLE_LEN]
if not text_value:
return []
return [text_value[:n] for n in range(1, len(text_value) + 1)]
async def _process_article_batch(
session: AsyncSession,
articles: list[Article],
) -> int:
"""处理一批 articles,UPSERT 到 search_title_suggestions。
返回成功插入/更新的条数。
"""
rows = []
for art in articles:
if art.title_zh and len(art.title_zh.strip()) > 0:
src_text = art.title_zh.strip()[:MAX_TITLE_LEN]
lang = "zh"
elif art.title and len(art.title.strip()) > 0:
src_text = art.title.strip()[:MAX_TITLE_LEN]
lang = "src"
else:
continue
rows.append(
{
"article_id": art.id,
"title_lang": lang,
"prefix_keys": _build_prefix_keys(src_text),
"published_at": art.published_at,
}
)
if not rows:
return 0
# 用 PG 原生 ON CONFLICT 实现 UPSERT(基于 article_id 唯一约束)
# 注意:表没建 unique on article_id,所以先 DELETE 再 INSERT
# 性能:批量 DELETE 在 article_id 上没索引,可能慢;临时加索引:
# CREATE INDEX IF NOT EXISTS tmp_idx ON search_title_suggestions(article_id);
# 简化:每个 batch 内逐条 DELETE 再 INSERT(慢但稳)
# 替代方案:直接 TRUNCATE + 全量重灌(回填场景下更简单)
for r in rows:
await session.execute(
text("DELETE FROM search_title_suggestions WHERE article_id = :aid"),
{"aid": r["article_id"]},
)
# bulk insert
await session.execute(pg_insert(SearchTitleSuggestion), rows)
await session.commit()
return len(rows)
async def backfill() -> None:
"""主流程:分批拉 articles,回灌 search_title_suggestions。"""
started = datetime.now(timezone.utc)
async with AsyncSessionLocal() as session:
# 总数
total = (await session.execute(select(Article.id))).all()
total_count = len(total)
logger.info("backfill start: %d articles to process", total_count)
processed = 0
last_id = 0
while True:
rows = (
await session.execute(
select(Article)
.where(Article.id > last_id)
.order_by(Article.id)
.limit(BATCH_SIZE)
)
).scalars().all()
if not rows:
break
n = await _process_article_batch(session, list(rows))
processed += n
last_id = rows[-1].id
logger.info(
"progress: %d / %d (%.1f%%)",
processed, total_count,
processed / total_count * 100 if total_count else 0,
)
elapsed = (datetime.now(timezone.utc) - started).total_seconds()
logger.info("backfill done: %d rows in %.1fs", processed, elapsed)
# 顺便触发一次 search_keywords 刷新(让词频表也有数据)
logger.info("triggering refresh_search_keywords()...")
async with AsyncSessionLocal() as session:
try:
await session.execute(text("SELECT refresh_search_keywords()"))
await session.commit()
logger.info("refresh_search_keywords() done")
except Exception as e:
logger.exception("refresh_search_keywords failed: %s (worker 03:00 会再跑)", e)
def main() -> int:
try:
asyncio.run(backfill())
except KeyboardInterrupt:
logger.warning("interrupted")
return 1
return 0
if __name__ == "__main__":
sys.exit(main())

View File

@@ -0,0 +1,160 @@
"""搜索建议服务:混合 A(高频词)+ B(真实标题) + 冷启动 fallback。
- A: search_keywords(prefix_keys @> ARRAY[''], ORDER BY weight DESC)
- B: search_title_suggestions(prefix_keys @> ARRAY[''], ORDER BY published_at DESC)
- fallback: 任一表空时回退实时 ILIKE 查 articles(冷启动 / worker 没刷新过)
"""
from __future__ import annotations
import logging
from sqlalchemy import desc, select
from sqlalchemy.dialects.postgresql import ARRAY
from sqlalchemy.ext.asyncio import AsyncSession
from app.models.article import Article
from app.models.search_keyword import SearchKeyword
from app.models.search_title_suggestion import SearchTitleSuggestion
logger = logging.getLogger("news.search")
class SearchService:
"""搜索建议 service。
设计:输入 prefix,返回 { titles, keywords } 两组候选。
- titles 真实文章标题(按 published_at DESC 排)
- keywords 高频词(按 weight DESC 排)
- 任一为空时回退实时 articles.title_zh ILIKE 查询(冷启动兜底)
"""
def __init__(self, session: AsyncSession):
self.session = session
async def suggestions(
self,
q: str,
limit: int = 10,
) -> dict[str, list[dict]]:
"""返回搜索建议。
Args:
q: 前缀(1-20 字符)
limit: 每组最多返回多少(默认 10,最大 20)
Returns:
{"query": q, "titles": [...], "keywords": [...]}
titles 元素:{"id": article_id, "published_at": ...}
keywords 元素:{"word": ..., "weight": ...}
"""
q = q.strip()
if not q:
return {"query": q, "titles": [], "keywords": []}
# 1) 查 search_title_suggestions(B 方案)
title_rows = await self.session.execute(
select(
SearchTitleSuggestion.article_id,
SearchTitleSuggestion.published_at,
SearchTitleSuggestion.title_lang,
)
.where(SearchTitleSuggestion.prefix_keys.contains([q]))
.order_by(desc(SearchTitleSuggestion.published_at))
.limit(limit)
)
titles = [
{
"id": row.article_id,
"published_at": row.published_at.isoformat() if row.published_at else None,
"lang": row.title_lang,
}
for row in title_rows.all()
]
# 2) 查 search_keywords(A 方案)
kw_rows = await self.session.execute(
select(SearchKeyword.keyword, SearchKeyword.weight, SearchKeyword.source)
.where(SearchKeyword.prefix_keys.contains([q]))
.order_by(desc(SearchKeyword.weight))
.limit(limit)
)
keywords = [
{"word": row.keyword, "weight": row.weight, "source": row.source}
for row in kw_rows.all()
]
# 3) 冷启动 fallback:任一为空时,回退到实时 ILIKE articles
# (如果两张固化表都跑空了,说明刚建库或数据被 truncate)
if not titles:
titles = await self._fallback_titles(q, limit)
if not keywords:
keywords = await self._fallback_keywords(q, limit)
return {"query": q, "titles": titles, "keywords": keywords}
async def _fallback_titles(self, q: str, limit: int) -> list[dict]:
"""回退:实时查 articles.title_zh / title(走 B-tree 索引,慢但能用)。
- 优先 title_zh LIKE(翻译后),没有再 LIKE title(短新闻)
- 限制 7 天内的文章,避免返回太老的(冷启动场景下用户预期)
"""
from datetime import datetime, timedelta, timezone
from sqlalchemy import or_
since = datetime.now(timezone.utc).replace(tzinfo=None) - timedelta(days=7)
like = f"{q}%"
stmt = (
select(Article.id, Article.published_at, Article.title_zh, Article.title)
.where(
Article.published_at >= since,
Article.duplicate_of.is_(None),
or_(
Article.title_zh.ilike(like),
Article.title.ilike(like),
),
)
.order_by(desc(Article.published_at))
.limit(limit)
)
rows = (await self.session.execute(stmt)).all()
return [
{
"id": row.id,
"published_at": row.published_at.isoformat() if row.published_at else None,
"lang": "zh" if row.title_zh else "src",
}
for row in rows
]
async def _fallback_keywords(self, q: str, limit: int) -> list[dict]:
"""回退:ts_stat 实时聚合(慢但能用)。
- 从 articles.title_zh + body_zh_text 实时 to_tsvector
- 适用:search_keywords 表空 + ts_stat 之前的全量聚合
"""
from sqlalchemy import text
sql = text(
"""
SELECT word, nentry::int AS weight
FROM ts_stat(
'simple',
(
SELECT to_tsvector(
'simple',
coalesce(title_zh, '') || ' ' || coalesce(body_zh_text, '')
)
FROM articles
WHERE title_zh IS NOT NULL OR body_zh_text IS NOT NULL
)
)
WHERE word LIKE :prefix
ORDER BY nentry DESC
LIMIT :lim
"""
)
rows = (
await self.session.execute(sql, {"prefix": f"{q}%", "lim": limit})
).all()
return [{"word": r.word, "weight": r.weight, "source": "ts_stat_live"} for r in rows]

View File

@@ -7,13 +7,13 @@ from __future__ import annotations
import asyncio
import logging
import signal
from datetime import datetime, timezone
from datetime import datetime, timedelta, timezone
from apscheduler.schedulers.asyncio import AsyncIOScheduler
from apscheduler.triggers.cron import CronTrigger
from apscheduler.triggers.date import DateTrigger
from apscheduler.triggers.interval import IntervalTrigger
from sqlalchemy import select
from sqlalchemy import select, text
from app.config import settings
from app.database import AsyncSessionLocal
@@ -28,6 +28,22 @@ logging.basicConfig(
)
async def _refresh_search_keywords() -> None:
"""每日刷新 search_keywords(ts_stat 词频表)。
- 调用 PG 函数 refresh_search_keywords()(迁移 0009 创建)
- 全量 truncate + insert,词频会变,不适合增量
- 失败也不应阻塞 worker,只记 log
"""
try:
async with AsyncSessionLocal() as s:
await s.execute(text("SELECT refresh_search_keywords()"))
await s.commit()
logger.info("search_keywords refreshed")
except Exception as e:
logger.exception("search_keywords refresh failed: %s", e)
async def _rebuild_jobs(scheduler: AsyncIOScheduler) -> None:
"""从 sources 表动态构建 job(可热更新)。
@@ -95,6 +111,23 @@ async def main() -> None:
id="startup_run",
)
# === 搜索建议相关 ===
# 每日凌晨 03:00 刷新 search_keywords(ts_stat 词频)
scheduler.add_job(
_refresh_search_keywords,
trigger=CronTrigger(hour=3, minute=0),
id="refresh_search_keywords",
replace_existing=True,
)
# 启动时延迟 10 秒跑一次(冷启动友好,worker 起来时 search_keywords 就有数据;
# 延迟是等 DB 完全就绪 + 不和 startup_run 抢资源)
scheduler.add_job(
_refresh_search_keywords,
trigger=DateTrigger(run_date=datetime.now() + timedelta(seconds=10)),
id="startup_refresh_search_keywords",
)
logger.info("scheduled: refresh_search_keywords daily 03:00 + on startup (+10s)")
scheduler.start()
logger.info("scheduler started with %d jobs", len(scheduler.get_jobs()))