"""搜索建议服务:混合 A(高频词)+ B(真实标题) + 冷启动 fallback。 - A: search_keywords(prefix_keys @> ARRAY['美'], ORDER BY weight DESC) - B: search_title_suggestions(prefix_keys @> ARRAY['美'], ORDER BY published_at DESC) - fallback: 任一表空时回退实时 ILIKE 查 articles(冷启动 / worker 没刷新过) """ from __future__ import annotations import logging from sqlalchemy import desc, select from sqlalchemy.dialects.postgresql import ARRAY from sqlalchemy.ext.asyncio import AsyncSession from app.models.article import Article from app.models.search_keyword import SearchKeyword from app.models.search_title_suggestion import SearchTitleSuggestion logger = logging.getLogger("news.search") class SearchService: """搜索建议 service。 设计:输入 prefix,返回 { titles, keywords } 两组候选。 - titles 真实文章标题(按 published_at DESC 排) - keywords 高频词(按 weight DESC 排) - 任一为空时回退实时 articles.title_zh ILIKE 查询(冷启动兜底) """ def __init__(self, session: AsyncSession): self.session = session async def suggestions( self, q: str, limit: int = 10, ) -> dict[str, list[dict]]: """返回搜索建议。 Args: q: 前缀(1-20 字符) limit: 每组最多返回多少(默认 10,最大 20) Returns: {"query": q, "titles": [...], "keywords": [...]} titles 元素:{"id": article_id, "published_at": ...} keywords 元素:{"word": ..., "weight": ...} """ q = q.strip() if not q: return {"query": q, "titles": [], "keywords": []} # 1) 查 search_title_suggestions(B 方案) title_rows = await self.session.execute( select( SearchTitleSuggestion.article_id, SearchTitleSuggestion.published_at, SearchTitleSuggestion.title_lang, ) .where(SearchTitleSuggestion.prefix_keys.contains([q])) .order_by(desc(SearchTitleSuggestion.published_at)) .limit(limit) ) titles = [ { "id": row.article_id, "published_at": row.published_at.isoformat() if row.published_at else None, "lang": row.title_lang, } for row in title_rows.all() ] # 2) 查 search_keywords(A 方案) kw_rows = await self.session.execute( select(SearchKeyword.keyword, SearchKeyword.weight, SearchKeyword.source) .where(SearchKeyword.prefix_keys.contains([q])) .order_by(desc(SearchKeyword.weight)) .limit(limit) ) keywords = [ {"word": row.keyword, "weight": row.weight, "source": row.source} for row in kw_rows.all() ] # 3) 冷启动 fallback:任一为空时,回退到实时 ILIKE articles # (如果两张固化表都跑空了,说明刚建库或数据被 truncate) if not titles: titles = await self._fallback_titles(q, limit) if not keywords: keywords = await self._fallback_keywords(q, limit) return {"query": q, "titles": titles, "keywords": keywords} async def _fallback_titles(self, q: str, limit: int) -> list[dict]: """回退:实时查 articles.title_zh / title(走 B-tree 索引,慢但能用)。 - 优先 title_zh LIKE(翻译后),没有再 LIKE title(短新闻) - 限制 7 天内的文章,避免返回太老的(冷启动场景下用户预期) """ from datetime import datetime, timedelta, timezone from sqlalchemy import or_ since = datetime.now(timezone.utc).replace(tzinfo=None) - timedelta(days=7) like = f"{q}%" stmt = ( select(Article.id, Article.published_at, Article.title_zh, Article.title) .where( Article.published_at >= since, Article.duplicate_of.is_(None), or_( Article.title_zh.ilike(like), Article.title.ilike(like), ), ) .order_by(desc(Article.published_at)) .limit(limit) ) rows = (await self.session.execute(stmt)).all() return [ { "id": row.id, "published_at": row.published_at.isoformat() if row.published_at else None, "lang": "zh" if row.title_zh else "src", } for row in rows ] async def _fallback_keywords(self, q: str, limit: int) -> list[dict]: """回退:ts_stat 实时聚合(慢但能用)。 - 从 articles.title_zh + body_zh_text 实时 to_tsvector(chinese_zh) - 适用:search_keywords 表空 + worker 没刷新过 - ts_stat(text) 单参 — 第二参 weights mask 不能传 'a'(zhparser 不标 A 权重会 0 行) """ from sqlalchemy import text sql = text( """ SELECT word, nentry::int AS weight FROM ts_stat( $$SELECT to_tsvector('chinese_zh', coalesce(title_zh, '') || ' ' || coalesce(body_zh_text, '') ) FROM articles WHERE title_zh IS NOT NULL OR body_zh_text IS NOT NULL LIMIT 500$$ ) AS s WHERE word LIKE :prefix ORDER BY nentry DESC LIMIT :lim """ ) rows = ( await self.session.execute(sql, {"prefix": f"{q}%", "lim": limit}) ).all() return [{"word": r.word, "weight": r.weight, "source": "ts_stat_live"} for r in rows]