diary-news/backend/app/services/search.py

"""搜索建议服务:混合 A(高频词)+ B(真实标题) + 冷启动 fallback。

- A: search_keywords(prefix_keys @> ARRAY['美'], ORDER BY weight DESC)
- B: search_title_suggestions(prefix_keys @> ARRAY['美'], ORDER BY published_at DESC)
- fallback: 任一表空时回退实时 ILIKE 查 articles(冷启动 / worker 没刷新过)
"""
from __future__ import annotations

import logging

from sqlalchemy import desc, select
from sqlalchemy.dialects.postgresql import ARRAY
from sqlalchemy.ext.asyncio import AsyncSession

from app.models.article import Article
from app.models.search_keyword import SearchKeyword
from app.models.search_title_suggestion import SearchTitleSuggestion

logger = logging.getLogger("news.search")


class SearchService:
    """搜索建议 service。

    设计:输入 prefix,返回 { titles, keywords } 两组候选。
    - titles 真实文章标题(按 published_at DESC 排)
    - keywords 高频词(按 weight DESC 排)
    - 任一为空时回退实时 articles.title_zh ILIKE 查询(冷启动兜底)
    """

    def __init__(self, session: AsyncSession):
        self.session = session

    async def suggestions(
        self,
        q: str,
        limit: int = 10,
    ) -> dict[str, list[dict]]:
        """返回搜索建议。

        Args:
            q: 前缀(1-20 字符)
            limit: 每组最多返回多少(默认 10,最大 20)

        Returns:
            {"query": q, "titles": [...], "keywords": [...]}
            titles 元素:{"id": article_id, "published_at": ...}
            keywords 元素:{"word": ..., "weight": ...}
        """
        q = q.strip()
        if not q:
            return {"query": q, "titles": [], "keywords": []}

        # 1) 查 search_title_suggestions(B 方案)
        title_rows = await self.session.execute(
            select(
                SearchTitleSuggestion.article_id,
                SearchTitleSuggestion.published_at,
                SearchTitleSuggestion.title_lang,
            )
            .where(SearchTitleSuggestion.prefix_keys.contains([q]))
            .order_by(desc(SearchTitleSuggestion.published_at))
            .limit(limit)
        )
        titles = [
            {
                "id": row.article_id,
                "published_at": row.published_at.isoformat() if row.published_at else None,
                "lang": row.title_lang,
            }
            for row in title_rows.all()
        ]

        # 2) 查 search_keywords(A 方案)
        kw_rows = await self.session.execute(
            select(SearchKeyword.keyword, SearchKeyword.weight, SearchKeyword.source)
            .where(SearchKeyword.prefix_keys.contains([q]))
            .order_by(desc(SearchKeyword.weight))
            .limit(limit)
        )
        keywords = [
            {"word": row.keyword, "weight": row.weight, "source": row.source}
            for row in kw_rows.all()
        ]

        # 3) 冷启动 fallback:任一为空时,回退到实时 ILIKE articles
        #    (如果两张固化表都跑空了,说明刚建库或数据被 truncate)
        if not titles:
            titles = await self._fallback_titles(q, limit)
        if not keywords:
            keywords = await self._fallback_keywords(q, limit)

        return {"query": q, "titles": titles, "keywords": keywords}

    async def _fallback_titles(self, q: str, limit: int) -> list[dict]:
        """回退:实时查 articles.title_zh / title(走 B-tree 索引,慢但能用)。

        - 优先 title_zh LIKE(翻译后),没有再 LIKE title(短新闻)
        - 限制 7 天内的文章,避免返回太老的(冷启动场景下用户预期)
        """
        from datetime import datetime, timedelta, timezone

        from sqlalchemy import or_

        since = datetime.now(timezone.utc).replace(tzinfo=None) - timedelta(days=7)
        like = f"{q}%"
        stmt = (
            select(Article.id, Article.published_at, Article.title_zh, Article.title)
            .where(
                Article.published_at >= since,
                Article.duplicate_of.is_(None),
                or_(
                    Article.title_zh.ilike(like),
                    Article.title.ilike(like),
                ),
            )
            .order_by(desc(Article.published_at))
            .limit(limit)
        )
        rows = (await self.session.execute(stmt)).all()
        return [
            {
                "id": row.id,
                "published_at": row.published_at.isoformat() if row.published_at else None,
                "lang": "zh" if row.title_zh else "src",
            }
            for row in rows
        ]

    async def _fallback_keywords(self, q: str, limit: int) -> list[dict]:
        """回退:ts_stat 实时聚合(慢但能用)。

        - 从 articles.title_zh + body_zh_text 实时 to_tsvector(chinese_zh)
        - 适用:search_keywords 表空 + worker 没刷新过
        - ts_stat(text) 单参 — 第二参 weights mask 不能传 'a'(zhparser 不标 A 权重会 0 行)
        """
        from sqlalchemy import text

        sql = text(
            """
            SELECT word, nentry::int AS weight
            FROM ts_stat(
                $$SELECT to_tsvector('chinese_zh',
                    coalesce(title_zh, '') || ' ' || coalesce(body_zh_text, '')
                )
                FROM articles
                WHERE title_zh IS NOT NULL OR body_zh_text IS NOT NULL
                LIMIT 500$$
            ) AS s
            WHERE word LIKE :prefix
            ORDER BY nentry DESC
            LIMIT :lim
            """
        )
        rows = (
            await self.session.execute(sql, {"prefix": f"{q}%", "lim": limit})
        ).all()
        return [{"word": r.word, "weight": r.weight, "source": "ts_stat_live"} for r in rows]