diary-news/backend/alembic/versions/0009_search_suggestions.py

"""搜索建议固化表 + 触发器

设计:
- search_keywords:        ts_stat 词频表,worker 每日凌晨刷新(全量重建)
- search_title_suggestions: 文章标题片段表,articles trigger 实时维护(增删改 title_zh 时)
- 两表都有 prefix_keys text[] + GIN 索引,查询走 prefix_keys @> ARRAY['前缀']
  (比 LIKE '%xxx%' 快 10-100x,100w 行也是亚毫秒)

数据源:
- search_title_suggestions:  articles.title_zh
  - 包括翻译后的中文标题 + 短新闻的原文(短新闻 title_zh 为空时回退 title)
  - 触发器只维护这一张表(写入频繁,实时)
- search_keywords:
  - ts_stat 词频(从 articles.title_zh + body_zh_text + commentary + commentary_meituan 算)
  - 每日 worker 全量刷新(词频聚合慢,不适合每篇文章 trigger)

顺手把 commit 11 提到的 full-text search 基础做完:
- articles.title_zh_tsv tsvector 列 + GIN 索引(用于未来 query 走 FTS)
- 触发器自动维护

Revision ID: 0009
Revises: 0008
Create Date: 2026-06-15
"""
from __future__ import annotations

from typing import Sequence, Union

import sqlalchemy as sa
from alembic import op
from sqlalchemy.dialects.postgresql import ARRAY, TSVECTOR


revision: str = "0009"
down_revision: Union[str, None] = "0008"
branch_labels = None
depends_on = None


def upgrade() -> None:
    # === 1) articles: 加 tsvector 列 + GIN 索引(full-text search 基础)===
    # 用 'simple' parser:对中文按字符切,免装 zhparser 扩展;
    # simple parser 对英文也 OK(按空格切),通用。
    # future:搜索 ?q=xxx 可以改走 to_tsquery,这里只先建基础设施。
    op.add_column(
        "articles",
        sa.Column(
            "title_zh_tsv",
            TSVECTOR,
            sa.Computed(
                "to_tsvector('simple', coalesce(title_zh, ''))",
                persisted=True,
            ),
        ),
    )
    op.create_index(
        "ix_articles_title_zh_tsv",
        "articles",
        ["title_zh_tsv"],
        postgresql_using="gin",
    )

    # === 2) search_keywords: 词频候选词表 ===
    op.create_table(
        "search_keywords",
        sa.Column("id", sa.BigInteger, primary_key=True),
        sa.Column("keyword", sa.Text, nullable=False),
        # ts_stat / title_extract / manual
        sa.Column("source", sa.String(32), nullable=False),
        # 词频或文章数(权重)
        sa.Column("weight", sa.Integer, nullable=False, server_default="0"),
        # 预计算的前缀数组,['美','美联储','美联储宣'] for '美联储宣布...'
        # 查询: prefix_keys @> ARRAY['美'] 走 GIN 索引
        sa.Column("prefix_keys", ARRAY(sa.Text), nullable=False),
        sa.Column(
            "last_seen_at",
            sa.DateTime(timezone=True),
            nullable=False,
            server_default=sa.text("now()"),
        ),
        sa.Column(
            "created_at",
            sa.DateTime(timezone=True),
            nullable=False,
            server_default=sa.text("now()"),
        ),
        sa.UniqueConstraint("keyword", "source", name="uq_search_keywords_kw_src"),
    )
    op.create_index(
        "ix_search_keywords_prefix",
        "search_keywords",
        ["prefix_keys"],
        postgresql_using="gin",
    )
    op.create_index(
        "ix_search_keywords_source_weight",
        "search_keywords",
        ["source", "weight"],
    )
    op.create_index(
        "ix_search_keywords_keyword_btree",
        "search_keywords",
        ["keyword"],
    )

    # === 3) search_title_suggestions: 真实文章标题片段表 ===
    op.create_table(
        "search_title_suggestions",
        sa.Column("id", sa.BigInteger, primary_key=True),
        sa.Column(
            "article_id",
            sa.BigInteger,
            sa.ForeignKey("articles.id", ondelete="CASCADE"),
            nullable=False,
        ),
        # 用的字段:'title_zh' / 'title' (短新闻回退)
        sa.Column("title_lang", sa.String(8), nullable=False, server_default="zh"),
        sa.Column("prefix_keys", ARRAY(sa.Text), nullable=False),
        sa.Column("published_at", sa.DateTime(timezone=True), nullable=True),
        sa.Column(
            "created_at",
            sa.DateTime(timezone=True),
            nullable=False,
            server_default=sa.text("now()"),
        ),
    )
    op.create_index(
        "ix_search_title_suggestions_prefix",
        "search_title_suggestions",
        ["prefix_keys"],
        postgresql_using="gin",
    )
    op.create_index(
        "ix_search_title_suggestions_article",
        "search_title_suggestions",
        ["article_id"],
    )
    op.create_index(
        "ix_search_title_suggestions_published",
        "search_title_suggestions",
        ["published_at"],
    )

    # === 4) articles 触发器:title_zh 写时自动维护 search_title_suggestions ===
    # 触发器函数:删除旧条目,按优先级(title_zh > title)插新条目
    # 限制:标题字符数 > 50 时只取前 50 字符(prefix_keys 数组不会爆炸)
    op.execute(
        """
        CREATE OR REPLACE FUNCTION rebuild_title_suggestions() RETURNS TRIGGER AS $$
        DECLARE
            src_text text;
            src_lang text;
            max_len int := 50;
        BEGIN
            -- 先删掉该文章旧条目
            DELETE FROM search_title_suggestions WHERE article_id = NEW.id;

            -- 优先用 title_zh(翻译后),没有再回退 title(短新闻路径)
            IF NEW.title_zh IS NOT NULL AND length(NEW.title_zh) > 0 THEN
                src_text := NEW.title_zh;
                src_lang := 'zh';
            ELSIF NEW.title IS NOT NULL AND length(NEW.title) > 0 THEN
                src_text := NEW.title;
                src_lang := 'src';
            ELSE
                RETURN NEW;
            END IF;

            -- 截断到 max_len 字符(prefix_keys 长度可控)
            src_text := substring(src_text, 1, max_len);

            -- 插入一条,prefix_keys 是从第 1 字到全词的所有前缀
            INSERT INTO search_title_suggestions
                (article_id, title_lang, prefix_keys, published_at)
            SELECT
                NEW.id,
                src_lang,
                ARRAY(
                    SELECT substring(src_text, 1, n)
                    FROM generate_series(1, length(src_text)) AS n
                ),
                NEW.published_at;

            RETURN NEW;
        END;
        $$ LANGUAGE plpgsql;
        """
    )

    op.execute(
        """
        CREATE TRIGGER trg_articles_rebuild_title_suggestions
        AFTER INSERT OR UPDATE OF title_zh, title, published_at ON articles
        FOR EACH ROW EXECUTE FUNCTION rebuild_title_suggestions();
        """
    )

    # === 5) articles 删除时清理 ===
    # 用 ON DELETE CASCADE 即可,不用单独触发器

    # === 6) search_keywords 刷新函数(给 worker 调用) ===
    # 设计:全量 truncate + insert(用 ts_stat + 标题聚合)
    # 调用方式: SELECT refresh_search_keywords();
    op.execute(
        """
        CREATE OR REPLACE FUNCTION refresh_search_keywords() RETURNS void AS $$
        BEGIN
            TRUNCATE search_keywords;

            -- A) ts_stat 词频(title_zh + body_zh_text + commentary)
            INSERT INTO search_keywords (keyword, source, weight, prefix_keys)
            SELECT
                word,
                'ts_stat',
                nentry::int,
                ARRAY(
                    SELECT substring(word, 1, n)
                    FROM generate_series(1, length(word)) AS n
                )
            FROM ts_stat(
                $$SELECT to_tsvector('simple',
                    coalesce(title_zh, '') || ' ' ||
                    coalesce(body_zh_text, '') || ' ' ||
                    coalesce(commentary, '') || ' ' ||
                    coalesce(commentary_meituan, '')
                )
                FROM articles
                WHERE title_zh IS NOT NULL
                   OR body_zh_text IS NOT NULL
                   OR commentary IS NOT NULL
                   OR commentary_meituan IS NOT NULL$$
            ) AS s
            WHERE length(s.word) >= 2;  -- 过滤单字噪音(中文标点/单字停用词)

        END;
        $$ LANGUAGE plpgsql;
        """
    )


def downgrade() -> None:
    op.execute("DROP TRIGGER IF EXISTS trg_articles_rebuild_title_suggestions ON articles")
    op.execute("DROP FUNCTION IF EXISTS rebuild_title_suggestions()")
    op.execute("DROP FUNCTION IF EXISTS refresh_search_keywords()")

    op.drop_index("ix_search_title_suggestions_published", table_name="search_title_suggestions")
    op.drop_index("ix_search_title_suggestions_article", table_name="search_title_suggestions")
    op.drop_index("ix_search_title_suggestions_prefix", table_name="search_title_suggestions")
    op.drop_table("search_title_suggestions")

    op.drop_index("ix_search_keywords_keyword_btree", table_name="search_keywords")
    op.drop_index("ix_search_keywords_source_weight", table_name="search_keywords")
    op.drop_index("ix_search_keywords_prefix", table_name="search_keywords")
    op.drop_table("search_keywords")

    op.drop_index("ix_articles_title_zh_tsv", table_name="articles")
    op.drop_column("articles", "title_zh_tsv")