"""搜索建议固化表 + 触发器 设计: - search_keywords: ts_stat 词频表,worker 每日凌晨刷新(全量重建) - search_title_suggestions: 文章标题片段表,articles trigger 实时维护(增删改 title_zh 时) - 两表都有 prefix_keys text[] + GIN 索引,查询走 prefix_keys @> ARRAY['前缀'] (比 LIKE '%xxx%' 快 10-100x,100w 行也是亚毫秒) 数据源: - search_title_suggestions: articles.title_zh - 包括翻译后的中文标题 + 短新闻的原文(短新闻 title_zh 为空时回退 title) - 触发器只维护这一张表(写入频繁,实时) - search_keywords: - ts_stat 词频(从 articles.title_zh + body_zh_text + commentary + commentary_meituan 算) - 每日 worker 全量刷新(词频聚合慢,不适合每篇文章 trigger) 顺手把 commit 11 提到的 full-text search 基础做完: - articles.title_zh_tsv tsvector 列 + GIN 索引(用于未来 query 走 FTS) - 触发器自动维护 Revision ID: 0009 Revises: 0008 Create Date: 2026-06-15 """ from __future__ import annotations from typing import Sequence, Union import sqlalchemy as sa from alembic import op from sqlalchemy.dialects.postgresql import ARRAY, TSVECTOR revision: str = "0009" down_revision: Union[str, None] = "0008" branch_labels = None depends_on = None def upgrade() -> None: # === 1) articles: 加 tsvector 列 + GIN 索引(full-text search 基础)=== # 用 'simple' parser:对中文按字符切,免装 zhparser 扩展; # simple parser 对英文也 OK(按空格切),通用。 # future:搜索 ?q=xxx 可以改走 to_tsquery,这里只先建基础设施。 op.add_column( "articles", sa.Column( "title_zh_tsv", TSVECTOR, sa.Computed( "to_tsvector('simple', coalesce(title_zh, ''))", persisted=True, ), ), ) op.create_index( "ix_articles_title_zh_tsv", "articles", ["title_zh_tsv"], postgresql_using="gin", ) # === 2) search_keywords: 词频候选词表 === op.create_table( "search_keywords", sa.Column("id", sa.BigInteger, primary_key=True), sa.Column("keyword", sa.Text, nullable=False), # ts_stat / title_extract / manual sa.Column("source", sa.String(32), nullable=False), # 词频或文章数(权重) sa.Column("weight", sa.Integer, nullable=False, server_default="0"), # 预计算的前缀数组,['美','美联储','美联储宣'] for '美联储宣布...' # 查询: prefix_keys @> ARRAY['美'] 走 GIN 索引 sa.Column("prefix_keys", ARRAY(sa.Text), nullable=False), sa.Column( "last_seen_at", sa.DateTime(timezone=True), nullable=False, server_default=sa.text("now()"), ), sa.Column( "created_at", sa.DateTime(timezone=True), nullable=False, server_default=sa.text("now()"), ), sa.UniqueConstraint("keyword", "source", name="uq_search_keywords_kw_src"), ) op.create_index( "ix_search_keywords_prefix", "search_keywords", ["prefix_keys"], postgresql_using="gin", ) op.create_index( "ix_search_keywords_source_weight", "search_keywords", ["source", "weight"], ) op.create_index( "ix_search_keywords_keyword_btree", "search_keywords", ["keyword"], ) # === 3) search_title_suggestions: 真实文章标题片段表 === op.create_table( "search_title_suggestions", sa.Column("id", sa.BigInteger, primary_key=True), sa.Column( "article_id", sa.BigInteger, sa.ForeignKey("articles.id", ondelete="CASCADE"), nullable=False, ), # 用的字段:'title_zh' / 'title' (短新闻回退) sa.Column("title_lang", sa.String(8), nullable=False, server_default="zh"), sa.Column("prefix_keys", ARRAY(sa.Text), nullable=False), sa.Column("published_at", sa.DateTime(timezone=True), nullable=True), sa.Column( "created_at", sa.DateTime(timezone=True), nullable=False, server_default=sa.text("now()"), ), ) op.create_index( "ix_search_title_suggestions_prefix", "search_title_suggestions", ["prefix_keys"], postgresql_using="gin", ) op.create_index( "ix_search_title_suggestions_article", "search_title_suggestions", ["article_id"], ) op.create_index( "ix_search_title_suggestions_published", "search_title_suggestions", ["published_at"], ) # === 4) articles 触发器:title_zh 写时自动维护 search_title_suggestions === # 触发器函数:删除旧条目,按优先级(title_zh > title)插新条目 # 限制:标题字符数 > 50 时只取前 50 字符(prefix_keys 数组不会爆炸) op.execute( """ CREATE OR REPLACE FUNCTION rebuild_title_suggestions() RETURNS TRIGGER AS $$ DECLARE src_text text; src_lang text; max_len int := 50; BEGIN -- 先删掉该文章旧条目 DELETE FROM search_title_suggestions WHERE article_id = NEW.id; -- 优先用 title_zh(翻译后),没有再回退 title(短新闻路径) IF NEW.title_zh IS NOT NULL AND length(NEW.title_zh) > 0 THEN src_text := NEW.title_zh; src_lang := 'zh'; ELSIF NEW.title IS NOT NULL AND length(NEW.title) > 0 THEN src_text := NEW.title; src_lang := 'src'; ELSE RETURN NEW; END IF; -- 截断到 max_len 字符(prefix_keys 长度可控) src_text := substring(src_text, 1, max_len); -- 插入一条,prefix_keys 是从第 1 字到全词的所有前缀 INSERT INTO search_title_suggestions (article_id, title_lang, prefix_keys, published_at) SELECT NEW.id, src_lang, ARRAY( SELECT substring(src_text, 1, n) FROM generate_series(1, length(src_text)) AS n ), NEW.published_at; RETURN NEW; END; $$ LANGUAGE plpgsql; """ ) op.execute( """ CREATE TRIGGER trg_articles_rebuild_title_suggestions AFTER INSERT OR UPDATE OF title_zh, title, published_at ON articles FOR EACH ROW EXECUTE FUNCTION rebuild_title_suggestions(); """ ) # === 5) articles 删除时清理 === # 用 ON DELETE CASCADE 即可,不用单独触发器 # === 6) search_keywords 刷新函数(给 worker 调用) === # 设计:全量 truncate + insert(用 ts_stat + 标题聚合) # 调用方式: SELECT refresh_search_keywords(); op.execute( """ CREATE OR REPLACE FUNCTION refresh_search_keywords() RETURNS void AS $$ BEGIN TRUNCATE search_keywords; -- A) ts_stat 词频(title_zh + body_zh_text + commentary) INSERT INTO search_keywords (keyword, source, weight, prefix_keys) SELECT word, 'ts_stat', nentry::int, ARRAY( SELECT substring(word, 1, n) FROM generate_series(1, length(word)) AS n ) FROM ts_stat( $$SELECT to_tsvector('simple', coalesce(title_zh, '') || ' ' || coalesce(body_zh_text, '') || ' ' || coalesce(commentary, '') || ' ' || coalesce(commentary_meituan, '') ) FROM articles WHERE title_zh IS NOT NULL OR body_zh_text IS NOT NULL OR commentary IS NOT NULL OR commentary_meituan IS NOT NULL$$ ) AS s WHERE length(s.word) >= 2; -- 过滤单字噪音(中文标点/单字停用词) END; $$ LANGUAGE plpgsql; """ ) def downgrade() -> None: op.execute("DROP TRIGGER IF EXISTS trg_articles_rebuild_title_suggestions ON articles") op.execute("DROP FUNCTION IF EXISTS rebuild_title_suggestions()") op.execute("DROP FUNCTION IF EXISTS refresh_search_keywords()") op.drop_index("ix_search_title_suggestions_published", table_name="search_title_suggestions") op.drop_index("ix_search_title_suggestions_article", table_name="search_title_suggestions") op.drop_index("ix_search_title_suggestions_prefix", table_name="search_title_suggestions") op.drop_table("search_title_suggestions") op.drop_index("ix_search_keywords_keyword_btree", table_name="search_keywords") op.drop_index("ix_search_keywords_source_weight", table_name="search_keywords") op.drop_index("ix_search_keywords_prefix", table_name="search_keywords") op.drop_table("search_keywords") op.drop_index("ix_articles_title_zh_tsv", table_name="articles") op.drop_column("articles", "title_zh_tsv")