"""搜索建议 schema 修正 — 加 zhparser 中文分词支持。 0009 跑完之后,发现: - PG 16 + alpine 镜像没有中文分词扩展,'simple' parser 把整句中文当一个 token,词频聚合无效 - 解决:换 Debian 镜像 + 装 zhparser + 创建 chinese_zh text search config - 修正 articles.title_zh_tsv 生成列 + refresh_search_keywords() 函数都用 chinese_zh 步骤: 1) CREATE EXTENSION zhparser 2) 创建 chinese_zh text search config(基于 zhparser) 3) 重建 articles.title_zh_tsv 用 chinese_zh(先 DROP,再 ADD) 4) CREATE OR REPLACE refresh_search_keywords() 用 chinese_zh 5) 触发 search_keywords 立即刷新,让数据立刻可用 注意:docker-compose.yml 同步改了 postgres 用 Dockerfile.postgres (debian bookworm + apt 装 zhparser,alpine 没现成包) Revision ID: 0010 Revises: 0009 Create Date: 2026-06-15 """ from __future__ import annotations from typing import Sequence, Union import sqlalchemy as sa from alembic import op from sqlalchemy.dialects.postgresql import TSVECTOR revision: str = "0010" down_revision: Union[str, None] = "0009" branch_labels = None depends_on = None def upgrade() -> None: # 1) 装扩展 op.execute("CREATE EXTENSION IF NOT EXISTS zhparser;") # 2) 建 text search config:中文用 zhparser,简单词(英文/数字)用 simple parser # zhparser 自带一个 'scwsm' 字典 + 'default' tokenizer,适合新闻类语料 op.execute("DROP TEXT SEARCH CONFIGURATION IF EXISTS chinese_zh;") op.execute( """ CREATE TEXT SEARCH CONFIGURATION chinese_zh (PARSER = zhparser); ALTER TEXT SEARCH CONFIGURATION chinese_zh ADD MAPPING FOR n, a, v, i, e, l, u, x WITH simple; """ ) # 3) 重建 articles.title_zh_tsv 列(用新 config) op.drop_index("ix_articles_title_zh_tsv", table_name="articles") op.drop_column("articles", "title_zh_tsv") op.add_column( "articles", sa.Column( "title_zh_tsv", TSVECTOR, sa.Computed( "to_tsvector('chinese_zh', coalesce(title_zh, ''))", persisted=True, ), ), ) op.create_index( "ix_articles_title_zh_tsv", "articles", ["title_zh_tsv"], postgresql_using="gin", ) # 4) 覆盖 refresh_search_keywords() — 用 chinese_zh + ts_stat(text) 正确签名 op.execute("DROP FUNCTION IF EXISTS refresh_search_keywords();") op.execute( """ CREATE OR REPLACE FUNCTION refresh_search_keywords() RETURNS void AS $func$ BEGIN TRUNCATE search_keywords; -- ts_stat(query text) 接受 SQL 字符串,内部执行并聚合词频 -- 单参(等价 mask='abcd',聚合所有权重) -- ⚠️ 不能传 'a' mask — zhparser 不标 A 权重,会 0 行 INSERT INTO search_keywords (keyword, source, weight, prefix_keys) SELECT word, 'ts_stat', nentry::int, ARRAY( SELECT substring(word, 1, n) FROM generate_series(1, length(word)) AS n ) FROM ts_stat( $$SELECT to_tsvector('chinese_zh', coalesce(title_zh, '') || ' ' || coalesce(body_zh_text, '') || ' ' || coalesce(commentary, '') || ' ' || coalesce(commentary_meituan, '') ) FROM articles WHERE title_zh IS NOT NULL OR body_zh_text IS NOT NULL OR commentary IS NOT NULL OR commentary_meituan IS NOT NULL$$ ) AS s WHERE length(s.word) >= 2; END; $func$ LANGUAGE plpgsql; """ ) # 5) 立即跑一次刷新(让 worker 下次 03:00 之前就有数据可用) op.execute("SELECT refresh_search_keywords();") def downgrade() -> None: # 1) 恢复 refresh_search_keywords() — 不强求降级数据;只恢复 schema op.execute("DROP FUNCTION IF EXISTS refresh_search_keywords();") op.execute( """ CREATE OR REPLACE FUNCTION refresh_search_keywords() RETURNS void AS $func$ BEGIN TRUNCATE search_keywords; -- 降级回 simple parser(中文会被当成一整句,词频聚合无效,仅占位) INSERT INTO search_keywords (keyword, source, weight, prefix_keys) SELECT word, 'ts_stat', nentry::int, ARRAY( SELECT substring(word, 1, n) FROM generate_series(1, length(word)) AS n ) FROM ts_stat( $$SELECT to_tsvector('simple', coalesce(title_zh, '') || ' ' || coalesce(body_zh_text, '') || ' ' || coalesce(commentary, '') || ' ' || coalesce(commentary_meituan, '') ) FROM articles WHERE title_zh IS NOT NULL OR body_zh_text IS NOT NULL OR commentary IS NOT NULL OR commentary_meituan IS NOT NULL$$ ) AS s WHERE length(s.word) >= 2; END; $func$ LANGUAGE plpgsql; """ ) # 2) 恢复 title_zh_tsv 用 simple op.drop_index("ix_articles_title_zh_tsv", table_name="articles") op.drop_column("articles", "title_zh_tsv") op.add_column( "articles", sa.Column( "title_zh_tsv", TSVECTOR, sa.Computed( "to_tsvector('simple', coalesce(title_zh, ''))", persisted=True, ), ), ) op.create_index( "ix_articles_title_zh_tsv", "articles", ["title_zh_tsv"], postgresql_using="gin", ) # 3) 删 chinese_zh config op.execute("DROP TEXT SEARCH CONFIGURATION IF EXISTS chinese_zh;") # 4) 留 zhparser 扩展不删(其他东西可能依赖,降级时留着更安全) # 真要删: op.execute("DROP EXTENSION IF EXISTS zhparser;")