backend/alembic/versions/0010_zhparser_chinese.py

"""搜索建议 schema 修正 — 加 zhparser 中文分词支持。

0009 跑完之后,发现:
- PG 16 + alpine 镜像没有中文分词扩展,'simple' parser 把整句中文当一个 token,词频聚合无效
- 解决:换 Debian 镜像 + 装 zhparser + 创建 chinese_zh text search config
- 修正 articles.title_zh_tsv 生成列 + refresh_search_keywords() 函数都用 chinese_zh

步骤:
  1) CREATE EXTENSION zhparser
  2) 创建 chinese_zh text search config(基于 zhparser)
  3) 重建 articles.title_zh_tsv 用 chinese_zh(先 DROP,再 ADD)
  4) CREATE OR REPLACE refresh_search_keywords() 用 chinese_zh
  5) 触发 search_keywords 立即刷新,让数据立刻可用

注意:docker-compose.yml 同步改了 postgres 用 Dockerfile.postgres
(debian bookworm + apt 装 zhparser,alpine 没现成包)

Revision ID: 0010
Revises: 0009
Create Date: 2026-06-15
"""
from __future__ import annotations

from typing import Sequence, Union

import sqlalchemy as sa
from alembic import op
from sqlalchemy.dialects.postgresql import TSVECTOR


revision: str = "0010"
down_revision: Union[str, None] = "0009"
branch_labels = None
depends_on = None


def upgrade() -> None:
    # 1) 装扩展
    op.execute("CREATE EXTENSION IF NOT EXISTS zhparser;")

    # 2) 建 text search config:中文用 zhparser,简单词(英文/数字)用 simple parser
    # zhparser 自带一个 'scwsm' 字典 + 'default' tokenizer,适合新闻类语料
    op.execute("DROP TEXT SEARCH CONFIGURATION IF EXISTS chinese_zh;")
    op.execute(
        """
        CREATE TEXT SEARCH CONFIGURATION chinese_zh (PARSER = zhparser);
        ALTER TEXT SEARCH CONFIGURATION chinese_zh
            ADD MAPPING FOR n, a, v, i, e, l, u, x
            WITH simple;
        """
    )

    # 3) 重建 articles.title_zh_tsv 列(用新 config)
    op.drop_index("ix_articles_title_zh_tsv", table_name="articles")
    op.drop_column("articles", "title_zh_tsv")
    op.add_column(
        "articles",
        sa.Column(
            "title_zh_tsv",
            TSVECTOR,
            sa.Computed(
                "to_tsvector('chinese_zh', coalesce(title_zh, ''))",
                persisted=True,
            ),
        ),
    )
    op.create_index(
        "ix_articles_title_zh_tsv",
        "articles",
        ["title_zh_tsv"],
        postgresql_using="gin",
    )

    # 4) 覆盖 refresh_search_keywords() — 用 chinese_zh + ts_stat(text) 正确签名
    op.execute("DROP FUNCTION IF EXISTS refresh_search_keywords();")
    op.execute(
        """
        CREATE OR REPLACE FUNCTION refresh_search_keywords() RETURNS void AS $func$
        BEGIN
            TRUNCATE search_keywords;

            -- ts_stat(query text) 接受 SQL 字符串,内部执行并聚合词频
            -- 'a' = 任意权重(A/B/C/D 四档,这里聚合所有)
            INSERT INTO search_keywords (keyword, source, weight, prefix_keys)
            SELECT
                word,
                'ts_stat',
                nentry::int,
                ARRAY(
                    SELECT substring(word, 1, n)
                    FROM generate_series(1, length(word)) AS n
                )
            FROM ts_stat(
                $q$
                SELECT to_tsvector('chinese_zh',
                    coalesce(title_zh, '') || ' ' ||
                    coalesce(body_zh_text, '') || ' ' ||
                    coalesce(commentary, '') || ' ' ||
                    coalesce(commentary_meituan, '')
                )
                FROM articles
                WHERE title_zh IS NOT NULL
                   OR body_zh_text IS NOT NULL
                   OR commentary IS NOT NULL
                   OR commentary_meituan IS NOT NULL
                $q$, 'a'
            )
            WHERE length(word) >= 2;
        END;
        $func$ LANGUAGE plpgsql;
        """
    )

    # 5) 立即跑一次刷新(让 worker 下次 03:00 之前就有数据可用)
    op.execute("SELECT refresh_search_keywords();")


def downgrade() -> None:
    # 1) 恢复 refresh_search_keywords() — 不强求降级数据;只恢复 schema
    op.execute("DROP FUNCTION IF EXISTS refresh_search_keywords();")
    op.execute(
        """
        CREATE OR REPLACE FUNCTION refresh_search_keywords() RETURNS void AS $func$
        BEGIN
            TRUNCATE search_keywords;

            -- 降级回 simple parser(中文会被当成一整句,词频聚合无效,仅占位)
            INSERT INTO search_keywords (keyword, source, weight, prefix_keys)
            SELECT
                word,
                'ts_stat',
                nentry::int,
                ARRAY(
                    SELECT substring(word, 1, n)
                    FROM generate_series(1, length(word)) AS n
                )
            FROM ts_stat(
                $q$
                SELECT to_tsvector('simple',
                    coalesce(title_zh, '') || ' ' ||
                    coalesce(body_zh_text, '') || ' ' ||
                    coalesce(commentary, '') || ' ' ||
                    coalesce(commentary_meituan, '')
                )
                FROM articles
                WHERE title_zh IS NOT NULL
                   OR body_zh_text IS NOT NULL
                   OR commentary IS NOT NULL
                   OR commentary_meituan IS NOT NULL
                $q$, 'a'
            )
            WHERE length(word) >= 2;
        END;
        $func$ LANGUAGE plpgsql;
        """
    )

    # 2) 恢复 title_zh_tsv 用 simple
    op.drop_index("ix_articles_title_zh_tsv", table_name="articles")
    op.drop_column("articles", "title_zh_tsv")
    op.add_column(
        "articles",
        sa.Column(
            "title_zh_tsv",
            TSVECTOR,
            sa.Computed(
                "to_tsvector('simple', coalesce(title_zh, ''))",
                persisted=True,
            ),
        ),
    )
    op.create_index(
        "ix_articles_title_zh_tsv",
        "articles",
        ["title_zh_tsv"],
        postgresql_using="gin",
    )

    # 3) 删 chinese_zh config
    op.execute("DROP TEXT SEARCH CONFIGURATION IF EXISTS chinese_zh;")

    # 4) 留 zhparser 扩展不删(其他东西可能依赖,降级时留着更安全)
    # 真要删: op.execute("DROP EXTENSION IF EXISTS zhparser;")
feat(search): 装 zhparser 中文分词 + 0010 迁移修正 - Dockerfile.postgres: 从 alpine 切到 debian bookworm,apt 装 postgresql-16-zhparser - docker-compose.yml: postgres 改用 build 指向 Dockerfile.postgres - 0010 迁移: CREATE EXTENSION zhparser + 建 chinese_zh text search config + 重建 articles.title_zh_tsv 用 chinese_zh + 重写 refresh_search_keywords() 2026-06-15 18:46:09 +08:00			`"""搜索建议 schema 修正 — 加 zhparser 中文分词支持。`

			`0009 跑完之后,发现:`
			`- PG 16 + alpine 镜像没有中文分词扩展,'simple' parser 把整句中文当一个 token,词频聚合无效`
			`- 解决:换 Debian 镜像 + 装 zhparser + 创建 chinese_zh text search config`
			`- 修正 articles.title_zh_tsv 生成列 + refresh_search_keywords() 函数都用 chinese_zh`

			`步骤:`
			`1) CREATE EXTENSION zhparser`
			`2) 创建 chinese_zh text search config(基于 zhparser)`
			`3) 重建 articles.title_zh_tsv 用 chinese_zh(先 DROP,再 ADD)`
			`4) CREATE OR REPLACE refresh_search_keywords() 用 chinese_zh`
			`5) 触发 search_keywords 立即刷新,让数据立刻可用`

			`注意:docker-compose.yml 同步改了 postgres 用 Dockerfile.postgres`
			`(debian bookworm + apt 装 zhparser,alpine 没现成包)`

			`Revision ID: 0010`
			`Revises: 0009`
			`Create Date: 2026-06-15`
			`"""`
			`from __future__ import annotations`

			`from typing import Sequence, Union`

			`import sqlalchemy as sa`
			`from alembic import op`
			`from sqlalchemy.dialects.postgresql import TSVECTOR`


			`revision: str = "0010"`
			`down_revision: Union[str, None] = "0009"`
			`branch_labels = None`
			`depends_on = None`


			`def upgrade() -> None:`
			`# 1) 装扩展`
			`op.execute("CREATE EXTENSION IF NOT EXISTS zhparser;")`

			`# 2) 建 text search config:中文用 zhparser,简单词(英文/数字)用 simple parser`
			`# zhparser 自带一个 'scwsm' 字典 + 'default' tokenizer,适合新闻类语料`
			`op.execute("DROP TEXT SEARCH CONFIGURATION IF EXISTS chinese_zh;")`
			`op.execute(`
			`"""`
			`CREATE TEXT SEARCH CONFIGURATION chinese_zh (PARSER = zhparser);`
			`ALTER TEXT SEARCH CONFIGURATION chinese_zh`
			`ADD MAPPING FOR n, a, v, i, e, l, u, x`
			`WITH simple;`
			`"""`
			`)`

			`# 3) 重建 articles.title_zh_tsv 列(用新 config)`
			`op.drop_index("ix_articles_title_zh_tsv", table_name="articles")`
			`op.drop_column("articles", "title_zh_tsv")`
			`op.add_column(`
			`"articles",`
			`sa.Column(`
			`"title_zh_tsv",`
			`TSVECTOR,`
			`sa.Computed(`
			`"to_tsvector('chinese_zh', coalesce(title_zh, ''))",`
			`persisted=True,`
			`),`
			`),`
			`)`
			`op.create_index(`
			`"ix_articles_title_zh_tsv",`
			`"articles",`
			`["title_zh_tsv"],`
			`postgresql_using="gin",`
			`)`

			`# 4) 覆盖 refresh_search_keywords() — 用 chinese_zh + ts_stat(text) 正确签名`
			`op.execute("DROP FUNCTION IF EXISTS refresh_search_keywords();")`
			`op.execute(`
			`"""`
			`CREATE OR REPLACE FUNCTION refresh_search_keywords() RETURNS void AS $func$`
			`BEGIN`
			`TRUNCATE search_keywords;`

			`-- ts_stat(query text) 接受 SQL 字符串,内部执行并聚合词频`
			`-- 'a' = 任意权重(A/B/C/D 四档,这里聚合所有)`
			`INSERT INTO search_keywords (keyword, source, weight, prefix_keys)`
			`SELECT`
			`word,`
			`'ts_stat',`
			`nentry::int,`
			`ARRAY(`
			`SELECT substring(word, 1, n)`
			`FROM generate_series(1, length(word)) AS n`
			`)`
			`FROM ts_stat(`
			$q$
			`SELECT to_tsvector('chinese_zh',`
			`coalesce(title_zh, '') \|\| ' ' \|\|`
			`coalesce(body_zh_text, '') \|\| ' ' \|\|`
			`coalesce(commentary, '') \|\| ' ' \|\|`
			`coalesce(commentary_meituan, '')`
			`)`
			`FROM articles`
			`WHERE title_zh IS NOT NULL`
			`OR body_zh_text IS NOT NULL`
			`OR commentary IS NOT NULL`
			`OR commentary_meituan IS NOT NULL`
			`$q$, 'a'`
			`)`
			`WHERE length(word) >= 2;`
			`END;`
			`$func$ LANGUAGE plpgsql;`
			`"""`
			`)`

			`# 5) 立即跑一次刷新(让 worker 下次 03:00 之前就有数据可用)`
			`op.execute("SELECT refresh_search_keywords();")`


			`def downgrade() -> None:`
			`# 1) 恢复 refresh_search_keywords() — 不强求降级数据;只恢复 schema`
			`op.execute("DROP FUNCTION IF EXISTS refresh_search_keywords();")`
			`op.execute(`
			`"""`
			`CREATE OR REPLACE FUNCTION refresh_search_keywords() RETURNS void AS $func$`
			`BEGIN`
			`TRUNCATE search_keywords;`

			`-- 降级回 simple parser(中文会被当成一整句,词频聚合无效,仅占位)`
			`INSERT INTO search_keywords (keyword, source, weight, prefix_keys)`
			`SELECT`
			`word,`
			`'ts_stat',`
			`nentry::int,`
			`ARRAY(`
			`SELECT substring(word, 1, n)`
			`FROM generate_series(1, length(word)) AS n`
			`)`
			`FROM ts_stat(`
			$q$
			`SELECT to_tsvector('simple',`
			`coalesce(title_zh, '') \|\| ' ' \|\|`
			`coalesce(body_zh_text, '') \|\| ' ' \|\|`
			`coalesce(commentary, '') \|\| ' ' \|\|`
			`coalesce(commentary_meituan, '')`
			`)`
			`FROM articles`
			`WHERE title_zh IS NOT NULL`
			`OR body_zh_text IS NOT NULL`
			`OR commentary IS NOT NULL`
			`OR commentary_meituan IS NOT NULL`
			`$q$, 'a'`
			`)`
			`WHERE length(word) >= 2;`
			`END;`
			`$func$ LANGUAGE plpgsql;`
			`"""`
			`)`

			`# 2) 恢复 title_zh_tsv 用 simple`
			`op.drop_index("ix_articles_title_zh_tsv", table_name="articles")`
			`op.drop_column("articles", "title_zh_tsv")`
			`op.add_column(`
			`"articles",`
			`sa.Column(`
			`"title_zh_tsv",`
			`TSVECTOR,`
			`sa.Computed(`
			`"to_tsvector('simple', coalesce(title_zh, ''))",`
			`persisted=True,`
			`),`
			`),`
			`)`
			`op.create_index(`
			`"ix_articles_title_zh_tsv",`
			`"articles",`
			`["title_zh_tsv"],`
			`postgresql_using="gin",`
			`)`

			`# 3) 删 chinese_zh config`
			`op.execute("DROP TEXT SEARCH CONFIGURATION IF EXISTS chinese_zh;")`

			`# 4) 留 zhparser 扩展不删(其他东西可能依赖,降级时留着更安全)`
			`# 真要删: op.execute("DROP EXTENSION IF EXISTS zhparser;")`