feat(search): 装 zhparser 中文分词 + 0010 迁移修正

- Dockerfile.postgres: 从 alpine 切到 debian bookworm,apt 装 postgresql-16-zhparser - docker-compose.yml: postgres 改用 build 指向 Dockerfile.postgres - 0010 迁移: CREATE EXTENSION zhparser + 建 chinese_zh text search config + 重建 articles.title_zh_tsv 用 chinese_zh + 重写 refresh_search_keywords()
2026-06-15 18:46:09 +08:00
parent 2b94be2048
commit 557b7a708e
3 changed files with 216 additions and 1 deletions
--- a/backend/Dockerfile.postgres
+++ b/backend/Dockerfile.postgres
@@ -0,0 +1,27 @@
+# Postgres 16 + zhparser 中文分词扩展
+# 基础镜像从 alpine 切到 debian bookworm,为了 apt 能装 postgresql-16-zhparser
+# alpine 仓库没打包 zhparser,只能从源码编,代价不值
+#
+# 构建: docker build -f backend/Dockerfile.postgres -t diary-postgres:zh ./backend
+# 配合 docker-compose.yml 改 postgres 服务的 build 字段使用
+FROM postgres:16-bookworm
+
+# 装 zhparser + scws(scws 是 zhparser 依赖的分词库)
+# postgresql-16-zhparser 包会同时拉 libscws 等依赖
+# bookworm 仓库里有现成二进制包,免编译
+RUN apt-get update \
+    && apt-get install -y --no-install-recommends \
+        postgresql-16-zhparser \
+        ca-certificates \
+    && rm -rf /var/lib/apt/lists/*
+
+# 注意:
+# 1) zhparser 扩展本身不需要 postgresql.conf 改 shared_preload_libraries
+#    (那是 pg_stat_statements / pg_cron 之类才需要)
+# 2) 扩展是建在 database 里的(用 CREATE EXTENSION zhparser;),不是 initdb 时装
+#    所以不需要改 entrypoint,迁移里 CREATE EXTENSION 即可
+# 3) 如果将来升级 PG 大版本,zhparser 包名会跟着变(postgresql-XX-zhparser)
+#    Dockerfile 写死版本号,升级时要改
+
+# 默认继承 postgres:16-bookworm 的 entrypoint / cmd
+# 数据 volume 复用 pg_data,数据兼容(只是多了个 extension)
--- a/backend/alembic/versions/0010_zhparser_chinese.py
+++ b/backend/alembic/versions/0010_zhparser_chinese.py
@@ -0,0 +1,183 @@
+"""搜索建议 schema 修正 — 加 zhparser 中文分词支持。
+
+0009 跑完之后,发现:
+- PG 16 + alpine 镜像没有中文分词扩展,'simple' parser 把整句中文当一个 token,词频聚合无效
+- 解决:换 Debian 镜像 + 装 zhparser + 创建 chinese_zh text search config
+- 修正 articles.title_zh_tsv 生成列 + refresh_search_keywords() 函数都用 chinese_zh
+
+步骤:
+  1) CREATE EXTENSION zhparser
+  2) 创建 chinese_zh text search config(基于 zhparser)
+  3) 重建 articles.title_zh_tsv 用 chinese_zh(先 DROP,再 ADD)
+  4) CREATE OR REPLACE refresh_search_keywords() 用 chinese_zh
+  5) 触发 search_keywords 立即刷新,让数据立刻可用
+
+注意:docker-compose.yml 同步改了 postgres 用 Dockerfile.postgres
+(debian bookworm + apt 装 zhparser,alpine 没现成包)
+
+Revision ID: 0010
+Revises: 0009
+Create Date: 2026-06-15
+"""
+from __future__ import annotations
+
+from typing import Sequence, Union
+
+import sqlalchemy as sa
+from alembic import op
+from sqlalchemy.dialects.postgresql import TSVECTOR
+
+
+revision: str = "0010"
+down_revision: Union[str, None] = "0009"
+branch_labels = None
+depends_on = None
+
+
+def upgrade() -> None:
+    # 1) 装扩展
+    op.execute("CREATE EXTENSION IF NOT EXISTS zhparser;")
+
+    # 2) 建 text search config:中文用 zhparser,简单词(英文/数字)用 simple parser
+    # zhparser 自带一个 'scwsm' 字典 + 'default' tokenizer,适合新闻类语料
+    op.execute("DROP TEXT SEARCH CONFIGURATION IF EXISTS chinese_zh;")
+    op.execute(
+        """
+        CREATE TEXT SEARCH CONFIGURATION chinese_zh (PARSER = zhparser);
+        ALTER TEXT SEARCH CONFIGURATION chinese_zh
+            ADD MAPPING FOR n, a, v, i, e, l, u, x
+            WITH simple;
+        """
+    )
+
+    # 3) 重建 articles.title_zh_tsv 列(用新 config)
+    op.drop_index("ix_articles_title_zh_tsv", table_name="articles")
+    op.drop_column("articles", "title_zh_tsv")
+    op.add_column(
+        "articles",
+        sa.Column(
+            "title_zh_tsv",
+            TSVECTOR,
+            sa.Computed(
+                "to_tsvector('chinese_zh', coalesce(title_zh, ''))",
+                persisted=True,
+            ),
+        ),
+    )
+    op.create_index(
+        "ix_articles_title_zh_tsv",
+        "articles",
+        ["title_zh_tsv"],
+        postgresql_using="gin",
+    )
+
+    # 4) 覆盖 refresh_search_keywords() — 用 chinese_zh + ts_stat(text) 正确签名
+    op.execute("DROP FUNCTION IF EXISTS refresh_search_keywords();")
+    op.execute(
+        """
+        CREATE OR REPLACE FUNCTION refresh_search_keywords() RETURNS void AS $func$
+        BEGIN
+            TRUNCATE search_keywords;
+
+            -- ts_stat(query text) 接受 SQL 字符串,内部执行并聚合词频
+            -- 'a' = 任意权重(A/B/C/D 四档,这里聚合所有)
+            INSERT INTO search_keywords (keyword, source, weight, prefix_keys)
+            SELECT
+                word,
+                'ts_stat',
+                nentry::int,
+                ARRAY(
+                    SELECT substring(word, 1, n)
+                    FROM generate_series(1, length(word)) AS n
+                )
+            FROM ts_stat(
+                $q$
+                SELECT to_tsvector('chinese_zh',
+                    coalesce(title_zh, '') || ' ' ||
+                    coalesce(body_zh_text, '') || ' ' ||
+                    coalesce(commentary, '') || ' ' ||
+                    coalesce(commentary_meituan, '')
+                )
+                FROM articles
+                WHERE title_zh IS NOT NULL
+                   OR body_zh_text IS NOT NULL
+                   OR commentary IS NOT NULL
+                   OR commentary_meituan IS NOT NULL
+                $q$, 'a'
+            )
+            WHERE length(word) >= 2;
+        END;
+        $func$ LANGUAGE plpgsql;
+        """
+    )
+
+    # 5) 立即跑一次刷新(让 worker 下次 03:00 之前就有数据可用)
+    op.execute("SELECT refresh_search_keywords();")
+
+
+def downgrade() -> None:
+    # 1) 恢复 refresh_search_keywords() — 不强求降级数据;只恢复 schema
+    op.execute("DROP FUNCTION IF EXISTS refresh_search_keywords();")
+    op.execute(
+        """
+        CREATE OR REPLACE FUNCTION refresh_search_keywords() RETURNS void AS $func$
+        BEGIN
+            TRUNCATE search_keywords;
+
+            -- 降级回 simple parser(中文会被当成一整句,词频聚合无效,仅占位)
+            INSERT INTO search_keywords (keyword, source, weight, prefix_keys)
+            SELECT
+                word,
+                'ts_stat',
+                nentry::int,
+                ARRAY(
+                    SELECT substring(word, 1, n)
+                    FROM generate_series(1, length(word)) AS n
+                )
+            FROM ts_stat(
+                $q$
+                SELECT to_tsvector('simple',
+                    coalesce(title_zh, '') || ' ' ||
+                    coalesce(body_zh_text, '') || ' ' ||
+                    coalesce(commentary, '') || ' ' ||
+                    coalesce(commentary_meituan, '')
+                )
+                FROM articles
+                WHERE title_zh IS NOT NULL
+                   OR body_zh_text IS NOT NULL
+                   OR commentary IS NOT NULL
+                   OR commentary_meituan IS NOT NULL
+                $q$, 'a'
+            )
+            WHERE length(word) >= 2;
+        END;
+        $func$ LANGUAGE plpgsql;
+        """
+    )
+
+    # 2) 恢复 title_zh_tsv 用 simple
+    op.drop_index("ix_articles_title_zh_tsv", table_name="articles")
+    op.drop_column("articles", "title_zh_tsv")
+    op.add_column(
+        "articles",
+        sa.Column(
+            "title_zh_tsv",
+            TSVECTOR,
+            sa.Computed(
+                "to_tsvector('simple', coalesce(title_zh, ''))",
+                persisted=True,
+            ),
+        ),
+    )
+    op.create_index(
+        "ix_articles_title_zh_tsv",
+        "articles",
+        ["title_zh_tsv"],
+        postgresql_using="gin",
+    )
+
+    # 3) 删 chinese_zh config
+    op.execute("DROP TEXT SEARCH CONFIGURATION IF EXISTS chinese_zh;")
+
+    # 4) 留 zhparser 扩展不删(其他东西可能依赖,降级时留着更安全)
+    # 真要删: op.execute("DROP EXTENSION IF EXISTS zhparser;")
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -2,7 +2,12 @@ name: news-aggregator

 services:
  postgres:
-    image: postgres:16-alpine
+    # 从 alpine 切到自建镜像(Debian bookworm + zhparser 中文分词扩展)
+    # alpine 仓库没打包 zhparser,Debian 仓库有现成 apt 包 postgresql-16-zhparser
+    build:
+      context: ./backend
+      dockerfile: Dockerfile.postgres
+    image: diary-postgres:zh   # 自定义 tag,方便 docker images 识别
    restart: unless-stopped
    environment:
      POSTGRES_USER: ${POSTGRES_USER}