diff --git a/backend/Dockerfile.postgres b/backend/Dockerfile.postgres new file mode 100644 index 0000000..d92d35b --- /dev/null +++ b/backend/Dockerfile.postgres @@ -0,0 +1,27 @@ +# Postgres 16 + zhparser 中文分词扩展 +# 基础镜像从 alpine 切到 debian bookworm,为了 apt 能装 postgresql-16-zhparser +# alpine 仓库没打包 zhparser,只能从源码编,代价不值 +# +# 构建: docker build -f backend/Dockerfile.postgres -t diary-postgres:zh ./backend +# 配合 docker-compose.yml 改 postgres 服务的 build 字段使用 +FROM postgres:16-bookworm + +# 装 zhparser + scws(scws 是 zhparser 依赖的分词库) +# postgresql-16-zhparser 包会同时拉 libscws 等依赖 +# bookworm 仓库里有现成二进制包,免编译 +RUN apt-get update \ + && apt-get install -y --no-install-recommends \ + postgresql-16-zhparser \ + ca-certificates \ + && rm -rf /var/lib/apt/lists/* + +# 注意: +# 1) zhparser 扩展本身不需要 postgresql.conf 改 shared_preload_libraries +# (那是 pg_stat_statements / pg_cron 之类才需要) +# 2) 扩展是建在 database 里的(用 CREATE EXTENSION zhparser;),不是 initdb 时装 +# 所以不需要改 entrypoint,迁移里 CREATE EXTENSION 即可 +# 3) 如果将来升级 PG 大版本,zhparser 包名会跟着变(postgresql-XX-zhparser) +# Dockerfile 写死版本号,升级时要改 + +# 默认继承 postgres:16-bookworm 的 entrypoint / cmd +# 数据 volume 复用 pg_data,数据兼容(只是多了个 extension) diff --git a/backend/alembic/versions/0010_zhparser_chinese.py b/backend/alembic/versions/0010_zhparser_chinese.py new file mode 100644 index 0000000..83c1b88 --- /dev/null +++ b/backend/alembic/versions/0010_zhparser_chinese.py @@ -0,0 +1,183 @@ +"""搜索建议 schema 修正 — 加 zhparser 中文分词支持。 + +0009 跑完之后,发现: +- PG 16 + alpine 镜像没有中文分词扩展,'simple' parser 把整句中文当一个 token,词频聚合无效 +- 解决:换 Debian 镜像 + 装 zhparser + 创建 chinese_zh text search config +- 修正 articles.title_zh_tsv 生成列 + refresh_search_keywords() 函数都用 chinese_zh + +步骤: + 1) CREATE EXTENSION zhparser + 2) 创建 chinese_zh text search config(基于 zhparser) + 3) 重建 articles.title_zh_tsv 用 chinese_zh(先 DROP,再 ADD) + 4) CREATE OR REPLACE refresh_search_keywords() 用 chinese_zh + 5) 触发 search_keywords 立即刷新,让数据立刻可用 + +注意:docker-compose.yml 同步改了 postgres 用 Dockerfile.postgres +(debian bookworm + apt 装 zhparser,alpine 没现成包) + +Revision ID: 0010 +Revises: 0009 +Create Date: 2026-06-15 +""" +from __future__ import annotations + +from typing import Sequence, Union + +import sqlalchemy as sa +from alembic import op +from sqlalchemy.dialects.postgresql import TSVECTOR + + +revision: str = "0010" +down_revision: Union[str, None] = "0009" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + # 1) 装扩展 + op.execute("CREATE EXTENSION IF NOT EXISTS zhparser;") + + # 2) 建 text search config:中文用 zhparser,简单词(英文/数字)用 simple parser + # zhparser 自带一个 'scwsm' 字典 + 'default' tokenizer,适合新闻类语料 + op.execute("DROP TEXT SEARCH CONFIGURATION IF EXISTS chinese_zh;") + op.execute( + """ + CREATE TEXT SEARCH CONFIGURATION chinese_zh (PARSER = zhparser); + ALTER TEXT SEARCH CONFIGURATION chinese_zh + ADD MAPPING FOR n, a, v, i, e, l, u, x + WITH simple; + """ + ) + + # 3) 重建 articles.title_zh_tsv 列(用新 config) + op.drop_index("ix_articles_title_zh_tsv", table_name="articles") + op.drop_column("articles", "title_zh_tsv") + op.add_column( + "articles", + sa.Column( + "title_zh_tsv", + TSVECTOR, + sa.Computed( + "to_tsvector('chinese_zh', coalesce(title_zh, ''))", + persisted=True, + ), + ), + ) + op.create_index( + "ix_articles_title_zh_tsv", + "articles", + ["title_zh_tsv"], + postgresql_using="gin", + ) + + # 4) 覆盖 refresh_search_keywords() — 用 chinese_zh + ts_stat(text) 正确签名 + op.execute("DROP FUNCTION IF EXISTS refresh_search_keywords();") + op.execute( + """ + CREATE OR REPLACE FUNCTION refresh_search_keywords() RETURNS void AS $func$ + BEGIN + TRUNCATE search_keywords; + + -- ts_stat(query text) 接受 SQL 字符串,内部执行并聚合词频 + -- 'a' = 任意权重(A/B/C/D 四档,这里聚合所有) + INSERT INTO search_keywords (keyword, source, weight, prefix_keys) + SELECT + word, + 'ts_stat', + nentry::int, + ARRAY( + SELECT substring(word, 1, n) + FROM generate_series(1, length(word)) AS n + ) + FROM ts_stat( + $q$ + SELECT to_tsvector('chinese_zh', + coalesce(title_zh, '') || ' ' || + coalesce(body_zh_text, '') || ' ' || + coalesce(commentary, '') || ' ' || + coalesce(commentary_meituan, '') + ) + FROM articles + WHERE title_zh IS NOT NULL + OR body_zh_text IS NOT NULL + OR commentary IS NOT NULL + OR commentary_meituan IS NOT NULL + $q$, 'a' + ) + WHERE length(word) >= 2; + END; + $func$ LANGUAGE plpgsql; + """ + ) + + # 5) 立即跑一次刷新(让 worker 下次 03:00 之前就有数据可用) + op.execute("SELECT refresh_search_keywords();") + + +def downgrade() -> None: + # 1) 恢复 refresh_search_keywords() — 不强求降级数据;只恢复 schema + op.execute("DROP FUNCTION IF EXISTS refresh_search_keywords();") + op.execute( + """ + CREATE OR REPLACE FUNCTION refresh_search_keywords() RETURNS void AS $func$ + BEGIN + TRUNCATE search_keywords; + + -- 降级回 simple parser(中文会被当成一整句,词频聚合无效,仅占位) + INSERT INTO search_keywords (keyword, source, weight, prefix_keys) + SELECT + word, + 'ts_stat', + nentry::int, + ARRAY( + SELECT substring(word, 1, n) + FROM generate_series(1, length(word)) AS n + ) + FROM ts_stat( + $q$ + SELECT to_tsvector('simple', + coalesce(title_zh, '') || ' ' || + coalesce(body_zh_text, '') || ' ' || + coalesce(commentary, '') || ' ' || + coalesce(commentary_meituan, '') + ) + FROM articles + WHERE title_zh IS NOT NULL + OR body_zh_text IS NOT NULL + OR commentary IS NOT NULL + OR commentary_meituan IS NOT NULL + $q$, 'a' + ) + WHERE length(word) >= 2; + END; + $func$ LANGUAGE plpgsql; + """ + ) + + # 2) 恢复 title_zh_tsv 用 simple + op.drop_index("ix_articles_title_zh_tsv", table_name="articles") + op.drop_column("articles", "title_zh_tsv") + op.add_column( + "articles", + sa.Column( + "title_zh_tsv", + TSVECTOR, + sa.Computed( + "to_tsvector('simple', coalesce(title_zh, ''))", + persisted=True, + ), + ), + ) + op.create_index( + "ix_articles_title_zh_tsv", + "articles", + ["title_zh_tsv"], + postgresql_using="gin", + ) + + # 3) 删 chinese_zh config + op.execute("DROP TEXT SEARCH CONFIGURATION IF EXISTS chinese_zh;") + + # 4) 留 zhparser 扩展不删(其他东西可能依赖,降级时留着更安全) + # 真要删: op.execute("DROP EXTENSION IF EXISTS zhparser;") diff --git a/docker-compose.yml b/docker-compose.yml index f680e24..a0dc417 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -2,7 +2,12 @@ name: news-aggregator services: postgres: - image: postgres:16-alpine + # 从 alpine 切到自建镜像(Debian bookworm + zhparser 中文分词扩展) + # alpine 仓库没打包 zhparser,Debian 仓库有现成 apt 包 postgresql-16-zhparser + build: + context: ./backend + dockerfile: Dockerfile.postgres + image: diary-postgres:zh # 自定义 tag,方便 docker images 识别 restart: unless-stopped environment: POSTGRES_USER: ${POSTGRES_USER}