184 lines
6.1 KiB
Python
184 lines
6.1 KiB
Python
|
|
"""搜索建议 schema 修正 — 加 zhparser 中文分词支持。
|
||
|
|
|
||
|
|
0009 跑完之后,发现:
|
||
|
|
- PG 16 + alpine 镜像没有中文分词扩展,'simple' parser 把整句中文当一个 token,词频聚合无效
|
||
|
|
- 解决:换 Debian 镜像 + 装 zhparser + 创建 chinese_zh text search config
|
||
|
|
- 修正 articles.title_zh_tsv 生成列 + refresh_search_keywords() 函数都用 chinese_zh
|
||
|
|
|
||
|
|
步骤:
|
||
|
|
1) CREATE EXTENSION zhparser
|
||
|
|
2) 创建 chinese_zh text search config(基于 zhparser)
|
||
|
|
3) 重建 articles.title_zh_tsv 用 chinese_zh(先 DROP,再 ADD)
|
||
|
|
4) CREATE OR REPLACE refresh_search_keywords() 用 chinese_zh
|
||
|
|
5) 触发 search_keywords 立即刷新,让数据立刻可用
|
||
|
|
|
||
|
|
注意:docker-compose.yml 同步改了 postgres 用 Dockerfile.postgres
|
||
|
|
(debian bookworm + apt 装 zhparser,alpine 没现成包)
|
||
|
|
|
||
|
|
Revision ID: 0010
|
||
|
|
Revises: 0009
|
||
|
|
Create Date: 2026-06-15
|
||
|
|
"""
|
||
|
|
from __future__ import annotations
|
||
|
|
|
||
|
|
from typing import Sequence, Union
|
||
|
|
|
||
|
|
import sqlalchemy as sa
|
||
|
|
from alembic import op
|
||
|
|
from sqlalchemy.dialects.postgresql import TSVECTOR
|
||
|
|
|
||
|
|
|
||
|
|
revision: str = "0010"
|
||
|
|
down_revision: Union[str, None] = "0009"
|
||
|
|
branch_labels = None
|
||
|
|
depends_on = None
|
||
|
|
|
||
|
|
|
||
|
|
def upgrade() -> None:
|
||
|
|
# 1) 装扩展
|
||
|
|
op.execute("CREATE EXTENSION IF NOT EXISTS zhparser;")
|
||
|
|
|
||
|
|
# 2) 建 text search config:中文用 zhparser,简单词(英文/数字)用 simple parser
|
||
|
|
# zhparser 自带一个 'scwsm' 字典 + 'default' tokenizer,适合新闻类语料
|
||
|
|
op.execute("DROP TEXT SEARCH CONFIGURATION IF EXISTS chinese_zh;")
|
||
|
|
op.execute(
|
||
|
|
"""
|
||
|
|
CREATE TEXT SEARCH CONFIGURATION chinese_zh (PARSER = zhparser);
|
||
|
|
ALTER TEXT SEARCH CONFIGURATION chinese_zh
|
||
|
|
ADD MAPPING FOR n, a, v, i, e, l, u, x
|
||
|
|
WITH simple;
|
||
|
|
"""
|
||
|
|
)
|
||
|
|
|
||
|
|
# 3) 重建 articles.title_zh_tsv 列(用新 config)
|
||
|
|
op.drop_index("ix_articles_title_zh_tsv", table_name="articles")
|
||
|
|
op.drop_column("articles", "title_zh_tsv")
|
||
|
|
op.add_column(
|
||
|
|
"articles",
|
||
|
|
sa.Column(
|
||
|
|
"title_zh_tsv",
|
||
|
|
TSVECTOR,
|
||
|
|
sa.Computed(
|
||
|
|
"to_tsvector('chinese_zh', coalesce(title_zh, ''))",
|
||
|
|
persisted=True,
|
||
|
|
),
|
||
|
|
),
|
||
|
|
)
|
||
|
|
op.create_index(
|
||
|
|
"ix_articles_title_zh_tsv",
|
||
|
|
"articles",
|
||
|
|
["title_zh_tsv"],
|
||
|
|
postgresql_using="gin",
|
||
|
|
)
|
||
|
|
|
||
|
|
# 4) 覆盖 refresh_search_keywords() — 用 chinese_zh + ts_stat(text) 正确签名
|
||
|
|
op.execute("DROP FUNCTION IF EXISTS refresh_search_keywords();")
|
||
|
|
op.execute(
|
||
|
|
"""
|
||
|
|
CREATE OR REPLACE FUNCTION refresh_search_keywords() RETURNS void AS $func$
|
||
|
|
BEGIN
|
||
|
|
TRUNCATE search_keywords;
|
||
|
|
|
||
|
|
-- ts_stat(query text) 接受 SQL 字符串,内部执行并聚合词频
|
||
|
|
-- 'a' = 任意权重(A/B/C/D 四档,这里聚合所有)
|
||
|
|
INSERT INTO search_keywords (keyword, source, weight, prefix_keys)
|
||
|
|
SELECT
|
||
|
|
word,
|
||
|
|
'ts_stat',
|
||
|
|
nentry::int,
|
||
|
|
ARRAY(
|
||
|
|
SELECT substring(word, 1, n)
|
||
|
|
FROM generate_series(1, length(word)) AS n
|
||
|
|
)
|
||
|
|
FROM ts_stat(
|
||
|
|
$q$
|
||
|
|
SELECT to_tsvector('chinese_zh',
|
||
|
|
coalesce(title_zh, '') || ' ' ||
|
||
|
|
coalesce(body_zh_text, '') || ' ' ||
|
||
|
|
coalesce(commentary, '') || ' ' ||
|
||
|
|
coalesce(commentary_meituan, '')
|
||
|
|
)
|
||
|
|
FROM articles
|
||
|
|
WHERE title_zh IS NOT NULL
|
||
|
|
OR body_zh_text IS NOT NULL
|
||
|
|
OR commentary IS NOT NULL
|
||
|
|
OR commentary_meituan IS NOT NULL
|
||
|
|
$q$, 'a'
|
||
|
|
)
|
||
|
|
WHERE length(word) >= 2;
|
||
|
|
END;
|
||
|
|
$func$ LANGUAGE plpgsql;
|
||
|
|
"""
|
||
|
|
)
|
||
|
|
|
||
|
|
# 5) 立即跑一次刷新(让 worker 下次 03:00 之前就有数据可用)
|
||
|
|
op.execute("SELECT refresh_search_keywords();")
|
||
|
|
|
||
|
|
|
||
|
|
def downgrade() -> None:
|
||
|
|
# 1) 恢复 refresh_search_keywords() — 不强求降级数据;只恢复 schema
|
||
|
|
op.execute("DROP FUNCTION IF EXISTS refresh_search_keywords();")
|
||
|
|
op.execute(
|
||
|
|
"""
|
||
|
|
CREATE OR REPLACE FUNCTION refresh_search_keywords() RETURNS void AS $func$
|
||
|
|
BEGIN
|
||
|
|
TRUNCATE search_keywords;
|
||
|
|
|
||
|
|
-- 降级回 simple parser(中文会被当成一整句,词频聚合无效,仅占位)
|
||
|
|
INSERT INTO search_keywords (keyword, source, weight, prefix_keys)
|
||
|
|
SELECT
|
||
|
|
word,
|
||
|
|
'ts_stat',
|
||
|
|
nentry::int,
|
||
|
|
ARRAY(
|
||
|
|
SELECT substring(word, 1, n)
|
||
|
|
FROM generate_series(1, length(word)) AS n
|
||
|
|
)
|
||
|
|
FROM ts_stat(
|
||
|
|
$q$
|
||
|
|
SELECT to_tsvector('simple',
|
||
|
|
coalesce(title_zh, '') || ' ' ||
|
||
|
|
coalesce(body_zh_text, '') || ' ' ||
|
||
|
|
coalesce(commentary, '') || ' ' ||
|
||
|
|
coalesce(commentary_meituan, '')
|
||
|
|
)
|
||
|
|
FROM articles
|
||
|
|
WHERE title_zh IS NOT NULL
|
||
|
|
OR body_zh_text IS NOT NULL
|
||
|
|
OR commentary IS NOT NULL
|
||
|
|
OR commentary_meituan IS NOT NULL
|
||
|
|
$q$, 'a'
|
||
|
|
)
|
||
|
|
WHERE length(word) >= 2;
|
||
|
|
END;
|
||
|
|
$func$ LANGUAGE plpgsql;
|
||
|
|
"""
|
||
|
|
)
|
||
|
|
|
||
|
|
# 2) 恢复 title_zh_tsv 用 simple
|
||
|
|
op.drop_index("ix_articles_title_zh_tsv", table_name="articles")
|
||
|
|
op.drop_column("articles", "title_zh_tsv")
|
||
|
|
op.add_column(
|
||
|
|
"articles",
|
||
|
|
sa.Column(
|
||
|
|
"title_zh_tsv",
|
||
|
|
TSVECTOR,
|
||
|
|
sa.Computed(
|
||
|
|
"to_tsvector('simple', coalesce(title_zh, ''))",
|
||
|
|
persisted=True,
|
||
|
|
),
|
||
|
|
),
|
||
|
|
)
|
||
|
|
op.create_index(
|
||
|
|
"ix_articles_title_zh_tsv",
|
||
|
|
"articles",
|
||
|
|
["title_zh_tsv"],
|
||
|
|
postgresql_using="gin",
|
||
|
|
)
|
||
|
|
|
||
|
|
# 3) 删 chinese_zh config
|
||
|
|
op.execute("DROP TEXT SEARCH CONFIGURATION IF EXISTS chinese_zh;")
|
||
|
|
|
||
|
|
# 4) 留 zhparser 扩展不删(其他东西可能依赖,降级时留着更安全)
|
||
|
|
# 真要删: op.execute("DROP EXTENSION IF EXISTS zhparser;")
|