Files
diary-news/backend/alembic/versions/0009_search_suggestions.py
mavis db4fd8699b fix(search): ts_stat 改单参(text),避免 'a' mask 静默 0 行
zhparser 不标 A 权重(也不标 B/C/D),传 'a' mask 给 ts_stat(text, weights) 会过滤掉所有词
但不报错,静默 0 行。改成 ts_stat(text) 单参(等价 mask='abcd',聚合所有权重)。

修:
- 0010 迁移里 refresh_search_keywords() 改用单参 ts_stat
- 0010 迁移 downgrade 部分同步修
- 0009 迁移 refresh_search_keywords() 同步修
- services/search.py _fallback_keywords 改用 chinese_zh + 单参 ts_stat
2026-06-15 19:19:19 +08:00

258 lines
9.1 KiB
Python

"""搜索建议固化表 + 触发器
设计:
- search_keywords: ts_stat 词频表,worker 每日凌晨刷新(全量重建)
- search_title_suggestions: 文章标题片段表,articles trigger 实时维护(增删改 title_zh 时)
- 两表都有 prefix_keys text[] + GIN 索引,查询走 prefix_keys @> ARRAY['前缀']
(比 LIKE '%xxx%' 快 10-100x,100w 行也是亚毫秒)
数据源:
- search_title_suggestions: articles.title_zh
- 包括翻译后的中文标题 + 短新闻的原文(短新闻 title_zh 为空时回退 title)
- 触发器只维护这一张表(写入频繁,实时)
- search_keywords:
- ts_stat 词频(从 articles.title_zh + body_zh_text + commentary + commentary_meituan 算)
- 每日 worker 全量刷新(词频聚合慢,不适合每篇文章 trigger)
顺手把 commit 11 提到的 full-text search 基础做完:
- articles.title_zh_tsv tsvector 列 + GIN 索引(用于未来 query 走 FTS)
- 触发器自动维护
Revision ID: 0009
Revises: 0008
Create Date: 2026-06-15
"""
from __future__ import annotations
from typing import Sequence, Union
import sqlalchemy as sa
from alembic import op
from sqlalchemy.dialects.postgresql import ARRAY, TSVECTOR
revision: str = "0009"
down_revision: Union[str, None] = "0008"
branch_labels = None
depends_on = None
def upgrade() -> None:
# === 1) articles: 加 tsvector 列 + GIN 索引(full-text search 基础)===
# 用 'simple' parser:对中文按字符切,免装 zhparser 扩展;
# simple parser 对英文也 OK(按空格切),通用。
# future:搜索 ?q=xxx 可以改走 to_tsquery,这里只先建基础设施。
op.add_column(
"articles",
sa.Column(
"title_zh_tsv",
TSVECTOR,
sa.Computed(
"to_tsvector('simple', coalesce(title_zh, ''))",
persisted=True,
),
),
)
op.create_index(
"ix_articles_title_zh_tsv",
"articles",
["title_zh_tsv"],
postgresql_using="gin",
)
# === 2) search_keywords: 词频候选词表 ===
op.create_table(
"search_keywords",
sa.Column("id", sa.BigInteger, primary_key=True),
sa.Column("keyword", sa.Text, nullable=False),
# ts_stat / title_extract / manual
sa.Column("source", sa.String(32), nullable=False),
# 词频或文章数(权重)
sa.Column("weight", sa.Integer, nullable=False, server_default="0"),
# 预计算的前缀数组,['美','美联储','美联储宣'] for '美联储宣布...'
# 查询: prefix_keys @> ARRAY['美'] 走 GIN 索引
sa.Column("prefix_keys", ARRAY(sa.Text), nullable=False),
sa.Column(
"last_seen_at",
sa.DateTime(timezone=True),
nullable=False,
server_default=sa.text("now()"),
),
sa.Column(
"created_at",
sa.DateTime(timezone=True),
nullable=False,
server_default=sa.text("now()"),
),
sa.UniqueConstraint("keyword", "source", name="uq_search_keywords_kw_src"),
)
op.create_index(
"ix_search_keywords_prefix",
"search_keywords",
["prefix_keys"],
postgresql_using="gin",
)
op.create_index(
"ix_search_keywords_source_weight",
"search_keywords",
["source", "weight"],
)
op.create_index(
"ix_search_keywords_keyword_btree",
"search_keywords",
["keyword"],
)
# === 3) search_title_suggestions: 真实文章标题片段表 ===
op.create_table(
"search_title_suggestions",
sa.Column("id", sa.BigInteger, primary_key=True),
sa.Column(
"article_id",
sa.BigInteger,
sa.ForeignKey("articles.id", ondelete="CASCADE"),
nullable=False,
),
# 用的字段:'title_zh' / 'title' (短新闻回退)
sa.Column("title_lang", sa.String(8), nullable=False, server_default="zh"),
sa.Column("prefix_keys", ARRAY(sa.Text), nullable=False),
sa.Column("published_at", sa.DateTime(timezone=True), nullable=True),
sa.Column(
"created_at",
sa.DateTime(timezone=True),
nullable=False,
server_default=sa.text("now()"),
),
)
op.create_index(
"ix_search_title_suggestions_prefix",
"search_title_suggestions",
["prefix_keys"],
postgresql_using="gin",
)
op.create_index(
"ix_search_title_suggestions_article",
"search_title_suggestions",
["article_id"],
)
op.create_index(
"ix_search_title_suggestions_published",
"search_title_suggestions",
["published_at"],
)
# === 4) articles 触发器:title_zh 写时自动维护 search_title_suggestions ===
# 触发器函数:删除旧条目,按优先级(title_zh > title)插新条目
# 限制:标题字符数 > 50 时只取前 50 字符(prefix_keys 数组不会爆炸)
op.execute(
"""
CREATE OR REPLACE FUNCTION rebuild_title_suggestions() RETURNS TRIGGER AS $$
DECLARE
src_text text;
src_lang text;
max_len int := 50;
BEGIN
-- 先删掉该文章旧条目
DELETE FROM search_title_suggestions WHERE article_id = NEW.id;
-- 优先用 title_zh(翻译后),没有再回退 title(短新闻路径)
IF NEW.title_zh IS NOT NULL AND length(NEW.title_zh) > 0 THEN
src_text := NEW.title_zh;
src_lang := 'zh';
ELSIF NEW.title IS NOT NULL AND length(NEW.title) > 0 THEN
src_text := NEW.title;
src_lang := 'src';
ELSE
RETURN NEW;
END IF;
-- 截断到 max_len 字符(prefix_keys 长度可控)
src_text := substring(src_text, 1, max_len);
-- 插入一条,prefix_keys 是从第 1 字到全词的所有前缀
INSERT INTO search_title_suggestions
(article_id, title_lang, prefix_keys, published_at)
SELECT
NEW.id,
src_lang,
ARRAY(
SELECT substring(src_text, 1, n)
FROM generate_series(1, length(src_text)) AS n
),
NEW.published_at;
RETURN NEW;
END;
$$ LANGUAGE plpgsql;
"""
)
op.execute(
"""
CREATE TRIGGER trg_articles_rebuild_title_suggestions
AFTER INSERT OR UPDATE OF title_zh, title, published_at ON articles
FOR EACH ROW EXECUTE FUNCTION rebuild_title_suggestions();
"""
)
# === 5) articles 删除时清理 ===
# 用 ON DELETE CASCADE 即可,不用单独触发器
# === 6) search_keywords 刷新函数(给 worker 调用) ===
# 设计:全量 truncate + insert(用 ts_stat + 标题聚合)
# 调用方式: SELECT refresh_search_keywords();
op.execute(
"""
CREATE OR REPLACE FUNCTION refresh_search_keywords() RETURNS void AS $$
BEGIN
TRUNCATE search_keywords;
-- A) ts_stat 词频(title_zh + body_zh_text + commentary)
INSERT INTO search_keywords (keyword, source, weight, prefix_keys)
SELECT
word,
'ts_stat',
nentry::int,
ARRAY(
SELECT substring(word, 1, n)
FROM generate_series(1, length(word)) AS n
)
FROM ts_stat(
$$SELECT to_tsvector('simple',
coalesce(title_zh, '') || ' ' ||
coalesce(body_zh_text, '') || ' ' ||
coalesce(commentary, '') || ' ' ||
coalesce(commentary_meituan, '')
)
FROM articles
WHERE title_zh IS NOT NULL
OR body_zh_text IS NOT NULL
OR commentary IS NOT NULL
OR commentary_meituan IS NOT NULL$$
) AS s
WHERE length(s.word) >= 2; -- 过滤单字噪音(中文标点/单字停用词)
END;
$$ LANGUAGE plpgsql;
"""
)
def downgrade() -> None:
op.execute("DROP TRIGGER IF EXISTS trg_articles_rebuild_title_suggestions ON articles")
op.execute("DROP FUNCTION IF EXISTS rebuild_title_suggestions()")
op.execute("DROP FUNCTION IF EXISTS refresh_search_keywords()")
op.drop_index("ix_search_title_suggestions_published", table_name="search_title_suggestions")
op.drop_index("ix_search_title_suggestions_article", table_name="search_title_suggestions")
op.drop_index("ix_search_title_suggestions_prefix", table_name="search_title_suggestions")
op.drop_table("search_title_suggestions")
op.drop_index("ix_search_keywords_keyword_btree", table_name="search_keywords")
op.drop_index("ix_search_keywords_source_weight", table_name="search_keywords")
op.drop_index("ix_search_keywords_prefix", table_name="search_keywords")
op.drop_table("search_keywords")
op.drop_index("ix_articles_title_zh_tsv", table_name="articles")
op.drop_column("articles", "title_zh_tsv")