zhparser 不标 A 权重(也不标 B/C/D),传 'a' mask 给 ts_stat(text, weights) 会过滤掉所有词 但不报错,静默 0 行。改成 ts_stat(text) 单参(等价 mask='abcd',聚合所有权重)。 修: - 0010 迁移里 refresh_search_keywords() 改用单参 ts_stat - 0010 迁移 downgrade 部分同步修 - 0009 迁移 refresh_search_keywords() 同步修 - services/search.py _fallback_keywords 改用 chinese_zh + 单参 ts_stat
258 lines
9.1 KiB
Python
258 lines
9.1 KiB
Python
"""搜索建议固化表 + 触发器
|
|
|
|
设计:
|
|
- search_keywords: ts_stat 词频表,worker 每日凌晨刷新(全量重建)
|
|
- search_title_suggestions: 文章标题片段表,articles trigger 实时维护(增删改 title_zh 时)
|
|
- 两表都有 prefix_keys text[] + GIN 索引,查询走 prefix_keys @> ARRAY['前缀']
|
|
(比 LIKE '%xxx%' 快 10-100x,100w 行也是亚毫秒)
|
|
|
|
数据源:
|
|
- search_title_suggestions: articles.title_zh
|
|
- 包括翻译后的中文标题 + 短新闻的原文(短新闻 title_zh 为空时回退 title)
|
|
- 触发器只维护这一张表(写入频繁,实时)
|
|
- search_keywords:
|
|
- ts_stat 词频(从 articles.title_zh + body_zh_text + commentary + commentary_meituan 算)
|
|
- 每日 worker 全量刷新(词频聚合慢,不适合每篇文章 trigger)
|
|
|
|
顺手把 commit 11 提到的 full-text search 基础做完:
|
|
- articles.title_zh_tsv tsvector 列 + GIN 索引(用于未来 query 走 FTS)
|
|
- 触发器自动维护
|
|
|
|
Revision ID: 0009
|
|
Revises: 0008
|
|
Create Date: 2026-06-15
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
from typing import Sequence, Union
|
|
|
|
import sqlalchemy as sa
|
|
from alembic import op
|
|
from sqlalchemy.dialects.postgresql import ARRAY, TSVECTOR
|
|
|
|
|
|
revision: str = "0009"
|
|
down_revision: Union[str, None] = "0008"
|
|
branch_labels = None
|
|
depends_on = None
|
|
|
|
|
|
def upgrade() -> None:
|
|
# === 1) articles: 加 tsvector 列 + GIN 索引(full-text search 基础)===
|
|
# 用 'simple' parser:对中文按字符切,免装 zhparser 扩展;
|
|
# simple parser 对英文也 OK(按空格切),通用。
|
|
# future:搜索 ?q=xxx 可以改走 to_tsquery,这里只先建基础设施。
|
|
op.add_column(
|
|
"articles",
|
|
sa.Column(
|
|
"title_zh_tsv",
|
|
TSVECTOR,
|
|
sa.Computed(
|
|
"to_tsvector('simple', coalesce(title_zh, ''))",
|
|
persisted=True,
|
|
),
|
|
),
|
|
)
|
|
op.create_index(
|
|
"ix_articles_title_zh_tsv",
|
|
"articles",
|
|
["title_zh_tsv"],
|
|
postgresql_using="gin",
|
|
)
|
|
|
|
# === 2) search_keywords: 词频候选词表 ===
|
|
op.create_table(
|
|
"search_keywords",
|
|
sa.Column("id", sa.BigInteger, primary_key=True),
|
|
sa.Column("keyword", sa.Text, nullable=False),
|
|
# ts_stat / title_extract / manual
|
|
sa.Column("source", sa.String(32), nullable=False),
|
|
# 词频或文章数(权重)
|
|
sa.Column("weight", sa.Integer, nullable=False, server_default="0"),
|
|
# 预计算的前缀数组,['美','美联储','美联储宣'] for '美联储宣布...'
|
|
# 查询: prefix_keys @> ARRAY['美'] 走 GIN 索引
|
|
sa.Column("prefix_keys", ARRAY(sa.Text), nullable=False),
|
|
sa.Column(
|
|
"last_seen_at",
|
|
sa.DateTime(timezone=True),
|
|
nullable=False,
|
|
server_default=sa.text("now()"),
|
|
),
|
|
sa.Column(
|
|
"created_at",
|
|
sa.DateTime(timezone=True),
|
|
nullable=False,
|
|
server_default=sa.text("now()"),
|
|
),
|
|
sa.UniqueConstraint("keyword", "source", name="uq_search_keywords_kw_src"),
|
|
)
|
|
op.create_index(
|
|
"ix_search_keywords_prefix",
|
|
"search_keywords",
|
|
["prefix_keys"],
|
|
postgresql_using="gin",
|
|
)
|
|
op.create_index(
|
|
"ix_search_keywords_source_weight",
|
|
"search_keywords",
|
|
["source", "weight"],
|
|
)
|
|
op.create_index(
|
|
"ix_search_keywords_keyword_btree",
|
|
"search_keywords",
|
|
["keyword"],
|
|
)
|
|
|
|
# === 3) search_title_suggestions: 真实文章标题片段表 ===
|
|
op.create_table(
|
|
"search_title_suggestions",
|
|
sa.Column("id", sa.BigInteger, primary_key=True),
|
|
sa.Column(
|
|
"article_id",
|
|
sa.BigInteger,
|
|
sa.ForeignKey("articles.id", ondelete="CASCADE"),
|
|
nullable=False,
|
|
),
|
|
# 用的字段:'title_zh' / 'title' (短新闻回退)
|
|
sa.Column("title_lang", sa.String(8), nullable=False, server_default="zh"),
|
|
sa.Column("prefix_keys", ARRAY(sa.Text), nullable=False),
|
|
sa.Column("published_at", sa.DateTime(timezone=True), nullable=True),
|
|
sa.Column(
|
|
"created_at",
|
|
sa.DateTime(timezone=True),
|
|
nullable=False,
|
|
server_default=sa.text("now()"),
|
|
),
|
|
)
|
|
op.create_index(
|
|
"ix_search_title_suggestions_prefix",
|
|
"search_title_suggestions",
|
|
["prefix_keys"],
|
|
postgresql_using="gin",
|
|
)
|
|
op.create_index(
|
|
"ix_search_title_suggestions_article",
|
|
"search_title_suggestions",
|
|
["article_id"],
|
|
)
|
|
op.create_index(
|
|
"ix_search_title_suggestions_published",
|
|
"search_title_suggestions",
|
|
["published_at"],
|
|
)
|
|
|
|
# === 4) articles 触发器:title_zh 写时自动维护 search_title_suggestions ===
|
|
# 触发器函数:删除旧条目,按优先级(title_zh > title)插新条目
|
|
# 限制:标题字符数 > 50 时只取前 50 字符(prefix_keys 数组不会爆炸)
|
|
op.execute(
|
|
"""
|
|
CREATE OR REPLACE FUNCTION rebuild_title_suggestions() RETURNS TRIGGER AS $$
|
|
DECLARE
|
|
src_text text;
|
|
src_lang text;
|
|
max_len int := 50;
|
|
BEGIN
|
|
-- 先删掉该文章旧条目
|
|
DELETE FROM search_title_suggestions WHERE article_id = NEW.id;
|
|
|
|
-- 优先用 title_zh(翻译后),没有再回退 title(短新闻路径)
|
|
IF NEW.title_zh IS NOT NULL AND length(NEW.title_zh) > 0 THEN
|
|
src_text := NEW.title_zh;
|
|
src_lang := 'zh';
|
|
ELSIF NEW.title IS NOT NULL AND length(NEW.title) > 0 THEN
|
|
src_text := NEW.title;
|
|
src_lang := 'src';
|
|
ELSE
|
|
RETURN NEW;
|
|
END IF;
|
|
|
|
-- 截断到 max_len 字符(prefix_keys 长度可控)
|
|
src_text := substring(src_text, 1, max_len);
|
|
|
|
-- 插入一条,prefix_keys 是从第 1 字到全词的所有前缀
|
|
INSERT INTO search_title_suggestions
|
|
(article_id, title_lang, prefix_keys, published_at)
|
|
SELECT
|
|
NEW.id,
|
|
src_lang,
|
|
ARRAY(
|
|
SELECT substring(src_text, 1, n)
|
|
FROM generate_series(1, length(src_text)) AS n
|
|
),
|
|
NEW.published_at;
|
|
|
|
RETURN NEW;
|
|
END;
|
|
$$ LANGUAGE plpgsql;
|
|
"""
|
|
)
|
|
|
|
op.execute(
|
|
"""
|
|
CREATE TRIGGER trg_articles_rebuild_title_suggestions
|
|
AFTER INSERT OR UPDATE OF title_zh, title, published_at ON articles
|
|
FOR EACH ROW EXECUTE FUNCTION rebuild_title_suggestions();
|
|
"""
|
|
)
|
|
|
|
# === 5) articles 删除时清理 ===
|
|
# 用 ON DELETE CASCADE 即可,不用单独触发器
|
|
|
|
# === 6) search_keywords 刷新函数(给 worker 调用) ===
|
|
# 设计:全量 truncate + insert(用 ts_stat + 标题聚合)
|
|
# 调用方式: SELECT refresh_search_keywords();
|
|
op.execute(
|
|
"""
|
|
CREATE OR REPLACE FUNCTION refresh_search_keywords() RETURNS void AS $$
|
|
BEGIN
|
|
TRUNCATE search_keywords;
|
|
|
|
-- A) ts_stat 词频(title_zh + body_zh_text + commentary)
|
|
INSERT INTO search_keywords (keyword, source, weight, prefix_keys)
|
|
SELECT
|
|
word,
|
|
'ts_stat',
|
|
nentry::int,
|
|
ARRAY(
|
|
SELECT substring(word, 1, n)
|
|
FROM generate_series(1, length(word)) AS n
|
|
)
|
|
FROM ts_stat(
|
|
$$SELECT to_tsvector('simple',
|
|
coalesce(title_zh, '') || ' ' ||
|
|
coalesce(body_zh_text, '') || ' ' ||
|
|
coalesce(commentary, '') || ' ' ||
|
|
coalesce(commentary_meituan, '')
|
|
)
|
|
FROM articles
|
|
WHERE title_zh IS NOT NULL
|
|
OR body_zh_text IS NOT NULL
|
|
OR commentary IS NOT NULL
|
|
OR commentary_meituan IS NOT NULL$$
|
|
) AS s
|
|
WHERE length(s.word) >= 2; -- 过滤单字噪音(中文标点/单字停用词)
|
|
|
|
END;
|
|
$$ LANGUAGE plpgsql;
|
|
"""
|
|
)
|
|
|
|
|
|
def downgrade() -> None:
|
|
op.execute("DROP TRIGGER IF EXISTS trg_articles_rebuild_title_suggestions ON articles")
|
|
op.execute("DROP FUNCTION IF EXISTS rebuild_title_suggestions()")
|
|
op.execute("DROP FUNCTION IF EXISTS refresh_search_keywords()")
|
|
|
|
op.drop_index("ix_search_title_suggestions_published", table_name="search_title_suggestions")
|
|
op.drop_index("ix_search_title_suggestions_article", table_name="search_title_suggestions")
|
|
op.drop_index("ix_search_title_suggestions_prefix", table_name="search_title_suggestions")
|
|
op.drop_table("search_title_suggestions")
|
|
|
|
op.drop_index("ix_search_keywords_keyword_btree", table_name="search_keywords")
|
|
op.drop_index("ix_search_keywords_source_weight", table_name="search_keywords")
|
|
op.drop_index("ix_search_keywords_prefix", table_name="search_keywords")
|
|
op.drop_table("search_keywords")
|
|
|
|
op.drop_index("ix_articles_title_zh_tsv", table_name="articles")
|
|
op.drop_column("articles", "title_zh_tsv")
|