feat(search): 智能搜索建议 - 固化候选词表 (search_keywords + search_title_suggestions)
后端: - alembic 0009: 两张固化表 + GIN prefix_keys 索引 + articles trigger - /api/v1/search/suggestions: 混合 A(高频词 ts_stat) + B(真实标题) + 冷启动 fallback - worker 每日 03:00 + 启动时刷新 search_keywords - 顺便填 commit 11 TODO: articles.title_zh_tsv + GIN 索引(未来 FTS 基础) 前端: - NInput -> NAutoComplete + debounce 250ms - 选标题 -> 跳详情;选关键词 -> 填入 + 触发搜索 - AbortController 防 race condition 性能: prefix_keys @> ARRAY[prefix] 走 GIN 亚毫秒,100w 行也稳
This commit is contained in:
261
backend/alembic/versions/0009_search_suggestions.py
Normal file
261
backend/alembic/versions/0009_search_suggestions.py
Normal file
@@ -0,0 +1,261 @@
|
||||
"""搜索建议固化表 + 触发器
|
||||
|
||||
设计:
|
||||
- search_keywords: ts_stat 词频表,worker 每日凌晨刷新(全量重建)
|
||||
- search_title_suggestions: 文章标题片段表,articles trigger 实时维护(增删改 title_zh 时)
|
||||
- 两表都有 prefix_keys text[] + GIN 索引,查询走 prefix_keys @> ARRAY['前缀']
|
||||
(比 LIKE '%xxx%' 快 10-100x,100w 行也是亚毫秒)
|
||||
|
||||
数据源:
|
||||
- search_title_suggestions: articles.title_zh
|
||||
- 包括翻译后的中文标题 + 短新闻的原文(短新闻 title_zh 为空时回退 title)
|
||||
- 触发器只维护这一张表(写入频繁,实时)
|
||||
- search_keywords:
|
||||
- ts_stat 词频(从 articles.title_zh + body_zh_text + commentary + commentary_meituan 算)
|
||||
- 每日 worker 全量刷新(词频聚合慢,不适合每篇文章 trigger)
|
||||
|
||||
顺手把 commit 11 提到的 full-text search 基础做完:
|
||||
- articles.title_zh_tsv tsvector 列 + GIN 索引(用于未来 query 走 FTS)
|
||||
- 触发器自动维护
|
||||
|
||||
Revision ID: 0009
|
||||
Revises: 0008
|
||||
Create Date: 2026-06-15
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Sequence, Union
|
||||
|
||||
import sqlalchemy as sa
|
||||
from alembic import op
|
||||
from sqlalchemy.dialects.postgresql import ARRAY, TSVECTOR
|
||||
|
||||
|
||||
revision: str = "0009"
|
||||
down_revision: Union[str, None] = "0008"
|
||||
branch_labels = None
|
||||
depends_on = None
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
# === 1) articles: 加 tsvector 列 + GIN 索引(full-text search 基础)===
|
||||
# 用 'simple' parser:对中文按字符切,免装 zhparser 扩展;
|
||||
# simple parser 对英文也 OK(按空格切),通用。
|
||||
# future:搜索 ?q=xxx 可以改走 to_tsquery,这里只先建基础设施。
|
||||
op.add_column(
|
||||
"articles",
|
||||
sa.Column(
|
||||
"title_zh_tsv",
|
||||
TSVECTOR,
|
||||
sa.Computed(
|
||||
"to_tsvector('simple', coalesce(title_zh, ''))",
|
||||
persisted=True,
|
||||
),
|
||||
),
|
||||
)
|
||||
op.create_index(
|
||||
"ix_articles_title_zh_tsv",
|
||||
"articles",
|
||||
["title_zh_tsv"],
|
||||
postgresql_using="gin",
|
||||
)
|
||||
|
||||
# === 2) search_keywords: 词频候选词表 ===
|
||||
op.create_table(
|
||||
"search_keywords",
|
||||
sa.Column("id", sa.BigInteger, primary_key=True),
|
||||
sa.Column("keyword", sa.Text, nullable=False),
|
||||
# ts_stat / title_extract / manual
|
||||
sa.Column("source", sa.String(32), nullable=False),
|
||||
# 词频或文章数(权重)
|
||||
sa.Column("weight", sa.Integer, nullable=False, server_default="0"),
|
||||
# 预计算的前缀数组,['美','美联储','美联储宣'] for '美联储宣布...'
|
||||
# 查询: prefix_keys @> ARRAY['美'] 走 GIN 索引
|
||||
sa.Column("prefix_keys", ARRAY(sa.Text), nullable=False),
|
||||
sa.Column(
|
||||
"last_seen_at",
|
||||
sa.DateTime(timezone=True),
|
||||
nullable=False,
|
||||
server_default=sa.text("now()"),
|
||||
),
|
||||
sa.Column(
|
||||
"created_at",
|
||||
sa.DateTime(timezone=True),
|
||||
nullable=False,
|
||||
server_default=sa.text("now()"),
|
||||
),
|
||||
sa.UniqueConstraint("keyword", "source", name="uq_search_keywords_kw_src"),
|
||||
)
|
||||
op.create_index(
|
||||
"ix_search_keywords_prefix",
|
||||
"search_keywords",
|
||||
["prefix_keys"],
|
||||
postgresql_using="gin",
|
||||
)
|
||||
op.create_index(
|
||||
"ix_search_keywords_source_weight",
|
||||
"search_keywords",
|
||||
["source", "weight"],
|
||||
)
|
||||
op.create_index(
|
||||
"ix_search_keywords_keyword_btree",
|
||||
"search_keywords",
|
||||
["keyword"],
|
||||
)
|
||||
|
||||
# === 3) search_title_suggestions: 真实文章标题片段表 ===
|
||||
op.create_table(
|
||||
"search_title_suggestions",
|
||||
sa.Column("id", sa.BigInteger, primary_key=True),
|
||||
sa.Column(
|
||||
"article_id",
|
||||
sa.BigInteger,
|
||||
sa.ForeignKey("articles.id", ondelete="CASCADE"),
|
||||
nullable=False,
|
||||
),
|
||||
# 用的字段:'title_zh' / 'title' (短新闻回退)
|
||||
sa.Column("title_lang", sa.String(8), nullable=False, server_default="zh"),
|
||||
sa.Column("prefix_keys", ARRAY(sa.Text), nullable=False),
|
||||
sa.Column("published_at", sa.DateTime(timezone=True), nullable=True),
|
||||
sa.Column(
|
||||
"created_at",
|
||||
sa.DateTime(timezone=True),
|
||||
nullable=False,
|
||||
server_default=sa.text("now()"),
|
||||
),
|
||||
)
|
||||
op.create_index(
|
||||
"ix_search_title_suggestions_prefix",
|
||||
"search_title_suggestions",
|
||||
["prefix_keys"],
|
||||
postgresql_using="gin",
|
||||
)
|
||||
op.create_index(
|
||||
"ix_search_title_suggestions_article",
|
||||
"search_title_suggestions",
|
||||
["article_id"],
|
||||
)
|
||||
op.create_index(
|
||||
"ix_search_title_suggestions_published",
|
||||
"search_title_suggestions",
|
||||
["published_at"],
|
||||
)
|
||||
|
||||
# === 4) articles 触发器:title_zh 写时自动维护 search_title_suggestions ===
|
||||
# 触发器函数:删除旧条目,按优先级(title_zh > title)插新条目
|
||||
# 限制:标题字符数 > 50 时只取前 50 字符(prefix_keys 数组不会爆炸)
|
||||
op.execute(
|
||||
"""
|
||||
CREATE OR REPLACE FUNCTION rebuild_title_suggestions() RETURNS TRIGGER AS $$
|
||||
DECLARE
|
||||
src_text text;
|
||||
src_lang text;
|
||||
max_len int := 50;
|
||||
BEGIN
|
||||
-- 先删掉该文章旧条目
|
||||
DELETE FROM search_title_suggestions WHERE article_id = NEW.id;
|
||||
|
||||
-- 优先用 title_zh(翻译后),没有再回退 title(短新闻路径)
|
||||
IF NEW.title_zh IS NOT NULL AND length(NEW.title_zh) > 0 THEN
|
||||
src_text := NEW.title_zh;
|
||||
src_lang := 'zh';
|
||||
ELSIF NEW.title IS NOT NULL AND length(NEW.title) > 0 THEN
|
||||
src_text := NEW.title;
|
||||
src_lang := 'src';
|
||||
ELSE
|
||||
RETURN NEW;
|
||||
END IF;
|
||||
|
||||
-- 截断到 max_len 字符(prefix_keys 长度可控)
|
||||
src_text := substring(src_text, 1, max_len);
|
||||
|
||||
-- 插入一条,prefix_keys 是从第 1 字到全词的所有前缀
|
||||
INSERT INTO search_title_suggestions
|
||||
(article_id, title_lang, prefix_keys, published_at)
|
||||
SELECT
|
||||
NEW.id,
|
||||
src_lang,
|
||||
ARRAY(
|
||||
SELECT substring(src_text, 1, n)
|
||||
FROM generate_series(1, length(src_text)) AS n
|
||||
),
|
||||
NEW.published_at;
|
||||
|
||||
RETURN NEW;
|
||||
END;
|
||||
$$ LANGUAGE plpgsql;
|
||||
"""
|
||||
)
|
||||
|
||||
op.execute(
|
||||
"""
|
||||
CREATE TRIGGER trg_articles_rebuild_title_suggestions
|
||||
AFTER INSERT OR UPDATE OF title_zh, title, published_at ON articles
|
||||
FOR EACH ROW EXECUTE FUNCTION rebuild_title_suggestions();
|
||||
"""
|
||||
)
|
||||
|
||||
# === 5) articles 删除时清理 ===
|
||||
# 用 ON DELETE CASCADE 即可,不用单独触发器
|
||||
|
||||
# === 6) search_keywords 刷新函数(给 worker 调用) ===
|
||||
# 设计:全量 truncate + insert(用 ts_stat + 标题聚合)
|
||||
# 调用方式: SELECT refresh_search_keywords();
|
||||
op.execute(
|
||||
"""
|
||||
CREATE OR REPLACE FUNCTION refresh_search_keywords() RETURNS void AS $$
|
||||
BEGIN
|
||||
TRUNCATE search_keywords;
|
||||
|
||||
-- A) ts_stat 词频(title_zh + body_zh_text + commentary)
|
||||
INSERT INTO search_keywords (keyword, source, weight, prefix_keys)
|
||||
SELECT
|
||||
word,
|
||||
'ts_stat',
|
||||
nentry::int,
|
||||
ARRAY(
|
||||
SELECT substring(word, 1, n)
|
||||
FROM generate_series(1, length(word)) AS n
|
||||
)
|
||||
FROM ts_stat(
|
||||
'simple',
|
||||
(
|
||||
SELECT to_tsvector(
|
||||
'simple',
|
||||
coalesce(title_zh, '') || ' ' ||
|
||||
coalesce(body_zh_text, '') || ' ' ||
|
||||
coalesce(commentary, '') || ' ' ||
|
||||
coalesce(commentary_meituan, '')
|
||||
)
|
||||
FROM articles
|
||||
WHERE title_zh IS NOT NULL
|
||||
OR body_zh_text IS NOT NULL
|
||||
OR commentary IS NOT NULL
|
||||
OR commentary_meituan IS NOT NULL
|
||||
)
|
||||
)
|
||||
WHERE length(word) >= 2; -- 过滤单字噪音(中文标点/单字停用词)
|
||||
|
||||
END;
|
||||
$$ LANGUAGE plpgsql;
|
||||
"""
|
||||
)
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
op.execute("DROP TRIGGER IF EXISTS trg_articles_rebuild_title_suggestions ON articles")
|
||||
op.execute("DROP FUNCTION IF EXISTS rebuild_title_suggestions()")
|
||||
op.execute("DROP FUNCTION IF EXISTS refresh_search_keywords()")
|
||||
|
||||
op.drop_index("ix_search_title_suggestions_published", table_name="search_title_suggestions")
|
||||
op.drop_index("ix_search_title_suggestions_article", table_name="search_title_suggestions")
|
||||
op.drop_index("ix_search_title_suggestions_prefix", table_name="search_title_suggestions")
|
||||
op.drop_table("search_title_suggestions")
|
||||
|
||||
op.drop_index("ix_search_keywords_keyword_btree", table_name="search_keywords")
|
||||
op.drop_index("ix_search_keywords_source_weight", table_name="search_keywords")
|
||||
op.drop_index("ix_search_keywords_prefix", table_name="search_keywords")
|
||||
op.drop_table("search_keywords")
|
||||
|
||||
op.drop_index("ix_articles_title_zh_tsv", table_name="articles")
|
||||
op.drop_column("articles", "title_zh_tsv")
|
||||
Reference in New Issue
Block a user