From db4fd8699b5f4a9d354db99a31f7fcadf0942468 Mon Sep 17 00:00:00 2001 From: mavis Date: Mon, 15 Jun 2026 19:19:19 +0800 Subject: [PATCH] =?UTF-8?q?fix(search):=20ts=5Fstat=20=E6=94=B9=E5=8D=95?= =?UTF-8?q?=E5=8F=82(text),=E9=81=BF=E5=85=8D=20'a'=20mask=20=E9=9D=99?= =?UTF-8?q?=E9=BB=98=200=20=E8=A1=8C?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit zhparser 不标 A 权重(也不标 B/C/D),传 'a' mask 给 ts_stat(text, weights) 会过滤掉所有词 但不报错,静默 0 行。改成 ts_stat(text) 单参(等价 mask='abcd',聚合所有权重)。 修: - 0010 迁移里 refresh_search_keywords() 改用单参 ts_stat - 0010 迁移 downgrade 部分同步修 - 0009 迁移 refresh_search_keywords() 同步修 - services/search.py _fallback_keywords 改用 chinese_zh + 单参 ts_stat --- .../versions/0009_search_suggestions.py | 28 ++++++++----------- .../alembic/versions/0010_zhparser_chinese.py | 23 +++++++-------- backend/app/services/search.py | 20 ++++++------- 3 files changed, 31 insertions(+), 40 deletions(-) diff --git a/backend/alembic/versions/0009_search_suggestions.py b/backend/alembic/versions/0009_search_suggestions.py index 4433ce9..33e9822 100644 --- a/backend/alembic/versions/0009_search_suggestions.py +++ b/backend/alembic/versions/0009_search_suggestions.py @@ -218,23 +218,19 @@ def upgrade() -> None: FROM generate_series(1, length(word)) AS n ) FROM ts_stat( - 'simple', - ( - SELECT to_tsvector( - 'simple', - coalesce(title_zh, '') || ' ' || - coalesce(body_zh_text, '') || ' ' || - coalesce(commentary, '') || ' ' || - coalesce(commentary_meituan, '') - ) - FROM articles - WHERE title_zh IS NOT NULL - OR body_zh_text IS NOT NULL - OR commentary IS NOT NULL - OR commentary_meituan IS NOT NULL + $$SELECT to_tsvector('simple', + coalesce(title_zh, '') || ' ' || + coalesce(body_zh_text, '') || ' ' || + coalesce(commentary, '') || ' ' || + coalesce(commentary_meituan, '') ) - ) - WHERE length(word) >= 2; -- 过滤单字噪音(中文标点/单字停用词) + FROM articles + WHERE title_zh IS NOT NULL + OR body_zh_text IS NOT NULL + OR commentary IS NOT NULL + OR commentary_meituan IS NOT NULL$$ + ) AS s + WHERE length(s.word) >= 2; -- 过滤单字噪音(中文标点/单字停用词) END; $$ LANGUAGE plpgsql; diff --git a/backend/alembic/versions/0010_zhparser_chinese.py b/backend/alembic/versions/0010_zhparser_chinese.py index 83c1b88..221e4fe 100644 --- a/backend/alembic/versions/0010_zhparser_chinese.py +++ b/backend/alembic/versions/0010_zhparser_chinese.py @@ -80,7 +80,8 @@ def upgrade() -> None: TRUNCATE search_keywords; -- ts_stat(query text) 接受 SQL 字符串,内部执行并聚合词频 - -- 'a' = 任意权重(A/B/C/D 四档,这里聚合所有) + -- 单参(等价 mask='abcd',聚合所有权重) + -- ⚠️ 不能传 'a' mask — zhparser 不标 A 权重,会 0 行 INSERT INTO search_keywords (keyword, source, weight, prefix_keys) SELECT word, @@ -91,8 +92,7 @@ def upgrade() -> None: FROM generate_series(1, length(word)) AS n ) FROM ts_stat( - $q$ - SELECT to_tsvector('chinese_zh', + $$SELECT to_tsvector('chinese_zh', coalesce(title_zh, '') || ' ' || coalesce(body_zh_text, '') || ' ' || coalesce(commentary, '') || ' ' || @@ -102,10 +102,9 @@ def upgrade() -> None: WHERE title_zh IS NOT NULL OR body_zh_text IS NOT NULL OR commentary IS NOT NULL - OR commentary_meituan IS NOT NULL - $q$, 'a' - ) - WHERE length(word) >= 2; + OR commentary_meituan IS NOT NULL$$ + ) AS s + WHERE length(s.word) >= 2; END; $func$ LANGUAGE plpgsql; """ @@ -135,8 +134,7 @@ def downgrade() -> None: FROM generate_series(1, length(word)) AS n ) FROM ts_stat( - $q$ - SELECT to_tsvector('simple', + $$SELECT to_tsvector('simple', coalesce(title_zh, '') || ' ' || coalesce(body_zh_text, '') || ' ' || coalesce(commentary, '') || ' ' || @@ -146,10 +144,9 @@ def downgrade() -> None: WHERE title_zh IS NOT NULL OR body_zh_text IS NOT NULL OR commentary IS NOT NULL - OR commentary_meituan IS NOT NULL - $q$, 'a' - ) - WHERE length(word) >= 2; + OR commentary_meituan IS NOT NULL$$ + ) AS s + WHERE length(s.word) >= 2; END; $func$ LANGUAGE plpgsql; """ diff --git a/backend/app/services/search.py b/backend/app/services/search.py index 427e7cf..45eb43a 100644 --- a/backend/app/services/search.py +++ b/backend/app/services/search.py @@ -130,8 +130,9 @@ class SearchService: async def _fallback_keywords(self, q: str, limit: int) -> list[dict]: """回退:ts_stat 实时聚合(慢但能用)。 - - 从 articles.title_zh + body_zh_text 实时 to_tsvector - - 适用:search_keywords 表空 + ts_stat 之前的全量聚合 + - 从 articles.title_zh + body_zh_text 实时 to_tsvector(chinese_zh) + - 适用:search_keywords 表空 + worker 没刷新过 + - ts_stat(text) 单参 — 第二参 weights mask 不能传 'a'(zhparser 不标 A 权重会 0 行) """ from sqlalchemy import text @@ -139,16 +140,13 @@ class SearchService: """ SELECT word, nentry::int AS weight FROM ts_stat( - 'simple', - ( - SELECT to_tsvector( - 'simple', - coalesce(title_zh, '') || ' ' || coalesce(body_zh_text, '') - ) - FROM articles - WHERE title_zh IS NOT NULL OR body_zh_text IS NOT NULL + $$SELECT to_tsvector('chinese_zh', + coalesce(title_zh, '') || ' ' || coalesce(body_zh_text, '') ) - ) + FROM articles + WHERE title_zh IS NOT NULL OR body_zh_text IS NOT NULL + LIMIT 500$$ + ) AS s WHERE word LIKE :prefix ORDER BY nentry DESC LIMIT :lim