fix(search): ts_stat 改单参(text),避免 'a' mask 静默 0 行

zhparser 不标 A 权重(也不标 B/C/D),传 'a' mask 给 ts_stat(text, weights) 会过滤掉所有词但不报错,静默 0 行。改成 ts_stat(text) 单参(等价 mask='abcd',聚合所有权重)。修: - 0010 迁移里 refresh_search_keywords() 改用单参 ts_stat - 0010 迁移 downgrade 部分同步修 - 0009 迁移 refresh_search_keywords() 同步修 - services/search.py _fallback_keywords 改用 chinese_zh + 单参 ts_stat
2026-06-15 19:19:19 +08:00
parent e85a27f69d
commit db4fd8699b
3 changed files with 31 additions and 40 deletions
--- a/backend/alembic/versions/0009_search_suggestions.py
+++ b/backend/alembic/versions/0009_search_suggestions.py
@@ -218,10 +218,7 @@ def upgrade() -> None:
                    FROM generate_series(1, length(word)) AS n
                )
            FROM ts_stat(
-                'simple',
+                $$SELECT to_tsvector('simple',
                (
                    SELECT to_tsvector(
                        'simple',
                    coalesce(title_zh, '') || ' ' ||
                    coalesce(body_zh_text, '') || ' ' ||
                    coalesce(commentary, '') || ' ' ||
@@ -231,10 +228,9 @@ def upgrade() -> None:
                WHERE title_zh IS NOT NULL
                   OR body_zh_text IS NOT NULL
                   OR commentary IS NOT NULL
-                       OR commentary_meituan IS NOT NULL
+                   OR commentary_meituan IS NOT NULL$$
-                )
+            ) AS s
-            )
+            WHERE length(s.word) >= 2;  -- 过滤单字噪音(中文标点/单字停用词)
            WHERE length(word) >= 2;  -- 过滤单字噪音(中文标点/单字停用词)
        END;
        $$ LANGUAGE plpgsql;
--- a/backend/alembic/versions/0010_zhparser_chinese.py
+++ b/backend/alembic/versions/0010_zhparser_chinese.py
@@ -80,7 +80,8 @@ def upgrade() -> None:
            TRUNCATE search_keywords;
            -- ts_stat(query text) 接受 SQL 字符串,内部执行并聚合词频
-            -- 'a' = 任意权重(A/B/C/D 四档,这里聚合所有)
+            -- 单参(等价 mask='abcd',聚合所有权重)
            -- ⚠️ 不能传 'a' mask — zhparser 不标 A 权重,会 0 行
            INSERT INTO search_keywords (keyword, source, weight, prefix_keys)
            SELECT
                word,
@@ -91,8 +92,7 @@ def upgrade() -> None:
                    FROM generate_series(1, length(word)) AS n
                )
            FROM ts_stat(
-                $q$
+                $$SELECT to_tsvector('chinese_zh',
                SELECT to_tsvector('chinese_zh',
                    coalesce(title_zh, '') || ' ' ||
                    coalesce(body_zh_text, '') || ' ' ||
                    coalesce(commentary, '') || ' ' ||
@@ -102,10 +102,9 @@ def upgrade() -> None:
                WHERE title_zh IS NOT NULL
                   OR body_zh_text IS NOT NULL
                   OR commentary IS NOT NULL
-                   OR commentary_meituan IS NOT NULL
+                   OR commentary_meituan IS NOT NULL$$
-                $q$, 'a'
+            ) AS s
-            )
+            WHERE length(s.word) >= 2;
            WHERE length(word) >= 2;
        END;
        $func$ LANGUAGE plpgsql;
        """
@@ -135,8 +134,7 @@ def downgrade() -> None:
                    FROM generate_series(1, length(word)) AS n
                )
            FROM ts_stat(
-                $q$
+                $$SELECT to_tsvector('simple',
                SELECT to_tsvector('simple',
                    coalesce(title_zh, '') || ' ' ||
                    coalesce(body_zh_text, '') || ' ' ||
                    coalesce(commentary, '') || ' ' ||
@@ -146,10 +144,9 @@ def downgrade() -> None:
                WHERE title_zh IS NOT NULL
                   OR body_zh_text IS NOT NULL
                   OR commentary IS NOT NULL
-                   OR commentary_meituan IS NOT NULL
+                   OR commentary_meituan IS NOT NULL$$
-                $q$, 'a'
+            ) AS s
-            )
+            WHERE length(s.word) >= 2;
            WHERE length(word) >= 2;
        END;
        $func$ LANGUAGE plpgsql;
        """
--- a/backend/app/services/search.py
+++ b/backend/app/services/search.py
@@ -130,8 +130,9 @@ class SearchService:
    async def _fallback_keywords(self, q: str, limit: int) -> list[dict]:
        """回退:ts_stat 实时聚合(慢但能用)。
-        - 从 articles.title_zh + body_zh_text 实时 to_tsvector
+        - 从 articles.title_zh + body_zh_text 实时 to_tsvector(chinese_zh)
-        - 适用:search_keywords 表空 + ts_stat 之前的全量聚合
+        - 适用:search_keywords 表空 + worker 没刷新过
        - ts_stat(text) 单参 — 第二参 weights mask 不能传 'a'(zhparser 不标 A 权重会 0 行)
        """
        from sqlalchemy import text
@@ -139,16 +140,13 @@ class SearchService:
            """
            SELECT word, nentry::int AS weight
            FROM ts_stat(
-                'simple',
+                $$SELECT to_tsvector('chinese_zh',
                (
                    SELECT to_tsvector(
                        'simple',
                    coalesce(title_zh, '') || ' ' || coalesce(body_zh_text, '')
                )
                FROM articles
                WHERE title_zh IS NOT NULL OR body_zh_text IS NOT NULL
-                )
+                LIMIT 500$$
-            )
+            ) AS s
            WHERE word LIKE :prefix
            ORDER BY nentry DESC
            LIMIT :lim