zhparser 不标 A 权重(也不标 B/C/D),传 'a' mask 给 ts_stat(text, weights) 会过滤掉所有词 但不报错,静默 0 行。改成 ts_stat(text) 单参(等价 mask='abcd',聚合所有权重)。 修: - 0010 迁移里 refresh_search_keywords() 改用单参 ts_stat - 0010 迁移 downgrade 部分同步修 - 0009 迁移 refresh_search_keywords() 同步修 - services/search.py _fallback_keywords 改用 chinese_zh + 单参 ts_stat
159 lines
5.6 KiB
Python
159 lines
5.6 KiB
Python
"""搜索建议服务:混合 A(高频词)+ B(真实标题) + 冷启动 fallback。
|
|
|
|
- A: search_keywords(prefix_keys @> ARRAY['美'], ORDER BY weight DESC)
|
|
- B: search_title_suggestions(prefix_keys @> ARRAY['美'], ORDER BY published_at DESC)
|
|
- fallback: 任一表空时回退实时 ILIKE 查 articles(冷启动 / worker 没刷新过)
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import logging
|
|
|
|
from sqlalchemy import desc, select
|
|
from sqlalchemy.dialects.postgresql import ARRAY
|
|
from sqlalchemy.ext.asyncio import AsyncSession
|
|
|
|
from app.models.article import Article
|
|
from app.models.search_keyword import SearchKeyword
|
|
from app.models.search_title_suggestion import SearchTitleSuggestion
|
|
|
|
logger = logging.getLogger("news.search")
|
|
|
|
|
|
class SearchService:
|
|
"""搜索建议 service。
|
|
|
|
设计:输入 prefix,返回 { titles, keywords } 两组候选。
|
|
- titles 真实文章标题(按 published_at DESC 排)
|
|
- keywords 高频词(按 weight DESC 排)
|
|
- 任一为空时回退实时 articles.title_zh ILIKE 查询(冷启动兜底)
|
|
"""
|
|
|
|
def __init__(self, session: AsyncSession):
|
|
self.session = session
|
|
|
|
async def suggestions(
|
|
self,
|
|
q: str,
|
|
limit: int = 10,
|
|
) -> dict[str, list[dict]]:
|
|
"""返回搜索建议。
|
|
|
|
Args:
|
|
q: 前缀(1-20 字符)
|
|
limit: 每组最多返回多少(默认 10,最大 20)
|
|
|
|
Returns:
|
|
{"query": q, "titles": [...], "keywords": [...]}
|
|
titles 元素:{"id": article_id, "published_at": ...}
|
|
keywords 元素:{"word": ..., "weight": ...}
|
|
"""
|
|
q = q.strip()
|
|
if not q:
|
|
return {"query": q, "titles": [], "keywords": []}
|
|
|
|
# 1) 查 search_title_suggestions(B 方案)
|
|
title_rows = await self.session.execute(
|
|
select(
|
|
SearchTitleSuggestion.article_id,
|
|
SearchTitleSuggestion.published_at,
|
|
SearchTitleSuggestion.title_lang,
|
|
)
|
|
.where(SearchTitleSuggestion.prefix_keys.contains([q]))
|
|
.order_by(desc(SearchTitleSuggestion.published_at))
|
|
.limit(limit)
|
|
)
|
|
titles = [
|
|
{
|
|
"id": row.article_id,
|
|
"published_at": row.published_at.isoformat() if row.published_at else None,
|
|
"lang": row.title_lang,
|
|
}
|
|
for row in title_rows.all()
|
|
]
|
|
|
|
# 2) 查 search_keywords(A 方案)
|
|
kw_rows = await self.session.execute(
|
|
select(SearchKeyword.keyword, SearchKeyword.weight, SearchKeyword.source)
|
|
.where(SearchKeyword.prefix_keys.contains([q]))
|
|
.order_by(desc(SearchKeyword.weight))
|
|
.limit(limit)
|
|
)
|
|
keywords = [
|
|
{"word": row.keyword, "weight": row.weight, "source": row.source}
|
|
for row in kw_rows.all()
|
|
]
|
|
|
|
# 3) 冷启动 fallback:任一为空时,回退到实时 ILIKE articles
|
|
# (如果两张固化表都跑空了,说明刚建库或数据被 truncate)
|
|
if not titles:
|
|
titles = await self._fallback_titles(q, limit)
|
|
if not keywords:
|
|
keywords = await self._fallback_keywords(q, limit)
|
|
|
|
return {"query": q, "titles": titles, "keywords": keywords}
|
|
|
|
async def _fallback_titles(self, q: str, limit: int) -> list[dict]:
|
|
"""回退:实时查 articles.title_zh / title(走 B-tree 索引,慢但能用)。
|
|
|
|
- 优先 title_zh LIKE(翻译后),没有再 LIKE title(短新闻)
|
|
- 限制 7 天内的文章,避免返回太老的(冷启动场景下用户预期)
|
|
"""
|
|
from datetime import datetime, timedelta, timezone
|
|
|
|
from sqlalchemy import or_
|
|
|
|
since = datetime.now(timezone.utc).replace(tzinfo=None) - timedelta(days=7)
|
|
like = f"{q}%"
|
|
stmt = (
|
|
select(Article.id, Article.published_at, Article.title_zh, Article.title)
|
|
.where(
|
|
Article.published_at >= since,
|
|
Article.duplicate_of.is_(None),
|
|
or_(
|
|
Article.title_zh.ilike(like),
|
|
Article.title.ilike(like),
|
|
),
|
|
)
|
|
.order_by(desc(Article.published_at))
|
|
.limit(limit)
|
|
)
|
|
rows = (await self.session.execute(stmt)).all()
|
|
return [
|
|
{
|
|
"id": row.id,
|
|
"published_at": row.published_at.isoformat() if row.published_at else None,
|
|
"lang": "zh" if row.title_zh else "src",
|
|
}
|
|
for row in rows
|
|
]
|
|
|
|
async def _fallback_keywords(self, q: str, limit: int) -> list[dict]:
|
|
"""回退:ts_stat 实时聚合(慢但能用)。
|
|
|
|
- 从 articles.title_zh + body_zh_text 实时 to_tsvector(chinese_zh)
|
|
- 适用:search_keywords 表空 + worker 没刷新过
|
|
- ts_stat(text) 单参 — 第二参 weights mask 不能传 'a'(zhparser 不标 A 权重会 0 行)
|
|
"""
|
|
from sqlalchemy import text
|
|
|
|
sql = text(
|
|
"""
|
|
SELECT word, nentry::int AS weight
|
|
FROM ts_stat(
|
|
$$SELECT to_tsvector('chinese_zh',
|
|
coalesce(title_zh, '') || ' ' || coalesce(body_zh_text, '')
|
|
)
|
|
FROM articles
|
|
WHERE title_zh IS NOT NULL OR body_zh_text IS NOT NULL
|
|
LIMIT 500$$
|
|
) AS s
|
|
WHERE word LIKE :prefix
|
|
ORDER BY nentry DESC
|
|
LIMIT :lim
|
|
"""
|
|
)
|
|
rows = (
|
|
await self.session.execute(sql, {"prefix": f"{q}%", "lim": limit})
|
|
).all()
|
|
return [{"word": r.word, "weight": r.weight, "source": "ts_stat_live"} for r in rows]
|