68 lines
1.9 KiB
Python
68 lines
1.9 KiB
Python
|
|
"""Fetcher 抽象基类 + 通用工具。"""
|
||
|
|
from __future__ import annotations
|
||
|
|
|
||
|
|
import hashlib
|
||
|
|
from abc import ABC, abstractmethod
|
||
|
|
from dataclasses import dataclass, field
|
||
|
|
from datetime import datetime
|
||
|
|
from typing import Any
|
||
|
|
|
||
|
|
import httpx
|
||
|
|
|
||
|
|
from app.config import settings
|
||
|
|
|
||
|
|
|
||
|
|
def normalize_url(url: str) -> str:
|
||
|
|
"""去 utm_*、fragment、尾斜杠,用于 url_hash。"""
|
||
|
|
from urllib.parse import urlsplit, urlunsplit, parse_qsl, urlencode
|
||
|
|
|
||
|
|
sp = urlsplit(url.strip())
|
||
|
|
# 去掉 fragment
|
||
|
|
fragment = ""
|
||
|
|
# 过滤 utm_*
|
||
|
|
qs = [(k, v) for k, v in parse_qsl(sp.query, keep_blank_values=True) if not k.lower().startswith("utm_")]
|
||
|
|
query = urlencode(qs)
|
||
|
|
# 路径末尾 /
|
||
|
|
path = sp.path.rstrip("/") or "/"
|
||
|
|
return urlunsplit((sp.scheme.lower(), sp.netloc.lower(), path, query, fragment))
|
||
|
|
|
||
|
|
|
||
|
|
def url_hash(url: str) -> str:
|
||
|
|
return hashlib.sha1(normalize_url(url).encode()).hexdigest()
|
||
|
|
|
||
|
|
|
||
|
|
@dataclass
|
||
|
|
class FetchedItem:
|
||
|
|
"""统一返回结构:一个待入库的条目。"""
|
||
|
|
|
||
|
|
url: str
|
||
|
|
title: str
|
||
|
|
body_html: str | None = None
|
||
|
|
body_text: str = ""
|
||
|
|
published_at: datetime | None = None
|
||
|
|
lang: str | None = None
|
||
|
|
author: str | None = None
|
||
|
|
image_url: str | None = None
|
||
|
|
guid: str | None = None
|
||
|
|
raw: dict[str, Any] = field(default_factory=dict)
|
||
|
|
|
||
|
|
|
||
|
|
class BaseFetcher(ABC):
|
||
|
|
def __init__(self, url: str, headers: dict | None = None):
|
||
|
|
self.url = url
|
||
|
|
self.headers = headers or {"User-Agent": "DiaryNews/0.1 (+https://github.com/)"}
|
||
|
|
|
||
|
|
@abstractmethod
|
||
|
|
async def fetch(self) -> list[FetchedItem]:
|
||
|
|
"""拉取并解析,返回 FetchedItem 列表。"""
|
||
|
|
|
||
|
|
async def _http_get(self) -> bytes:
|
||
|
|
async with httpx.AsyncClient(
|
||
|
|
timeout=settings.fetch_timeout,
|
||
|
|
follow_redirects=True,
|
||
|
|
headers=self.headers,
|
||
|
|
) as client:
|
||
|
|
r = await client.get(self.url)
|
||
|
|
r.raise_for_status()
|
||
|
|
return r.content
|