"""RSS / Atom fetcher(基于 feedparser)。

增强:对 content 太短(< BODY_MIN_LEN)的 item,自动去 article URL 抓全文
用 trafilatura 抽取(从 RSS 摘要升级到全文)。
"""
from __future__ import annotations

import logging
from datetime import datetime, timezone
from email.utils import parsedate_to_datetime

import feedparser
import httpx
import trafilatura
from bs4 import BeautifulSoup
from dateutil import parser as dtp

from app.services.fetchers.base import BaseFetcher, FetchedItem

logger = logging.getLogger("news.fetcher.rss")

# 如果 RSS 给的 body 不到这个字符数,就自动去 article URL 抓全文
BODY_MIN_LEN = 500


class RSSFetcher(BaseFetcher):
    async def fetch(self) -> list[FetchedItem]:
        raw = await self._http_get()
        # feedparser 在不同 Python 下处理 bytes/str
        try:
            text = raw.decode("utf-8")
        except UnicodeDecodeError:
            text = raw.decode("utf-8", errors="replace")
        feed = feedparser.parse(text)
        if feed.bozo and not feed.entries:
            # 整篇解析失败
            raise RuntimeError(f"RSS parse failed: {feed.bozo_exception}")

        # 拿到 fetch 上下文
        self._http_client: httpx.AsyncClient | None = None
        items: list[FetchedItem] = []
        for e in feed.entries:
            url = e.get("link") or e.get("id")
            if not url:
                continue
            title = (e.get("title") or "").strip()
            if not title:
                continue

            body_html, body_text = self._extract_from_entry(e)

            # body 太短:去 article URL 抓全文(trafilatura)
            if len(body_text) < BODY_MIN_LEN and url:
                full_html, full_text = await self._fetch_fulltext(url)
                if full_text and len(full_text) > len(body_text):
                    body_text = full_text
                    body_html = full_html or body_html

            published_at = _parse_dt(e.get("published") or e.get("updated") or e.get("created"))
            author = e.get("author")
            image_url = self._extract_image(e)

            items.append(
                FetchedItem(
                    url=url,
                    title=title,
                    body_html=body_html,
                    body_text=body_text,
                    published_at=published_at,
                    lang=e.get("language") or feed.feed.get("language"),
                    author=author,
                    image_url=image_url,
                    guid=e.get("id") or e.get("guid"),
                )
            )
        if self._http_client is not None:
            await self._http_client.aclose()
        return items

    @staticmethod
    def _extract_from_entry(e) -> tuple[str | None, str]:
        body_html = None
        if e.get("content"):
            contents = sorted(e["content"], key=lambda c: -len(c.get("value", "")))
            body_html = contents[0].get("value")
        if not body_html:
            body_html = e.get("summary")
        if not body_html:
            return None, ""
        soup = BeautifulSoup(body_html, "lxml")
        for tag in soup(["script", "style", "noscript"]):
            tag.decompose()
        text = soup.get_text(separator="\n", strip=True)
        return body_html, text

    @staticmethod
    def _extract_image(e) -> str | None:
        if e.get("media_content"):
            try:
                return e["media_content"][0].get("url")
            except (IndexError, KeyError, TypeError):
                pass
        if e.get("media_thumbnail"):
            try:
                return e["media_thumbnail"][0].get("url")
            except (IndexError, KeyError, TypeError):
                pass
        if e.get("enclosures"):
            for enc in e["enclosures"]:
                if enc.get("type", "").startswith("image/"):
                    return enc.get("href") or enc.get("url")
        return None

    async def _fetch_fulltext(self, url: str) -> tuple[str | None, str]:
        """去 article URL 抓全文,用 trafilatura 抽正文。"""
        try:
            if self._http_client is None:
                # 用真实浏览器 UA(很多站[如 NHK news.web]把爬虫 UA 直接 403)
                self._http_client = httpx.AsyncClient(
                    follow_redirects=True,
                    timeout=20,
                    headers={
                        "User-Agent": (
                            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                            "AppleWebKit/537.36 (KHTML, like Gecko) "
                            "Chrome/120.0.0.0 Safari/537.36"
                        ),
                        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
                        "Accept-Language": "ja,en-US;q=0.9,en;q=0.8,zh;q=0.7",
                    },
                )
            r = await self._http_client.get(url)
            r.raise_for_status()
        except Exception as e:
            logger.warning("fulltext fetch failed for %s: %s", url, e)
            return None, ""

        try:
            html = trafilatura.extract(r.text, include_comments=False, include_tables=False, favor_recall=True, output_format="html") or ""
            text = trafilatura.extract(r.text, include_comments=False, include_tables=False, favor_recall=True, output_format="txt") or ""
        except Exception as e:
            logger.warning("trafilatura extract failed for %s: %s", url, e)
            return None, ""
        return html, text


def _parse_dt(s: str | None) -> datetime | None:
    if not s:
        return None
    try:
        dt = dtp.parse(s)
    except (ValueError, TypeError, dtp.ParserError):
        try:
            dt = parsedate_to_datetime(s)
        except Exception:
            return None
    if dt is None:
        return None
    if dt.tzinfo is None:
        dt = dt.replace(tzinfo=timezone.utc)
    return dt.astimezone(timezone.utc)