""" 爬虫模块 - 网站评论抓取 """ import requests from lxml import etree import time from typing import List, Dict, Optional from urllib.parse import urljoin from bs4 import BeautifulSoup import random class SpiderManager: """爬虫管理器""" def __init__(self, config: Dict): self.config = config self.session = requests.Session() self.session.headers.update({ 'User-Agent': config.get('user_agent', 'Mozilla/5.0') }) self.retry_times = config.get('retry_times', 3) self.retry_interval = config.get('retry_interval', 5) def fetch(self, url: str = None, xpath: str = None) -> List[Dict]: """ 抓取网页评论 返回评论列表,每个元素包含 content 和 url """ target_url = url or self.config.get('target_url', '') target_xpath = xpath or self.config.get('xpath', '') if not target_url: return [] html = self._fetch_with_retry(target_url) if not html: return [] return self._parse_comments(html, target_xpath, target_url) def _fetch_with_retry(self, url: str, max_retries: int = None) -> Optional[str]: """带重试的网页获取""" max_retries = max_retries or self.retry_times for attempt in range(max_retries): try: response = self.session.get(url, timeout=30) response.raise_for_status() response.encoding = response.apparent_encoding return response.text except requests.RequestException as e: print(f"请求失败 (尝试 {attempt + 1}/{max_retries}): {e}") if attempt < max_retries - 1: time.sleep(self.retry_interval + random.uniform(0, 2)) else: return None return None def _parse_comments(self, html: str, xpath: str, base_url: str) -> List[Dict]: """解析评论""" comments = [] try: # 使用 lxml 解析 tree = etree.HTML(html) elements = tree.xpath(xpath) for elem in elements: try: text = elem.text_content().strip() if text: # 获取链接的 href(如果存在) href = elem.get('href') full_url = urljoin(base_url, href) if href else base_url comments.append({ 'content': text, 'url': full_url }) except Exception as e: print(f"解析元素失败: {e}") continue except Exception as e: print(f"XPath解析失败: {e}") # 备选解析方法 comments = self._fallback_parse(html, base_url) return comments def _fallback_parse(self, html: str, base_url: str) -> List[Dict]: """备选解析方法 - 使用 BeautifulSoup""" comments = [] try: soup = BeautifulSoup(html, 'lxml') # 尝试查找常见的评论元素 # 这里可以根据实际网站结构调整选择器 elements = soup.find_all(['a', 'div', 'p', 'span'], class_=lambda x: x and 'linkblack' in x if x else False) for elem in elements[:50]: # 限制数量 text = elem.get_text().strip() if text and len(text) > 5: comments.append({ 'content': text, 'url': base_url }) except Exception as e: print(f"备选解析失败: {e}") return comments def set_user_agent(self, user_agent: str): """更新User-Agent""" self.session.headers.update({'User-Agent': user_agent}) def update_config(self, config: Dict): """更新配置""" self.config.update(config) if 'user_agent' in config: self.set_user_agent(config['user_agent']) if 'retry_times' in config: self.retry_times = config['retry_times'] if 'retry_interval' in config: self.retry_interval = config['retry_interval'] def get_fetch_interval(self) -> int: """获取爬取间隔""" return self.config.get('fetch_interval', 60)