131 lines
4.3 KiB
Python
131 lines
4.3 KiB
Python
"""
|
||
爬虫模块 - 网站评论抓取
|
||
"""
|
||
import requests
|
||
from lxml import etree
|
||
import time
|
||
from typing import List, Dict, Optional
|
||
from urllib.parse import urljoin
|
||
from bs4 import BeautifulSoup
|
||
import random
|
||
|
||
|
||
class SpiderManager:
|
||
"""爬虫管理器"""
|
||
|
||
def __init__(self, config: Dict):
|
||
self.config = config
|
||
self.session = requests.Session()
|
||
self.session.headers.update({
|
||
'User-Agent': config.get('user_agent', 'Mozilla/5.0')
|
||
})
|
||
self.retry_times = config.get('retry_times', 3)
|
||
self.retry_interval = config.get('retry_interval', 5)
|
||
|
||
def fetch(self, url: str = None, xpath: str = None) -> List[Dict]:
|
||
"""
|
||
抓取网页评论
|
||
返回评论列表,每个元素包含 content 和 url
|
||
"""
|
||
target_url = url or self.config.get('target_url', '')
|
||
target_xpath = xpath or self.config.get('xpath', '')
|
||
|
||
if not target_url:
|
||
return []
|
||
|
||
html = self._fetch_with_retry(target_url)
|
||
if not html:
|
||
return []
|
||
|
||
return self._parse_comments(html, target_xpath, target_url)
|
||
|
||
def _fetch_with_retry(self, url: str, max_retries: int = None) -> Optional[str]:
|
||
"""带重试的网页获取"""
|
||
max_retries = max_retries or self.retry_times
|
||
|
||
for attempt in range(max_retries):
|
||
try:
|
||
response = self.session.get(url, timeout=30)
|
||
response.raise_for_status()
|
||
response.encoding = response.apparent_encoding
|
||
return response.text
|
||
except requests.RequestException as e:
|
||
print(f"请求失败 (尝试 {attempt + 1}/{max_retries}): {e}")
|
||
if attempt < max_retries - 1:
|
||
time.sleep(self.retry_interval + random.uniform(0, 2))
|
||
else:
|
||
return None
|
||
return None
|
||
|
||
def _parse_comments(self, html: str, xpath: str, base_url: str) -> List[Dict]:
|
||
"""解析评论"""
|
||
comments = []
|
||
|
||
try:
|
||
# 使用 lxml 解析
|
||
tree = etree.HTML(html)
|
||
elements = tree.xpath(xpath)
|
||
|
||
for elem in elements:
|
||
try:
|
||
text = elem.text_content().strip()
|
||
if text:
|
||
# 获取链接的 href(如果存在)
|
||
href = elem.get('href')
|
||
full_url = urljoin(base_url, href) if href else base_url
|
||
|
||
comments.append({
|
||
'content': text,
|
||
'url': full_url
|
||
})
|
||
except Exception as e:
|
||
print(f"解析元素失败: {e}")
|
||
continue
|
||
|
||
except Exception as e:
|
||
print(f"XPath解析失败: {e}")
|
||
# 备选解析方法
|
||
comments = self._fallback_parse(html, base_url)
|
||
|
||
return comments
|
||
|
||
def _fallback_parse(self, html: str, base_url: str) -> List[Dict]:
|
||
"""备选解析方法 - 使用 BeautifulSoup"""
|
||
comments = []
|
||
try:
|
||
soup = BeautifulSoup(html, 'lxml')
|
||
|
||
# 尝试查找常见的评论元素
|
||
# 这里可以根据实际网站结构调整选择器
|
||
elements = soup.find_all(['a', 'div', 'p', 'span'], class_=lambda x: x and 'linkblack' in x if x else False)
|
||
|
||
for elem in elements[:50]: # 限制数量
|
||
text = elem.get_text().strip()
|
||
if text and len(text) > 5:
|
||
comments.append({
|
||
'content': text,
|
||
'url': base_url
|
||
})
|
||
except Exception as e:
|
||
print(f"备选解析失败: {e}")
|
||
|
||
return comments
|
||
|
||
def set_user_agent(self, user_agent: str):
|
||
"""更新User-Agent"""
|
||
self.session.headers.update({'User-Agent': user_agent})
|
||
|
||
def update_config(self, config: Dict):
|
||
"""更新配置"""
|
||
self.config.update(config)
|
||
if 'user_agent' in config:
|
||
self.set_user_agent(config['user_agent'])
|
||
if 'retry_times' in config:
|
||
self.retry_times = config['retry_times']
|
||
if 'retry_interval' in config:
|
||
self.retry_interval = config['retry_interval']
|
||
|
||
def get_fetch_interval(self) -> int:
|
||
"""获取爬取间隔"""
|
||
return self.config.get('fetch_interval', 60)
|