Files
guba-indicator/spider.py

131 lines
4.3 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
爬虫模块 - 网站评论抓取
"""
import requests
from lxml import etree
import time
from typing import List, Dict, Optional
from urllib.parse import urljoin
from bs4 import BeautifulSoup
import random
class SpiderManager:
"""爬虫管理器"""
def __init__(self, config: Dict):
self.config = config
self.session = requests.Session()
self.session.headers.update({
'User-Agent': config.get('user_agent', 'Mozilla/5.0')
})
self.retry_times = config.get('retry_times', 3)
self.retry_interval = config.get('retry_interval', 5)
def fetch(self, url: str = None, xpath: str = None) -> List[Dict]:
"""
抓取网页评论
返回评论列表,每个元素包含 content 和 url
"""
target_url = url or self.config.get('target_url', '')
target_xpath = xpath or self.config.get('xpath', '')
if not target_url:
return []
html = self._fetch_with_retry(target_url)
if not html:
return []
return self._parse_comments(html, target_xpath, target_url)
def _fetch_with_retry(self, url: str, max_retries: int = None) -> Optional[str]:
"""带重试的网页获取"""
max_retries = max_retries or self.retry_times
for attempt in range(max_retries):
try:
response = self.session.get(url, timeout=30)
response.raise_for_status()
response.encoding = response.apparent_encoding
return response.text
except requests.RequestException as e:
print(f"请求失败 (尝试 {attempt + 1}/{max_retries}): {e}")
if attempt < max_retries - 1:
time.sleep(self.retry_interval + random.uniform(0, 2))
else:
return None
return None
def _parse_comments(self, html: str, xpath: str, base_url: str) -> List[Dict]:
"""解析评论"""
comments = []
try:
# 使用 lxml 解析
tree = etree.HTML(html)
elements = tree.xpath(xpath)
for elem in elements:
try:
text = elem.text_content().strip()
if text:
# 获取链接的 href如果存在
href = elem.get('href')
full_url = urljoin(base_url, href) if href else base_url
comments.append({
'content': text,
'url': full_url
})
except Exception as e:
print(f"解析元素失败: {e}")
continue
except Exception as e:
print(f"XPath解析失败: {e}")
# 备选解析方法
comments = self._fallback_parse(html, base_url)
return comments
def _fallback_parse(self, html: str, base_url: str) -> List[Dict]:
"""备选解析方法 - 使用 BeautifulSoup"""
comments = []
try:
soup = BeautifulSoup(html, 'lxml')
# 尝试查找常见的评论元素
# 这里可以根据实际网站结构调整选择器
elements = soup.find_all(['a', 'div', 'p', 'span'], class_=lambda x: x and 'linkblack' in x if x else False)
for elem in elements[:50]: # 限制数量
text = elem.get_text().strip()
if text and len(text) > 5:
comments.append({
'content': text,
'url': base_url
})
except Exception as e:
print(f"备选解析失败: {e}")
return comments
def set_user_agent(self, user_agent: str):
"""更新User-Agent"""
self.session.headers.update({'User-Agent': user_agent})
def update_config(self, config: Dict):
"""更新配置"""
self.config.update(config)
if 'user_agent' in config:
self.set_user_agent(config['user_agent'])
if 'retry_times' in config:
self.retry_times = config['retry_times']
if 'retry_interval' in config:
self.retry_interval = config['retry_interval']
def get_fetch_interval(self) -> int:
"""获取爬取间隔"""
return self.config.get('fetch_interval', 60)