Files
guba-indicator/spider.py
xiaji 10ce2ba17b feat(配置): 增加playwright_dir配置项并优化浏览器路径查找逻辑
添加playwright_dir配置项以支持自定义浏览器路径
优化打包环境和开发环境下的浏览器路径查找逻辑,优先使用配置中的路径
删除不再使用的build.bat打包脚本
2026-01-21 11:54:21 +08:00

414 lines
16 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
爬虫模块 - 网站评论抓取
"""
import requests
from lxml import etree
import time
from typing import List, Dict, Optional
from urllib.parse import urljoin
from bs4 import BeautifulSoup
import random
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from loguru import logger
class SpiderManager:
"""爬虫管理器"""
def __init__(self, config: Dict):
self.config = config
self.session = requests.Session()
self.session.headers.update({
'User-Agent': config.get('user_agent', 'Mozilla/5.0')
})
self.retry_times = config.get('retry_times', 3)
self.retry_interval = config.get('retry_interval', 5)
logger.info(f"爬虫管理器初始化完成目标URL: {config.get('target_url', '')}")
def fetch(self, url: str = None, xpath: str = None) -> List[Dict]:
"""
抓取网页评论
返回评论列表,每个元素包含 content 和 url
"""
target_url = url or self.config.get('target_url', '')
target_xpath = xpath or self.config.get('xpath', '')
if not target_url:
logger.warning("未设置目标URL")
return []
logger.info(f"开始抓取: {target_url}")
html = self._fetch_with_retry(target_url)
if not html:
logger.warning("网页获取失败")
return []
comments = self._parse_comments(html, target_xpath, target_url)
logger.info(f"解析完成,获取到 {len(comments)} 条评论")
return comments
def _fetch_with_retry(self, url: str, max_retries: int = None) -> Optional[str]:
"""带重试的网页获取"""
max_retries = max_retries or self.retry_times
for attempt in range(max_retries):
try:
logger.debug(f"尝试 {attempt + 1}/{max_retries} 获取网页")
response = self.session.get(url, timeout=30)
response.raise_for_status()
response.encoding = response.apparent_encoding
logger.debug(f"网页获取成功,大小: {len(response.text)} 字节")
return response.text
except requests.RequestException as e:
logger.warning(f"请求失败 (尝试 {attempt + 1}/{max_retries}): {e}")
if attempt < max_retries - 1:
time.sleep(self.retry_interval + random.uniform(0, 2))
else:
logger.error(f"所有重试均失败: {url}")
return None
return None
def get_page_title(self, url: str = None) -> str:
"""获取页面标题"""
target_url = url or self.config.get('target_url', '')
if not target_url:
logger.warning("未设置目标URL")
return ""
logger.info(f"获取页面标题: {target_url}")
html = self._fetch_with_retry(target_url)
if not html:
logger.warning("网页获取失败")
return ""
try:
# 使用 lxml 解析页面标题
tree = etree.HTML(html)
title_elements = tree.xpath('//title/text()')
if title_elements:
title = title_elements[0].strip()
logger.info(f"获取到页面标题: {title}")
return title
else:
logger.warning("未找到页面标题")
return ""
except Exception as e:
logger.error(f"解析页面标题失败: {e}")
return ""
def _parse_comments(self, html: str, xpath: str, base_url: str) -> List[Dict]:
"""解析评论"""
comments = []
try:
# 使用 lxml 解析
tree = etree.HTML(html)
elements = tree.xpath(xpath)
logger.debug(f"XPath 匹配到 {len(elements)} 个元素")
for elem in elements:
try:
text = etree.tostring(elem, method='text', encoding='unicode').strip()
if text:
# 获取链接的 href如果存在
href = elem.get('href')
full_url = urljoin(base_url, href) if href else base_url
comments.append({
'content': text,
'url': full_url
})
except Exception as e:
logger.error(f"解析元素失败: {e}")
continue
except Exception as e:
logger.error(f"XPath解析失败: {e}")
# 备选解析方法
comments = self._fallback_parse(html, base_url)
return comments
def _fallback_parse(self, html: str, base_url: str) -> List[Dict]:
"""备选解析方法 - 使用 BeautifulSoup"""
comments = []
try:
logger.debug("使用备选解析方法")
soup = BeautifulSoup(html, 'lxml')
# 尝试查找常见的评论元素
# 这里可以根据实际网站结构调整选择器
elements = soup.find_all(['a', 'div', 'p', 'span'], class_=lambda x: x and 'linkblack' in x if x else False)
for elem in elements[:50]: # 限制数量
text = elem.get_text().strip()
if text and len(text) > 5:
comments.append({
'content': text,
'url': base_url
})
logger.debug(f"备选解析获取到 {len(comments)} 条评论")
except Exception as e:
logger.error(f"备选解析失败: {e}")
return comments
def _fetch_sse_with_selenium(self, url: str) -> Optional[str]:
"""使用 Selenium 获取页面内容"""
driver = None
try:
# 配置 Chrome 选项
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('--window-size=1920,1080')
chrome_options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36')
# 创建 WebDriver
driver = webdriver.Chrome(options=chrome_options)
# 访问页面
driver.get(url)
# 等待页面加载
WebDriverWait(driver, 30).until(
EC.presence_of_element_located((By.ID, "wmt_china"))
)
# 额外等待确保数据加载完成
time.sleep(2)
# 获取页面内容
html = driver.page_source
logger.debug(f"Selenium 获取页面成功,大小: {len(html)} 字节")
return html
except Exception as e:
logger.error(f"Selenium 获取页面失败: {e}")
return None
finally:
if driver:
driver.quit()
def fetch_sse_stock_data(self) -> Dict[str, float]:
"""
使用新浪财经JS接口获取上证指数数据
返回包含时间和数值的字典
"""
from datetime import datetime
import re
sse_url = "https://hq.sinajs.cn/list=sh000001"
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Referer': 'https://finance.sina.com.cn/'
}
logger.info(f"通过新浪JS接口获取上证指数数据: {sse_url}")
try:
response = self.session.get(sse_url, timeout=30, headers=headers)
response.raise_for_status()
content = response.text
logger.debug(f"获取到响应: {content}")
pattern = r'var hq_str_sh000001="([^"]+)"'
match = re.search(pattern, content)
if not match:
logger.warning("未能解析新浪JS接口返回数据")
return {}
data_fields = match.group(1).split(',')
if len(data_fields) < 32:
logger.warning(f"数据字段不足: {len(data_fields)}")
return {}
stock_name = data_fields[0]
current_price = float(data_fields[3]) if data_fields[3] else 0.0
current_time = datetime.now().strftime("%H:%M")
logger.info(f"✓ 成功获取 {stock_name}: {current_price} (时间: {current_time})")
return {
'time': current_time,
'value': current_price
}
except requests.RequestException as e:
logger.error(f"请求新浪JS接口失败: {e}")
return {}
except (ValueError, IndexError) as e:
logger.error(f"解析数据失败: {e}")
return {}
except Exception as e:
logger.error(f"获取股票数据异常: {e}")
return {}
def set_user_agent(self, user_agent: str):
"""更新User-Agent"""
self.session.headers.update({'User-Agent': user_agent})
def update_config(self, config: Dict):
"""更新配置"""
self.config.update(config)
if 'user_agent' in config:
self.set_user_agent(config['user_agent'])
if 'retry_times' in config:
self.retry_times = config['retry_times']
if 'retry_interval' in config:
self.retry_interval = config['retry_interval']
def get_fetch_interval(self) -> int:
"""获取爬取间隔"""
return self.config.get('fetch_interval', 60)
def fetch_sse_screenshot(self) -> str:
"""
爬取上证所网页指定元素截图
返回截图文件路径
"""
from playwright.sync_api import sync_playwright
import os
import sys
url = "https://www.sse.com.cn/"
xpath = "//div[contains(@class,'gray_bg')]//div[contains(@class,'col-md-7')]"
logger.info(f"开始爬取上证所网页截图: {url}")
logger.info(f"目标XPath: {xpath}")
# 首先从配置中读取浏览器路径
config_playwright_dir = self.config.get('playwright_dir')
if config_playwright_dir:
logger.info(f"从配置中获取到Playwright浏览器路径: {config_playwright_dir}")
# 获取当前脚本目录
if getattr(sys, 'frozen', False):
# 打包后的环境
current_dir = os.path.dirname(sys.executable)
# 优先使用配置中的路径
if config_playwright_dir and os.path.exists(config_playwright_dir):
playwright_dir = config_playwright_dir
logger.info(f"使用配置中指定的Playwright浏览器路径: {playwright_dir}")
else:
# 配置路径不存在或未设置尝试多个可能的Playwright浏览器路径
possible_paths = [
os.path.join(current_dir, '_internal', 'ms-playwright'),
os.path.join(current_dir, 'ms-playwright'),
os.path.join(os.path.dirname(current_dir), 'ms-playwright')
]
playwright_dir = None
for path in possible_paths:
if os.path.exists(path):
playwright_dir = path
logger.info(f"找到Playwright浏览器路径: {playwright_dir}")
break
if not playwright_dir:
logger.warning("未找到Playwright浏览器路径尝试使用系统默认路径")
else:
# 开发环境
current_dir = os.path.dirname(os.path.abspath(__file__))
# 优先使用配置中的路径
if config_playwright_dir and os.path.exists(config_playwright_dir):
playwright_dir = config_playwright_dir
logger.info(f"使用配置中指定的Playwright浏览器路径: {playwright_dir}")
else:
playwright_dir = None
logger.info(f"开发环境,当前目录: {current_dir}")
output_dir = current_dir
screenshot_path = os.path.join(output_dir, "sse_screenshot.png")
try:
with sync_playwright() as p:
# 设置浏览器路径
browser_launch_options = {
'headless': True
}
if playwright_dir:
# 尝试多个可能的浏览器可执行文件路径
possible_executables = [
os.path.join(playwright_dir, 'chromium-1091', 'chrome-win', 'chrome.exe'),
os.path.join(playwright_dir, 'chromium', 'chrome-win', 'chrome.exe'),
os.path.join(playwright_dir, 'chrome-win', 'chrome.exe')
]
for executable_path in possible_executables:
if os.path.exists(executable_path):
browser_launch_options['executable_path'] = executable_path
logger.info(f"使用自定义浏览器路径: {executable_path}")
break
if 'executable_path' not in browser_launch_options:
logger.warning("未找到浏览器可执行文件,使用系统默认浏览器")
browser = p.chromium.launch(**browser_launch_options)
page = browser.new_page()
page.set_default_timeout(60000)
logger.info("正在访问页面...")
page.goto(url, wait_until="networkidle")
logger.info("等待页面加载完成...")
page.wait_for_load_state("domcontentloaded")
page.wait_for_timeout(5000)
logger.info(f"查找XPath元素: {xpath}")
element = page.locator(f"xpath={xpath}")
if element.count() > 0:
logger.info("✓ 找到目标元素")
is_visible = element.is_visible()
logger.info(f"元素可见: {is_visible}")
if not is_visible:
logger.info("元素不可见,尝试滚动到可见区域...")
element.scroll_into_view_if_needed()
page.wait_for_timeout(2000)
logger.info(f"正在截取元素截图到: {screenshot_path}")
element.screenshot(path=screenshot_path)
logger.info("✓ 截屏成功")
browser.close()
return screenshot_path
else:
logger.warning("✗ 未找到目标元素")
browser.close()
return ""
except ImportError as e:
logger.error(f"Playwright导入失败: {e}")
logger.error("请确保已安装playwright: pip install playwright")
logger.error("并安装浏览器: python -m playwright install chromium")
return ""
except Exception as e:
logger.error(f"爬取上证所截图失败: {e}")
logger.exception(e) # 记录详细异常
# 记录环境信息用于调试
logger.error(f"环境信息 - Frozen: {getattr(sys, 'frozen', False)}")
logger.error(f"当前目录: {current_dir}")
logger.error(f"Playwright目录: {playwright_dir}")
return ""