refactor(spider): 将Playwright替换为Selenium实现网页截图功能。

这个版本的exe可以运行了。

移除Playwright相关代码,改用Selenium实现上证所网页截图功能。修改包括:
1. 删除Playwright依赖和配置逻辑
2. 添加Selenium相关配置和异常处理
3. 优化截图流程和日志记录
同时删除不再需要的build_new.bat打包脚本。
This commit is contained in:
2026-01-23 11:55:31 +08:00
parent 5d79cd9e8f
commit df9348ca95
4 changed files with 66 additions and 151 deletions

View File

@@ -1,48 +0,0 @@
@echo off
chcp 65001 >nul
echo ========================================
echo 股吧人气指示器 - 打包工具
echo ========================================
REM 检查并安装必要的依赖
echo 检查并安装依赖...
pip install -r requirements.txt
REM 安装Playwright浏览器
echo 安装Playwright浏览器...
python -m playwright install chromium
REM 检查 pyinstaller 是否安装
pip show pyinstaller >nul 2>&1
if errorlevel 1 (
echo 正在安装 pyinstaller...
pip install pyinstaller
)
REM 清理旧的构建文件
echo 清理旧的构建文件...
if exist "build" rmdir /s /q build
if exist "dist" rmdir /s /q dist
if exist "guba-indicator.spec" del guba-indicator.spec
echo 开始打包...
pyinstaller build.spec --noconfirm
if exist "dist\guba-indicator\guba-indicator.exe" (
echo ========================================
echo 打包成功!
echo 可执行文件位置: dist\guba-indicator\guba-indicator.exe
echo ========================================
REM 复制必要的资源文件
echo 复制资源文件...
copy guba.ico dist\guba-indicator\ >nul 2>&1
copy indicator.ico dist\guba-indicator\ >nul 2>&1
copy config.json dist\guba-indicator\ >nul 2>&1
echo 资源文件复制完成!
) else (
echo 打包失败,请检查错误信息
)
pause

BIN
main.exe

Binary file not shown.

169
spider.py
View File

@@ -276,129 +276,89 @@ class SpiderManager:
def fetch_sse_screenshot(self) -> str: def fetch_sse_screenshot(self) -> str:
""" """
爬取上证所网页指定元素截图 使用Selenium爬取上证所网页指定元素截图
返回截图文件路径 返回截图文件路径
""" """
from playwright.sync_api import sync_playwright
import os import os
import sys import sys
url = "https://www.sse.com.cn/" url = "https://www.sse.com.cn/"
xpath = "//div[contains(@class,'gray_bg')]//div[contains(@class,'col-md-7')]" xpath = "//div[contains(@class,'gray_bg')]//div[contains(@class,'col-md-7')]"
logger.info(f"开始爬取上证所网页截图: {url}") logger.info(f"开始使用Selenium爬取上证所网页截图: {url}")
logger.info(f"目标XPath: {xpath}") logger.info(f"目标XPath: {xpath}")
# 首先从配置中读取浏览器路径
config_playwright_dir = self.config.get('playwright_dir')
if config_playwright_dir:
logger.info(f"从配置中获取到Playwright浏览器路径: {config_playwright_dir}")
# 获取当前脚本目录 # 获取当前脚本目录
if getattr(sys, 'frozen', False): if getattr(sys, 'frozen', False):
# 打包后的环境 # 打包后的环境
current_dir = os.path.dirname(sys.executable) current_dir = os.path.dirname(sys.executable)
# 优先使用配置中的路径
if config_playwright_dir and os.path.exists(config_playwright_dir):
playwright_dir = config_playwright_dir
logger.info(f"使用配置中指定的Playwright浏览器路径: {playwright_dir}")
else:
# 配置路径不存在或未设置尝试多个可能的Playwright浏览器路径
possible_paths = [
os.path.join(current_dir, '_internal', 'ms-playwright'),
os.path.join(current_dir, 'ms-playwright'),
os.path.join(os.path.dirname(current_dir), 'ms-playwright')
]
playwright_dir = None
for path in possible_paths:
if os.path.exists(path):
playwright_dir = path
logger.info(f"找到Playwright浏览器路径: {playwright_dir}")
break
if not playwright_dir:
logger.warning("未找到Playwright浏览器路径尝试使用系统默认路径")
else: else:
# 开发环境 # 开发环境
current_dir = os.path.dirname(os.path.abspath(__file__)) current_dir = os.path.dirname(os.path.abspath(__file__))
# 优先使用配置中的路径
if config_playwright_dir and os.path.exists(config_playwright_dir):
playwright_dir = config_playwright_dir
logger.info(f"使用配置中指定的Playwright浏览器路径: {playwright_dir}")
else:
playwright_dir = None
logger.info(f"开发环境,当前目录: {current_dir}")
output_dir = current_dir output_dir = current_dir
screenshot_path = os.path.join(output_dir, "sse_screenshot.png") screenshot_path = os.path.join(output_dir, "sse_screenshot.png")
logger.info(f"截图将保存到: {screenshot_path}")
driver = None
try: try:
with sync_playwright() as p: # 配置Chrome选项
# 设置浏览器路径 chrome_options = Options()
browser_launch_options = { chrome_options.add_argument('--headless')
'headless': True chrome_options.add_argument('--no-sandbox')
} chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument('--disable-gpu')
if playwright_dir: chrome_options.add_argument('--window-size=1920,1080')
# 尝试多个可能的浏览器可执行文件路径 chrome_options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36')
possible_executables = [
os.path.join(playwright_dir, 'chromium-1091', 'chrome-win', 'chrome.exe'), # 设置Chrome浏览器路径如果配置了
os.path.join(playwright_dir, 'chromium', 'chrome-win', 'chrome.exe'), chrome_path = self.config.get('chrome_path')
os.path.join(playwright_dir, 'chrome-win', 'chrome.exe') if chrome_path and os.path.exists(chrome_path):
] logger.info(f"使用配置中指定的Chrome路径: {chrome_path}")
chrome_options.binary_location = chrome_path
for executable_path in possible_executables:
if os.path.exists(executable_path): # 创建WebDriver
browser_launch_options['executable_path'] = executable_path logger.info("初始化Chrome WebDriver...")
logger.info(f"使用自定义浏览器路径: {executable_path}") driver = webdriver.Chrome(options=chrome_options)
break driver.set_page_load_timeout(60)
driver.implicitly_wait(30)
if 'executable_path' not in browser_launch_options:
logger.warning("未找到浏览器可执行文件,使用系统默认浏览器") # 访问页面
logger.info("正在访问页面...")
browser = p.chromium.launch(**browser_launch_options) driver.get(url)
page = browser.new_page()
# 等待页面加载完成
page.set_default_timeout(60000) logger.info("等待页面加载完成...")
logger.info("正在访问页面...") WebDriverWait(driver, 30).until(
page.goto(url, wait_until="networkidle") EC.presence_of_element_located((By.XPATH, "//body"))
)
logger.info("等待页面加载完成...") # 额外等待确保数据加载完成
page.wait_for_load_state("domcontentloaded") time.sleep(5)
page.wait_for_timeout(5000)
# 查找目标元素
logger.info(f"查找XPath元素: {xpath}") logger.info(f"查找XPath元素: {xpath}")
element = page.locator(f"xpath={xpath}") element = WebDriverWait(driver, 30).until(
EC.presence_of_element_located((By.XPATH, xpath))
if element.count() > 0: )
logger.info("✓ 找到目标元素")
logger.info("✓ 找到目标元素")
is_visible = element.is_visible()
logger.info(f"元素可见: {is_visible}") # 检查元素是否可见
is_visible = element.is_displayed()
if not is_visible: logger.info(f"元素可见: {is_visible}")
logger.info("元素不可见,尝试滚动到可见区域...")
element.scroll_into_view_if_needed() if not is_visible:
page.wait_for_timeout(2000) logger.info("元素不可见,尝试滚动到可见区域...")
driver.execute_script("arguments[0].scrollIntoView({behavior: 'smooth', block: 'center', inline: 'center'});", element)
logger.info(f"正在截取元素截图到: {screenshot_path}") time.sleep(2)
element.screenshot(path=screenshot_path)
logger.info("✓ 截屏成功") # 截取元素截图
browser.close() logger.info(f"正在截取元素截图到: {screenshot_path}")
return screenshot_path element.screenshot(screenshot_path)
else:
logger.warning("✗ 未找到目标元素") logger.info("✓ 截屏成功")
browser.close() return screenshot_path
return ""
except ImportError as e:
logger.error(f"Playwright导入失败: {e}")
logger.error("请确保已安装playwright: pip install playwright")
logger.error("并安装浏览器: python -m playwright install chromium")
return ""
except Exception as e: except Exception as e:
logger.error(f"爬取上证所截图失败: {e}") logger.error(f"爬取上证所截图失败: {e}")
logger.exception(e) # 记录详细异常 logger.exception(e) # 记录详细异常
@@ -406,8 +366,11 @@ class SpiderManager:
# 记录环境信息用于调试 # 记录环境信息用于调试
logger.error(f"环境信息 - Frozen: {getattr(sys, 'frozen', False)}") logger.error(f"环境信息 - Frozen: {getattr(sys, 'frozen', False)}")
logger.error(f"当前目录: {current_dir}") logger.error(f"当前目录: {current_dir}")
logger.error(f"Playwright目录: {playwright_dir}")
return "" return ""
finally:
if driver:
logger.info("关闭WebDriver...")
driver.quit()

Binary file not shown.

Before

Width:  |  Height:  |  Size: 58 KiB

After

Width:  |  Height:  |  Size: 43 KiB