refactor(spider): 将Playwright替换为Selenium实现网页截图功能。
这个版本的exe可以运行了。 移除Playwright相关代码,改用Selenium实现上证所网页截图功能。修改包括: 1. 删除Playwright依赖和配置逻辑 2. 添加Selenium相关配置和异常处理 3. 优化截图流程和日志记录 同时删除不再需要的build_new.bat打包脚本。
This commit is contained in:
@@ -1,48 +0,0 @@
|
|||||||
@echo off
|
|
||||||
chcp 65001 >nul
|
|
||||||
echo ========================================
|
|
||||||
echo 股吧人气指示器 - 打包工具
|
|
||||||
echo ========================================
|
|
||||||
|
|
||||||
REM 检查并安装必要的依赖
|
|
||||||
echo 检查并安装依赖...
|
|
||||||
pip install -r requirements.txt
|
|
||||||
|
|
||||||
REM 安装Playwright浏览器
|
|
||||||
echo 安装Playwright浏览器...
|
|
||||||
python -m playwright install chromium
|
|
||||||
|
|
||||||
REM 检查 pyinstaller 是否安装
|
|
||||||
pip show pyinstaller >nul 2>&1
|
|
||||||
if errorlevel 1 (
|
|
||||||
echo 正在安装 pyinstaller...
|
|
||||||
pip install pyinstaller
|
|
||||||
)
|
|
||||||
|
|
||||||
REM 清理旧的构建文件
|
|
||||||
echo 清理旧的构建文件...
|
|
||||||
if exist "build" rmdir /s /q build
|
|
||||||
if exist "dist" rmdir /s /q dist
|
|
||||||
if exist "guba-indicator.spec" del guba-indicator.spec
|
|
||||||
|
|
||||||
echo 开始打包...
|
|
||||||
pyinstaller build.spec --noconfirm
|
|
||||||
|
|
||||||
if exist "dist\guba-indicator\guba-indicator.exe" (
|
|
||||||
echo ========================================
|
|
||||||
echo 打包成功!
|
|
||||||
echo 可执行文件位置: dist\guba-indicator\guba-indicator.exe
|
|
||||||
echo ========================================
|
|
||||||
|
|
||||||
REM 复制必要的资源文件
|
|
||||||
echo 复制资源文件...
|
|
||||||
copy guba.ico dist\guba-indicator\ >nul 2>&1
|
|
||||||
copy indicator.ico dist\guba-indicator\ >nul 2>&1
|
|
||||||
copy config.json dist\guba-indicator\ >nul 2>&1
|
|
||||||
|
|
||||||
echo 资源文件复制完成!
|
|
||||||
) else (
|
|
||||||
echo 打包失败,请检查错误信息
|
|
||||||
)
|
|
||||||
|
|
||||||
pause
|
|
||||||
169
spider.py
169
spider.py
@@ -276,129 +276,89 @@ class SpiderManager:
|
|||||||
|
|
||||||
def fetch_sse_screenshot(self) -> str:
|
def fetch_sse_screenshot(self) -> str:
|
||||||
"""
|
"""
|
||||||
爬取上证所网页指定元素截图
|
使用Selenium爬取上证所网页指定元素截图
|
||||||
返回截图文件路径
|
返回截图文件路径
|
||||||
"""
|
"""
|
||||||
from playwright.sync_api import sync_playwright
|
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
url = "https://www.sse.com.cn/"
|
url = "https://www.sse.com.cn/"
|
||||||
xpath = "//div[contains(@class,'gray_bg')]//div[contains(@class,'col-md-7')]"
|
xpath = "//div[contains(@class,'gray_bg')]//div[contains(@class,'col-md-7')]"
|
||||||
|
|
||||||
logger.info(f"开始爬取上证所网页截图: {url}")
|
logger.info(f"开始使用Selenium爬取上证所网页截图: {url}")
|
||||||
logger.info(f"目标XPath: {xpath}")
|
logger.info(f"目标XPath: {xpath}")
|
||||||
|
|
||||||
# 首先从配置中读取浏览器路径
|
|
||||||
config_playwright_dir = self.config.get('playwright_dir')
|
|
||||||
if config_playwright_dir:
|
|
||||||
logger.info(f"从配置中获取到Playwright浏览器路径: {config_playwright_dir}")
|
|
||||||
|
|
||||||
# 获取当前脚本目录
|
# 获取当前脚本目录
|
||||||
if getattr(sys, 'frozen', False):
|
if getattr(sys, 'frozen', False):
|
||||||
# 打包后的环境
|
# 打包后的环境
|
||||||
current_dir = os.path.dirname(sys.executable)
|
current_dir = os.path.dirname(sys.executable)
|
||||||
|
|
||||||
# 优先使用配置中的路径
|
|
||||||
if config_playwright_dir and os.path.exists(config_playwright_dir):
|
|
||||||
playwright_dir = config_playwright_dir
|
|
||||||
logger.info(f"使用配置中指定的Playwright浏览器路径: {playwright_dir}")
|
|
||||||
else:
|
|
||||||
# 配置路径不存在或未设置,尝试多个可能的Playwright浏览器路径
|
|
||||||
possible_paths = [
|
|
||||||
os.path.join(current_dir, '_internal', 'ms-playwright'),
|
|
||||||
os.path.join(current_dir, 'ms-playwright'),
|
|
||||||
os.path.join(os.path.dirname(current_dir), 'ms-playwright')
|
|
||||||
]
|
|
||||||
|
|
||||||
playwright_dir = None
|
|
||||||
for path in possible_paths:
|
|
||||||
if os.path.exists(path):
|
|
||||||
playwright_dir = path
|
|
||||||
logger.info(f"找到Playwright浏览器路径: {playwright_dir}")
|
|
||||||
break
|
|
||||||
|
|
||||||
if not playwright_dir:
|
|
||||||
logger.warning("未找到Playwright浏览器路径,尝试使用系统默认路径")
|
|
||||||
else:
|
else:
|
||||||
# 开发环境
|
# 开发环境
|
||||||
current_dir = os.path.dirname(os.path.abspath(__file__))
|
current_dir = os.path.dirname(os.path.abspath(__file__))
|
||||||
|
|
||||||
# 优先使用配置中的路径
|
|
||||||
if config_playwright_dir and os.path.exists(config_playwright_dir):
|
|
||||||
playwright_dir = config_playwright_dir
|
|
||||||
logger.info(f"使用配置中指定的Playwright浏览器路径: {playwright_dir}")
|
|
||||||
else:
|
|
||||||
playwright_dir = None
|
|
||||||
logger.info(f"开发环境,当前目录: {current_dir}")
|
|
||||||
|
|
||||||
output_dir = current_dir
|
output_dir = current_dir
|
||||||
screenshot_path = os.path.join(output_dir, "sse_screenshot.png")
|
screenshot_path = os.path.join(output_dir, "sse_screenshot.png")
|
||||||
|
logger.info(f"截图将保存到: {screenshot_path}")
|
||||||
|
|
||||||
|
driver = None
|
||||||
try:
|
try:
|
||||||
with sync_playwright() as p:
|
# 配置Chrome选项
|
||||||
# 设置浏览器路径
|
chrome_options = Options()
|
||||||
browser_launch_options = {
|
chrome_options.add_argument('--headless')
|
||||||
'headless': True
|
chrome_options.add_argument('--no-sandbox')
|
||||||
}
|
chrome_options.add_argument('--disable-dev-shm-usage')
|
||||||
|
chrome_options.add_argument('--disable-gpu')
|
||||||
if playwright_dir:
|
chrome_options.add_argument('--window-size=1920,1080')
|
||||||
# 尝试多个可能的浏览器可执行文件路径
|
chrome_options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36')
|
||||||
possible_executables = [
|
|
||||||
os.path.join(playwright_dir, 'chromium-1091', 'chrome-win', 'chrome.exe'),
|
# 设置Chrome浏览器路径(如果配置了)
|
||||||
os.path.join(playwright_dir, 'chromium', 'chrome-win', 'chrome.exe'),
|
chrome_path = self.config.get('chrome_path')
|
||||||
os.path.join(playwright_dir, 'chrome-win', 'chrome.exe')
|
if chrome_path and os.path.exists(chrome_path):
|
||||||
]
|
logger.info(f"使用配置中指定的Chrome路径: {chrome_path}")
|
||||||
|
chrome_options.binary_location = chrome_path
|
||||||
for executable_path in possible_executables:
|
|
||||||
if os.path.exists(executable_path):
|
# 创建WebDriver
|
||||||
browser_launch_options['executable_path'] = executable_path
|
logger.info("初始化Chrome WebDriver...")
|
||||||
logger.info(f"使用自定义浏览器路径: {executable_path}")
|
driver = webdriver.Chrome(options=chrome_options)
|
||||||
break
|
driver.set_page_load_timeout(60)
|
||||||
|
driver.implicitly_wait(30)
|
||||||
if 'executable_path' not in browser_launch_options:
|
|
||||||
logger.warning("未找到浏览器可执行文件,使用系统默认浏览器")
|
# 访问页面
|
||||||
|
logger.info("正在访问页面...")
|
||||||
browser = p.chromium.launch(**browser_launch_options)
|
driver.get(url)
|
||||||
page = browser.new_page()
|
|
||||||
|
# 等待页面加载完成
|
||||||
page.set_default_timeout(60000)
|
logger.info("等待页面加载完成...")
|
||||||
logger.info("正在访问页面...")
|
WebDriverWait(driver, 30).until(
|
||||||
page.goto(url, wait_until="networkidle")
|
EC.presence_of_element_located((By.XPATH, "//body"))
|
||||||
|
)
|
||||||
logger.info("等待页面加载完成...")
|
# 额外等待确保数据加载完成
|
||||||
page.wait_for_load_state("domcontentloaded")
|
time.sleep(5)
|
||||||
page.wait_for_timeout(5000)
|
|
||||||
|
# 查找目标元素
|
||||||
logger.info(f"查找XPath元素: {xpath}")
|
logger.info(f"查找XPath元素: {xpath}")
|
||||||
element = page.locator(f"xpath={xpath}")
|
element = WebDriverWait(driver, 30).until(
|
||||||
|
EC.presence_of_element_located((By.XPATH, xpath))
|
||||||
if element.count() > 0:
|
)
|
||||||
logger.info("✓ 找到目标元素")
|
|
||||||
|
logger.info("✓ 找到目标元素")
|
||||||
is_visible = element.is_visible()
|
|
||||||
logger.info(f"元素可见: {is_visible}")
|
# 检查元素是否可见
|
||||||
|
is_visible = element.is_displayed()
|
||||||
if not is_visible:
|
logger.info(f"元素可见: {is_visible}")
|
||||||
logger.info("元素不可见,尝试滚动到可见区域...")
|
|
||||||
element.scroll_into_view_if_needed()
|
if not is_visible:
|
||||||
page.wait_for_timeout(2000)
|
logger.info("元素不可见,尝试滚动到可见区域...")
|
||||||
|
driver.execute_script("arguments[0].scrollIntoView({behavior: 'smooth', block: 'center', inline: 'center'});", element)
|
||||||
logger.info(f"正在截取元素截图到: {screenshot_path}")
|
time.sleep(2)
|
||||||
element.screenshot(path=screenshot_path)
|
|
||||||
logger.info("✓ 截屏成功")
|
# 截取元素截图
|
||||||
browser.close()
|
logger.info(f"正在截取元素截图到: {screenshot_path}")
|
||||||
return screenshot_path
|
element.screenshot(screenshot_path)
|
||||||
else:
|
|
||||||
logger.warning("✗ 未找到目标元素")
|
logger.info("✓ 截屏成功")
|
||||||
browser.close()
|
return screenshot_path
|
||||||
return ""
|
|
||||||
|
|
||||||
except ImportError as e:
|
|
||||||
logger.error(f"Playwright导入失败: {e}")
|
|
||||||
logger.error("请确保已安装playwright: pip install playwright")
|
|
||||||
logger.error("并安装浏览器: python -m playwright install chromium")
|
|
||||||
return ""
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"爬取上证所截图失败: {e}")
|
logger.error(f"爬取上证所截图失败: {e}")
|
||||||
logger.exception(e) # 记录详细异常
|
logger.exception(e) # 记录详细异常
|
||||||
@@ -406,8 +366,11 @@ class SpiderManager:
|
|||||||
# 记录环境信息用于调试
|
# 记录环境信息用于调试
|
||||||
logger.error(f"环境信息 - Frozen: {getattr(sys, 'frozen', False)}")
|
logger.error(f"环境信息 - Frozen: {getattr(sys, 'frozen', False)}")
|
||||||
logger.error(f"当前目录: {current_dir}")
|
logger.error(f"当前目录: {current_dir}")
|
||||||
logger.error(f"Playwright目录: {playwright_dir}")
|
|
||||||
|
|
||||||
return ""
|
return ""
|
||||||
|
finally:
|
||||||
|
if driver:
|
||||||
|
logger.info("关闭WebDriver...")
|
||||||
|
driver.quit()
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Binary file not shown.
|
Before Width: | Height: | Size: 58 KiB After Width: | Height: | Size: 43 KiB |
Reference in New Issue
Block a user