refactor(spider): 将Playwright替换为Selenium实现网页截图功能。

这个版本的exe可以运行了。

移除Playwright相关代码,改用Selenium实现上证所网页截图功能。修改包括:
1. 删除Playwright依赖和配置逻辑
2. 添加Selenium相关配置和异常处理
3. 优化截图流程和日志记录
同时删除不再需要的build_new.bat打包脚本。
This commit is contained in:
2026-01-23 11:55:31 +08:00
parent 5d79cd9e8f
commit df9348ca95
4 changed files with 66 additions and 151 deletions

169
spider.py
View File

@@ -276,129 +276,89 @@ class SpiderManager:
def fetch_sse_screenshot(self) -> str:
"""
爬取上证所网页指定元素截图
使用Selenium爬取上证所网页指定元素截图
返回截图文件路径
"""
from playwright.sync_api import sync_playwright
import os
import sys
url = "https://www.sse.com.cn/"
xpath = "//div[contains(@class,'gray_bg')]//div[contains(@class,'col-md-7')]"
logger.info(f"开始爬取上证所网页截图: {url}")
logger.info(f"开始使用Selenium爬取上证所网页截图: {url}")
logger.info(f"目标XPath: {xpath}")
# 首先从配置中读取浏览器路径
config_playwright_dir = self.config.get('playwright_dir')
if config_playwright_dir:
logger.info(f"从配置中获取到Playwright浏览器路径: {config_playwright_dir}")
# 获取当前脚本目录
if getattr(sys, 'frozen', False):
# 打包后的环境
current_dir = os.path.dirname(sys.executable)
# 优先使用配置中的路径
if config_playwright_dir and os.path.exists(config_playwright_dir):
playwright_dir = config_playwright_dir
logger.info(f"使用配置中指定的Playwright浏览器路径: {playwright_dir}")
else:
# 配置路径不存在或未设置尝试多个可能的Playwright浏览器路径
possible_paths = [
os.path.join(current_dir, '_internal', 'ms-playwright'),
os.path.join(current_dir, 'ms-playwright'),
os.path.join(os.path.dirname(current_dir), 'ms-playwright')
]
playwright_dir = None
for path in possible_paths:
if os.path.exists(path):
playwright_dir = path
logger.info(f"找到Playwright浏览器路径: {playwright_dir}")
break
if not playwright_dir:
logger.warning("未找到Playwright浏览器路径尝试使用系统默认路径")
else:
# 开发环境
current_dir = os.path.dirname(os.path.abspath(__file__))
# 优先使用配置中的路径
if config_playwright_dir and os.path.exists(config_playwright_dir):
playwright_dir = config_playwright_dir
logger.info(f"使用配置中指定的Playwright浏览器路径: {playwright_dir}")
else:
playwright_dir = None
logger.info(f"开发环境,当前目录: {current_dir}")
output_dir = current_dir
screenshot_path = os.path.join(output_dir, "sse_screenshot.png")
logger.info(f"截图将保存到: {screenshot_path}")
driver = None
try:
with sync_playwright() as p:
# 设置浏览器路径
browser_launch_options = {
'headless': True
}
if playwright_dir:
# 尝试多个可能的浏览器可执行文件路径
possible_executables = [
os.path.join(playwright_dir, 'chromium-1091', 'chrome-win', 'chrome.exe'),
os.path.join(playwright_dir, 'chromium', 'chrome-win', 'chrome.exe'),
os.path.join(playwright_dir, 'chrome-win', 'chrome.exe')
]
for executable_path in possible_executables:
if os.path.exists(executable_path):
browser_launch_options['executable_path'] = executable_path
logger.info(f"使用自定义浏览器路径: {executable_path}")
break
if 'executable_path' not in browser_launch_options:
logger.warning("未找到浏览器可执行文件,使用系统默认浏览器")
browser = p.chromium.launch(**browser_launch_options)
page = browser.new_page()
page.set_default_timeout(60000)
logger.info("正在访问页面...")
page.goto(url, wait_until="networkidle")
logger.info("等待页面加载完成...")
page.wait_for_load_state("domcontentloaded")
page.wait_for_timeout(5000)
logger.info(f"查找XPath元素: {xpath}")
element = page.locator(f"xpath={xpath}")
if element.count() > 0:
logger.info("✓ 找到目标元素")
is_visible = element.is_visible()
logger.info(f"元素可见: {is_visible}")
if not is_visible:
logger.info("元素不可见,尝试滚动到可见区域...")
element.scroll_into_view_if_needed()
page.wait_for_timeout(2000)
logger.info(f"正在截取元素截图到: {screenshot_path}")
element.screenshot(path=screenshot_path)
logger.info("✓ 截屏成功")
browser.close()
return screenshot_path
else:
logger.warning("✗ 未找到目标元素")
browser.close()
return ""
except ImportError as e:
logger.error(f"Playwright导入失败: {e}")
logger.error("请确保已安装playwright: pip install playwright")
logger.error("并安装浏览器: python -m playwright install chromium")
return ""
# 配置Chrome选项
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('--window-size=1920,1080')
chrome_options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36')
# 设置Chrome浏览器路径如果配置了
chrome_path = self.config.get('chrome_path')
if chrome_path and os.path.exists(chrome_path):
logger.info(f"使用配置中指定的Chrome路径: {chrome_path}")
chrome_options.binary_location = chrome_path
# 创建WebDriver
logger.info("初始化Chrome WebDriver...")
driver = webdriver.Chrome(options=chrome_options)
driver.set_page_load_timeout(60)
driver.implicitly_wait(30)
# 访问页面
logger.info("正在访问页面...")
driver.get(url)
# 等待页面加载完成
logger.info("等待页面加载完成...")
WebDriverWait(driver, 30).until(
EC.presence_of_element_located((By.XPATH, "//body"))
)
# 额外等待确保数据加载完成
time.sleep(5)
# 查找目标元素
logger.info(f"查找XPath元素: {xpath}")
element = WebDriverWait(driver, 30).until(
EC.presence_of_element_located((By.XPATH, xpath))
)
logger.info("✓ 找到目标元素")
# 检查元素是否可见
is_visible = element.is_displayed()
logger.info(f"元素可见: {is_visible}")
if not is_visible:
logger.info("元素不可见,尝试滚动到可见区域...")
driver.execute_script("arguments[0].scrollIntoView({behavior: 'smooth', block: 'center', inline: 'center'});", element)
time.sleep(2)
# 截取元素截图
logger.info(f"正在截取元素截图到: {screenshot_path}")
element.screenshot(screenshot_path)
logger.info("✓ 截屏成功")
return screenshot_path
except Exception as e:
logger.error(f"爬取上证所截图失败: {e}")
logger.exception(e) # 记录详细异常
@@ -406,8 +366,11 @@ class SpiderManager:
# 记录环境信息用于调试
logger.error(f"环境信息 - Frozen: {getattr(sys, 'frozen', False)}")
logger.error(f"当前目录: {current_dir}")
logger.error(f"Playwright目录: {playwright_dir}")
return ""
finally:
if driver:
logger.info("关闭WebDriver...")
driver.quit()