refactor(spider): 将Playwright替换为Selenium实现网页截图功能。
这个版本的exe可以运行了。 移除Playwright相关代码,改用Selenium实现上证所网页截图功能。修改包括: 1. 删除Playwright依赖和配置逻辑 2. 添加Selenium相关配置和异常处理 3. 优化截图流程和日志记录 同时删除不再需要的build_new.bat打包脚本。
This commit is contained in:
169
spider.py
169
spider.py
@@ -276,129 +276,89 @@ class SpiderManager:
|
||||
|
||||
def fetch_sse_screenshot(self) -> str:
|
||||
"""
|
||||
爬取上证所网页指定元素截图
|
||||
使用Selenium爬取上证所网页指定元素截图
|
||||
返回截图文件路径
|
||||
"""
|
||||
from playwright.sync_api import sync_playwright
|
||||
import os
|
||||
import sys
|
||||
|
||||
url = "https://www.sse.com.cn/"
|
||||
xpath = "//div[contains(@class,'gray_bg')]//div[contains(@class,'col-md-7')]"
|
||||
|
||||
logger.info(f"开始爬取上证所网页截图: {url}")
|
||||
logger.info(f"开始使用Selenium爬取上证所网页截图: {url}")
|
||||
logger.info(f"目标XPath: {xpath}")
|
||||
|
||||
# 首先从配置中读取浏览器路径
|
||||
config_playwright_dir = self.config.get('playwright_dir')
|
||||
if config_playwright_dir:
|
||||
logger.info(f"从配置中获取到Playwright浏览器路径: {config_playwright_dir}")
|
||||
|
||||
# 获取当前脚本目录
|
||||
if getattr(sys, 'frozen', False):
|
||||
# 打包后的环境
|
||||
current_dir = os.path.dirname(sys.executable)
|
||||
|
||||
# 优先使用配置中的路径
|
||||
if config_playwright_dir and os.path.exists(config_playwright_dir):
|
||||
playwright_dir = config_playwright_dir
|
||||
logger.info(f"使用配置中指定的Playwright浏览器路径: {playwright_dir}")
|
||||
else:
|
||||
# 配置路径不存在或未设置,尝试多个可能的Playwright浏览器路径
|
||||
possible_paths = [
|
||||
os.path.join(current_dir, '_internal', 'ms-playwright'),
|
||||
os.path.join(current_dir, 'ms-playwright'),
|
||||
os.path.join(os.path.dirname(current_dir), 'ms-playwright')
|
||||
]
|
||||
|
||||
playwright_dir = None
|
||||
for path in possible_paths:
|
||||
if os.path.exists(path):
|
||||
playwright_dir = path
|
||||
logger.info(f"找到Playwright浏览器路径: {playwright_dir}")
|
||||
break
|
||||
|
||||
if not playwright_dir:
|
||||
logger.warning("未找到Playwright浏览器路径,尝试使用系统默认路径")
|
||||
else:
|
||||
# 开发环境
|
||||
current_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
|
||||
# 优先使用配置中的路径
|
||||
if config_playwright_dir and os.path.exists(config_playwright_dir):
|
||||
playwright_dir = config_playwright_dir
|
||||
logger.info(f"使用配置中指定的Playwright浏览器路径: {playwright_dir}")
|
||||
else:
|
||||
playwright_dir = None
|
||||
logger.info(f"开发环境,当前目录: {current_dir}")
|
||||
|
||||
|
||||
output_dir = current_dir
|
||||
screenshot_path = os.path.join(output_dir, "sse_screenshot.png")
|
||||
logger.info(f"截图将保存到: {screenshot_path}")
|
||||
|
||||
driver = None
|
||||
try:
|
||||
with sync_playwright() as p:
|
||||
# 设置浏览器路径
|
||||
browser_launch_options = {
|
||||
'headless': True
|
||||
}
|
||||
|
||||
if playwright_dir:
|
||||
# 尝试多个可能的浏览器可执行文件路径
|
||||
possible_executables = [
|
||||
os.path.join(playwright_dir, 'chromium-1091', 'chrome-win', 'chrome.exe'),
|
||||
os.path.join(playwright_dir, 'chromium', 'chrome-win', 'chrome.exe'),
|
||||
os.path.join(playwright_dir, 'chrome-win', 'chrome.exe')
|
||||
]
|
||||
|
||||
for executable_path in possible_executables:
|
||||
if os.path.exists(executable_path):
|
||||
browser_launch_options['executable_path'] = executable_path
|
||||
logger.info(f"使用自定义浏览器路径: {executable_path}")
|
||||
break
|
||||
|
||||
if 'executable_path' not in browser_launch_options:
|
||||
logger.warning("未找到浏览器可执行文件,使用系统默认浏览器")
|
||||
|
||||
browser = p.chromium.launch(**browser_launch_options)
|
||||
page = browser.new_page()
|
||||
|
||||
page.set_default_timeout(60000)
|
||||
logger.info("正在访问页面...")
|
||||
page.goto(url, wait_until="networkidle")
|
||||
|
||||
logger.info("等待页面加载完成...")
|
||||
page.wait_for_load_state("domcontentloaded")
|
||||
page.wait_for_timeout(5000)
|
||||
|
||||
logger.info(f"查找XPath元素: {xpath}")
|
||||
element = page.locator(f"xpath={xpath}")
|
||||
|
||||
if element.count() > 0:
|
||||
logger.info("✓ 找到目标元素")
|
||||
|
||||
is_visible = element.is_visible()
|
||||
logger.info(f"元素可见: {is_visible}")
|
||||
|
||||
if not is_visible:
|
||||
logger.info("元素不可见,尝试滚动到可见区域...")
|
||||
element.scroll_into_view_if_needed()
|
||||
page.wait_for_timeout(2000)
|
||||
|
||||
logger.info(f"正在截取元素截图到: {screenshot_path}")
|
||||
element.screenshot(path=screenshot_path)
|
||||
logger.info("✓ 截屏成功")
|
||||
browser.close()
|
||||
return screenshot_path
|
||||
else:
|
||||
logger.warning("✗ 未找到目标元素")
|
||||
browser.close()
|
||||
return ""
|
||||
|
||||
except ImportError as e:
|
||||
logger.error(f"Playwright导入失败: {e}")
|
||||
logger.error("请确保已安装playwright: pip install playwright")
|
||||
logger.error("并安装浏览器: python -m playwright install chromium")
|
||||
return ""
|
||||
# 配置Chrome选项
|
||||
chrome_options = Options()
|
||||
chrome_options.add_argument('--headless')
|
||||
chrome_options.add_argument('--no-sandbox')
|
||||
chrome_options.add_argument('--disable-dev-shm-usage')
|
||||
chrome_options.add_argument('--disable-gpu')
|
||||
chrome_options.add_argument('--window-size=1920,1080')
|
||||
chrome_options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36')
|
||||
|
||||
# 设置Chrome浏览器路径(如果配置了)
|
||||
chrome_path = self.config.get('chrome_path')
|
||||
if chrome_path and os.path.exists(chrome_path):
|
||||
logger.info(f"使用配置中指定的Chrome路径: {chrome_path}")
|
||||
chrome_options.binary_location = chrome_path
|
||||
|
||||
# 创建WebDriver
|
||||
logger.info("初始化Chrome WebDriver...")
|
||||
driver = webdriver.Chrome(options=chrome_options)
|
||||
driver.set_page_load_timeout(60)
|
||||
driver.implicitly_wait(30)
|
||||
|
||||
# 访问页面
|
||||
logger.info("正在访问页面...")
|
||||
driver.get(url)
|
||||
|
||||
# 等待页面加载完成
|
||||
logger.info("等待页面加载完成...")
|
||||
WebDriverWait(driver, 30).until(
|
||||
EC.presence_of_element_located((By.XPATH, "//body"))
|
||||
)
|
||||
# 额外等待确保数据加载完成
|
||||
time.sleep(5)
|
||||
|
||||
# 查找目标元素
|
||||
logger.info(f"查找XPath元素: {xpath}")
|
||||
element = WebDriverWait(driver, 30).until(
|
||||
EC.presence_of_element_located((By.XPATH, xpath))
|
||||
)
|
||||
|
||||
logger.info("✓ 找到目标元素")
|
||||
|
||||
# 检查元素是否可见
|
||||
is_visible = element.is_displayed()
|
||||
logger.info(f"元素可见: {is_visible}")
|
||||
|
||||
if not is_visible:
|
||||
logger.info("元素不可见,尝试滚动到可见区域...")
|
||||
driver.execute_script("arguments[0].scrollIntoView({behavior: 'smooth', block: 'center', inline: 'center'});", element)
|
||||
time.sleep(2)
|
||||
|
||||
# 截取元素截图
|
||||
logger.info(f"正在截取元素截图到: {screenshot_path}")
|
||||
element.screenshot(screenshot_path)
|
||||
|
||||
logger.info("✓ 截屏成功")
|
||||
return screenshot_path
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"爬取上证所截图失败: {e}")
|
||||
logger.exception(e) # 记录详细异常
|
||||
@@ -406,8 +366,11 @@ class SpiderManager:
|
||||
# 记录环境信息用于调试
|
||||
logger.error(f"环境信息 - Frozen: {getattr(sys, 'frozen', False)}")
|
||||
logger.error(f"当前目录: {current_dir}")
|
||||
logger.error(f"Playwright目录: {playwright_dir}")
|
||||
|
||||
return ""
|
||||
finally:
|
||||
if driver:
|
||||
logger.info("关闭WebDriver...")
|
||||
driver.quit()
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user