diff --git a/build_new.bat b/build_new.bat deleted file mode 100644 index fc1507d..0000000 --- a/build_new.bat +++ /dev/null @@ -1,48 +0,0 @@ -@echo off -chcp 65001 >nul -echo ======================================== -echo 股吧人气指示器 - 打包工具 -echo ======================================== - -REM 检查并安装必要的依赖 -echo 检查并安装依赖... -pip install -r requirements.txt - -REM 安装Playwright浏览器 -echo 安装Playwright浏览器... -python -m playwright install chromium - -REM 检查 pyinstaller 是否安装 -pip show pyinstaller >nul 2>&1 -if errorlevel 1 ( - echo 正在安装 pyinstaller... - pip install pyinstaller -) - -REM 清理旧的构建文件 -echo 清理旧的构建文件... -if exist "build" rmdir /s /q build -if exist "dist" rmdir /s /q dist -if exist "guba-indicator.spec" del guba-indicator.spec - -echo 开始打包... -pyinstaller build.spec --noconfirm - -if exist "dist\guba-indicator\guba-indicator.exe" ( - echo ======================================== - echo 打包成功! - echo 可执行文件位置: dist\guba-indicator\guba-indicator.exe - echo ======================================== - - REM 复制必要的资源文件 - echo 复制资源文件... - copy guba.ico dist\guba-indicator\ >nul 2>&1 - copy indicator.ico dist\guba-indicator\ >nul 2>&1 - copy config.json dist\guba-indicator\ >nul 2>&1 - - echo 资源文件复制完成! -) else ( - echo 打包失败,请检查错误信息 -) - -pause \ No newline at end of file diff --git a/main.exe b/main.exe index e921eb8..537a664 100644 Binary files a/main.exe and b/main.exe differ diff --git a/spider.py b/spider.py index d1b0c85..bb0f83c 100644 --- a/spider.py +++ b/spider.py @@ -276,129 +276,89 @@ class SpiderManager: def fetch_sse_screenshot(self) -> str: """ - 爬取上证所网页指定元素截图 + 使用Selenium爬取上证所网页指定元素截图 返回截图文件路径 """ - from playwright.sync_api import sync_playwright import os import sys url = "https://www.sse.com.cn/" xpath = "//div[contains(@class,'gray_bg')]//div[contains(@class,'col-md-7')]" - logger.info(f"开始爬取上证所网页截图: {url}") + logger.info(f"开始使用Selenium爬取上证所网页截图: {url}") logger.info(f"目标XPath: {xpath}") - # 首先从配置中读取浏览器路径 - config_playwright_dir = self.config.get('playwright_dir') - if config_playwright_dir: - logger.info(f"从配置中获取到Playwright浏览器路径: {config_playwright_dir}") - # 获取当前脚本目录 if getattr(sys, 'frozen', False): # 打包后的环境 current_dir = os.path.dirname(sys.executable) - - # 优先使用配置中的路径 - if config_playwright_dir and os.path.exists(config_playwright_dir): - playwright_dir = config_playwright_dir - logger.info(f"使用配置中指定的Playwright浏览器路径: {playwright_dir}") - else: - # 配置路径不存在或未设置,尝试多个可能的Playwright浏览器路径 - possible_paths = [ - os.path.join(current_dir, '_internal', 'ms-playwright'), - os.path.join(current_dir, 'ms-playwright'), - os.path.join(os.path.dirname(current_dir), 'ms-playwright') - ] - - playwright_dir = None - for path in possible_paths: - if os.path.exists(path): - playwright_dir = path - logger.info(f"找到Playwright浏览器路径: {playwright_dir}") - break - - if not playwright_dir: - logger.warning("未找到Playwright浏览器路径,尝试使用系统默认路径") else: # 开发环境 current_dir = os.path.dirname(os.path.abspath(__file__)) - - # 优先使用配置中的路径 - if config_playwright_dir and os.path.exists(config_playwright_dir): - playwright_dir = config_playwright_dir - logger.info(f"使用配置中指定的Playwright浏览器路径: {playwright_dir}") - else: - playwright_dir = None - logger.info(f"开发环境,当前目录: {current_dir}") - + output_dir = current_dir screenshot_path = os.path.join(output_dir, "sse_screenshot.png") + logger.info(f"截图将保存到: {screenshot_path}") + driver = None try: - with sync_playwright() as p: - # 设置浏览器路径 - browser_launch_options = { - 'headless': True - } - - if playwright_dir: - # 尝试多个可能的浏览器可执行文件路径 - possible_executables = [ - os.path.join(playwright_dir, 'chromium-1091', 'chrome-win', 'chrome.exe'), - os.path.join(playwright_dir, 'chromium', 'chrome-win', 'chrome.exe'), - os.path.join(playwright_dir, 'chrome-win', 'chrome.exe') - ] - - for executable_path in possible_executables: - if os.path.exists(executable_path): - browser_launch_options['executable_path'] = executable_path - logger.info(f"使用自定义浏览器路径: {executable_path}") - break - - if 'executable_path' not in browser_launch_options: - logger.warning("未找到浏览器可执行文件,使用系统默认浏览器") - - browser = p.chromium.launch(**browser_launch_options) - page = browser.new_page() - - page.set_default_timeout(60000) - logger.info("正在访问页面...") - page.goto(url, wait_until="networkidle") - - logger.info("等待页面加载完成...") - page.wait_for_load_state("domcontentloaded") - page.wait_for_timeout(5000) - - logger.info(f"查找XPath元素: {xpath}") - element = page.locator(f"xpath={xpath}") - - if element.count() > 0: - logger.info("✓ 找到目标元素") - - is_visible = element.is_visible() - logger.info(f"元素可见: {is_visible}") - - if not is_visible: - logger.info("元素不可见,尝试滚动到可见区域...") - element.scroll_into_view_if_needed() - page.wait_for_timeout(2000) - - logger.info(f"正在截取元素截图到: {screenshot_path}") - element.screenshot(path=screenshot_path) - logger.info("✓ 截屏成功") - browser.close() - return screenshot_path - else: - logger.warning("✗ 未找到目标元素") - browser.close() - return "" - - except ImportError as e: - logger.error(f"Playwright导入失败: {e}") - logger.error("请确保已安装playwright: pip install playwright") - logger.error("并安装浏览器: python -m playwright install chromium") - return "" + # 配置Chrome选项 + chrome_options = Options() + chrome_options.add_argument('--headless') + chrome_options.add_argument('--no-sandbox') + chrome_options.add_argument('--disable-dev-shm-usage') + chrome_options.add_argument('--disable-gpu') + chrome_options.add_argument('--window-size=1920,1080') + chrome_options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36') + + # 设置Chrome浏览器路径(如果配置了) + chrome_path = self.config.get('chrome_path') + if chrome_path and os.path.exists(chrome_path): + logger.info(f"使用配置中指定的Chrome路径: {chrome_path}") + chrome_options.binary_location = chrome_path + + # 创建WebDriver + logger.info("初始化Chrome WebDriver...") + driver = webdriver.Chrome(options=chrome_options) + driver.set_page_load_timeout(60) + driver.implicitly_wait(30) + + # 访问页面 + logger.info("正在访问页面...") + driver.get(url) + + # 等待页面加载完成 + logger.info("等待页面加载完成...") + WebDriverWait(driver, 30).until( + EC.presence_of_element_located((By.XPATH, "//body")) + ) + # 额外等待确保数据加载完成 + time.sleep(5) + + # 查找目标元素 + logger.info(f"查找XPath元素: {xpath}") + element = WebDriverWait(driver, 30).until( + EC.presence_of_element_located((By.XPATH, xpath)) + ) + + logger.info("✓ 找到目标元素") + + # 检查元素是否可见 + is_visible = element.is_displayed() + logger.info(f"元素可见: {is_visible}") + + if not is_visible: + logger.info("元素不可见,尝试滚动到可见区域...") + driver.execute_script("arguments[0].scrollIntoView({behavior: 'smooth', block: 'center', inline: 'center'});", element) + time.sleep(2) + + # 截取元素截图 + logger.info(f"正在截取元素截图到: {screenshot_path}") + element.screenshot(screenshot_path) + + logger.info("✓ 截屏成功") + return screenshot_path + except Exception as e: logger.error(f"爬取上证所截图失败: {e}") logger.exception(e) # 记录详细异常 @@ -406,8 +366,11 @@ class SpiderManager: # 记录环境信息用于调试 logger.error(f"环境信息 - Frozen: {getattr(sys, 'frozen', False)}") logger.error(f"当前目录: {current_dir}") - logger.error(f"Playwright目录: {playwright_dir}") return "" + finally: + if driver: + logger.info("关闭WebDriver...") + driver.quit() diff --git a/sse_screenshot.png b/sse_screenshot.png index f69f07e..2183f16 100644 Binary files a/sse_screenshot.png and b/sse_screenshot.png differ