feat: 添加上证所截图功能并优化股票数据获取

- 新增上证所网页元素截图功能,使用Playwright实现
- 优化股票数据获取方式,改用新浪财经JS接口
- 调整情感分析评分规则为7级分类
- 添加截图显示组件到主界面
- 更新依赖项,替换playwright为selenium
This commit is contained in:
2026-01-13 17:06:18 +08:00
parent 33c2af5348
commit 9d33a8e179
7 changed files with 291 additions and 84 deletions

281
spider.py
View File

@@ -8,6 +8,12 @@ from typing import List, Dict, Optional
from urllib.parse import urljoin
from bs4 import BeautifulSoup
import random
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from loguru import logger
@@ -174,84 +180,105 @@ class SpiderManager:
logger.debug(f"当前时间 {current_time.strftime('%H:%M')} 是否为交易时间: {is_trading}")
return is_trading
def _fetch_sse_with_selenium(self, url: str) -> Optional[str]:
"""使用 Selenium 获取页面内容"""
driver = None
try:
# 配置 Chrome 选项
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('--window-size=1920,1080')
chrome_options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36')
# 创建 WebDriver
driver = webdriver.Chrome(options=chrome_options)
# 访问页面
driver.get(url)
# 等待页面加载
WebDriverWait(driver, 30).until(
EC.presence_of_element_located((By.ID, "wmt_china"))
)
# 额外等待确保数据加载完成
time.sleep(2)
# 获取页面内容
html = driver.page_source
logger.debug(f"Selenium 获取页面成功,大小: {len(html)} 字节")
return html
except Exception as e:
logger.error(f"Selenium 获取页面失败: {e}")
return None
finally:
if driver:
driver.quit()
def fetch_sse_stock_data(self) -> Dict[str, float]:
"""
爬取上海证券交易所股票数据
使用新浪财经JS接口获取上证指数数据
返回包含时间和数值的字典
"""
# 检查是否为交易时间
from datetime import datetime
import re
if not self.is_trading_time():
logger.info("当前为非交易时间,跳过股票数据爬取")
return {}
sse_url = "https://www.sse.com.cn/"
xpath = "//*[@id=\"hq_controller\"]/table/tbody/tr/td[1]/rowspan[2]/i[1]"
logger.info(f"开始爬取上海证券交易所数据: {sse_url}")
html = self._fetch_with_retry(sse_url)
if not html:
logger.warning("上海证券交易所网页获取失败")
return {}
sse_url = "https://hq.sinajs.cn/list=sh000001"
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Referer': 'https://finance.sina.com.cn/'
}
logger.info(f"通过新浪JS接口获取上证指数数据: {sse_url}")
try:
# 使用 lxml 解析
tree = etree.HTML(html)
# 尝试获取股票数值
elements = tree.xpath(xpath)
if not elements:
logger.warning("未找到股票数据元素尝试备用XPath")
# 尝试备用XPath
backup_xpaths = [
"//*[@id='hq_area']",
"//*[@id='hq_controller']//td[contains(@class, 'price')]//text()",
"//*[contains(@class, 'stock-price')]//text()",
"//*[contains(@class, 'price')]//text()"
]
for backup_xpath in backup_xpaths:
elements = tree.xpath(backup_xpath)
if elements:
logger.info(f"使用备用XPath获取到数据")
break
if elements:
# 获取文本内容并尝试转换为数值
text_content = etree.tostring(elements[0], method='text', encoding='unicode').strip()
logger.info(f"获取到股票数据文本: {text_content}")
# 提取数值(可能包含逗号、小数点等)
import re
# 匹配数字(包括小数点和逗号分隔符)
numbers = re.findall(r'[\d,]+(?:\.\d+)?', text_content)
if numbers:
# 去除逗号并转换为浮点数
value_str = numbers[0].replace(',', '')
try:
stock_value = float(value_str)
# 获取当前时间
from datetime import datetime
current_time = datetime.now().strftime("%H:%M")
logger.info(f"成功获取股票数据: 时间={current_time}, 值={stock_value}")
return {
'time': current_time,
'value': stock_value
}
except ValueError as e:
logger.error(f"数值转换失败: {value_str}, 错误: {e}")
else:
logger.warning(f"未找到有效数值: {text_content}")
else:
logger.warning("未找到股票数据元素")
response = self.session.get(sse_url, timeout=30, headers=headers)
response.raise_for_status()
content = response.text
logger.debug(f"获取到响应: {content}")
pattern = r'var hq_str_sh000001="([^"]+)"'
match = re.search(pattern, content)
if not match:
logger.warning("未能解析新浪JS接口返回数据")
return {}
data_fields = match.group(1).split(',')
if len(data_fields) < 32:
logger.warning(f"数据字段不足: {len(data_fields)}")
return {}
stock_name = data_fields[0]
current_price = float(data_fields[3]) if data_fields[3] else 0.0
current_time = datetime.now().strftime("%H:%M")
logger.info(f"✓ 成功获取 {stock_name}: {current_price} (时间: {current_time})")
return {
'time': current_time,
'value': current_price
}
except requests.RequestException as e:
logger.error(f"请求新浪JS接口失败: {e}")
return {}
except (ValueError, IndexError) as e:
logger.error(f"解析数据失败: {e}")
return {}
except Exception as e:
logger.error(f"解析上海证券交易所数据失败: {e}")
return {}
logger.error(f"获取股票数据异常: {e}")
return {}
def set_user_agent(self, user_agent: str):
"""更新User-Agent"""
@@ -270,3 +297,119 @@ class SpiderManager:
def get_fetch_interval(self) -> int:
"""获取爬取间隔"""
return self.config.get('fetch_interval', 60)
def fetch_sse_screenshot(self) -> str:
"""
爬取上证所网页指定元素截图
返回截图文件路径
"""
from playwright.sync_api import sync_playwright
import os
url = "https://www.sse.com.cn/"
xpath = "//div[contains(@class,'gray_bg')]//div[contains(@class,'col-md-7')]"
logger.info(f"开始爬取上证所网页截图: {url}")
logger.info(f"目标XPath: {xpath}")
output_dir = os.path.dirname(os.path.abspath(__file__))
screenshot_path = os.path.join(output_dir, "sse_screenshot.png")
try:
with sync_playwright() as p:
browser = p.chromium.launch(headless=True)
page = browser.new_page()
page.set_default_timeout(60000)
logger.info("正在访问页面...")
page.goto(url, wait_until="networkidle")
logger.info("等待页面加载完成...")
page.wait_for_load_state("domcontentloaded")
page.wait_for_timeout(5000)
logger.info(f"查找XPath元素: {xpath}")
element = page.locator(f"xpath={xpath}")
if element.count() > 0:
logger.info("✓ 找到目标元素")
is_visible = element.is_visible()
logger.info(f"元素可见: {is_visible}")
if not is_visible:
logger.info("元素不可见,尝试滚动到可见区域...")
element.scroll_into_view_if_needed()
page.wait_for_timeout(2000)
logger.info(f"正在截取元素截图到: {screenshot_path}")
element.screenshot(path=screenshot_path)
logger.info("✓ 截屏成功")
browser.close()
return screenshot_path
else:
logger.warning("✗ 未找到目标元素")
browser.close()
return ""
except Exception as e:
logger.error(f"爬取上证所截图失败: {e}")
return ""
def fetch_sse_screenshot(self) -> str:
"""
爬取上证所网页指定元素截图
返回截图文件路径
"""
from playwright.sync_api import sync_playwright
import os
url = "https://www.sse.com.cn/"
xpath = "//div[contains(@class,'gray_bg')]//div[contains(@class,'col-md-7')]"
logger.info(f"开始爬取上证所网页截图: {url}")
logger.info(f"目标XPath: {xpath}")
output_dir = os.path.dirname(os.path.abspath(__file__))
screenshot_path = os.path.join(output_dir, "sse_screenshot.png")
try:
with sync_playwright() as p:
browser = p.chromium.launch(headless=True)
page = browser.new_page()
page.set_default_timeout(60000)
logger.info("正在访问页面...")
page.goto(url, wait_until="networkidle")
logger.info("等待页面加载完成...")
page.wait_for_load_state("domcontentloaded")
page.wait_for_timeout(5000)
logger.info(f"查找XPath元素: {xpath}")
element = page.locator(f"xpath={xpath}")
if element.count() > 0:
logger.info("✓ 找到目标元素")
is_visible = element.is_visible()
logger.info(f"元素可见: {is_visible}")
if not is_visible:
logger.info("元素不可见,尝试滚动到可见区域...")
element.scroll_into_view_if_needed()
page.wait_for_timeout(2000)
logger.info(f"正在截取元素截图到: {screenshot_path}")
element.screenshot(path=screenshot_path)
logger.info("✓ 截屏成功")
browser.close()
return screenshot_path
else:
logger.warning("✗ 未找到目标元素")
browser.close()
return ""
except Exception as e:
logger.error(f"爬取上证所截图失败: {e}")
return ""