feat: 添加上证所截图功能并优化股票数据获取

- 新增上证所网页元素截图功能,使用Playwright实现
- 优化股票数据获取方式,改用新浪财经JS接口
- 调整情感分析评分规则为7级分类
- 添加截图显示组件到主界面
- 更新依赖项,替换playwright为selenium
This commit is contained in:
2026-01-13 17:06:18 +08:00
parent 33c2af5348
commit 9d33a8e179
7 changed files with 291 additions and 84 deletions

1
.gitignore vendored
View File

@@ -50,3 +50,4 @@ Thumbs.db
# 临时文件
*.tmp
*.bak
test*.*

View File

@@ -16,14 +16,17 @@ class LLMAnalyzer:
SYSTEM_PROMPT = """你是一个专业的情感分析助手。你的任务是分析股吧/论坛评论的情感倾向,判断投资者对该股票的态度。
评分规则:
- 0-30: 极度悲观/看空(利空、暴跌、绝望等情绪)
- 31-50: 悲观/中性(担忧、谨慎、观望等情绪)
- 51-70: 偏乐观/中性(看好、希望、期待等情绪)
- 71-100: 极度乐观/看涨(利好、暴涨、兴奋等情绪)
- 0-30: 极度悲观(利空、暴跌、绝望等情绪)
- 30-39: 悲观(看空、担忧、谨慎等情绪)
- 39-45: 偏悲观(谨慎观望、保守等情绪)
- 45-55: 中立(观望、客观等情绪)
- 55-65: 偏乐观(看好、希望等情绪)
- 65-70: 乐观(看涨、信心等情绪)
- 70-100: 极度乐观(利好、暴涨、兴奋等情绪)
请直接输出一个JSON格式的结果包含两个字段
- score: 0-100的整数评分
- label: 简短的态度描述(如"强烈看跌""谨慎观望""温和看涨""强烈看涨"
- label: 简短的态度描述(如"极度悲观""悲观""偏悲观""中立""偏乐观""乐观""极度乐观"
注意:
1. 只返回JSON不要有其他文字

23
main.py
View File

@@ -24,6 +24,7 @@ class BackendWorker(QObject):
error_occurred = Signal(str)
status_update = Signal(str)
stock_data_fetched = Signal(str, float) # 股票数据获取信号
sse_screenshot_fetched = Signal(str) # 上证所截图获取信号
def __init__(self, config_manager: ConfigManager, db_manager: DatabaseManager,
spider: SpiderManager, analyzer: LLMAnalyzer):
@@ -197,6 +198,21 @@ class BackendWorker(QObject):
except Exception as e:
logger.error(f"爬取股票数据失败: {str(e)}")
def fetch_sse_screenshot(self):
"""爬取上证所网页元素截图"""
try:
logger.info("开始爬取上证所网页截图")
screenshot_path = self.spider.fetch_sse_screenshot()
if screenshot_path:
logger.info(f"成功获取截图: {screenshot_path}")
self.sse_screenshot_fetched.emit(screenshot_path)
else:
logger.warning("未能获取有效的截图")
except Exception as e:
logger.error(f"爬取截图失败: {str(e)}")
def manual_refresh(self):
"""手动刷新"""
logger.info("用户手动刷新")
@@ -283,6 +299,7 @@ def main():
worker.analysis_finished.connect(window.update_indicator)
worker.error_occurred.connect(lambda msg: window.show_message("错误", msg))
worker.stock_data_fetched.connect(window.add_waveform_data)
worker.sse_screenshot_fetched.connect(window.update_sse_screenshot)
# 启动时从数据库初始化指示器显示
worker._update_indicator()
@@ -302,6 +319,12 @@ def main():
stock_timer.start(60000) # 每分钟爬取一次股票数据
logger.info("股票数据爬取定时器已启动间隔60秒")
# 启动上证所截图爬取定时器
screenshot_timer = QTimer()
screenshot_timer.timeout.connect(worker.fetch_sse_screenshot)
screenshot_timer.start(300000) # 每5分钟爬取一次截图
logger.info("上证所截图爬取定时器已启动间隔300秒")
# 确保应用退出时清理线程
def cleanup():
logger.info("清理资源,停止后台线程...")

View File

@@ -4,9 +4,9 @@ PySide6 GUI界面模块
from PySide6.QtWidgets import (QWidget, QVBoxLayout, QHBoxLayout, QLabel,
QPushButton, QSlider, QDialog, QFormLayout,
QLineEdit, QSpinBox, QMessageBox, QSystemTrayIcon,
QMenu, QTextEdit, QGroupBox, QDialogButtonBox, QCheckBox)
QMenu, QTextEdit, QGroupBox, QDialogButtonBox, QCheckBox, QScrollArea)
from PySide6.QtCore import Qt, QTimer, Signal, QPoint
from PySide6.QtGui import QFont, QColor, QPainter, QBrush, QPen, QIcon, QAction
from PySide6.QtGui import QFont, QColor, QPainter, QBrush, QPen, QIcon, QAction, QPixmap
from typing import Callable, Optional
from loguru import logger
@@ -95,16 +95,20 @@ class SentimentIndicator(QWidget):
def get_description(self, score: int) -> str:
"""获取描述文本"""
if score < 20:
return "极度看跌"
elif score < 40:
if score < 30:
return "极度悲观"
elif score < 39:
return "悲观"
elif score < 45:
return "偏悲观"
elif score < 60:
return ""
elif score < 80:
elif score < 55:
return ""
elif score < 65:
return "偏乐观"
elif score < 70:
return "乐观"
else:
return "极度看涨"
return "极度乐观"
class ConfigDialog(QDialog):
@@ -263,6 +267,22 @@ class MainWindow(QWidget):
self.waveform_widget = WaveformWidget()
self.waveform_widget.setMinimumHeight(200)
# 上证所截图显示
screenshot_group = QGroupBox("上证所行情")
screenshot_layout = QVBoxLayout(screenshot_group)
self.screenshot_label = QLabel("等待截图...")
self.screenshot_label.setAlignment(Qt.AlignCenter)
self.screenshot_label.setMinimumSize(400, 200)
self.screenshot_label.setStyleSheet("border: 1px solid #ccc; background-color: #f0f0f0;")
screenshot_scroll = QScrollArea()
screenshot_scroll.setWidget(self.screenshot_label)
screenshot_scroll.setWidgetResizable(True)
screenshot_scroll.setMinimumHeight(150)
screenshot_layout.addWidget(screenshot_scroll)
# 按钮
btn_layout = QHBoxLayout()
self.refresh_btn = QPushButton("刷新")
@@ -279,6 +299,7 @@ class MainWindow(QWidget):
layout.addWidget(self.score_label)
layout.addWidget(self.status_label)
layout.addWidget(self.waveform_widget)
layout.addWidget(screenshot_group)
layout.addLayout(btn_layout)
# 设置窗口标志(无边框、可拖拽)
@@ -426,6 +447,22 @@ class MainWindow(QWidget):
self.waveform_widget.add_data_point(time_str, value)
logger.info(f"添加波形图数据点: 时间={time_str}, 值={value}")
def update_sse_screenshot(self, screenshot_path: str):
"""更新上证所截图显示"""
logger.info(f"更新截图显示: {screenshot_path}")
pixmap = QPixmap(screenshot_path)
if not pixmap.isNull():
self.screenshot_label.setPixmap(pixmap.scaled(
self.screenshot_label.size(),
Qt.KeepAspectRatio,
Qt.SmoothTransformation
))
self.screenshot_label.setText("")
logger.info("截图显示更新成功")
else:
self.screenshot_label.setText("截图加载失败")
logger.warning("截图加载失败")
class QCheckBox(QPushButton):
"""自定义复选框"""

View File

@@ -3,4 +3,4 @@ requests>=2.31.0
beautifulsoup4>=4.12.0
lxml>=4.9.0
openai>=1.0.0
playwright>=1.40.0
selenium>=4.15.0

281
spider.py
View File

@@ -8,6 +8,12 @@ from typing import List, Dict, Optional
from urllib.parse import urljoin
from bs4 import BeautifulSoup
import random
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from loguru import logger
@@ -174,84 +180,105 @@ class SpiderManager:
logger.debug(f"当前时间 {current_time.strftime('%H:%M')} 是否为交易时间: {is_trading}")
return is_trading
def _fetch_sse_with_selenium(self, url: str) -> Optional[str]:
"""使用 Selenium 获取页面内容"""
driver = None
try:
# 配置 Chrome 选项
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('--window-size=1920,1080')
chrome_options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36')
# 创建 WebDriver
driver = webdriver.Chrome(options=chrome_options)
# 访问页面
driver.get(url)
# 等待页面加载
WebDriverWait(driver, 30).until(
EC.presence_of_element_located((By.ID, "wmt_china"))
)
# 额外等待确保数据加载完成
time.sleep(2)
# 获取页面内容
html = driver.page_source
logger.debug(f"Selenium 获取页面成功,大小: {len(html)} 字节")
return html
except Exception as e:
logger.error(f"Selenium 获取页面失败: {e}")
return None
finally:
if driver:
driver.quit()
def fetch_sse_stock_data(self) -> Dict[str, float]:
"""
爬取上海证券交易所股票数据
使用新浪财经JS接口获取上证指数数据
返回包含时间和数值的字典
"""
# 检查是否为交易时间
from datetime import datetime
import re
if not self.is_trading_time():
logger.info("当前为非交易时间,跳过股票数据爬取")
return {}
sse_url = "https://www.sse.com.cn/"
xpath = "//*[@id=\"hq_controller\"]/table/tbody/tr/td[1]/rowspan[2]/i[1]"
logger.info(f"开始爬取上海证券交易所数据: {sse_url}")
html = self._fetch_with_retry(sse_url)
if not html:
logger.warning("上海证券交易所网页获取失败")
return {}
sse_url = "https://hq.sinajs.cn/list=sh000001"
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Referer': 'https://finance.sina.com.cn/'
}
logger.info(f"通过新浪JS接口获取上证指数数据: {sse_url}")
try:
# 使用 lxml 解析
tree = etree.HTML(html)
# 尝试获取股票数值
elements = tree.xpath(xpath)
if not elements:
logger.warning("未找到股票数据元素尝试备用XPath")
# 尝试备用XPath
backup_xpaths = [
"//*[@id='hq_area']",
"//*[@id='hq_controller']//td[contains(@class, 'price')]//text()",
"//*[contains(@class, 'stock-price')]//text()",
"//*[contains(@class, 'price')]//text()"
]
for backup_xpath in backup_xpaths:
elements = tree.xpath(backup_xpath)
if elements:
logger.info(f"使用备用XPath获取到数据")
break
if elements:
# 获取文本内容并尝试转换为数值
text_content = etree.tostring(elements[0], method='text', encoding='unicode').strip()
logger.info(f"获取到股票数据文本: {text_content}")
# 提取数值(可能包含逗号、小数点等)
import re
# 匹配数字(包括小数点和逗号分隔符)
numbers = re.findall(r'[\d,]+(?:\.\d+)?', text_content)
if numbers:
# 去除逗号并转换为浮点数
value_str = numbers[0].replace(',', '')
try:
stock_value = float(value_str)
# 获取当前时间
from datetime import datetime
current_time = datetime.now().strftime("%H:%M")
logger.info(f"成功获取股票数据: 时间={current_time}, 值={stock_value}")
return {
'time': current_time,
'value': stock_value
}
except ValueError as e:
logger.error(f"数值转换失败: {value_str}, 错误: {e}")
else:
logger.warning(f"未找到有效数值: {text_content}")
else:
logger.warning("未找到股票数据元素")
response = self.session.get(sse_url, timeout=30, headers=headers)
response.raise_for_status()
content = response.text
logger.debug(f"获取到响应: {content}")
pattern = r'var hq_str_sh000001="([^"]+)"'
match = re.search(pattern, content)
if not match:
logger.warning("未能解析新浪JS接口返回数据")
return {}
data_fields = match.group(1).split(',')
if len(data_fields) < 32:
logger.warning(f"数据字段不足: {len(data_fields)}")
return {}
stock_name = data_fields[0]
current_price = float(data_fields[3]) if data_fields[3] else 0.0
current_time = datetime.now().strftime("%H:%M")
logger.info(f"✓ 成功获取 {stock_name}: {current_price} (时间: {current_time})")
return {
'time': current_time,
'value': current_price
}
except requests.RequestException as e:
logger.error(f"请求新浪JS接口失败: {e}")
return {}
except (ValueError, IndexError) as e:
logger.error(f"解析数据失败: {e}")
return {}
except Exception as e:
logger.error(f"解析上海证券交易所数据失败: {e}")
return {}
logger.error(f"获取股票数据异常: {e}")
return {}
def set_user_agent(self, user_agent: str):
"""更新User-Agent"""
@@ -270,3 +297,119 @@ class SpiderManager:
def get_fetch_interval(self) -> int:
"""获取爬取间隔"""
return self.config.get('fetch_interval', 60)
def fetch_sse_screenshot(self) -> str:
"""
爬取上证所网页指定元素截图
返回截图文件路径
"""
from playwright.sync_api import sync_playwright
import os
url = "https://www.sse.com.cn/"
xpath = "//div[contains(@class,'gray_bg')]//div[contains(@class,'col-md-7')]"
logger.info(f"开始爬取上证所网页截图: {url}")
logger.info(f"目标XPath: {xpath}")
output_dir = os.path.dirname(os.path.abspath(__file__))
screenshot_path = os.path.join(output_dir, "sse_screenshot.png")
try:
with sync_playwright() as p:
browser = p.chromium.launch(headless=True)
page = browser.new_page()
page.set_default_timeout(60000)
logger.info("正在访问页面...")
page.goto(url, wait_until="networkidle")
logger.info("等待页面加载完成...")
page.wait_for_load_state("domcontentloaded")
page.wait_for_timeout(5000)
logger.info(f"查找XPath元素: {xpath}")
element = page.locator(f"xpath={xpath}")
if element.count() > 0:
logger.info("✓ 找到目标元素")
is_visible = element.is_visible()
logger.info(f"元素可见: {is_visible}")
if not is_visible:
logger.info("元素不可见,尝试滚动到可见区域...")
element.scroll_into_view_if_needed()
page.wait_for_timeout(2000)
logger.info(f"正在截取元素截图到: {screenshot_path}")
element.screenshot(path=screenshot_path)
logger.info("✓ 截屏成功")
browser.close()
return screenshot_path
else:
logger.warning("✗ 未找到目标元素")
browser.close()
return ""
except Exception as e:
logger.error(f"爬取上证所截图失败: {e}")
return ""
def fetch_sse_screenshot(self) -> str:
"""
爬取上证所网页指定元素截图
返回截图文件路径
"""
from playwright.sync_api import sync_playwright
import os
url = "https://www.sse.com.cn/"
xpath = "//div[contains(@class,'gray_bg')]//div[contains(@class,'col-md-7')]"
logger.info(f"开始爬取上证所网页截图: {url}")
logger.info(f"目标XPath: {xpath}")
output_dir = os.path.dirname(os.path.abspath(__file__))
screenshot_path = os.path.join(output_dir, "sse_screenshot.png")
try:
with sync_playwright() as p:
browser = p.chromium.launch(headless=True)
page = browser.new_page()
page.set_default_timeout(60000)
logger.info("正在访问页面...")
page.goto(url, wait_until="networkidle")
logger.info("等待页面加载完成...")
page.wait_for_load_state("domcontentloaded")
page.wait_for_timeout(5000)
logger.info(f"查找XPath元素: {xpath}")
element = page.locator(f"xpath={xpath}")
if element.count() > 0:
logger.info("✓ 找到目标元素")
is_visible = element.is_visible()
logger.info(f"元素可见: {is_visible}")
if not is_visible:
logger.info("元素不可见,尝试滚动到可见区域...")
element.scroll_into_view_if_needed()
page.wait_for_timeout(2000)
logger.info(f"正在截取元素截图到: {screenshot_path}")
element.screenshot(path=screenshot_path)
logger.info("✓ 截屏成功")
browser.close()
return screenshot_path
else:
logger.warning("✗ 未找到目标元素")
browser.close()
return ""
except Exception as e:
logger.error(f"爬取上证所截图失败: {e}")
return ""

BIN
sse_screenshot.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 57 KiB