feat: 添加上证所截图功能并优化股票数据获取
- 新增上证所网页元素截图功能,使用Playwright实现 - 优化股票数据获取方式,改用新浪财经JS接口 - 调整情感分析评分规则为7级分类 - 添加截图显示组件到主界面 - 更新依赖项,替换playwright为selenium
This commit is contained in:
1
.gitignore
vendored
1
.gitignore
vendored
@@ -50,3 +50,4 @@ Thumbs.db
|
||||
# 临时文件
|
||||
*.tmp
|
||||
*.bak
|
||||
test*.*
|
||||
|
||||
@@ -16,14 +16,17 @@ class LLMAnalyzer:
|
||||
SYSTEM_PROMPT = """你是一个专业的情感分析助手。你的任务是分析股吧/论坛评论的情感倾向,判断投资者对该股票的态度。
|
||||
|
||||
评分规则:
|
||||
- 0-30: 极度悲观/看空(利空、暴跌、绝望等情绪)
|
||||
- 31-50: 偏悲观/中性(担忧、谨慎、观望等情绪)
|
||||
- 51-70: 偏乐观/中性(看好、希望、期待等情绪)
|
||||
- 71-100: 极度乐观/看涨(利好、暴涨、兴奋等情绪)
|
||||
- 0-30: 极度悲观(利空、暴跌、绝望等情绪)
|
||||
- 30-39: 悲观(看空、担忧、谨慎等情绪)
|
||||
- 39-45: 偏悲观(谨慎观望、保守等情绪)
|
||||
- 45-55: 中立(观望、客观等情绪)
|
||||
- 55-65: 偏乐观(看好、希望等情绪)
|
||||
- 65-70: 乐观(看涨、信心等情绪)
|
||||
- 70-100: 极度乐观(利好、暴涨、兴奋等情绪)
|
||||
|
||||
请直接输出一个JSON格式的结果,包含两个字段:
|
||||
- score: 0-100的整数评分
|
||||
- label: 简短的态度描述(如"强烈看跌"、"谨慎观望"、"温和看涨"、"强烈看涨"等)
|
||||
- label: 简短的态度描述(如"极度悲观"、"悲观"、"偏悲观"、"中立"、"偏乐观"、"乐观"、"极度乐观")
|
||||
|
||||
注意:
|
||||
1. 只返回JSON,不要有其他文字
|
||||
|
||||
23
main.py
23
main.py
@@ -24,6 +24,7 @@ class BackendWorker(QObject):
|
||||
error_occurred = Signal(str)
|
||||
status_update = Signal(str)
|
||||
stock_data_fetched = Signal(str, float) # 股票数据获取信号
|
||||
sse_screenshot_fetched = Signal(str) # 上证所截图获取信号
|
||||
|
||||
def __init__(self, config_manager: ConfigManager, db_manager: DatabaseManager,
|
||||
spider: SpiderManager, analyzer: LLMAnalyzer):
|
||||
@@ -197,6 +198,21 @@ class BackendWorker(QObject):
|
||||
except Exception as e:
|
||||
logger.error(f"爬取股票数据失败: {str(e)}")
|
||||
|
||||
def fetch_sse_screenshot(self):
|
||||
"""爬取上证所网页元素截图"""
|
||||
try:
|
||||
logger.info("开始爬取上证所网页截图")
|
||||
screenshot_path = self.spider.fetch_sse_screenshot()
|
||||
|
||||
if screenshot_path:
|
||||
logger.info(f"成功获取截图: {screenshot_path}")
|
||||
self.sse_screenshot_fetched.emit(screenshot_path)
|
||||
else:
|
||||
logger.warning("未能获取有效的截图")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"爬取截图失败: {str(e)}")
|
||||
|
||||
def manual_refresh(self):
|
||||
"""手动刷新"""
|
||||
logger.info("用户手动刷新")
|
||||
@@ -283,6 +299,7 @@ def main():
|
||||
worker.analysis_finished.connect(window.update_indicator)
|
||||
worker.error_occurred.connect(lambda msg: window.show_message("错误", msg))
|
||||
worker.stock_data_fetched.connect(window.add_waveform_data)
|
||||
worker.sse_screenshot_fetched.connect(window.update_sse_screenshot)
|
||||
|
||||
# 启动时从数据库初始化指示器显示
|
||||
worker._update_indicator()
|
||||
@@ -302,6 +319,12 @@ def main():
|
||||
stock_timer.start(60000) # 每分钟爬取一次股票数据
|
||||
logger.info("股票数据爬取定时器已启动,间隔60秒")
|
||||
|
||||
# 启动上证所截图爬取定时器
|
||||
screenshot_timer = QTimer()
|
||||
screenshot_timer.timeout.connect(worker.fetch_sse_screenshot)
|
||||
screenshot_timer.start(300000) # 每5分钟爬取一次截图
|
||||
logger.info("上证所截图爬取定时器已启动,间隔300秒")
|
||||
|
||||
# 确保应用退出时清理线程
|
||||
def cleanup():
|
||||
logger.info("清理资源,停止后台线程...")
|
||||
|
||||
@@ -4,9 +4,9 @@ PySide6 GUI界面模块
|
||||
from PySide6.QtWidgets import (QWidget, QVBoxLayout, QHBoxLayout, QLabel,
|
||||
QPushButton, QSlider, QDialog, QFormLayout,
|
||||
QLineEdit, QSpinBox, QMessageBox, QSystemTrayIcon,
|
||||
QMenu, QTextEdit, QGroupBox, QDialogButtonBox, QCheckBox)
|
||||
QMenu, QTextEdit, QGroupBox, QDialogButtonBox, QCheckBox, QScrollArea)
|
||||
from PySide6.QtCore import Qt, QTimer, Signal, QPoint
|
||||
from PySide6.QtGui import QFont, QColor, QPainter, QBrush, QPen, QIcon, QAction
|
||||
from PySide6.QtGui import QFont, QColor, QPainter, QBrush, QPen, QIcon, QAction, QPixmap
|
||||
from typing import Callable, Optional
|
||||
from loguru import logger
|
||||
|
||||
@@ -95,16 +95,20 @@ class SentimentIndicator(QWidget):
|
||||
|
||||
def get_description(self, score: int) -> str:
|
||||
"""获取描述文本"""
|
||||
if score < 20:
|
||||
return "极度看跌"
|
||||
elif score < 40:
|
||||
if score < 30:
|
||||
return "极度悲观"
|
||||
elif score < 39:
|
||||
return "悲观"
|
||||
elif score < 45:
|
||||
return "偏悲观"
|
||||
elif score < 60:
|
||||
return "中性"
|
||||
elif score < 80:
|
||||
elif score < 55:
|
||||
return "中立"
|
||||
elif score < 65:
|
||||
return "偏乐观"
|
||||
elif score < 70:
|
||||
return "乐观"
|
||||
else:
|
||||
return "极度看涨"
|
||||
return "极度乐观"
|
||||
|
||||
|
||||
class ConfigDialog(QDialog):
|
||||
@@ -263,6 +267,22 @@ class MainWindow(QWidget):
|
||||
self.waveform_widget = WaveformWidget()
|
||||
self.waveform_widget.setMinimumHeight(200)
|
||||
|
||||
# 上证所截图显示
|
||||
screenshot_group = QGroupBox("上证所行情")
|
||||
screenshot_layout = QVBoxLayout(screenshot_group)
|
||||
|
||||
self.screenshot_label = QLabel("等待截图...")
|
||||
self.screenshot_label.setAlignment(Qt.AlignCenter)
|
||||
self.screenshot_label.setMinimumSize(400, 200)
|
||||
self.screenshot_label.setStyleSheet("border: 1px solid #ccc; background-color: #f0f0f0;")
|
||||
|
||||
screenshot_scroll = QScrollArea()
|
||||
screenshot_scroll.setWidget(self.screenshot_label)
|
||||
screenshot_scroll.setWidgetResizable(True)
|
||||
screenshot_scroll.setMinimumHeight(150)
|
||||
|
||||
screenshot_layout.addWidget(screenshot_scroll)
|
||||
|
||||
# 按钮
|
||||
btn_layout = QHBoxLayout()
|
||||
self.refresh_btn = QPushButton("刷新")
|
||||
@@ -279,6 +299,7 @@ class MainWindow(QWidget):
|
||||
layout.addWidget(self.score_label)
|
||||
layout.addWidget(self.status_label)
|
||||
layout.addWidget(self.waveform_widget)
|
||||
layout.addWidget(screenshot_group)
|
||||
layout.addLayout(btn_layout)
|
||||
|
||||
# 设置窗口标志(无边框、可拖拽)
|
||||
@@ -426,6 +447,22 @@ class MainWindow(QWidget):
|
||||
self.waveform_widget.add_data_point(time_str, value)
|
||||
logger.info(f"添加波形图数据点: 时间={time_str}, 值={value}")
|
||||
|
||||
def update_sse_screenshot(self, screenshot_path: str):
|
||||
"""更新上证所截图显示"""
|
||||
logger.info(f"更新截图显示: {screenshot_path}")
|
||||
pixmap = QPixmap(screenshot_path)
|
||||
if not pixmap.isNull():
|
||||
self.screenshot_label.setPixmap(pixmap.scaled(
|
||||
self.screenshot_label.size(),
|
||||
Qt.KeepAspectRatio,
|
||||
Qt.SmoothTransformation
|
||||
))
|
||||
self.screenshot_label.setText("")
|
||||
logger.info("截图显示更新成功")
|
||||
else:
|
||||
self.screenshot_label.setText("截图加载失败")
|
||||
logger.warning("截图加载失败")
|
||||
|
||||
|
||||
class QCheckBox(QPushButton):
|
||||
"""自定义复选框"""
|
||||
|
||||
@@ -3,4 +3,4 @@ requests>=2.31.0
|
||||
beautifulsoup4>=4.12.0
|
||||
lxml>=4.9.0
|
||||
openai>=1.0.0
|
||||
playwright>=1.40.0
|
||||
selenium>=4.15.0
|
||||
|
||||
281
spider.py
281
spider.py
@@ -8,6 +8,12 @@ from typing import List, Dict, Optional
|
||||
from urllib.parse import urljoin
|
||||
from bs4 import BeautifulSoup
|
||||
import random
|
||||
from selenium import webdriver
|
||||
from selenium.webdriver.chrome.service import Service
|
||||
from selenium.webdriver.chrome.options import Options
|
||||
from selenium.webdriver.common.by import By
|
||||
from selenium.webdriver.support.ui import WebDriverWait
|
||||
from selenium.webdriver.support import expected_conditions as EC
|
||||
from loguru import logger
|
||||
|
||||
|
||||
@@ -174,84 +180,105 @@ class SpiderManager:
|
||||
logger.debug(f"当前时间 {current_time.strftime('%H:%M')} 是否为交易时间: {is_trading}")
|
||||
return is_trading
|
||||
|
||||
def _fetch_sse_with_selenium(self, url: str) -> Optional[str]:
|
||||
"""使用 Selenium 获取页面内容"""
|
||||
driver = None
|
||||
try:
|
||||
# 配置 Chrome 选项
|
||||
chrome_options = Options()
|
||||
chrome_options.add_argument('--headless')
|
||||
chrome_options.add_argument('--no-sandbox')
|
||||
chrome_options.add_argument('--disable-dev-shm-usage')
|
||||
chrome_options.add_argument('--disable-gpu')
|
||||
chrome_options.add_argument('--window-size=1920,1080')
|
||||
chrome_options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36')
|
||||
|
||||
# 创建 WebDriver
|
||||
driver = webdriver.Chrome(options=chrome_options)
|
||||
|
||||
# 访问页面
|
||||
driver.get(url)
|
||||
|
||||
# 等待页面加载
|
||||
WebDriverWait(driver, 30).until(
|
||||
EC.presence_of_element_located((By.ID, "wmt_china"))
|
||||
)
|
||||
|
||||
# 额外等待确保数据加载完成
|
||||
time.sleep(2)
|
||||
|
||||
# 获取页面内容
|
||||
html = driver.page_source
|
||||
|
||||
logger.debug(f"Selenium 获取页面成功,大小: {len(html)} 字节")
|
||||
return html
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Selenium 获取页面失败: {e}")
|
||||
return None
|
||||
finally:
|
||||
if driver:
|
||||
driver.quit()
|
||||
|
||||
def fetch_sse_stock_data(self) -> Dict[str, float]:
|
||||
"""
|
||||
爬取上海证券交易所股票数据
|
||||
使用新浪财经JS接口获取上证指数数据
|
||||
返回包含时间和数值的字典
|
||||
"""
|
||||
# 检查是否为交易时间
|
||||
from datetime import datetime
|
||||
import re
|
||||
|
||||
if not self.is_trading_time():
|
||||
logger.info("当前为非交易时间,跳过股票数据爬取")
|
||||
return {}
|
||||
|
||||
sse_url = "https://www.sse.com.cn/"
|
||||
xpath = "//*[@id=\"hq_controller\"]/table/tbody/tr/td[1]/rowspan[2]/i[1]"
|
||||
|
||||
logger.info(f"开始爬取上海证券交易所数据: {sse_url}")
|
||||
|
||||
html = self._fetch_with_retry(sse_url)
|
||||
if not html:
|
||||
logger.warning("上海证券交易所网页获取失败")
|
||||
return {}
|
||||
|
||||
|
||||
sse_url = "https://hq.sinajs.cn/list=sh000001"
|
||||
headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||||
'Referer': 'https://finance.sina.com.cn/'
|
||||
}
|
||||
logger.info(f"通过新浪JS接口获取上证指数数据: {sse_url}")
|
||||
|
||||
try:
|
||||
# 使用 lxml 解析
|
||||
tree = etree.HTML(html)
|
||||
|
||||
# 尝试获取股票数值
|
||||
elements = tree.xpath(xpath)
|
||||
if not elements:
|
||||
logger.warning("未找到股票数据元素,尝试备用XPath")
|
||||
# 尝试备用XPath
|
||||
backup_xpaths = [
|
||||
"//*[@id='hq_area']",
|
||||
"//*[@id='hq_controller']//td[contains(@class, 'price')]//text()",
|
||||
"//*[contains(@class, 'stock-price')]//text()",
|
||||
"//*[contains(@class, 'price')]//text()"
|
||||
]
|
||||
|
||||
for backup_xpath in backup_xpaths:
|
||||
elements = tree.xpath(backup_xpath)
|
||||
if elements:
|
||||
logger.info(f"使用备用XPath获取到数据")
|
||||
break
|
||||
|
||||
if elements:
|
||||
# 获取文本内容并尝试转换为数值
|
||||
text_content = etree.tostring(elements[0], method='text', encoding='unicode').strip()
|
||||
logger.info(f"获取到股票数据文本: {text_content}")
|
||||
|
||||
# 提取数值(可能包含逗号、小数点等)
|
||||
import re
|
||||
# 匹配数字(包括小数点和逗号分隔符)
|
||||
numbers = re.findall(r'[\d,]+(?:\.\d+)?', text_content)
|
||||
if numbers:
|
||||
# 去除逗号并转换为浮点数
|
||||
value_str = numbers[0].replace(',', '')
|
||||
try:
|
||||
stock_value = float(value_str)
|
||||
|
||||
# 获取当前时间
|
||||
from datetime import datetime
|
||||
current_time = datetime.now().strftime("%H:%M")
|
||||
|
||||
logger.info(f"成功获取股票数据: 时间={current_time}, 值={stock_value}")
|
||||
|
||||
return {
|
||||
'time': current_time,
|
||||
'value': stock_value
|
||||
}
|
||||
except ValueError as e:
|
||||
logger.error(f"数值转换失败: {value_str}, 错误: {e}")
|
||||
else:
|
||||
logger.warning(f"未找到有效数值: {text_content}")
|
||||
else:
|
||||
logger.warning("未找到股票数据元素")
|
||||
|
||||
response = self.session.get(sse_url, timeout=30, headers=headers)
|
||||
response.raise_for_status()
|
||||
|
||||
content = response.text
|
||||
logger.debug(f"获取到响应: {content}")
|
||||
|
||||
pattern = r'var hq_str_sh000001="([^"]+)"'
|
||||
match = re.search(pattern, content)
|
||||
|
||||
if not match:
|
||||
logger.warning("未能解析新浪JS接口返回数据")
|
||||
return {}
|
||||
|
||||
data_fields = match.group(1).split(',')
|
||||
|
||||
if len(data_fields) < 32:
|
||||
logger.warning(f"数据字段不足: {len(data_fields)}")
|
||||
return {}
|
||||
|
||||
stock_name = data_fields[0]
|
||||
current_price = float(data_fields[3]) if data_fields[3] else 0.0
|
||||
current_time = datetime.now().strftime("%H:%M")
|
||||
|
||||
logger.info(f"✓ 成功获取 {stock_name}: {current_price} (时间: {current_time})")
|
||||
|
||||
return {
|
||||
'time': current_time,
|
||||
'value': current_price
|
||||
}
|
||||
|
||||
except requests.RequestException as e:
|
||||
logger.error(f"请求新浪JS接口失败: {e}")
|
||||
return {}
|
||||
except (ValueError, IndexError) as e:
|
||||
logger.error(f"解析数据失败: {e}")
|
||||
return {}
|
||||
except Exception as e:
|
||||
logger.error(f"解析上海证券交易所数据失败: {e}")
|
||||
|
||||
return {}
|
||||
logger.error(f"获取股票数据异常: {e}")
|
||||
return {}
|
||||
|
||||
def set_user_agent(self, user_agent: str):
|
||||
"""更新User-Agent"""
|
||||
@@ -270,3 +297,119 @@ class SpiderManager:
|
||||
def get_fetch_interval(self) -> int:
|
||||
"""获取爬取间隔"""
|
||||
return self.config.get('fetch_interval', 60)
|
||||
|
||||
def fetch_sse_screenshot(self) -> str:
|
||||
"""
|
||||
爬取上证所网页指定元素截图
|
||||
返回截图文件路径
|
||||
"""
|
||||
from playwright.sync_api import sync_playwright
|
||||
import os
|
||||
|
||||
url = "https://www.sse.com.cn/"
|
||||
xpath = "//div[contains(@class,'gray_bg')]//div[contains(@class,'col-md-7')]"
|
||||
|
||||
logger.info(f"开始爬取上证所网页截图: {url}")
|
||||
logger.info(f"目标XPath: {xpath}")
|
||||
|
||||
output_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
screenshot_path = os.path.join(output_dir, "sse_screenshot.png")
|
||||
|
||||
try:
|
||||
with sync_playwright() as p:
|
||||
browser = p.chromium.launch(headless=True)
|
||||
page = browser.new_page()
|
||||
|
||||
page.set_default_timeout(60000)
|
||||
logger.info("正在访问页面...")
|
||||
page.goto(url, wait_until="networkidle")
|
||||
|
||||
logger.info("等待页面加载完成...")
|
||||
page.wait_for_load_state("domcontentloaded")
|
||||
page.wait_for_timeout(5000)
|
||||
|
||||
logger.info(f"查找XPath元素: {xpath}")
|
||||
element = page.locator(f"xpath={xpath}")
|
||||
|
||||
if element.count() > 0:
|
||||
logger.info("✓ 找到目标元素")
|
||||
|
||||
is_visible = element.is_visible()
|
||||
logger.info(f"元素可见: {is_visible}")
|
||||
|
||||
if not is_visible:
|
||||
logger.info("元素不可见,尝试滚动到可见区域...")
|
||||
element.scroll_into_view_if_needed()
|
||||
page.wait_for_timeout(2000)
|
||||
|
||||
logger.info(f"正在截取元素截图到: {screenshot_path}")
|
||||
element.screenshot(path=screenshot_path)
|
||||
logger.info("✓ 截屏成功")
|
||||
browser.close()
|
||||
return screenshot_path
|
||||
else:
|
||||
logger.warning("✗ 未找到目标元素")
|
||||
browser.close()
|
||||
return ""
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"爬取上证所截图失败: {e}")
|
||||
return ""
|
||||
|
||||
def fetch_sse_screenshot(self) -> str:
|
||||
"""
|
||||
爬取上证所网页指定元素截图
|
||||
返回截图文件路径
|
||||
"""
|
||||
from playwright.sync_api import sync_playwright
|
||||
import os
|
||||
|
||||
url = "https://www.sse.com.cn/"
|
||||
xpath = "//div[contains(@class,'gray_bg')]//div[contains(@class,'col-md-7')]"
|
||||
|
||||
logger.info(f"开始爬取上证所网页截图: {url}")
|
||||
logger.info(f"目标XPath: {xpath}")
|
||||
|
||||
output_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
screenshot_path = os.path.join(output_dir, "sse_screenshot.png")
|
||||
|
||||
try:
|
||||
with sync_playwright() as p:
|
||||
browser = p.chromium.launch(headless=True)
|
||||
page = browser.new_page()
|
||||
|
||||
page.set_default_timeout(60000)
|
||||
logger.info("正在访问页面...")
|
||||
page.goto(url, wait_until="networkidle")
|
||||
|
||||
logger.info("等待页面加载完成...")
|
||||
page.wait_for_load_state("domcontentloaded")
|
||||
page.wait_for_timeout(5000)
|
||||
|
||||
logger.info(f"查找XPath元素: {xpath}")
|
||||
element = page.locator(f"xpath={xpath}")
|
||||
|
||||
if element.count() > 0:
|
||||
logger.info("✓ 找到目标元素")
|
||||
|
||||
is_visible = element.is_visible()
|
||||
logger.info(f"元素可见: {is_visible}")
|
||||
|
||||
if not is_visible:
|
||||
logger.info("元素不可见,尝试滚动到可见区域...")
|
||||
element.scroll_into_view_if_needed()
|
||||
page.wait_for_timeout(2000)
|
||||
|
||||
logger.info(f"正在截取元素截图到: {screenshot_path}")
|
||||
element.screenshot(path=screenshot_path)
|
||||
logger.info("✓ 截屏成功")
|
||||
browser.close()
|
||||
return screenshot_path
|
||||
else:
|
||||
logger.warning("✗ 未找到目标元素")
|
||||
browser.close()
|
||||
return ""
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"爬取上证所截图失败: {e}")
|
||||
return ""
|
||||
|
||||
BIN
sse_screenshot.png
Normal file
BIN
sse_screenshot.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 57 KiB |
Reference in New Issue
Block a user