diff --git a/.gitignore b/.gitignore index 5feab33..2316695 100644 --- a/.gitignore +++ b/.gitignore @@ -50,3 +50,4 @@ Thumbs.db # 临时文件 *.tmp *.bak +test*.* diff --git a/llm_analyzer.py b/llm_analyzer.py index 132f539..b884369 100644 --- a/llm_analyzer.py +++ b/llm_analyzer.py @@ -16,14 +16,17 @@ class LLMAnalyzer: SYSTEM_PROMPT = """你是一个专业的情感分析助手。你的任务是分析股吧/论坛评论的情感倾向,判断投资者对该股票的态度。 评分规则: -- 0-30: 极度悲观/看空(利空、暴跌、绝望等情绪) -- 31-50: 偏悲观/中性(担忧、谨慎、观望等情绪) -- 51-70: 偏乐观/中性(看好、希望、期待等情绪) -- 71-100: 极度乐观/看涨(利好、暴涨、兴奋等情绪) +- 0-30: 极度悲观(利空、暴跌、绝望等情绪) +- 30-39: 悲观(看空、担忧、谨慎等情绪) +- 39-45: 偏悲观(谨慎观望、保守等情绪) +- 45-55: 中立(观望、客观等情绪) +- 55-65: 偏乐观(看好、希望等情绪) +- 65-70: 乐观(看涨、信心等情绪) +- 70-100: 极度乐观(利好、暴涨、兴奋等情绪) 请直接输出一个JSON格式的结果,包含两个字段: - score: 0-100的整数评分 -- label: 简短的态度描述(如"强烈看跌"、"谨慎观望"、"温和看涨"、"强烈看涨"等) +- label: 简短的态度描述(如"极度悲观"、"悲观"、"偏悲观"、"中立"、"偏乐观"、"乐观"、"极度乐观") 注意: 1. 只返回JSON,不要有其他文字 diff --git a/main.py b/main.py index e18e7e5..ded22e0 100644 --- a/main.py +++ b/main.py @@ -24,6 +24,7 @@ class BackendWorker(QObject): error_occurred = Signal(str) status_update = Signal(str) stock_data_fetched = Signal(str, float) # 股票数据获取信号 + sse_screenshot_fetched = Signal(str) # 上证所截图获取信号 def __init__(self, config_manager: ConfigManager, db_manager: DatabaseManager, spider: SpiderManager, analyzer: LLMAnalyzer): @@ -197,6 +198,21 @@ class BackendWorker(QObject): except Exception as e: logger.error(f"爬取股票数据失败: {str(e)}") + def fetch_sse_screenshot(self): + """爬取上证所网页元素截图""" + try: + logger.info("开始爬取上证所网页截图") + screenshot_path = self.spider.fetch_sse_screenshot() + + if screenshot_path: + logger.info(f"成功获取截图: {screenshot_path}") + self.sse_screenshot_fetched.emit(screenshot_path) + else: + logger.warning("未能获取有效的截图") + + except Exception as e: + logger.error(f"爬取截图失败: {str(e)}") + def manual_refresh(self): """手动刷新""" logger.info("用户手动刷新") @@ -283,6 +299,7 @@ def main(): worker.analysis_finished.connect(window.update_indicator) worker.error_occurred.connect(lambda msg: window.show_message("错误", msg)) worker.stock_data_fetched.connect(window.add_waveform_data) + worker.sse_screenshot_fetched.connect(window.update_sse_screenshot) # 启动时从数据库初始化指示器显示 worker._update_indicator() @@ -302,6 +319,12 @@ def main(): stock_timer.start(60000) # 每分钟爬取一次股票数据 logger.info("股票数据爬取定时器已启动,间隔60秒") + # 启动上证所截图爬取定时器 + screenshot_timer = QTimer() + screenshot_timer.timeout.connect(worker.fetch_sse_screenshot) + screenshot_timer.start(300000) # 每5分钟爬取一次截图 + logger.info("上证所截图爬取定时器已启动,间隔300秒") + # 确保应用退出时清理线程 def cleanup(): logger.info("清理资源,停止后台线程...") diff --git a/main_window.py b/main_window.py index 384cbfe..002106d 100644 --- a/main_window.py +++ b/main_window.py @@ -4,9 +4,9 @@ PySide6 GUI界面模块 from PySide6.QtWidgets import (QWidget, QVBoxLayout, QHBoxLayout, QLabel, QPushButton, QSlider, QDialog, QFormLayout, QLineEdit, QSpinBox, QMessageBox, QSystemTrayIcon, - QMenu, QTextEdit, QGroupBox, QDialogButtonBox, QCheckBox) + QMenu, QTextEdit, QGroupBox, QDialogButtonBox, QCheckBox, QScrollArea) from PySide6.QtCore import Qt, QTimer, Signal, QPoint -from PySide6.QtGui import QFont, QColor, QPainter, QBrush, QPen, QIcon, QAction +from PySide6.QtGui import QFont, QColor, QPainter, QBrush, QPen, QIcon, QAction, QPixmap from typing import Callable, Optional from loguru import logger @@ -95,16 +95,20 @@ class SentimentIndicator(QWidget): def get_description(self, score: int) -> str: """获取描述文本""" - if score < 20: - return "极度看跌" - elif score < 40: + if score < 30: + return "极度悲观" + elif score < 39: + return "悲观" + elif score < 45: return "偏悲观" - elif score < 60: - return "中性" - elif score < 80: + elif score < 55: + return "中立" + elif score < 65: return "偏乐观" + elif score < 70: + return "乐观" else: - return "极度看涨" + return "极度乐观" class ConfigDialog(QDialog): @@ -263,6 +267,22 @@ class MainWindow(QWidget): self.waveform_widget = WaveformWidget() self.waveform_widget.setMinimumHeight(200) + # 上证所截图显示 + screenshot_group = QGroupBox("上证所行情") + screenshot_layout = QVBoxLayout(screenshot_group) + + self.screenshot_label = QLabel("等待截图...") + self.screenshot_label.setAlignment(Qt.AlignCenter) + self.screenshot_label.setMinimumSize(400, 200) + self.screenshot_label.setStyleSheet("border: 1px solid #ccc; background-color: #f0f0f0;") + + screenshot_scroll = QScrollArea() + screenshot_scroll.setWidget(self.screenshot_label) + screenshot_scroll.setWidgetResizable(True) + screenshot_scroll.setMinimumHeight(150) + + screenshot_layout.addWidget(screenshot_scroll) + # 按钮 btn_layout = QHBoxLayout() self.refresh_btn = QPushButton("刷新") @@ -279,6 +299,7 @@ class MainWindow(QWidget): layout.addWidget(self.score_label) layout.addWidget(self.status_label) layout.addWidget(self.waveform_widget) + layout.addWidget(screenshot_group) layout.addLayout(btn_layout) # 设置窗口标志(无边框、可拖拽) @@ -426,6 +447,22 @@ class MainWindow(QWidget): self.waveform_widget.add_data_point(time_str, value) logger.info(f"添加波形图数据点: 时间={time_str}, 值={value}") + def update_sse_screenshot(self, screenshot_path: str): + """更新上证所截图显示""" + logger.info(f"更新截图显示: {screenshot_path}") + pixmap = QPixmap(screenshot_path) + if not pixmap.isNull(): + self.screenshot_label.setPixmap(pixmap.scaled( + self.screenshot_label.size(), + Qt.KeepAspectRatio, + Qt.SmoothTransformation + )) + self.screenshot_label.setText("") + logger.info("截图显示更新成功") + else: + self.screenshot_label.setText("截图加载失败") + logger.warning("截图加载失败") + class QCheckBox(QPushButton): """自定义复选框""" diff --git a/requirements.txt b/requirements.txt index bddce44..418135f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,4 +3,4 @@ requests>=2.31.0 beautifulsoup4>=4.12.0 lxml>=4.9.0 openai>=1.0.0 -playwright>=1.40.0 +selenium>=4.15.0 diff --git a/spider.py b/spider.py index 60d2180..1e1e65c 100644 --- a/spider.py +++ b/spider.py @@ -8,6 +8,12 @@ from typing import List, Dict, Optional from urllib.parse import urljoin from bs4 import BeautifulSoup import random +from selenium import webdriver +from selenium.webdriver.chrome.service import Service +from selenium.webdriver.chrome.options import Options +from selenium.webdriver.common.by import By +from selenium.webdriver.support.ui import WebDriverWait +from selenium.webdriver.support import expected_conditions as EC from loguru import logger @@ -174,84 +180,105 @@ class SpiderManager: logger.debug(f"当前时间 {current_time.strftime('%H:%M')} 是否为交易时间: {is_trading}") return is_trading + def _fetch_sse_with_selenium(self, url: str) -> Optional[str]: + """使用 Selenium 获取页面内容""" + driver = None + try: + # 配置 Chrome 选项 + chrome_options = Options() + chrome_options.add_argument('--headless') + chrome_options.add_argument('--no-sandbox') + chrome_options.add_argument('--disable-dev-shm-usage') + chrome_options.add_argument('--disable-gpu') + chrome_options.add_argument('--window-size=1920,1080') + chrome_options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36') + + # 创建 WebDriver + driver = webdriver.Chrome(options=chrome_options) + + # 访问页面 + driver.get(url) + + # 等待页面加载 + WebDriverWait(driver, 30).until( + EC.presence_of_element_located((By.ID, "wmt_china")) + ) + + # 额外等待确保数据加载完成 + time.sleep(2) + + # 获取页面内容 + html = driver.page_source + + logger.debug(f"Selenium 获取页面成功,大小: {len(html)} 字节") + return html + + except Exception as e: + logger.error(f"Selenium 获取页面失败: {e}") + return None + finally: + if driver: + driver.quit() + def fetch_sse_stock_data(self) -> Dict[str, float]: """ - 爬取上海证券交易所股票数据 + 使用新浪财经JS接口获取上证指数数据 返回包含时间和数值的字典 """ - # 检查是否为交易时间 + from datetime import datetime + import re + if not self.is_trading_time(): logger.info("当前为非交易时间,跳过股票数据爬取") return {} - - sse_url = "https://www.sse.com.cn/" - xpath = "//*[@id=\"hq_controller\"]/table/tbody/tr/td[1]/rowspan[2]/i[1]" - - logger.info(f"开始爬取上海证券交易所数据: {sse_url}") - - html = self._fetch_with_retry(sse_url) - if not html: - logger.warning("上海证券交易所网页获取失败") - return {} - + + sse_url = "https://hq.sinajs.cn/list=sh000001" + headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', + 'Referer': 'https://finance.sina.com.cn/' + } + logger.info(f"通过新浪JS接口获取上证指数数据: {sse_url}") + try: - # 使用 lxml 解析 - tree = etree.HTML(html) - - # 尝试获取股票数值 - elements = tree.xpath(xpath) - if not elements: - logger.warning("未找到股票数据元素,尝试备用XPath") - # 尝试备用XPath - backup_xpaths = [ - "//*[@id='hq_area']", - "//*[@id='hq_controller']//td[contains(@class, 'price')]//text()", - "//*[contains(@class, 'stock-price')]//text()", - "//*[contains(@class, 'price')]//text()" - ] - - for backup_xpath in backup_xpaths: - elements = tree.xpath(backup_xpath) - if elements: - logger.info(f"使用备用XPath获取到数据") - break - - if elements: - # 获取文本内容并尝试转换为数值 - text_content = etree.tostring(elements[0], method='text', encoding='unicode').strip() - logger.info(f"获取到股票数据文本: {text_content}") - - # 提取数值(可能包含逗号、小数点等) - import re - # 匹配数字(包括小数点和逗号分隔符) - numbers = re.findall(r'[\d,]+(?:\.\d+)?', text_content) - if numbers: - # 去除逗号并转换为浮点数 - value_str = numbers[0].replace(',', '') - try: - stock_value = float(value_str) - - # 获取当前时间 - from datetime import datetime - current_time = datetime.now().strftime("%H:%M") - - logger.info(f"成功获取股票数据: 时间={current_time}, 值={stock_value}") - - return { - 'time': current_time, - 'value': stock_value - } - except ValueError as e: - logger.error(f"数值转换失败: {value_str}, 错误: {e}") - else: - logger.warning(f"未找到有效数值: {text_content}") - else: - logger.warning("未找到股票数据元素") - + response = self.session.get(sse_url, timeout=30, headers=headers) + response.raise_for_status() + + content = response.text + logger.debug(f"获取到响应: {content}") + + pattern = r'var hq_str_sh000001="([^"]+)"' + match = re.search(pattern, content) + + if not match: + logger.warning("未能解析新浪JS接口返回数据") + return {} + + data_fields = match.group(1).split(',') + + if len(data_fields) < 32: + logger.warning(f"数据字段不足: {len(data_fields)}") + return {} + + stock_name = data_fields[0] + current_price = float(data_fields[3]) if data_fields[3] else 0.0 + current_time = datetime.now().strftime("%H:%M") + + logger.info(f"✓ 成功获取 {stock_name}: {current_price} (时间: {current_time})") + + return { + 'time': current_time, + 'value': current_price + } + + except requests.RequestException as e: + logger.error(f"请求新浪JS接口失败: {e}") + return {} + except (ValueError, IndexError) as e: + logger.error(f"解析数据失败: {e}") + return {} except Exception as e: - logger.error(f"解析上海证券交易所数据失败: {e}") - - return {} + logger.error(f"获取股票数据异常: {e}") + return {} def set_user_agent(self, user_agent: str): """更新User-Agent""" @@ -270,3 +297,119 @@ class SpiderManager: def get_fetch_interval(self) -> int: """获取爬取间隔""" return self.config.get('fetch_interval', 60) + + def fetch_sse_screenshot(self) -> str: + """ + 爬取上证所网页指定元素截图 + 返回截图文件路径 + """ + from playwright.sync_api import sync_playwright + import os + + url = "https://www.sse.com.cn/" + xpath = "//div[contains(@class,'gray_bg')]//div[contains(@class,'col-md-7')]" + + logger.info(f"开始爬取上证所网页截图: {url}") + logger.info(f"目标XPath: {xpath}") + + output_dir = os.path.dirname(os.path.abspath(__file__)) + screenshot_path = os.path.join(output_dir, "sse_screenshot.png") + + try: + with sync_playwright() as p: + browser = p.chromium.launch(headless=True) + page = browser.new_page() + + page.set_default_timeout(60000) + logger.info("正在访问页面...") + page.goto(url, wait_until="networkidle") + + logger.info("等待页面加载完成...") + page.wait_for_load_state("domcontentloaded") + page.wait_for_timeout(5000) + + logger.info(f"查找XPath元素: {xpath}") + element = page.locator(f"xpath={xpath}") + + if element.count() > 0: + logger.info("✓ 找到目标元素") + + is_visible = element.is_visible() + logger.info(f"元素可见: {is_visible}") + + if not is_visible: + logger.info("元素不可见,尝试滚动到可见区域...") + element.scroll_into_view_if_needed() + page.wait_for_timeout(2000) + + logger.info(f"正在截取元素截图到: {screenshot_path}") + element.screenshot(path=screenshot_path) + logger.info("✓ 截屏成功") + browser.close() + return screenshot_path + else: + logger.warning("✗ 未找到目标元素") + browser.close() + return "" + + except Exception as e: + logger.error(f"爬取上证所截图失败: {e}") + return "" + + def fetch_sse_screenshot(self) -> str: + """ + 爬取上证所网页指定元素截图 + 返回截图文件路径 + """ + from playwright.sync_api import sync_playwright + import os + + url = "https://www.sse.com.cn/" + xpath = "//div[contains(@class,'gray_bg')]//div[contains(@class,'col-md-7')]" + + logger.info(f"开始爬取上证所网页截图: {url}") + logger.info(f"目标XPath: {xpath}") + + output_dir = os.path.dirname(os.path.abspath(__file__)) + screenshot_path = os.path.join(output_dir, "sse_screenshot.png") + + try: + with sync_playwright() as p: + browser = p.chromium.launch(headless=True) + page = browser.new_page() + + page.set_default_timeout(60000) + logger.info("正在访问页面...") + page.goto(url, wait_until="networkidle") + + logger.info("等待页面加载完成...") + page.wait_for_load_state("domcontentloaded") + page.wait_for_timeout(5000) + + logger.info(f"查找XPath元素: {xpath}") + element = page.locator(f"xpath={xpath}") + + if element.count() > 0: + logger.info("✓ 找到目标元素") + + is_visible = element.is_visible() + logger.info(f"元素可见: {is_visible}") + + if not is_visible: + logger.info("元素不可见,尝试滚动到可见区域...") + element.scroll_into_view_if_needed() + page.wait_for_timeout(2000) + + logger.info(f"正在截取元素截图到: {screenshot_path}") + element.screenshot(path=screenshot_path) + logger.info("✓ 截屏成功") + browser.close() + return screenshot_path + else: + logger.warning("✗ 未找到目标元素") + browser.close() + return "" + + except Exception as e: + logger.error(f"爬取上证所截图失败: {e}") + return "" diff --git a/sse_screenshot.png b/sse_screenshot.png new file mode 100644 index 0000000..5ca40f0 Binary files /dev/null and b/sse_screenshot.png differ