diff --git a/gui.py b/gui.py index f75beae..e49d035 100644 --- a/gui.py +++ b/gui.py @@ -43,6 +43,7 @@ class CaptureWorker(QThread): status_signal = Signal(str) finished_signal = Signal() error_signal = Signal(str) + div_processed_signal = Signal(int, int) # (当前div序号, 总div数) def __init__(self, scroll_capture: ScrollCaptureOCR): super().__init__() @@ -59,17 +60,24 @@ class CaptureWorker(QThread): self.scroll_capture.previous_ocr_result = [] self.scroll_capture.scroll_count = 0 self.scroll_capture.all_results = [] + self.scroll_capture.processed_divs = [] + self.scroll_capture.last_div_signature = None + self.scroll_capture.total_scroll_distance = 0 + self.scroll_capture.is_first_capture = True # 循环处理 while self.is_running and self.scroll_capture.process_once(): progress = min(self.scroll_capture.scroll_count * 10, 90) self.progress_signal.emit(progress) - self.log_signal.emit(f"第 {self.scroll_capture.scroll_count} 次截屏完成") + self.log_signal.emit(f"✓ 第 {self.scroll_capture.scroll_count} 次截屏完成," + f"累计滚动 {self.scroll_capture.total_scroll_distance} 像素") # 保存最终结果 if self.scroll_capture.all_results: self.scroll_capture.save_final_result() - self.log_signal.emit(f"✓ 共处理 {len(self.scroll_capture.all_results)} 次截屏") + total_divs = sum(len(result.get('texts', [])) for result in self.scroll_capture.all_results) + self.log_signal.emit(f"✓ 共处理 {len(self.scroll_capture.all_results)} 次截屏," + f"识别 {total_divs} 个内容区域") self.progress_signal.emit(100) self.status_signal.emit("完成") @@ -263,10 +271,11 @@ class MainWindow(QMainWindow): # 使用说明 help_text = QLabel( "使用步骤:
" - "1. 点击「开始截屏」按钮
" + "1. 点击「开始」按钮
" "2. 按住鼠标左键拖动选择区域
" - "3. 程序自动滚动截屏并OCR识别
" - "4. 检测到重复内容时自动停止" + "3. 程序自动分割div并逐个OCR
" + "4. 智能计算滚动距离,自动翻页
" + "5. 完成后点击「结束」按钮" ) help_text.setFont(QFont("Microsoft YaHei", 10)) help_text.setStyleSheet("color: #555555; line-height: 1.6;") @@ -320,16 +329,10 @@ class MainWindow(QMainWindow): button_layout.addStretch() - # 停止按钮 - self.stop_btn = ModernButton("停止", primary=False) - self.stop_btn.setEnabled(False) - self.stop_btn.clicked.connect(self.stop_capture) - button_layout.addWidget(self.stop_btn) - - # 开始按钮 - self.start_btn = ModernButton("开始截屏", primary=True) - self.start_btn.clicked.connect(self.start_capture) - button_layout.addWidget(self.start_btn) + # 操作按钮(开始/结束 二合一) + self.action_btn = ModernButton("开始", primary=True) + self.action_btn.clicked.connect(self.on_action_button_clicked) + button_layout.addWidget(self.action_btn) # 清空日志按钮 self.clear_btn = ModernButton("清空日志", primary=False) @@ -341,7 +344,7 @@ class MainWindow(QMainWindow): main_layout.addLayout(button_layout) # === 底部信息 === - footer = QLabel("按 Ctrl+F9 也可以快速启动 | 输出目录: ./output/") + footer = QLabel("点击「开始」按钮启动 | 输出目录: ./output/") footer.setFont(QFont("Microsoft YaHei", 9)) footer.setStyleSheet("color: #999999; margin-top: 10px;") footer.setAlignment(Qt.AlignCenter) @@ -359,10 +362,10 @@ class MainWindow(QMainWindow): tray_menu.addSeparator() - start_action = tray_menu.addAction("开始截屏") + start_action = tray_menu.addAction("开始") start_action.triggered.connect(self.start_capture) - stop_action = tray_menu.addAction("停止") + stop_action = tray_menu.addAction("结束") stop_action.triggered.connect(self.stop_capture) tray_menu.addSeparator() @@ -407,6 +410,13 @@ class MainWindow(QMainWindow): self.log_text.append_log(message, level) + def on_action_button_clicked(self): + """操作按钮点击事件""" + if self.action_btn.text() == "开始": + self.start_capture() + else: + self.stop_capture() + def start_capture(self): """开始截屏""" # 检查OCR服务 @@ -454,9 +464,9 @@ class MainWindow(QMainWindow): self.worker.start() - # 更新UI状态 - self.start_btn.setEnabled(False) - self.stop_btn.setEnabled(True) + # 更新UI状态 - 按钮变为"结束" + self.action_btn.setText("结束") + self.action_btn.update_style() self.progress_bar.setValue(0) def stop_capture(self): @@ -464,15 +474,16 @@ class MainWindow(QMainWindow): if self.worker and self.worker.isRunning(): self.worker.stop() self.worker.wait(1000) - self.log_text.append_log("用户手动停止", "WARNING") + self.log_text.append_log("用户手动结束", "WARNING") - self.start_btn.setEnabled(True) - self.stop_btn.setEnabled(False) - self.status_label.setText("已停止") + # 更新UI状态 - 按钮恢复为"开始" + self.action_btn.setText("开始") + self.action_btn.update_style() + self.status_label.setText("就绪") self.status_label.setStyleSheet(""" QLabel { - color: #F44336; - background-color: #ffebee; + color: #4CAF50; + background-color: #e8f5e9; padding: 6px 16px; border-radius: 16px; } @@ -516,15 +527,17 @@ class MainWindow(QMainWindow): def on_finished(self): """任务完成回调""" - self.start_btn.setEnabled(True) - self.stop_btn.setEnabled(False) + # 按钮恢复为"开始" + self.action_btn.setText("开始") + self.action_btn.update_style() self.log_text.append_log("✓ 截屏OCR任务已完成", "INFO") def on_error(self, error_msg: str): """错误回调""" self.log_text.append_log(f"✗ 错误: {error_msg}", "ERROR") - self.start_btn.setEnabled(True) - self.stop_btn.setEnabled(False) + # 按钮恢复为"开始" + self.action_btn.setText("开始") + self.action_btn.update_style() def clear_logs(self): """清空日志""" @@ -572,7 +585,7 @@ def main(): window.show() # 显示启动提示 - window.log_text.append_log("程序已启动,点击「开始截屏」按钮开始", "INFO") + window.log_text.append_log("程序已启动,点击「开始」按钮开始", "INFO") window.log_text.append_log(f"OCR引擎: {Config.OCR_ENGINE}", "INFO") sys.exit(app.exec()) diff --git a/main.py b/main.py index df400bc..15daf71 100644 --- a/main.py +++ b/main.py @@ -9,7 +9,7 @@ import base64 import io import tempfile from dataclasses import dataclass, field -from typing import List, Tuple, Optional, Callable +from typing import List, Tuple, Optional, Callable, Dict from pathlib import Path import cv2 @@ -381,6 +381,12 @@ class ScrollCaptureOCR: self.scroll_count = 0 self.all_results: List[dict] = [] + # 新增:记录已处理的div信息 + self.processed_divs: List[Dict] = [] # 已处理的所有div信息 + self.last_div_signature: Optional[str] = None # 最后一个div的签名(用于去重) + self.total_scroll_distance: int = 0 # 累计滚动距离 + self.is_first_capture: bool = True # 是否是第一次截图 + # 创建输出目录 Path(self.config.OUTPUT_DIR).mkdir(exist_ok=True) @@ -393,6 +399,21 @@ class ScrollCaptureOCR: screenshot = pyautogui.screenshot(region=(left, top, right - left, bottom - top)) return cv2.cvtColor(np.array(screenshot), cv2.COLOR_RGB2BGR) + def get_div_signature(self, div: DivRegion, image: np.ndarray) -> str: + """ + 生成div的签名,用于判断是否是同一个div + 使用div的图像内容的哈希值 + """ + import hashlib + div_image = image[div.top:div.bottom, div.left:div.right] + # 缩小图像以加快计算 + small = cv2.resize(div_image, (32, 32)) + # 计算平均哈希 + gray = cv2.cvtColor(small, cv2.COLOR_BGR2GRAY) + avg = gray.mean() + hash_str = ''.join(['1' if p > avg else '0' for p in gray.flatten()]) + return hashlib.md5(hash_str.encode()).hexdigest()[:16] + def scroll_screen(self, distance: int): """在截图区域执行滚动""" if not self.capture_region: @@ -460,13 +481,19 @@ class ScrollCaptureOCR: """ 执行一次处理循环 返回False表示应该停止 + + 新逻辑: + 1. 第一次截图:处理所有div + 2. 后续截图:只处理新的div(跳过已处理的) + 3. 滚动距离 = 最后一个新div的底部位置 + 空白间隔 """ logger.info(f"=== 第 {self.scroll_count + 1} 次截屏 ===") print(f"\n>>> 第 {self.scroll_count + 1} 次截屏处理中...") # 1. 截取当前屏幕 image = self.capture_screen() - logger.info(f"截图完成,尺寸: {image.shape[1]}x{image.shape[0]}") + height, width = image.shape[:2] + logger.info(f"截图完成,尺寸: {width}x{height}") # 2. 分析图像,定位div边界 analysis = self.image_analyzer.analyze(image) @@ -476,32 +503,101 @@ class ScrollCaptureOCR: print("警告: 未检测到内容区域") return False - # 3. OCR提取文字 - current_texts = self.ocr_engine.recognize_divs(image, analysis.divs) - print(f"识别到 {len(current_texts)} 段文字") + logger.info(f"检测到 {len(analysis.divs)} 个div区域") + print(f"检测到 {len(analysis.divs)} 个内容区域") + + # 3. 识别新的div(跳过已处理的) + new_divs = [] + current_texts = [] + last_processed_signature = None + + for i, div in enumerate(analysis.divs): + # 生成div签名 + div_signature = self.get_div_signature(div, image) + + # 检查是否是已处理的div + is_processed = False + if not self.is_first_capture and self.processed_divs: + # 与已处理的div比较 + for processed in self.processed_divs: + if processed.get('signature') == div_signature: + is_processed = True + logger.info(f"跳过已处理的div {i+1}") + break + + if is_processed: + continue + + # 新的div,进行处理 + new_divs.append({ + 'div': div, + 'signature': div_signature, + 'index': i + }) + + logger.info(f"处理新的div {i+1},位置: {div.top}-{div.bottom}") + print(f" 处理新区域 {i+1}/{len(analysis.divs)}...") + + # 截取单个div区域 + div_image = image[div.top:div.bottom, div.left:div.right] + + # OCR识别 + texts = self.ocr_engine.recognize(div_image) + div.text = "\n".join(texts) + current_texts.extend(texts) + + # 保存单个div的结果 + self.save_div_result(self.scroll_count, i, div_image, texts, div) + + # 记录处理的div + self.processed_divs.append({ + 'signature': div_signature, + 'text': div.text, + 'scroll_count': self.scroll_count, + 'div_index': i + }) + + last_processed_signature = div_signature + logger.info(f" 识别到 {len(texts)} 段文字") + + # 如果没有新的div,说明已经到底 + if not new_divs: + logger.info("没有新的div需要处理,可能已到达底部") + print("✓ 没有新的内容需要处理") + return False + + print(f"✓ 本次处理 {len(new_divs)} 个新区域,共识别 {len(current_texts)} 段文字") for i, text in enumerate(current_texts[:3], 1): preview = text[:50] + "..." if len(text) > 50 else text print(f" [{i}] {preview}") if len(current_texts) > 3: print(f" ... 还有 {len(current_texts) - 3} 段文字") - # 4. 保存结果 + # 4. 保存完整截图结果 self.save_result(self.scroll_count, image, current_texts) - # 5. 判断是否到达底部(OCR结果重复) + # 5. 判断是否到达底部(没有新内容或OCR结果重复) if self.check_duplicate(current_texts): print("\n>>> 检测到内容重复,已到达底部,处理完成 <<<") return False self.previous_ocr_result = current_texts - # 6. 计算滚动距离 - scroll_distance = self.image_analyzer.calculate_scroll_distance(analysis) + # 6. 计算滚动距离 - 基于最后一个新div的位置 + if new_divs: + last_new_div = new_divs[-1]['div'] + scroll_distance = self.calculate_scroll_based_on_last_div( + last_new_div, analysis.gaps, height + ) + else: + scroll_distance = int(height * 0.8) # 默认滚动80%高度 # 7. 执行滚动 self.scroll_screen(scroll_distance) + self.total_scroll_distance += scroll_distance self.scroll_count += 1 + self.is_first_capture = False # 标记不再是第一次 # 检查最大滚动次数 if self.scroll_count >= self.config.MAX_SCROLL_COUNT: @@ -511,6 +607,67 @@ class ScrollCaptureOCR: return True + def calculate_scroll_based_on_last_div(self, last_div: DivRegion, gaps: List[GapInfo], image_height: int) -> int: + """ + 基于最后一个div计算滚动距离 + + 策略:滚动到最后一个div的底部 + 其后空白间隔 + 这样可以让下一个新div出现在截图区域的顶部 + """ + last_div_bottom = last_div.bottom + + # 查找最后一个div之后的空白间隔 + last_gap_height = 0 + for gap in gaps: + if gap.start_row >= last_div.bottom: + last_gap_height = gap.height + break + + # 计算滚动距离 + scroll_distance = last_div_bottom + last_gap_height + + # 确保至少滚动一定距离(避免 stuck) + min_scroll = 50 + scroll_distance = max(scroll_distance, min_scroll) + + # 限制最大滚动距离(不超过图片高度的90%,保留一些重叠) + max_scroll = int(image_height * 0.9) + scroll_distance = min(scroll_distance, max_scroll) + + logger.info(f"滚动距离计算: 最后div底部={last_div_bottom}, " + f"空白间隔={last_gap_height}, 滚动距离={scroll_distance}") + print(f" 滚动距离: {scroll_distance} 像素 (基于最后一个内容区域)") + + return int(scroll_distance) + + def save_div_result(self, scroll_index: int, div_index: int, image: np.ndarray, texts: List[str], div: DivRegion): + """保存单个div的结果""" + timestamp = time.strftime("%Y%m%d_%H%M%S") + + # 保存div图片 + div_dir = Path(self.config.OUTPUT_DIR) / f"scroll_{scroll_index:03d}" + div_dir.mkdir(parents=True, exist_ok=True) + + image_path = div_dir / f"div_{div_index:02d}_{timestamp}.png" + cv2.imwrite(str(image_path), image) + + # 保存div OCR结果 + result = { + "scroll_index": scroll_index, + "div_index": div_index, + "timestamp": timestamp, + "div_position": {"top": div.top, "bottom": div.bottom, "left": div.left, "right": div.right}, + "image_path": str(image_path), + "texts": texts + } + + # 也添加到总结果中 + if not hasattr(self, '_div_results'): + self._div_results = [] + self._div_results.append(result) + + logger.debug(f"保存div结果: scroll={scroll_index}, div={div_index}, 文字数={len(texts)}") + def run(self): """主运行流程""" print("=" * 60)