feat: 优化滚动截屏逻辑并改进UI交互

- 新增div签名机制用于内容去重
- 实现基于最后一个div位置的智能滚动计算
- 合并开始/停止按钮为单一操作按钮
- 增加处理进度和滚动距离的详细日志
- 优化UI状态显示和提示信息
This commit is contained in:
2026-03-06 17:24:27 +08:00
parent 41ff658e31
commit 4178bfed06
2 changed files with 211 additions and 41 deletions

77
gui.py
View File

@@ -43,6 +43,7 @@ class CaptureWorker(QThread):
status_signal = Signal(str)
finished_signal = Signal()
error_signal = Signal(str)
div_processed_signal = Signal(int, int) # (当前div序号, 总div数)
def __init__(self, scroll_capture: ScrollCaptureOCR):
super().__init__()
@@ -59,17 +60,24 @@ class CaptureWorker(QThread):
self.scroll_capture.previous_ocr_result = []
self.scroll_capture.scroll_count = 0
self.scroll_capture.all_results = []
self.scroll_capture.processed_divs = []
self.scroll_capture.last_div_signature = None
self.scroll_capture.total_scroll_distance = 0
self.scroll_capture.is_first_capture = True
# 循环处理
while self.is_running and self.scroll_capture.process_once():
progress = min(self.scroll_capture.scroll_count * 10, 90)
self.progress_signal.emit(progress)
self.log_signal.emit(f"{self.scroll_capture.scroll_count} 次截屏完成")
self.log_signal.emit(f"{self.scroll_capture.scroll_count} 次截屏完成"
f"累计滚动 {self.scroll_capture.total_scroll_distance} 像素")
# 保存最终结果
if self.scroll_capture.all_results:
self.scroll_capture.save_final_result()
self.log_signal.emit(f"✓ 共处理 {len(self.scroll_capture.all_results)} 次截屏")
total_divs = sum(len(result.get('texts', [])) for result in self.scroll_capture.all_results)
self.log_signal.emit(f"✓ 共处理 {len(self.scroll_capture.all_results)} 次截屏,"
f"识别 {total_divs} 个内容区域")
self.progress_signal.emit(100)
self.status_signal.emit("完成")
@@ -263,10 +271,11 @@ class MainWindow(QMainWindow):
# 使用说明
help_text = QLabel(
"<b>使用步骤:</b><br>"
"1. 点击「开始截屏」按钮<br>"
"1. 点击「开始」按钮<br>"
"2. 按住鼠标左键拖动选择区域<br>"
"3. 程序自动滚动截屏并OCR识别<br>"
"4. 检测到重复内容时自动停止"
"3. 程序自动分割div并逐个OCR<br>"
"4. 智能计算滚动距离,自动翻页<br>"
"5. 完成后点击「结束」按钮"
)
help_text.setFont(QFont("Microsoft YaHei", 10))
help_text.setStyleSheet("color: #555555; line-height: 1.6;")
@@ -320,16 +329,10 @@ class MainWindow(QMainWindow):
button_layout.addStretch()
# 停止按钮
self.stop_btn = ModernButton("停止", primary=False)
self.stop_btn.setEnabled(False)
self.stop_btn.clicked.connect(self.stop_capture)
button_layout.addWidget(self.stop_btn)
# 开始按钮
self.start_btn = ModernButton("开始截屏", primary=True)
self.start_btn.clicked.connect(self.start_capture)
button_layout.addWidget(self.start_btn)
# 操作按钮(开始/结束 二合一)
self.action_btn = ModernButton("开始", primary=True)
self.action_btn.clicked.connect(self.on_action_button_clicked)
button_layout.addWidget(self.action_btn)
# 清空日志按钮
self.clear_btn = ModernButton("清空日志", primary=False)
@@ -341,7 +344,7 @@ class MainWindow(QMainWindow):
main_layout.addLayout(button_layout)
# === 底部信息 ===
footer = QLabel("按 Ctrl+F9 也可以快速启动 | 输出目录: ./output/")
footer = QLabel("点击「开始」按钮启动 | 输出目录: ./output/")
footer.setFont(QFont("Microsoft YaHei", 9))
footer.setStyleSheet("color: #999999; margin-top: 10px;")
footer.setAlignment(Qt.AlignCenter)
@@ -359,10 +362,10 @@ class MainWindow(QMainWindow):
tray_menu.addSeparator()
start_action = tray_menu.addAction("开始截屏")
start_action = tray_menu.addAction("开始")
start_action.triggered.connect(self.start_capture)
stop_action = tray_menu.addAction("停止")
stop_action = tray_menu.addAction("结束")
stop_action.triggered.connect(self.stop_capture)
tray_menu.addSeparator()
@@ -407,6 +410,13 @@ class MainWindow(QMainWindow):
self.log_text.append_log(message, level)
def on_action_button_clicked(self):
"""操作按钮点击事件"""
if self.action_btn.text() == "开始":
self.start_capture()
else:
self.stop_capture()
def start_capture(self):
"""开始截屏"""
# 检查OCR服务
@@ -454,9 +464,9 @@ class MainWindow(QMainWindow):
self.worker.start()
# 更新UI状态
self.start_btn.setEnabled(False)
self.stop_btn.setEnabled(True)
# 更新UI状态 - 按钮变为"结束"
self.action_btn.setText("结束")
self.action_btn.update_style()
self.progress_bar.setValue(0)
def stop_capture(self):
@@ -464,15 +474,16 @@ class MainWindow(QMainWindow):
if self.worker and self.worker.isRunning():
self.worker.stop()
self.worker.wait(1000)
self.log_text.append_log("用户手动停止", "WARNING")
self.log_text.append_log("用户手动结束", "WARNING")
self.start_btn.setEnabled(True)
self.stop_btn.setEnabled(False)
self.status_label.setText("已停止")
# 更新UI状态 - 按钮恢复为"开始"
self.action_btn.setText("开始")
self.action_btn.update_style()
self.status_label.setText("就绪")
self.status_label.setStyleSheet("""
QLabel {
color: #F44336;
background-color: #ffebee;
color: #4CAF50;
background-color: #e8f5e9;
padding: 6px 16px;
border-radius: 16px;
}
@@ -516,15 +527,17 @@ class MainWindow(QMainWindow):
def on_finished(self):
"""任务完成回调"""
self.start_btn.setEnabled(True)
self.stop_btn.setEnabled(False)
# 按钮恢复为"开始"
self.action_btn.setText("开始")
self.action_btn.update_style()
self.log_text.append_log("✓ 截屏OCR任务已完成", "INFO")
def on_error(self, error_msg: str):
"""错误回调"""
self.log_text.append_log(f"✗ 错误: {error_msg}", "ERROR")
self.start_btn.setEnabled(True)
self.stop_btn.setEnabled(False)
# 按钮恢复为"开始"
self.action_btn.setText("开始")
self.action_btn.update_style()
def clear_logs(self):
"""清空日志"""
@@ -572,7 +585,7 @@ def main():
window.show()
# 显示启动提示
window.log_text.append_log("程序已启动,点击「开始截屏」按钮开始", "INFO")
window.log_text.append_log("程序已启动,点击「开始」按钮开始", "INFO")
window.log_text.append_log(f"OCR引擎: {Config.OCR_ENGINE}", "INFO")
sys.exit(app.exec())

175
main.py
View File

@@ -9,7 +9,7 @@ import base64
import io
import tempfile
from dataclasses import dataclass, field
from typing import List, Tuple, Optional, Callable
from typing import List, Tuple, Optional, Callable, Dict
from pathlib import Path
import cv2
@@ -381,6 +381,12 @@ class ScrollCaptureOCR:
self.scroll_count = 0
self.all_results: List[dict] = []
# 新增记录已处理的div信息
self.processed_divs: List[Dict] = [] # 已处理的所有div信息
self.last_div_signature: Optional[str] = None # 最后一个div的签名用于去重
self.total_scroll_distance: int = 0 # 累计滚动距离
self.is_first_capture: bool = True # 是否是第一次截图
# 创建输出目录
Path(self.config.OUTPUT_DIR).mkdir(exist_ok=True)
@@ -393,6 +399,21 @@ class ScrollCaptureOCR:
screenshot = pyautogui.screenshot(region=(left, top, right - left, bottom - top))
return cv2.cvtColor(np.array(screenshot), cv2.COLOR_RGB2BGR)
def get_div_signature(self, div: DivRegion, image: np.ndarray) -> str:
"""
生成div的签名用于判断是否是同一个div
使用div的图像内容的哈希值
"""
import hashlib
div_image = image[div.top:div.bottom, div.left:div.right]
# 缩小图像以加快计算
small = cv2.resize(div_image, (32, 32))
# 计算平均哈希
gray = cv2.cvtColor(small, cv2.COLOR_BGR2GRAY)
avg = gray.mean()
hash_str = ''.join(['1' if p > avg else '0' for p in gray.flatten()])
return hashlib.md5(hash_str.encode()).hexdigest()[:16]
def scroll_screen(self, distance: int):
"""在截图区域执行滚动"""
if not self.capture_region:
@@ -460,13 +481,19 @@ class ScrollCaptureOCR:
"""
执行一次处理循环
返回False表示应该停止
新逻辑:
1. 第一次截图处理所有div
2. 后续截图只处理新的div跳过已处理的
3. 滚动距离 = 最后一个新div的底部位置 + 空白间隔
"""
logger.info(f"=== 第 {self.scroll_count + 1} 次截屏 ===")
print(f"\n>>> 第 {self.scroll_count + 1} 次截屏处理中...")
# 1. 截取当前屏幕
image = self.capture_screen()
logger.info(f"截图完成,尺寸: {image.shape[1]}x{image.shape[0]}")
height, width = image.shape[:2]
logger.info(f"截图完成,尺寸: {width}x{height}")
# 2. 分析图像定位div边界
analysis = self.image_analyzer.analyze(image)
@@ -476,32 +503,101 @@ class ScrollCaptureOCR:
print("警告: 未检测到内容区域")
return False
# 3. OCR提取文字
current_texts = self.ocr_engine.recognize_divs(image, analysis.divs)
print(f"识别到 {len(current_texts)} 段文字")
logger.info(f"检测到 {len(analysis.divs)} 个div区域")
print(f"检测到 {len(analysis.divs)} 个内容区域")
# 3. 识别新的div跳过已处理的
new_divs = []
current_texts = []
last_processed_signature = None
for i, div in enumerate(analysis.divs):
# 生成div签名
div_signature = self.get_div_signature(div, image)
# 检查是否是已处理的div
is_processed = False
if not self.is_first_capture and self.processed_divs:
# 与已处理的div比较
for processed in self.processed_divs:
if processed.get('signature') == div_signature:
is_processed = True
logger.info(f"跳过已处理的div {i+1}")
break
if is_processed:
continue
# 新的div进行处理
new_divs.append({
'div': div,
'signature': div_signature,
'index': i
})
logger.info(f"处理新的div {i+1},位置: {div.top}-{div.bottom}")
print(f" 处理新区域 {i+1}/{len(analysis.divs)}...")
# 截取单个div区域
div_image = image[div.top:div.bottom, div.left:div.right]
# OCR识别
texts = self.ocr_engine.recognize(div_image)
div.text = "\n".join(texts)
current_texts.extend(texts)
# 保存单个div的结果
self.save_div_result(self.scroll_count, i, div_image, texts, div)
# 记录处理的div
self.processed_divs.append({
'signature': div_signature,
'text': div.text,
'scroll_count': self.scroll_count,
'div_index': i
})
last_processed_signature = div_signature
logger.info(f" 识别到 {len(texts)} 段文字")
# 如果没有新的div说明已经到底
if not new_divs:
logger.info("没有新的div需要处理可能已到达底部")
print("✓ 没有新的内容需要处理")
return False
print(f"✓ 本次处理 {len(new_divs)} 个新区域,共识别 {len(current_texts)} 段文字")
for i, text in enumerate(current_texts[:3], 1):
preview = text[:50] + "..." if len(text) > 50 else text
print(f" [{i}] {preview}")
if len(current_texts) > 3:
print(f" ... 还有 {len(current_texts) - 3} 段文字")
# 4. 保存结果
# 4. 保存完整截图结果
self.save_result(self.scroll_count, image, current_texts)
# 5. 判断是否到达底部OCR结果重复
# 5. 判断是否到达底部(没有新内容或OCR结果重复
if self.check_duplicate(current_texts):
print("\n>>> 检测到内容重复,已到达底部,处理完成 <<<")
return False
self.previous_ocr_result = current_texts
# 6. 计算滚动距离
scroll_distance = self.image_analyzer.calculate_scroll_distance(analysis)
# 6. 计算滚动距离 - 基于最后一个新div的位置
if new_divs:
last_new_div = new_divs[-1]['div']
scroll_distance = self.calculate_scroll_based_on_last_div(
last_new_div, analysis.gaps, height
)
else:
scroll_distance = int(height * 0.8) # 默认滚动80%高度
# 7. 执行滚动
self.scroll_screen(scroll_distance)
self.total_scroll_distance += scroll_distance
self.scroll_count += 1
self.is_first_capture = False # 标记不再是第一次
# 检查最大滚动次数
if self.scroll_count >= self.config.MAX_SCROLL_COUNT:
@@ -511,6 +607,67 @@ class ScrollCaptureOCR:
return True
def calculate_scroll_based_on_last_div(self, last_div: DivRegion, gaps: List[GapInfo], image_height: int) -> int:
"""
基于最后一个div计算滚动距离
策略滚动到最后一个div的底部 + 其后空白间隔
这样可以让下一个新div出现在截图区域的顶部
"""
last_div_bottom = last_div.bottom
# 查找最后一个div之后的空白间隔
last_gap_height = 0
for gap in gaps:
if gap.start_row >= last_div.bottom:
last_gap_height = gap.height
break
# 计算滚动距离
scroll_distance = last_div_bottom + last_gap_height
# 确保至少滚动一定距离(避免 stuck
min_scroll = 50
scroll_distance = max(scroll_distance, min_scroll)
# 限制最大滚动距离不超过图片高度的90%,保留一些重叠)
max_scroll = int(image_height * 0.9)
scroll_distance = min(scroll_distance, max_scroll)
logger.info(f"滚动距离计算: 最后div底部={last_div_bottom}, "
f"空白间隔={last_gap_height}, 滚动距离={scroll_distance}")
print(f" 滚动距离: {scroll_distance} 像素 (基于最后一个内容区域)")
return int(scroll_distance)
def save_div_result(self, scroll_index: int, div_index: int, image: np.ndarray, texts: List[str], div: DivRegion):
"""保存单个div的结果"""
timestamp = time.strftime("%Y%m%d_%H%M%S")
# 保存div图片
div_dir = Path(self.config.OUTPUT_DIR) / f"scroll_{scroll_index:03d}"
div_dir.mkdir(parents=True, exist_ok=True)
image_path = div_dir / f"div_{div_index:02d}_{timestamp}.png"
cv2.imwrite(str(image_path), image)
# 保存div OCR结果
result = {
"scroll_index": scroll_index,
"div_index": div_index,
"timestamp": timestamp,
"div_position": {"top": div.top, "bottom": div.bottom, "left": div.left, "right": div.right},
"image_path": str(image_path),
"texts": texts
}
# 也添加到总结果中
if not hasattr(self, '_div_results'):
self._div_results = []
self._div_results.append(result)
logger.debug(f"保存div结果: scroll={scroll_index}, div={div_index}, 文字数={len(texts)}")
def run(self):
"""主运行流程"""
print("=" * 60)