feat: 优化滚动截屏逻辑并改进UI交互

- 新增div签名机制用于内容去重
- 实现基于最后一个div位置的智能滚动计算
- 合并开始/停止按钮为单一操作按钮
- 增加处理进度和滚动距离的详细日志
- 优化UI状态显示和提示信息
This commit is contained in:
2026-03-06 17:24:27 +08:00
parent 41ff658e31
commit 4178bfed06
2 changed files with 211 additions and 41 deletions

175
main.py
View File

@@ -9,7 +9,7 @@ import base64
import io
import tempfile
from dataclasses import dataclass, field
from typing import List, Tuple, Optional, Callable
from typing import List, Tuple, Optional, Callable, Dict
from pathlib import Path
import cv2
@@ -381,6 +381,12 @@ class ScrollCaptureOCR:
self.scroll_count = 0
self.all_results: List[dict] = []
# 新增记录已处理的div信息
self.processed_divs: List[Dict] = [] # 已处理的所有div信息
self.last_div_signature: Optional[str] = None # 最后一个div的签名用于去重
self.total_scroll_distance: int = 0 # 累计滚动距离
self.is_first_capture: bool = True # 是否是第一次截图
# 创建输出目录
Path(self.config.OUTPUT_DIR).mkdir(exist_ok=True)
@@ -393,6 +399,21 @@ class ScrollCaptureOCR:
screenshot = pyautogui.screenshot(region=(left, top, right - left, bottom - top))
return cv2.cvtColor(np.array(screenshot), cv2.COLOR_RGB2BGR)
def get_div_signature(self, div: DivRegion, image: np.ndarray) -> str:
"""
生成div的签名用于判断是否是同一个div
使用div的图像内容的哈希值
"""
import hashlib
div_image = image[div.top:div.bottom, div.left:div.right]
# 缩小图像以加快计算
small = cv2.resize(div_image, (32, 32))
# 计算平均哈希
gray = cv2.cvtColor(small, cv2.COLOR_BGR2GRAY)
avg = gray.mean()
hash_str = ''.join(['1' if p > avg else '0' for p in gray.flatten()])
return hashlib.md5(hash_str.encode()).hexdigest()[:16]
def scroll_screen(self, distance: int):
"""在截图区域执行滚动"""
if not self.capture_region:
@@ -460,13 +481,19 @@ class ScrollCaptureOCR:
"""
执行一次处理循环
返回False表示应该停止
新逻辑:
1. 第一次截图处理所有div
2. 后续截图只处理新的div跳过已处理的
3. 滚动距离 = 最后一个新div的底部位置 + 空白间隔
"""
logger.info(f"=== 第 {self.scroll_count + 1} 次截屏 ===")
print(f"\n>>> 第 {self.scroll_count + 1} 次截屏处理中...")
# 1. 截取当前屏幕
image = self.capture_screen()
logger.info(f"截图完成,尺寸: {image.shape[1]}x{image.shape[0]}")
height, width = image.shape[:2]
logger.info(f"截图完成,尺寸: {width}x{height}")
# 2. 分析图像定位div边界
analysis = self.image_analyzer.analyze(image)
@@ -476,32 +503,101 @@ class ScrollCaptureOCR:
print("警告: 未检测到内容区域")
return False
# 3. OCR提取文字
current_texts = self.ocr_engine.recognize_divs(image, analysis.divs)
print(f"识别到 {len(current_texts)} 段文字")
logger.info(f"检测到 {len(analysis.divs)} 个div区域")
print(f"检测到 {len(analysis.divs)} 个内容区域")
# 3. 识别新的div跳过已处理的
new_divs = []
current_texts = []
last_processed_signature = None
for i, div in enumerate(analysis.divs):
# 生成div签名
div_signature = self.get_div_signature(div, image)
# 检查是否是已处理的div
is_processed = False
if not self.is_first_capture and self.processed_divs:
# 与已处理的div比较
for processed in self.processed_divs:
if processed.get('signature') == div_signature:
is_processed = True
logger.info(f"跳过已处理的div {i+1}")
break
if is_processed:
continue
# 新的div进行处理
new_divs.append({
'div': div,
'signature': div_signature,
'index': i
})
logger.info(f"处理新的div {i+1},位置: {div.top}-{div.bottom}")
print(f" 处理新区域 {i+1}/{len(analysis.divs)}...")
# 截取单个div区域
div_image = image[div.top:div.bottom, div.left:div.right]
# OCR识别
texts = self.ocr_engine.recognize(div_image)
div.text = "\n".join(texts)
current_texts.extend(texts)
# 保存单个div的结果
self.save_div_result(self.scroll_count, i, div_image, texts, div)
# 记录处理的div
self.processed_divs.append({
'signature': div_signature,
'text': div.text,
'scroll_count': self.scroll_count,
'div_index': i
})
last_processed_signature = div_signature
logger.info(f" 识别到 {len(texts)} 段文字")
# 如果没有新的div说明已经到底
if not new_divs:
logger.info("没有新的div需要处理可能已到达底部")
print("✓ 没有新的内容需要处理")
return False
print(f"✓ 本次处理 {len(new_divs)} 个新区域,共识别 {len(current_texts)} 段文字")
for i, text in enumerate(current_texts[:3], 1):
preview = text[:50] + "..." if len(text) > 50 else text
print(f" [{i}] {preview}")
if len(current_texts) > 3:
print(f" ... 还有 {len(current_texts) - 3} 段文字")
# 4. 保存结果
# 4. 保存完整截图结果
self.save_result(self.scroll_count, image, current_texts)
# 5. 判断是否到达底部OCR结果重复
# 5. 判断是否到达底部(没有新内容或OCR结果重复
if self.check_duplicate(current_texts):
print("\n>>> 检测到内容重复,已到达底部,处理完成 <<<")
return False
self.previous_ocr_result = current_texts
# 6. 计算滚动距离
scroll_distance = self.image_analyzer.calculate_scroll_distance(analysis)
# 6. 计算滚动距离 - 基于最后一个新div的位置
if new_divs:
last_new_div = new_divs[-1]['div']
scroll_distance = self.calculate_scroll_based_on_last_div(
last_new_div, analysis.gaps, height
)
else:
scroll_distance = int(height * 0.8) # 默认滚动80%高度
# 7. 执行滚动
self.scroll_screen(scroll_distance)
self.total_scroll_distance += scroll_distance
self.scroll_count += 1
self.is_first_capture = False # 标记不再是第一次
# 检查最大滚动次数
if self.scroll_count >= self.config.MAX_SCROLL_COUNT:
@@ -511,6 +607,67 @@ class ScrollCaptureOCR:
return True
def calculate_scroll_based_on_last_div(self, last_div: DivRegion, gaps: List[GapInfo], image_height: int) -> int:
"""
基于最后一个div计算滚动距离
策略滚动到最后一个div的底部 + 其后空白间隔
这样可以让下一个新div出现在截图区域的顶部
"""
last_div_bottom = last_div.bottom
# 查找最后一个div之后的空白间隔
last_gap_height = 0
for gap in gaps:
if gap.start_row >= last_div.bottom:
last_gap_height = gap.height
break
# 计算滚动距离
scroll_distance = last_div_bottom + last_gap_height
# 确保至少滚动一定距离(避免 stuck
min_scroll = 50
scroll_distance = max(scroll_distance, min_scroll)
# 限制最大滚动距离不超过图片高度的90%,保留一些重叠)
max_scroll = int(image_height * 0.9)
scroll_distance = min(scroll_distance, max_scroll)
logger.info(f"滚动距离计算: 最后div底部={last_div_bottom}, "
f"空白间隔={last_gap_height}, 滚动距离={scroll_distance}")
print(f" 滚动距离: {scroll_distance} 像素 (基于最后一个内容区域)")
return int(scroll_distance)
def save_div_result(self, scroll_index: int, div_index: int, image: np.ndarray, texts: List[str], div: DivRegion):
"""保存单个div的结果"""
timestamp = time.strftime("%Y%m%d_%H%M%S")
# 保存div图片
div_dir = Path(self.config.OUTPUT_DIR) / f"scroll_{scroll_index:03d}"
div_dir.mkdir(parents=True, exist_ok=True)
image_path = div_dir / f"div_{div_index:02d}_{timestamp}.png"
cv2.imwrite(str(image_path), image)
# 保存div OCR结果
result = {
"scroll_index": scroll_index,
"div_index": div_index,
"timestamp": timestamp,
"div_position": {"top": div.top, "bottom": div.bottom, "left": div.left, "right": div.right},
"image_path": str(image_path),
"texts": texts
}
# 也添加到总结果中
if not hasattr(self, '_div_results'):
self._div_results = []
self._div_results.append(result)
logger.debug(f"保存div结果: scroll={scroll_index}, div={div_index}, 文字数={len(texts)}")
def run(self):
"""主运行流程"""
print("=" * 60)