Files
weixin-holiday-message/batch_ocr_resume.py

232 lines
7.2 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
# -*- coding: utf-8 -*-
"""
断点续传OCR识别
"""
import os
import sqlite3
import requests
import base64
from PIL import Image
import glob
import json
import time
def ocr_image(image_path):
"""OCR识别单张图片"""
with open(image_path, 'rb') as f:
image_base64 = base64.b64encode(f.read()).decode('utf-8')
url = "http://localhost:11434/api/chat"
payload = {
"model": "glm-ocr",
"messages": [{
"role": "user",
"content": """识别图片中的所有联系人名称。要求:
1. 只输出联系人名称,每行一个
2. 忽略分组标题如星号、字母A-Z等
3. 忽略数字统计
4. 不要添加任何其他内容""",
"images": [image_base64]
}],
"stream": False
}
try:
response = requests.post(url, json=payload, timeout=60)
return response.json().get('message', {}).get('content', '')
except Exception as e:
print(f"OCR失败: {e}")
return ""
def is_valid_contact(line):
"""判断是否是有效的联系人"""
line = line.strip()
if not line or len(line) < 2:
return False
invalid = ["公众号", "服务号", "企业微信联系人", "我的企业", "联系人",
"星标朋友", "新的朋友", "群聊", "标签", "仅聊天", "设备"]
if line in invalid:
return False
if len(line) == 1 and line.isalpha():
return False
if line.startswith(">") or line.startswith("!"):
return False
return True
def clean_contact_name(name):
"""清理联系人名称"""
name = name.strip('"\'')
name = name.rstrip(',,。::')
return name.strip()
def get_existing_contacts():
"""从数据库获取已存在的联系人"""
conn = sqlite3.connect(r'D:\夏骥\微信研究\contacts.db')
cursor = conn.cursor()
cursor.execute('SELECT name FROM contacts')
existing = set(row[0] for row in cursor.fetchall())
conn.close()
return existing
def add_new_contacts(new_contacts):
"""将新联系人添加到数据库"""
if not new_contacts:
return 0
conn = sqlite3.connect(r'D:\夏骥\微信研究\contacts.db')
cursor = conn.cursor()
# 获取当前最大ID
cursor.execute('SELECT MAX(id) FROM contacts')
max_id = cursor.fetchone()[0] or 0
added = 0
for idx, name in enumerate(new_contacts, start=max_id + 1):
cursor.execute('''
INSERT INTO contacts (id, name, category, blessing, selected)
VALUES (?, ?, ?, ?, ?)
''', (idx, name, '', '马年新春快乐!愿您在新的一年里,事业腾飞,马到成功!', False))
added += 1
conn.commit()
conn.close()
return added
def save_progress(progress):
"""保存进度"""
with open(r'D:\夏骥\微信研究\ocr_progress.json', 'w', encoding='utf-8') as f:
json.dump(progress, f, ensure_ascii=False, indent=2)
def load_progress():
"""加载进度"""
try:
with open(r'D:\夏骥\微信研究\ocr_progress.json', 'r', encoding='utf-8') as f:
return json.load(f)
except:
return {"processed": 0, "total": 0, "new_contacts": []}
def main():
print("=" * 60)
print("断点续传OCR识别")
print("=" * 60)
# 获取截图目录
scroll_dir = r"D:\夏骥\微信研究\scroll_complete"
screenshots = sorted(glob.glob(os.path.join(scroll_dir, "*.png")))
if not screenshots:
print("未找到截图文件!")
return
print(f"找到 {len(screenshots)} 张截图")
# 加载进度
progress = load_progress()
processed = progress.get("processed", 0)
all_new_contacts = set(progress.get("new_contacts", []))
print(f"已处理: {processed} 张,已发现新联系人: {len(all_new_contacts)}")
# 获取已存在的联系人
existing_contacts = get_existing_contacts()
print(f"数据库中已有 {len(existing_contacts)} 个联系人")
skipped_count = 0
batch_size = 50 # 每50张保存一次进度
start_time = time.time()
for i, path in enumerate(screenshots[processed:], start=processed):
current_time = time.time()
elapsed = current_time - start_time
avg_time = elapsed / (i - processed + 1) if i > processed else 0
remaining = avg_time * (len(screenshots) - i - 1)
percent = (i + 1) / len(screenshots) * 100
print(f"\n[{i+1}/{len(screenshots)}] {percent:.1f}% | 预计剩余: {remaining/60:.1f}分钟")
print(f" 文件: {os.path.basename(path)}")
result = ocr_image(path)
new_in_this = 0
for line in result.strip().split('\n'):
line = line.strip()
if is_valid_contact(line):
cleaned = clean_contact_name(line)
if cleaned and len(cleaned) >= 2:
if cleaned in existing_contacts:
skipped_count += 1
elif cleaned not in all_new_contacts:
new_in_this += 1
all_new_contacts.add(cleaned)
print(f" ✓ 新: {cleaned}")
print(f" 本轮: +{new_in_this} | 累计新: {len(all_new_contacts)} | 跳过: {skipped_count}")
# 保存进度
progress["processed"] = i + 1
progress["total"] = len(screenshots)
progress["new_contacts"] = list(all_new_contacts)
if (i + 1) % batch_size == 0:
save_progress(progress)
print(f" 💾 进度已保存 ({i+1}/{len(screenshots)})")
# 每10张显示汇总
if (i + 1) % 10 == 0:
print(f"\n{'='*60}")
print(f"📊 进度汇总: {i+1}/{len(screenshots)} ({percent:.1f}%)")
print(f"⏱️ 已用时间: {elapsed/60:.1f}分钟 | 预计剩余: {remaining/60:.1f}分钟")
print(f"👤 新联系人: {len(all_new_contacts)} | 跳过: {skipped_count}")
print(f"{'='*60}\n")
# 最终保存
save_progress(progress)
print(f"\n{'='*60}")
print(f"OCR完成")
print(f"发现新联系人: {len(all_new_contacts)}")
print(f"跳过已存在: {skipped_count}")
# 入库
if all_new_contacts:
added = add_new_contacts(sorted(all_new_contacts, key=lambda x: (not x[0].isalpha() if x else True, x.lower() if x and x[0].isalpha() else x)))
print(f"成功入库: {added}")
# 更新JSON文件
conn = sqlite3.connect(r'D:\夏骥\微信研究\contacts.db')
cursor = conn.cursor()
cursor.execute('SELECT * FROM contacts ORDER BY id')
all_contacts = []
for row in cursor.fetchall():
all_contacts.append({
"id": row[0],
"name": row[1],
"category": row[2],
"blessing": row[3],
"selected": bool(row[4])
})
conn.close()
json_file = r"D:\夏骥\微信研究\contacts_data.json"
with open(json_file, 'w', encoding='utf-8') as f:
json.dump(all_contacts, f, ensure_ascii=False, indent=2)
print(f"JSON数据已更新: {json_file}")
print(f"数据库总联系人: {len(all_contacts)}")
if __name__ == '__main__':
main()