# -*- coding: utf-8 -*- """ 并行OCR识别 - 多进程加速 """ import os import sqlite3 import requests import base64 from PIL import Image import glob import json from multiprocessing import Pool, Manager import time def ocr_image(args): """OCR识别单张图片""" image_path, idx, total = args try: with open(image_path, 'rb') as f: image_base64 = base64.b64encode(f.read()).decode('utf-8') url = "http://localhost:11434/api/chat" payload = { "model": "glm-ocr", "messages": [{ "role": "user", "content": """识别图片中的所有联系人名称。要求: 1. 只输出联系人名称,每行一个 2. 忽略分组标题(如星号、字母A-Z等) 3. 忽略数字统计 4. 不要添加任何其他内容""", "images": [image_base64] }], "stream": False } response = requests.post(url, json=payload, timeout=60) result = response.json().get('message', {}).get('content', '') contacts = [] for line in result.strip().split('\n'): line = line.strip() if line and len(line) >= 2 and len(line) < 50: # 简单过滤 if not any(x in line for x in ['公众号', '服务号', '企业微信', '联系人', '星标朋友', '新的朋友']): contacts.append(line.strip('"\'').rstrip(',,。::')) print(f"[{idx+1}/{total}] {os.path.basename(image_path)}: 发现 {len(contacts)} 个联系人") return contacts except Exception as e: print(f"[{idx+1}/{total}] {os.path.basename(image_path)}: 失败 - {e}") return [] def get_existing_contacts(): """从数据库获取已存在的联系人""" conn = sqlite3.connect(r'D:\夏骥\微信研究\contacts.db') cursor = conn.cursor() cursor.execute('SELECT name FROM contacts') existing = set(row[0] for row in cursor.fetchall()) conn.close() return existing def add_new_contacts(new_contacts): """将新联系人添加到数据库""" if not new_contacts: return 0 conn = sqlite3.connect(r'D:\夏骥\微信研究\contacts.db') cursor = conn.cursor() # 获取当前最大ID cursor.execute('SELECT MAX(id) FROM contacts') max_id = cursor.fetchone()[0] or 0 added = 0 for idx, name in enumerate(new_contacts, start=max_id + 1): cursor.execute(''' INSERT INTO contacts (id, name, category, blessing, selected) VALUES (?, ?, ?, ?, ?) ''', (idx, name, '', '马年新春快乐!愿您在新的一年里,事业腾飞,马到成功!', False)) added += 1 conn.commit() conn.close() return added def main(): print("=" * 60) print("并行OCR识别") print("=" * 60) # 获取截图目录 scroll_dir = r"D:\夏骥\微信研究\scroll_complete" screenshots = sorted(glob.glob(os.path.join(scroll_dir, "*.png"))) if not screenshots: print("未找到截图文件!") return print(f"找到 {len(screenshots)} 张截图") # 获取已存在的联系人 existing_contacts = get_existing_contacts() print(f"数据库中已有 {len(existing_contacts)} 个联系人") # 准备参数 args_list = [(path, i, len(screenshots)) for i, path in enumerate(screenshots)] # 并行处理 - 使用4个进程 print("\n开始并行OCR识别...") all_contacts = set() with Pool(processes=4) as pool: results = pool.map(ocr_image, args_list) # 收集结果 for contacts in results: for name in contacts: if name and len(name) >= 2 and name not in existing_contacts: all_contacts.add(name) print(f"\n{'='*60}") print(f"OCR完成!") print(f"发现新联系人: {len(all_contacts)} 个") # 入库 if all_contacts: added = add_new_contacts(sorted(all_contacts, key=lambda x: (not x[0].isalpha() if x else True, x.lower() if x and x[0].isalpha() else x))) print(f"成功入库: {added} 个") # 更新JSON文件 conn = sqlite3.connect(r'D:\夏骥\微信研究\contacts.db') cursor = conn.cursor() cursor.execute('SELECT * FROM contacts ORDER BY id') all_db_contacts = [] for row in cursor.fetchall(): all_db_contacts.append({ "id": row[0], "name": row[1], "category": row[2], "blessing": row[3], "selected": bool(row[4]) }) conn.close() json_file = r"D:\夏骥\微信研究\contacts_data.json" with open(json_file, 'w', encoding='utf-8') as f: json.dump(all_db_contacts, f, ensure_ascii=False, indent=2) print(f"JSON数据已更新: {json_file}") print(f"数据库总联系人: {len(all_db_contacts)} 个") if __name__ == '__main__': main()