# -*- coding: utf-8 -*- """ 对完整截图进行OCR识别 """ import os import requests import base64 from PIL import Image import glob import json def ocr_image(image_path): """OCR识别单张图片""" with open(image_path, 'rb') as f: image_base64 = base64.b64encode(f.read()).decode('utf-8') url = "http://localhost:11434/api/chat" payload = { "model": "glm-ocr", "messages": [{ "role": "user", "content": """识别图片中的所有联系人名称。要求: 1. 只输出联系人名称,每行一个 2. 忽略分组标题(如星号、字母A-Z等) 3. 忽略数字统计 4. 不要添加任何其他内容""", "images": [image_base64] }], "stream": False } try: response = requests.post(url, json=payload, timeout=60) return response.json().get('message', {}).get('content', '') except Exception as e: print(f"OCR失败: {e}") return "" def is_valid_contact(line): """判断是否是有效的联系人""" line = line.strip() if not line or len(line) < 2: return False invalid = ["公众号", "服务号", "企业微信联系人", "我的企业", "联系人", "星标朋友", "新的朋友", "群聊", "标签", "仅聊天", "设备"] if line in invalid: return False if len(line) == 1 and line.isalpha(): return False if line.startswith(">") or line.startswith("!"): return False if line.startswith('"') or line.startswith('{') or line.startswith('[') or line.startswith('```'): return False if line.startswith('"') and ':' in line: return False return True def clean_contact_name(name): """清理联系人名称""" name = name.strip('"\'') name = name.rstrip(',,。::') return name.strip() def main(): print("=" * 60) print("批量OCR识别完整截图") print("=" * 60) # 获取截图目录 scroll_dir = r"D:\夏骥\微信研究\scroll_full" if not os.path.exists(scroll_dir): print(f"目录不存在: {scroll_dir}") print("请先运行 scroll_full_contacts.py 进行截图") return screenshots = sorted(glob.glob(os.path.join(scroll_dir, "*.png"))) if not screenshots: print("未找到截图文件!") return print(f"找到 {len(screenshots)} 张截图") all_contacts = set() for i, path in enumerate(screenshots): print(f"\n[{i+1}/{len(screenshots)}] {os.path.basename(path)}") result = ocr_image(path) new_count = 0 for line in result.strip().split('\n'): line = line.strip() if is_valid_contact(line): cleaned = clean_contact_name(line) if cleaned and len(cleaned) >= 2 and cleaned not in all_contacts: new_count += 1 print(f" + {cleaned}") all_contacts.add(cleaned) print(f" 本轮新增 {new_count},累计 {len(all_contacts)}") # 生成JSON数据 contacts_json = [] for idx, name in enumerate(sorted(all_contacts, key=lambda x: (not x[0].isalpha() if x else True, x.lower() if x and x[0].isalpha() else x))): if name and len(name) >= 2: contacts_json.append({ "id": idx + 1, "name": name, "category": "", "blessing": "马年新春快乐!愿您在新的一年里,事业腾飞,马到成功!", "selected": False }) # 保存JSON文件 json_file = r"D:\夏骥\微信研究\contacts_data.json" with open(json_file, 'w', encoding='utf-8') as f: json.dump(contacts_json, f, ensure_ascii=False, indent=2) print(f"\nJSON数据已保存: {json_file}") # 保存纯文本结果 result_file = r"D:\夏骥\微信研究\ocr_result_full.txt" with open(result_file, 'w', encoding='utf-8') as f: f.write(f"微信通讯录OCR识别结果(完整)\n") f.write(f"共截图 {len(screenshots)} 张\n") f.write(f"共识别 {len(contacts_json)} 个联系人\n") f.write("=" * 60 + "\n\n") for c in contacts_json: f.write(f"{c['name']}\n") print(f"结果已保存: {result_file}") print(f"\n共识别到 {len(contacts_json)} 个不重复联系人") if __name__ == '__main__': main()