batch_ocr.py

# -*- coding: utf-8 -*-
"""
对已截图的图片进行OCR识别，并生成前端可用的JSON数据
"""
import os
import requests
import base64
from PIL import Image
import glob
import json
import re


def ocr_image(image_path):
    """OCR识别单张图片"""
    with open(image_path, 'rb') as f:
        image_base64 = base64.b64encode(f.read()).decode('utf-8')
    
    url = "http://localhost:11434/api/chat"
    payload = {
        "model": "glm-ocr",
        "messages": [{
            "role": "user",
            "content": """识别图片中的所有联系人名称。要求：
1. 只输出联系人名称，每行一个
2. 忽略分组标题（如星号、字母A-Z等）
3. 忽略数字统计
4. 不要添加任何其他内容""",
            "images": [image_base64]
        }],
        "stream": False
    }
    
    try:
        response = requests.post(url, json=payload, timeout=60)
        return response.json().get('message', {}).get('content', '')
    except Exception as e:
        print(f"OCR失败: {e}")
        return ""


def is_valid_contact(line):
    """判断是否是有效的联系人"""
    line = line.strip()
    if not line:
        return False
    
    invalid = ["公众号", "服务号", "企业微信联系人", "我的企业", "联系人",
               "星标朋友", "新的朋友", "群聊", "标签", "仅聊天", "设备"]
    
    if line in invalid:
        return False
    if len(line) == 1 and line.isalpha():
        return False
    if line.startswith(">") or line.startswith("!"):
        return False
    # 过滤JSON格式的内容
    if line.startswith('"') or line.startswith('{') or line.startswith('[') or line.startswith('```'):
        return False
    if line.startswith('"') and ':' in line:
        return False
    
    return True


def clean_contact_name(name):
    """清理联系人名称"""
    # 移除引号
    name = name.strip('"\'')
    # 移除末尾的标点
    name = name.rstrip(',，。：:')
    return name.strip()


def main():
    print("=" * 60)
    print("批量OCR识别截图")
    print("=" * 60)
    
    # 获取所有截图
    scroll_dir = r"D:\夏骥\微信研究\scroll"
    screenshots = sorted(glob.glob(os.path.join(scroll_dir, "*.png")))
    
    print(f"找到 {len(screenshots)} 张截图")
    
    all_contacts = set()
    
    for i, path in enumerate(screenshots):
        print(f"\n[{i+1}/{len(screenshots)}] {os.path.basename(path)}")
        result = ocr_image(path)
        
        new_count = 0
        for line in result.strip().split('\n'):
            line = line.strip()
            if is_valid_contact(line):
                cleaned = clean_contact_name(line)
                if cleaned and cleaned not in all_contacts:
                    new_count += 1
                    print(f"  + {cleaned}")
                all_contacts.add(cleaned)
        
        print(f"  本轮新增 {new_count}，累计 {len(all_contacts)}")
    
    # 生成JSON数据供前端使用
    contacts_json = []
    for idx, name in enumerate(sorted(all_contacts, key=lambda x: (not x[0].isalpha() if x else True, x.lower() if x and x[0].isalpha() else x))):
        if name:  # 确保名称非空
            contacts_json.append({
                "id": idx + 1,
                "name": name,
                "category": "",
                "blessing": "马年新春快乐！愿您在新的一年里，事业腾飞，马到成功！",
                "selected": False
            })
    
    # 保存为JSON文件（供前端导入）
    json_file = r"D:\夏骥\微信研究\contacts_data.json"
    with open(json_file, 'w', encoding='utf-8') as f:
        json.dump(contacts_json, f, ensure_ascii=False, indent=2)
    
    print(f"\nJSON数据已保存: {json_file}")
    
    # 保存纯文本结果
    print("\n" + "=" * 60)
    print("保存结果...")
    
    result_file = r"D:\夏骥\微信研究\ocr_result.txt"
    with open(result_file, 'w', encoding='utf-8') as f:
        f.write(f"微信通讯录OCR识别结果\n")
        f.write(f"共截图 {len(screenshots)} 张\n")
        f.write(f"共识别 {len(all_contacts)} 个联系人\n")
        f.write("=" * 60 + "\n\n")
        for c in sorted(all_contacts, key=lambda x: (not x[0].isalpha() if x else True, x.lower() if x and x[0].isalpha() else x)):
            if c:
                f.write(f"{c}\n")
    
    print(f"结果已保存: {result_file}")
    print(f"共识别到 {len(all_contacts)} 个不重复联系人")


if __name__ == '__main__':
    main()