Files
weixin-holiday-message/batch_ocr_full.py

145 lines
4.4 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
# -*- coding: utf-8 -*-
"""
对完整截图进行OCR识别
"""
import os
import requests
import base64
from PIL import Image
import glob
import json
def ocr_image(image_path):
"""OCR识别单张图片"""
with open(image_path, 'rb') as f:
image_base64 = base64.b64encode(f.read()).decode('utf-8')
url = "http://localhost:11434/api/chat"
payload = {
"model": "glm-ocr",
"messages": [{
"role": "user",
"content": """识别图片中的所有联系人名称。要求:
1. 只输出联系人名称,每行一个
2. 忽略分组标题如星号、字母A-Z等
3. 忽略数字统计
4. 不要添加任何其他内容""",
"images": [image_base64]
}],
"stream": False
}
try:
response = requests.post(url, json=payload, timeout=60)
return response.json().get('message', {}).get('content', '')
except Exception as e:
print(f"OCR失败: {e}")
return ""
def is_valid_contact(line):
"""判断是否是有效的联系人"""
line = line.strip()
if not line or len(line) < 2:
return False
invalid = ["公众号", "服务号", "企业微信联系人", "我的企业", "联系人",
"星标朋友", "新的朋友", "群聊", "标签", "仅聊天", "设备"]
if line in invalid:
return False
if len(line) == 1 and line.isalpha():
return False
if line.startswith(">") or line.startswith("!"):
return False
if line.startswith('"') or line.startswith('{') or line.startswith('[') or line.startswith('```'):
return False
if line.startswith('"') and ':' in line:
return False
return True
def clean_contact_name(name):
"""清理联系人名称"""
name = name.strip('"\'')
name = name.rstrip(',,。::')
return name.strip()
def main():
print("=" * 60)
print("批量OCR识别完整截图")
print("=" * 60)
# 获取截图目录
scroll_dir = r"D:\夏骥\微信研究\scroll_full"
if not os.path.exists(scroll_dir):
print(f"目录不存在: {scroll_dir}")
print("请先运行 scroll_full_contacts.py 进行截图")
return
screenshots = sorted(glob.glob(os.path.join(scroll_dir, "*.png")))
if not screenshots:
print("未找到截图文件!")
return
print(f"找到 {len(screenshots)} 张截图")
all_contacts = set()
for i, path in enumerate(screenshots):
print(f"\n[{i+1}/{len(screenshots)}] {os.path.basename(path)}")
result = ocr_image(path)
new_count = 0
for line in result.strip().split('\n'):
line = line.strip()
if is_valid_contact(line):
cleaned = clean_contact_name(line)
if cleaned and len(cleaned) >= 2 and cleaned not in all_contacts:
new_count += 1
print(f" + {cleaned}")
all_contacts.add(cleaned)
print(f" 本轮新增 {new_count},累计 {len(all_contacts)}")
# 生成JSON数据
contacts_json = []
for idx, name in enumerate(sorted(all_contacts, key=lambda x: (not x[0].isalpha() if x else True, x.lower() if x and x[0].isalpha() else x))):
if name and len(name) >= 2:
contacts_json.append({
"id": idx + 1,
"name": name,
"category": "",
"blessing": "马年新春快乐!愿您在新的一年里,事业腾飞,马到成功!",
"selected": False
})
# 保存JSON文件
json_file = r"D:\夏骥\微信研究\contacts_data.json"
with open(json_file, 'w', encoding='utf-8') as f:
json.dump(contacts_json, f, ensure_ascii=False, indent=2)
print(f"\nJSON数据已保存: {json_file}")
# 保存纯文本结果
result_file = r"D:\夏骥\微信研究\ocr_result_full.txt"
with open(result_file, 'w', encoding='utf-8') as f:
f.write(f"微信通讯录OCR识别结果完整\n")
f.write(f"共截图 {len(screenshots)}\n")
f.write(f"共识别 {len(contacts_json)} 个联系人\n")
f.write("=" * 60 + "\n\n")
for c in contacts_json:
f.write(f"{c['name']}\n")
print(f"结果已保存: {result_file}")
print(f"\n共识别到 {len(contacts_json)} 个不重复联系人")
if __name__ == '__main__':
main()