Files
weixin-holiday-message/batch_ocr.py

143 lines
4.4 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
# -*- coding: utf-8 -*-
"""
对已截图的图片进行OCR识别并生成前端可用的JSON数据
"""
import os
import requests
import base64
from PIL import Image
import glob
import json
import re
def ocr_image(image_path):
"""OCR识别单张图片"""
with open(image_path, 'rb') as f:
image_base64 = base64.b64encode(f.read()).decode('utf-8')
url = "http://localhost:11434/api/chat"
payload = {
"model": "glm-ocr",
"messages": [{
"role": "user",
"content": """识别图片中的所有联系人名称。要求:
1. 只输出联系人名称,每行一个
2. 忽略分组标题如星号、字母A-Z等
3. 忽略数字统计
4. 不要添加任何其他内容""",
"images": [image_base64]
}],
"stream": False
}
try:
response = requests.post(url, json=payload, timeout=60)
return response.json().get('message', {}).get('content', '')
except Exception as e:
print(f"OCR失败: {e}")
return ""
def is_valid_contact(line):
"""判断是否是有效的联系人"""
line = line.strip()
if not line:
return False
invalid = ["公众号", "服务号", "企业微信联系人", "我的企业", "联系人",
"星标朋友", "新的朋友", "群聊", "标签", "仅聊天", "设备"]
if line in invalid:
return False
if len(line) == 1 and line.isalpha():
return False
if line.startswith(">") or line.startswith("!"):
return False
# 过滤JSON格式的内容
if line.startswith('"') or line.startswith('{') or line.startswith('[') or line.startswith('```'):
return False
if line.startswith('"') and ':' in line:
return False
return True
def clean_contact_name(name):
"""清理联系人名称"""
# 移除引号
name = name.strip('"\'')
# 移除末尾的标点
name = name.rstrip(',,。::')
return name.strip()
def main():
print("=" * 60)
print("批量OCR识别截图")
print("=" * 60)
# 获取所有截图
scroll_dir = r"D:\夏骥\微信研究\scroll"
screenshots = sorted(glob.glob(os.path.join(scroll_dir, "*.png")))
print(f"找到 {len(screenshots)} 张截图")
all_contacts = set()
for i, path in enumerate(screenshots):
print(f"\n[{i+1}/{len(screenshots)}] {os.path.basename(path)}")
result = ocr_image(path)
new_count = 0
for line in result.strip().split('\n'):
line = line.strip()
if is_valid_contact(line):
cleaned = clean_contact_name(line)
if cleaned and cleaned not in all_contacts:
new_count += 1
print(f" + {cleaned}")
all_contacts.add(cleaned)
print(f" 本轮新增 {new_count},累计 {len(all_contacts)}")
# 生成JSON数据供前端使用
contacts_json = []
for idx, name in enumerate(sorted(all_contacts, key=lambda x: (not x[0].isalpha() if x else True, x.lower() if x and x[0].isalpha() else x))):
if name: # 确保名称非空
contacts_json.append({
"id": idx + 1,
"name": name,
"category": "",
"blessing": "马年新春快乐!愿您在新的一年里,事业腾飞,马到成功!",
"selected": False
})
# 保存为JSON文件供前端导入
json_file = r"D:\夏骥\微信研究\contacts_data.json"
with open(json_file, 'w', encoding='utf-8') as f:
json.dump(contacts_json, f, ensure_ascii=False, indent=2)
print(f"\nJSON数据已保存: {json_file}")
# 保存纯文本结果
print("\n" + "=" * 60)
print("保存结果...")
result_file = r"D:\夏骥\微信研究\ocr_result.txt"
with open(result_file, 'w', encoding='utf-8') as f:
f.write(f"微信通讯录OCR识别结果\n")
f.write(f"共截图 {len(screenshots)}\n")
f.write(f"共识别 {len(all_contacts)} 个联系人\n")
f.write("=" * 60 + "\n\n")
for c in sorted(all_contacts, key=lambda x: (not x[0].isalpha() if x else True, x.lower() if x and x[0].isalpha() else x)):
if c:
f.write(f"{c}\n")
print(f"结果已保存: {result_file}")
print(f"共识别到 {len(all_contacts)} 个不重复联系人")
if __name__ == '__main__':
main()