143 lines
4.4 KiB
Python
143 lines
4.4 KiB
Python
|
|
# -*- coding: utf-8 -*-
|
|||
|
|
"""
|
|||
|
|
对已截图的图片进行OCR识别,并生成前端可用的JSON数据
|
|||
|
|
"""
|
|||
|
|
import os
|
|||
|
|
import requests
|
|||
|
|
import base64
|
|||
|
|
from PIL import Image
|
|||
|
|
import glob
|
|||
|
|
import json
|
|||
|
|
import re
|
|||
|
|
|
|||
|
|
|
|||
|
|
def ocr_image(image_path):
|
|||
|
|
"""OCR识别单张图片"""
|
|||
|
|
with open(image_path, 'rb') as f:
|
|||
|
|
image_base64 = base64.b64encode(f.read()).decode('utf-8')
|
|||
|
|
|
|||
|
|
url = "http://localhost:11434/api/chat"
|
|||
|
|
payload = {
|
|||
|
|
"model": "glm-ocr",
|
|||
|
|
"messages": [{
|
|||
|
|
"role": "user",
|
|||
|
|
"content": """识别图片中的所有联系人名称。要求:
|
|||
|
|
1. 只输出联系人名称,每行一个
|
|||
|
|
2. 忽略分组标题(如星号、字母A-Z等)
|
|||
|
|
3. 忽略数字统计
|
|||
|
|
4. 不要添加任何其他内容""",
|
|||
|
|
"images": [image_base64]
|
|||
|
|
}],
|
|||
|
|
"stream": False
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
response = requests.post(url, json=payload, timeout=60)
|
|||
|
|
return response.json().get('message', {}).get('content', '')
|
|||
|
|
except Exception as e:
|
|||
|
|
print(f"OCR失败: {e}")
|
|||
|
|
return ""
|
|||
|
|
|
|||
|
|
|
|||
|
|
def is_valid_contact(line):
|
|||
|
|
"""判断是否是有效的联系人"""
|
|||
|
|
line = line.strip()
|
|||
|
|
if not line:
|
|||
|
|
return False
|
|||
|
|
|
|||
|
|
invalid = ["公众号", "服务号", "企业微信联系人", "我的企业", "联系人",
|
|||
|
|
"星标朋友", "新的朋友", "群聊", "标签", "仅聊天", "设备"]
|
|||
|
|
|
|||
|
|
if line in invalid:
|
|||
|
|
return False
|
|||
|
|
if len(line) == 1 and line.isalpha():
|
|||
|
|
return False
|
|||
|
|
if line.startswith(">") or line.startswith("!"):
|
|||
|
|
return False
|
|||
|
|
# 过滤JSON格式的内容
|
|||
|
|
if line.startswith('"') or line.startswith('{') or line.startswith('[') or line.startswith('```'):
|
|||
|
|
return False
|
|||
|
|
if line.startswith('"') and ':' in line:
|
|||
|
|
return False
|
|||
|
|
|
|||
|
|
return True
|
|||
|
|
|
|||
|
|
|
|||
|
|
def clean_contact_name(name):
|
|||
|
|
"""清理联系人名称"""
|
|||
|
|
# 移除引号
|
|||
|
|
name = name.strip('"\'')
|
|||
|
|
# 移除末尾的标点
|
|||
|
|
name = name.rstrip(',,。::')
|
|||
|
|
return name.strip()
|
|||
|
|
|
|||
|
|
|
|||
|
|
def main():
|
|||
|
|
print("=" * 60)
|
|||
|
|
print("批量OCR识别截图")
|
|||
|
|
print("=" * 60)
|
|||
|
|
|
|||
|
|
# 获取所有截图
|
|||
|
|
scroll_dir = r"D:\夏骥\微信研究\scroll"
|
|||
|
|
screenshots = sorted(glob.glob(os.path.join(scroll_dir, "*.png")))
|
|||
|
|
|
|||
|
|
print(f"找到 {len(screenshots)} 张截图")
|
|||
|
|
|
|||
|
|
all_contacts = set()
|
|||
|
|
|
|||
|
|
for i, path in enumerate(screenshots):
|
|||
|
|
print(f"\n[{i+1}/{len(screenshots)}] {os.path.basename(path)}")
|
|||
|
|
result = ocr_image(path)
|
|||
|
|
|
|||
|
|
new_count = 0
|
|||
|
|
for line in result.strip().split('\n'):
|
|||
|
|
line = line.strip()
|
|||
|
|
if is_valid_contact(line):
|
|||
|
|
cleaned = clean_contact_name(line)
|
|||
|
|
if cleaned and cleaned not in all_contacts:
|
|||
|
|
new_count += 1
|
|||
|
|
print(f" + {cleaned}")
|
|||
|
|
all_contacts.add(cleaned)
|
|||
|
|
|
|||
|
|
print(f" 本轮新增 {new_count},累计 {len(all_contacts)}")
|
|||
|
|
|
|||
|
|
# 生成JSON数据供前端使用
|
|||
|
|
contacts_json = []
|
|||
|
|
for idx, name in enumerate(sorted(all_contacts, key=lambda x: (not x[0].isalpha() if x else True, x.lower() if x and x[0].isalpha() else x))):
|
|||
|
|
if name: # 确保名称非空
|
|||
|
|
contacts_json.append({
|
|||
|
|
"id": idx + 1,
|
|||
|
|
"name": name,
|
|||
|
|
"category": "",
|
|||
|
|
"blessing": "马年新春快乐!愿您在新的一年里,事业腾飞,马到成功!",
|
|||
|
|
"selected": False
|
|||
|
|
})
|
|||
|
|
|
|||
|
|
# 保存为JSON文件(供前端导入)
|
|||
|
|
json_file = r"D:\夏骥\微信研究\contacts_data.json"
|
|||
|
|
with open(json_file, 'w', encoding='utf-8') as f:
|
|||
|
|
json.dump(contacts_json, f, ensure_ascii=False, indent=2)
|
|||
|
|
|
|||
|
|
print(f"\nJSON数据已保存: {json_file}")
|
|||
|
|
|
|||
|
|
# 保存纯文本结果
|
|||
|
|
print("\n" + "=" * 60)
|
|||
|
|
print("保存结果...")
|
|||
|
|
|
|||
|
|
result_file = r"D:\夏骥\微信研究\ocr_result.txt"
|
|||
|
|
with open(result_file, 'w', encoding='utf-8') as f:
|
|||
|
|
f.write(f"微信通讯录OCR识别结果\n")
|
|||
|
|
f.write(f"共截图 {len(screenshots)} 张\n")
|
|||
|
|
f.write(f"共识别 {len(all_contacts)} 个联系人\n")
|
|||
|
|
f.write("=" * 60 + "\n\n")
|
|||
|
|
for c in sorted(all_contacts, key=lambda x: (not x[0].isalpha() if x else True, x.lower() if x and x[0].isalpha() else x)):
|
|||
|
|
if c:
|
|||
|
|
f.write(f"{c}\n")
|
|||
|
|
|
|||
|
|
print(f"结果已保存: {result_file}")
|
|||
|
|
print(f"共识别到 {len(all_contacts)} 个不重复联系人")
|
|||
|
|
|
|||
|
|
|
|||
|
|
if __name__ == '__main__':
|
|||
|
|
main()
|