Files
weixin-holiday-message/batch_ocr_parallel.py

161 lines
4.9 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
# -*- coding: utf-8 -*-
"""
并行OCR识别 - 多进程加速
"""
import os
import sqlite3
import requests
import base64
from PIL import Image
import glob
import json
from multiprocessing import Pool, Manager
import time
def ocr_image(args):
"""OCR识别单张图片"""
image_path, idx, total = args
try:
with open(image_path, 'rb') as f:
image_base64 = base64.b64encode(f.read()).decode('utf-8')
url = "http://localhost:11434/api/chat"
payload = {
"model": "glm-ocr",
"messages": [{
"role": "user",
"content": """识别图片中的所有联系人名称。要求:
1. 只输出联系人名称,每行一个
2. 忽略分组标题如星号、字母A-Z等
3. 忽略数字统计
4. 不要添加任何其他内容""",
"images": [image_base64]
}],
"stream": False
}
response = requests.post(url, json=payload, timeout=60)
result = response.json().get('message', {}).get('content', '')
contacts = []
for line in result.strip().split('\n'):
line = line.strip()
if line and len(line) >= 2 and len(line) < 50:
# 简单过滤
if not any(x in line for x in ['公众号', '服务号', '企业微信', '联系人', '星标朋友', '新的朋友']):
contacts.append(line.strip('"\'').rstrip(',,。::'))
print(f"[{idx+1}/{total}] {os.path.basename(image_path)}: 发现 {len(contacts)} 个联系人")
return contacts
except Exception as e:
print(f"[{idx+1}/{total}] {os.path.basename(image_path)}: 失败 - {e}")
return []
def get_existing_contacts():
"""从数据库获取已存在的联系人"""
conn = sqlite3.connect(r'D:\夏骥\微信研究\contacts.db')
cursor = conn.cursor()
cursor.execute('SELECT name FROM contacts')
existing = set(row[0] for row in cursor.fetchall())
conn.close()
return existing
def add_new_contacts(new_contacts):
"""将新联系人添加到数据库"""
if not new_contacts:
return 0
conn = sqlite3.connect(r'D:\夏骥\微信研究\contacts.db')
cursor = conn.cursor()
# 获取当前最大ID
cursor.execute('SELECT MAX(id) FROM contacts')
max_id = cursor.fetchone()[0] or 0
added = 0
for idx, name in enumerate(new_contacts, start=max_id + 1):
cursor.execute('''
INSERT INTO contacts (id, name, category, blessing, selected)
VALUES (?, ?, ?, ?, ?)
''', (idx, name, '', '马年新春快乐!愿您在新的一年里,事业腾飞,马到成功!', False))
added += 1
conn.commit()
conn.close()
return added
def main():
print("=" * 60)
print("并行OCR识别")
print("=" * 60)
# 获取截图目录
scroll_dir = r"D:\夏骥\微信研究\scroll_complete"
screenshots = sorted(glob.glob(os.path.join(scroll_dir, "*.png")))
if not screenshots:
print("未找到截图文件!")
return
print(f"找到 {len(screenshots)} 张截图")
# 获取已存在的联系人
existing_contacts = get_existing_contacts()
print(f"数据库中已有 {len(existing_contacts)} 个联系人")
# 准备参数
args_list = [(path, i, len(screenshots)) for i, path in enumerate(screenshots)]
# 并行处理 - 使用4个进程
print("\n开始并行OCR识别...")
all_contacts = set()
with Pool(processes=4) as pool:
results = pool.map(ocr_image, args_list)
# 收集结果
for contacts in results:
for name in contacts:
if name and len(name) >= 2 and name not in existing_contacts:
all_contacts.add(name)
print(f"\n{'='*60}")
print(f"OCR完成")
print(f"发现新联系人: {len(all_contacts)}")
# 入库
if all_contacts:
added = add_new_contacts(sorted(all_contacts, key=lambda x: (not x[0].isalpha() if x else True, x.lower() if x and x[0].isalpha() else x)))
print(f"成功入库: {added}")
# 更新JSON文件
conn = sqlite3.connect(r'D:\夏骥\微信研究\contacts.db')
cursor = conn.cursor()
cursor.execute('SELECT * FROM contacts ORDER BY id')
all_db_contacts = []
for row in cursor.fetchall():
all_db_contacts.append({
"id": row[0],
"name": row[1],
"category": row[2],
"blessing": row[3],
"selected": bool(row[4])
})
conn.close()
json_file = r"D:\夏骥\微信研究\contacts_data.json"
with open(json_file, 'w', encoding='utf-8') as f:
json.dump(all_db_contacts, f, ensure_ascii=False, indent=2)
print(f"JSON数据已更新: {json_file}")
print(f"数据库总联系人: {len(all_db_contacts)}")
if __name__ == '__main__':
main()