161 lines
4.9 KiB
Python
161 lines
4.9 KiB
Python
|
|
# -*- coding: utf-8 -*-
|
|||
|
|
"""
|
|||
|
|
并行OCR识别 - 多进程加速
|
|||
|
|
"""
|
|||
|
|
import os
|
|||
|
|
import sqlite3
|
|||
|
|
import requests
|
|||
|
|
import base64
|
|||
|
|
from PIL import Image
|
|||
|
|
import glob
|
|||
|
|
import json
|
|||
|
|
from multiprocessing import Pool, Manager
|
|||
|
|
import time
|
|||
|
|
|
|||
|
|
|
|||
|
|
def ocr_image(args):
|
|||
|
|
"""OCR识别单张图片"""
|
|||
|
|
image_path, idx, total = args
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
with open(image_path, 'rb') as f:
|
|||
|
|
image_base64 = base64.b64encode(f.read()).decode('utf-8')
|
|||
|
|
|
|||
|
|
url = "http://localhost:11434/api/chat"
|
|||
|
|
payload = {
|
|||
|
|
"model": "glm-ocr",
|
|||
|
|
"messages": [{
|
|||
|
|
"role": "user",
|
|||
|
|
"content": """识别图片中的所有联系人名称。要求:
|
|||
|
|
1. 只输出联系人名称,每行一个
|
|||
|
|
2. 忽略分组标题(如星号、字母A-Z等)
|
|||
|
|
3. 忽略数字统计
|
|||
|
|
4. 不要添加任何其他内容""",
|
|||
|
|
"images": [image_base64]
|
|||
|
|
}],
|
|||
|
|
"stream": False
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
response = requests.post(url, json=payload, timeout=60)
|
|||
|
|
result = response.json().get('message', {}).get('content', '')
|
|||
|
|
|
|||
|
|
contacts = []
|
|||
|
|
for line in result.strip().split('\n'):
|
|||
|
|
line = line.strip()
|
|||
|
|
if line and len(line) >= 2 and len(line) < 50:
|
|||
|
|
# 简单过滤
|
|||
|
|
if not any(x in line for x in ['公众号', '服务号', '企业微信', '联系人', '星标朋友', '新的朋友']):
|
|||
|
|
contacts.append(line.strip('"\'').rstrip(',,。::'))
|
|||
|
|
|
|||
|
|
print(f"[{idx+1}/{total}] {os.path.basename(image_path)}: 发现 {len(contacts)} 个联系人")
|
|||
|
|
return contacts
|
|||
|
|
except Exception as e:
|
|||
|
|
print(f"[{idx+1}/{total}] {os.path.basename(image_path)}: 失败 - {e}")
|
|||
|
|
return []
|
|||
|
|
|
|||
|
|
|
|||
|
|
def get_existing_contacts():
|
|||
|
|
"""从数据库获取已存在的联系人"""
|
|||
|
|
conn = sqlite3.connect(r'D:\夏骥\微信研究\contacts.db')
|
|||
|
|
cursor = conn.cursor()
|
|||
|
|
cursor.execute('SELECT name FROM contacts')
|
|||
|
|
existing = set(row[0] for row in cursor.fetchall())
|
|||
|
|
conn.close()
|
|||
|
|
return existing
|
|||
|
|
|
|||
|
|
|
|||
|
|
def add_new_contacts(new_contacts):
|
|||
|
|
"""将新联系人添加到数据库"""
|
|||
|
|
if not new_contacts:
|
|||
|
|
return 0
|
|||
|
|
|
|||
|
|
conn = sqlite3.connect(r'D:\夏骥\微信研究\contacts.db')
|
|||
|
|
cursor = conn.cursor()
|
|||
|
|
|
|||
|
|
# 获取当前最大ID
|
|||
|
|
cursor.execute('SELECT MAX(id) FROM contacts')
|
|||
|
|
max_id = cursor.fetchone()[0] or 0
|
|||
|
|
|
|||
|
|
added = 0
|
|||
|
|
for idx, name in enumerate(new_contacts, start=max_id + 1):
|
|||
|
|
cursor.execute('''
|
|||
|
|
INSERT INTO contacts (id, name, category, blessing, selected)
|
|||
|
|
VALUES (?, ?, ?, ?, ?)
|
|||
|
|
''', (idx, name, '', '马年新春快乐!愿您在新的一年里,事业腾飞,马到成功!', False))
|
|||
|
|
added += 1
|
|||
|
|
|
|||
|
|
conn.commit()
|
|||
|
|
conn.close()
|
|||
|
|
return added
|
|||
|
|
|
|||
|
|
|
|||
|
|
def main():
|
|||
|
|
print("=" * 60)
|
|||
|
|
print("并行OCR识别")
|
|||
|
|
print("=" * 60)
|
|||
|
|
|
|||
|
|
# 获取截图目录
|
|||
|
|
scroll_dir = r"D:\夏骥\微信研究\scroll_complete"
|
|||
|
|
screenshots = sorted(glob.glob(os.path.join(scroll_dir, "*.png")))
|
|||
|
|
|
|||
|
|
if not screenshots:
|
|||
|
|
print("未找到截图文件!")
|
|||
|
|
return
|
|||
|
|
|
|||
|
|
print(f"找到 {len(screenshots)} 张截图")
|
|||
|
|
|
|||
|
|
# 获取已存在的联系人
|
|||
|
|
existing_contacts = get_existing_contacts()
|
|||
|
|
print(f"数据库中已有 {len(existing_contacts)} 个联系人")
|
|||
|
|
|
|||
|
|
# 准备参数
|
|||
|
|
args_list = [(path, i, len(screenshots)) for i, path in enumerate(screenshots)]
|
|||
|
|
|
|||
|
|
# 并行处理 - 使用4个进程
|
|||
|
|
print("\n开始并行OCR识别...")
|
|||
|
|
all_contacts = set()
|
|||
|
|
|
|||
|
|
with Pool(processes=4) as pool:
|
|||
|
|
results = pool.map(ocr_image, args_list)
|
|||
|
|
|
|||
|
|
# 收集结果
|
|||
|
|
for contacts in results:
|
|||
|
|
for name in contacts:
|
|||
|
|
if name and len(name) >= 2 and name not in existing_contacts:
|
|||
|
|
all_contacts.add(name)
|
|||
|
|
|
|||
|
|
print(f"\n{'='*60}")
|
|||
|
|
print(f"OCR完成!")
|
|||
|
|
print(f"发现新联系人: {len(all_contacts)} 个")
|
|||
|
|
|
|||
|
|
# 入库
|
|||
|
|
if all_contacts:
|
|||
|
|
added = add_new_contacts(sorted(all_contacts, key=lambda x: (not x[0].isalpha() if x else True, x.lower() if x and x[0].isalpha() else x)))
|
|||
|
|
print(f"成功入库: {added} 个")
|
|||
|
|
|
|||
|
|
# 更新JSON文件
|
|||
|
|
conn = sqlite3.connect(r'D:\夏骥\微信研究\contacts.db')
|
|||
|
|
cursor = conn.cursor()
|
|||
|
|
cursor.execute('SELECT * FROM contacts ORDER BY id')
|
|||
|
|
all_db_contacts = []
|
|||
|
|
for row in cursor.fetchall():
|
|||
|
|
all_db_contacts.append({
|
|||
|
|
"id": row[0],
|
|||
|
|
"name": row[1],
|
|||
|
|
"category": row[2],
|
|||
|
|
"blessing": row[3],
|
|||
|
|
"selected": bool(row[4])
|
|||
|
|
})
|
|||
|
|
conn.close()
|
|||
|
|
|
|||
|
|
json_file = r"D:\夏骥\微信研究\contacts_data.json"
|
|||
|
|
with open(json_file, 'w', encoding='utf-8') as f:
|
|||
|
|
json.dump(all_db_contacts, f, ensure_ascii=False, indent=2)
|
|||
|
|
|
|||
|
|
print(f"JSON数据已更新: {json_file}")
|
|||
|
|
print(f"数据库总联系人: {len(all_db_contacts)} 个")
|
|||
|
|
|
|||
|
|
|
|||
|
|
if __name__ == '__main__':
|
|||
|
|
main()
|