weixin-holiday-message/batch_ocr_resume.py

# -*- coding: utf-8 -*-
"""
断点续传OCR识别
"""
import os
import sqlite3
import requests
import base64
from PIL import Image
import glob
import json
import time


def ocr_image(image_path):
    """OCR识别单张图片"""
    with open(image_path, 'rb') as f:
        image_base64 = base64.b64encode(f.read()).decode('utf-8')

    url = "http://localhost:11434/api/chat"
    payload = {
        "model": "glm-ocr",
        "messages": [{
            "role": "user",
            "content": """识别图片中的所有联系人名称。要求：
1. 只输出联系人名称，每行一个
2. 忽略分组标题（如星号、字母A-Z等）
3. 忽略数字统计
4. 不要添加任何其他内容""",
            "images": [image_base64]
        }],
        "stream": False
    }

    try:
        response = requests.post(url, json=payload, timeout=60)
        return response.json().get('message', {}).get('content', '')
    except Exception as e:
        print(f"OCR失败: {e}")
        return ""


def is_valid_contact(line):
    """判断是否是有效的联系人"""
    line = line.strip()
    if not line or len(line) < 2:
        return False

    invalid = ["公众号", "服务号", "企业微信联系人", "我的企业", "联系人",
               "星标朋友", "新的朋友", "群聊", "标签", "仅聊天", "设备"]

    if line in invalid:
        return False
    if len(line) == 1 and line.isalpha():
        return False
    if line.startswith(">") or line.startswith("!"):
        return False

    return True


def clean_contact_name(name):
    """清理联系人名称"""
    name = name.strip('"\'')
    name = name.rstrip(',，。：:')
    return name.strip()


def get_existing_contacts():
    """从数据库获取已存在的联系人"""
    conn = sqlite3.connect(r'D:\夏骥\微信研究\contacts.db')
    cursor = conn.cursor()
    cursor.execute('SELECT name FROM contacts')
    existing = set(row[0] for row in cursor.fetchall())
    conn.close()
    return existing


def add_new_contacts(new_contacts):
    """将新联系人添加到数据库"""
    if not new_contacts:
        return 0

    conn = sqlite3.connect(r'D:\夏骥\微信研究\contacts.db')
    cursor = conn.cursor()

    # 获取当前最大ID
    cursor.execute('SELECT MAX(id) FROM contacts')
    max_id = cursor.fetchone()[0] or 0

    added = 0
    for idx, name in enumerate(new_contacts, start=max_id + 1):
        cursor.execute('''
            INSERT INTO contacts (id, name, category, blessing, selected)
            VALUES (?, ?, ?, ?, ?)
        ''', (idx, name, '', '马年新春快乐！愿您在新的一年里，事业腾飞，马到成功！', False))
        added += 1

    conn.commit()
    conn.close()
    return added


def save_progress(progress):
    """保存进度"""
    with open(r'D:\夏骥\微信研究\ocr_progress.json', 'w', encoding='utf-8') as f:
        json.dump(progress, f, ensure_ascii=False, indent=2)


def load_progress():
    """加载进度"""
    try:
        with open(r'D:\夏骥\微信研究\ocr_progress.json', 'r', encoding='utf-8') as f:
            return json.load(f)
    except:
        return {"processed": 0, "total": 0, "new_contacts": []}


def main():
    print("=" * 60)
    print("断点续传OCR识别")
    print("=" * 60)

    # 获取截图目录
    scroll_dir = r"D:\夏骥\微信研究\scroll_complete"
    screenshots = sorted(glob.glob(os.path.join(scroll_dir, "*.png")))

    if not screenshots:
        print("未找到截图文件！")
        return

    print(f"找到 {len(screenshots)} 张截图")

    # 加载进度
    progress = load_progress()
    processed = progress.get("processed", 0)
    all_new_contacts = set(progress.get("new_contacts", []))

    print(f"已处理: {processed} 张，已发现新联系人: {len(all_new_contacts)} 个")

    # 获取已存在的联系人
    existing_contacts = get_existing_contacts()
    print(f"数据库中已有 {len(existing_contacts)} 个联系人")

    skipped_count = 0
    batch_size = 50  # 每50张保存一次进度

    start_time = time.time()

    for i, path in enumerate(screenshots[processed:], start=processed):
        current_time = time.time()
        elapsed = current_time - start_time
        avg_time = elapsed / (i - processed + 1) if i > processed else 0
        remaining = avg_time * (len(screenshots) - i - 1)
        percent = (i + 1) / len(screenshots) * 100

        print(f"\n[{i+1}/{len(screenshots)}] {percent:.1f}% | 预计剩余: {remaining/60:.1f}分钟")
        print(f"  文件: {os.path.basename(path)}")

        result = ocr_image(path)

        new_in_this = 0
        for line in result.strip().split('\n'):
            line = line.strip()
            if is_valid_contact(line):
                cleaned = clean_contact_name(line)
                if cleaned and len(cleaned) >= 2:
                    if cleaned in existing_contacts:
                        skipped_count += 1
                    elif cleaned not in all_new_contacts:
                        new_in_this += 1
                        all_new_contacts.add(cleaned)
                        print(f"  ✓ 新: {cleaned}")

        print(f"  本轮: +{new_in_this} | 累计新: {len(all_new_contacts)} | 跳过: {skipped_count}")

        # 保存进度
        progress["processed"] = i + 1
        progress["total"] = len(screenshots)
        progress["new_contacts"] = list(all_new_contacts)

        if (i + 1) % batch_size == 0:
            save_progress(progress)
            print(f"  💾 进度已保存 ({i+1}/{len(screenshots)})")

        # 每10张显示汇总
        if (i + 1) % 10 == 0:
            print(f"\n{'='*60}")
            print(f"📊 进度汇总: {i+1}/{len(screenshots)} ({percent:.1f}%)")
            print(f"⏱️  已用时间: {elapsed/60:.1f}分钟 | 预计剩余: {remaining/60:.1f}分钟")
            print(f"👤 新联系人: {len(all_new_contacts)} | 跳过: {skipped_count}")
            print(f"{'='*60}\n")

    # 最终保存
    save_progress(progress)

    print(f"\n{'='*60}")
    print(f"OCR完成！")
    print(f"发现新联系人: {len(all_new_contacts)} 个")
    print(f"跳过已存在: {skipped_count} 个")

    # 入库
    if all_new_contacts:
        added = add_new_contacts(sorted(all_new_contacts, key=lambda x: (not x[0].isalpha() if x else True, x.lower() if x and x[0].isalpha() else x)))
        print(f"成功入库: {added} 个")

    # 更新JSON文件
    conn = sqlite3.connect(r'D:\夏骥\微信研究\contacts.db')
    cursor = conn.cursor()
    cursor.execute('SELECT * FROM contacts ORDER BY id')
    all_contacts = []
    for row in cursor.fetchall():
        all_contacts.append({
            "id": row[0],
            "name": row[1],
            "category": row[2],
            "blessing": row[3],
            "selected": bool(row[4])
        })
    conn.close()

    json_file = r"D:\夏骥\微信研究\contacts_data.json"
    with open(json_file, 'w', encoding='utf-8') as f:
        json.dump(all_contacts, f, ensure_ascii=False, indent=2)

    print(f"JSON数据已更新: {json_file}")
    print(f"数据库总联系人: {len(all_contacts)} 个")


if __name__ == '__main__':
    main()