UmiOCR-data/py_src/ocr/output/output_pdf_layered.py

# 双层可搜索 searchable pdf
# https://github.com/pymupdf/PyMuPDF/discussions/2299

import os
import fitz  # PyMuPDF

from umi_log import logger
from .output import Output


class OutputPdfLayered(Output):
    def __init__(self, argd):
        self.dir = argd["outputDir"]  # 输出路径（文件夹）
        self.originPath = argd["originPath"]  # 原始文件路径
        self.fileName = argd["outputFileName"]  # 文件名
        self.password = argd["password"]  # 密码
        self.outputPath = f"{self.dir}/{self.fileName}.layered.pdf"  # 输出路径
        self.pdf = None
        self.existentPages = []  # 已处理的页数
        self.isInsertFont = False  # 是否有字体嵌入
        self.opacity = 0  # 文本透明度为0
        try:
            self.font = fitz.Font("cjk")  # 字体
        except Exception as e:
            raise Exception(f"Failed to load cjk font. {e}\n无法加载cjk字体。")
        try:
            self.pdf = self._getPDF(self.originPath)  # 加载pymupdf对象
        except Exception as e:
            raise Exception(
                f"Failed to load doc file. {e}\n无法加载文档。\n{self.originPath}"
            )

    # 获取pdf文档对象，或将其它类型的文档转为PDF对象
    def _getPDF(self, path):
        # https://github.com/pymupdf/PyMuPDF-Utilities/blob/master/examples/convert-document/convert.py
        doc = fitz.open(path)
        # 如果已加密，则尝试解密
        if doc.is_encrypted and not doc.authenticate(self.password):
            raise Exception(
                f'The document is encrypted, and the password "{self.password}" is incorrect.\n文档已加密，输入密码不正确。'
            )
        if doc.is_pdf:
            return doc
        b = doc.convert_to_pdf()  # 转换为PDF格式的二进制数据
        pdf = fitz.open("pdf", b)  # 创建PDF文档对象
        try:
            pdf.set_toc(doc.get_toc())  # 复制原始文档的目录
        except Exception:
            logger.warning("pdf.set_toc error", exc_info=True, stack_info=True)
        # 复制原始文档的元数据（如作者、标题等）
        meta = doc.metadata
        if not meta["producer"]:
            meta["producer"] = "Umi-OCR & PyMuPDF v" + fitz.VersionBind
        if not meta["creator"]:
            meta["creator"] = "Umi-OCR & PyMuPDF PDF converter"
        pdf.set_metadata(meta)
        # 复制原始文档的链接
        for pinput in doc:
            links = pinput.get_links()
            pout = pdf[pinput.number]
            for link in links:
                if link["kind"] == fitz.LINK_NAMED:  # 不处理 named links
                    continue
                pout.insert_link(link)  # 写入新文档
        doc.close()  # 释放原文档
        return pdf

    # 计算填满宽和高的一行字体大小
    def _calculateFontSize(self, text, w, h):
        if h > w:  # 竖排转为横排计算
            w, h = h, w
        fontsize = round(h)  # 字体大小初值，假设为行高
        minSize = 5  # 大小下限
        getLen = lambda text, s: self.font.text_length(text, fontsize=s)
        while getLen(text, fontsize) > w and fontsize >= minSize:
            fontsize -= 1  # 尝试减小字体，直到行宽刚好小于界限
        while getLen(text, fontsize) < w:
            fontsize += 1  # 尝试增大字体，直到行宽刚好超过界限
        while getLen(text, fontsize) > w and fontsize >= minSize:
            fontsize -= 0.1  # 再次减小字体，将精度提升到 0.1
        return fontsize

    def print(self, res):  # 输出图片结果
        if not self.pdf:
            logger.error("self.pdf 未初始化。")
            return
        pno = res["page"] - 1  # 当前页数
        self.existentPages.append(pno)  # 记录已处理的页面
        if not res["code"] == 100:
            return  # 忽略空白

        page = self.pdf[pno]  # 当前页对象
        page.clean_contents()  # 内容流清理、语法更正，减少错误
        protation = page.rotation  # 获取页面旋转角度
        isInsertFont = False  # 当前是否进行过字体注入
        # 插入文本，用shape.insert_text（可编辑）或page.insert_text（不可编辑）
        for tb in res["data"]:
            if self.opacity == 0 and "from" in tb and tb["from"] == "text":
                continue  # 双层（透明文字）模式下，跳过直接提取的文本，只写入OCR文本
            if not isInsertFont:  # 页面插入字体
                self.isInsertFont = isInsertFont = True
                page.insert_font(fontname="cjk", fontbuffer=self.font.buffer)
            text = tb["text"]
            box = tb["box"]
            x0, y0 = box[0]
            x2, y2 = box[2]
            w = x2 - x0
            h = y2 - y0
            fontsize = self._calculateFontSize(text, w, h)
            # 插入点的 旋转后的坐标
            point = fitz.Point(x0, y2) * page.derotation_matrix
            page.insert_text(
                point,
                text,
                fontsize,
                fontname="cjk",
                rotate=protation,  # 文本角度设定
                stroke_opacity=self.opacity,  # 描边透明度
                fill_opacity=self.opacity,  # 填充（字体）透明度
            )

    def onEnd(self):  # 结束时保存。
        if not self.pdf:
            return
        # 删除未处理的页数
        for i in range(len(self.pdf) - 1, -1, -1):
            if i not in self.existentPages:
                self.pdf.delete_page(i)
        logger.info(f"保存{len(self.pdf)}页PDF：{self.outputPath}")
        if self.isInsertFont:  # 有任意页面嵌入字体，则构建字体子集
            try:  # 对于部分PDF，如用txt直接打印的，构建字体子集会失败。
                self.pdf.subset_fonts()  # 构建字体子集，减小文件大小。需要 fontTools 库
            except Exception:  # TODO: 失败原因？可能文件中实际并没有字体？
                logger.error("构建字体子集失败。", exc_info=True, stack_info=True)
            # 保存：压缩并进行3级垃圾回收。等同 ez_save
            self.save(self.pdf, self.outputPath, deflate=True, garbage=3)
        else:
            # 无嵌入字体，则直接保存，不压缩
            self.save(self.pdf, self.outputPath)

    def save(self, pdf, path, **options):  # 保存并关闭 pdf 对象
        try:
            # 尝试保存到指定路径
            pdf.save(path, **options)
        except Exception:
            # 保存失败，尝试保存到 ".temp" 路径
            tempPath = self.outputPath + ".temp"
            logger.warning(f"保存PDF失败。 path: {path}", exc_info=True)
            try:
                pdf.save(tempPath, **options)
                pdf.close()
            except Exception as e1:
                logger.error(
                    f"保存PDF到临时路径失败。 tempPath: {tempPath}", exc_info=True
                )
                raise Exception(f"[Error] Unable to save PDF to [{tempPath}]: {e1}")
            # 已保存到 .temp 并 close 原对象，尝试替换文件
            try:
                if os.path.exists(path):
                    os.remove(path)
                os.rename(tempPath, path)
            except Exception as e2:
                logger.warning(
                    f"保存PDF文件替换失败。保存到临时文件: {tempPath}", exc_info=True
                )

                raise Exception(
                    f"[Warning] Unable to save PDF: [{path}]. Exception: {e2}. Saved to temporary path: [{tempPath}]."
                )
        else:  # 正常结束
            pdf.close()