docs: 添加涉密文件自检工具实施计划

2026-06-08 13:53:24 +08:00
commit 31161d9a5f
1838 changed files with 455407 additions and 0 deletions
--- a/UmiOCR-data/py_src/ocr/output/init.py
+++ b/UmiOCR-data/py_src/ocr/output/init.py
@@ -0,0 +1,30 @@
+from .output_txt import OutputTxt
+from .output_txt_plain import OutputTxtPlain
+from .output_txt_individual import OutputTxtIndividual
+from .output_md import OutputMD
+from .output_jsonl import OutputJsonl
+from .output_csv import OutputCsv
+from .output_pdf_layered import OutputPdfLayered
+from .output_pdf_one_layer import OutputPdfOneLayer
+
+"""纯文本输出器。初始化传入参数字典：
+    outputArgd = {
+        "outputDir": "",  # 输出路径
+        "outputDirType": "",  # 输出目录类型，"source" 为原文件目录，"specify"为指定目录
+        "outputFileName": "",  # 输出文件名（前缀）
+        "startDatetime": "",  # 开始日期字符串（标准格式）
+        "ignoreBlank": True/False,  # 忽略空白文件
+    }
+"""
+Output = {
+    # 纯文本输出器
+    "txt": OutputTxt,
+    "txtPlain": OutputTxtPlain,
+    "txtIndividual": OutputTxtIndividual,
+    "md": OutputMD,
+    "jsonl": OutputJsonl,
+    "csv": OutputCsv,
+    # PDF输出器，需要额外的参数 "originPath" 原始文件路径
+    "pdfLayered": OutputPdfLayered,
+    "pdfOneLayer": OutputPdfOneLayer,
+}
--- a/UmiOCR-data/py_src/ocr/output/output.py
+++ b/UmiOCR-data/py_src/ocr/output/output.py
@@ -0,0 +1,32 @@
+# OCR输出器的基类。按指定的格式，将传入的文本输出到指定地方。
+
+from .tools import getDataText
+from ...platform import Platform
+import os
+
+
+class Output:
+    def __init__(self, argd):
+        self.dir = argd["outputDir"]  # 输出路径（文件夹）
+        self.fileName = argd["outputFileName"]  # 文件名
+        self.outputPath = f"{self.dir}/{self.fileName}.txt"  # 输出路径
+        self.ignoreBlank = argd["ignoreBlank"]  # 忽略空白文件
+
+    def print(self, res):  # 输出图片信息
+        if not res["code"] == 100 and self.ignoreBlank:
+            return  # 忽略空白图片
+        textOut = f"图片路径：{res['path']}\n代码：{res['code']}\n"
+        if res["code"] == 100:
+            textOut += getDataText(res["data"])  # 获取拼接结果
+        elif res["code"] == 101:
+            textOut += "无文字"
+        else:
+            textOut += f"错误原因：{res['data']}"
+        print(textOut)
+
+    def openOutputFile(self):  # 打开输出文件
+        if self.outputPath and os.path.exists(self.outputPath):
+            Platform.startfile(self.outputPath)
+
+    def onEnd(self):  # 结束输出。
+        pass
--- a/UmiOCR-data/py_src/ocr/output/output_csv.py
+++ b/UmiOCR-data/py_src/ocr/output/output_csv.py
@@ -0,0 +1,70 @@
+# 输出到csv表格文件
+
+import csv
+
+from umi_log import logger
+from .output import Output
+from .tools import getDataText
+
+
+class OutputCsv(Output):
+    def __init__(self, argd):
+        self.encodings = [  # 保存编码优先级
+            "ansi",  # Windows系统本地编码。在linux和macos下会抛出异常
+            "ascii",  # 纯英
+            "gbk",  # 简中
+            "big5",  # 繁中
+            "shift_jis",  # 日文
+            "euc-kr",  # 韩文
+            "utf-8",
+        ]
+        self.dir = argd["outputDir"]  # 输出路径（文件夹）
+        self.fileName = argd["outputFileName"]  # 文件名
+        self.outputPath = f"{self.dir}/{self.fileName}.csv"  # 输出路径
+        self.ignoreBlank = argd["ignoreBlank"]  # 忽略空白文件
+        self.writeLists = []  # 输出内容列表
+        self.writeText = ""  # 输出内容字符串
+        try:  # 覆盖创建临时文件
+            with open(self.outputPath, "w", encoding="utf-8") as f:
+                pass
+        except Exception as e:
+            raise Exception(f"Failed to create csv file. {e}\n创建csv文件失败。")
+
+    def print(self, res):  # 输出图片结果
+        if not res["code"] == 100 and self.ignoreBlank:
+            return  # 忽略空白图片
+        name = res["fileName"]
+        path = res["path"]
+        if res["code"] == 100:
+            textOut = getDataText(res["data"])  # 获取拼接结果
+        elif res["code"] == 101:
+            textOut = ""
+        else:
+            textOut = f'[Error] OCR failed. Code: {res["code"]}, Msg: {res["data"]} .\n'
+        self.writeLists.append([name, textOut, path])
+        self.writeText += textOut
+
+    def onEnd(self):  # 结束时保存。
+        # 顺序测试编码优先级列表，获取保存编码
+        encoding = "utf-8"
+        for e in self.encodings:
+            try:
+                self.writeText.encode(e)
+                encoding = e
+                break
+            # except UnicodeEncodeError:
+            except Exception:
+                pass
+        logger.info(f"csv encoding: {encoding}")
+        # 创建文件、输出
+        headers = ["Name", "OCR", "Path"]  # 表头
+        try:
+            with open(
+                self.outputPath, "w", encoding=encoding, newline=""
+            ) as f:  # 覆盖创建文件
+                writer = csv.writer(f)
+                writer.writerow(headers)  # 写入CSV表头
+                for writeList in self.writeLists:
+                    writer.writerow(writeList)  # 写入CSV内容
+        except Exception as e:
+            raise Exception(f"Failed to write csv file. {e}\n写入csv文件失败。")
--- a/UmiOCR-data/py_src/ocr/output/output_jsonl.py
+++ b/UmiOCR-data/py_src/ocr/output/output_jsonl.py
@@ -0,0 +1,24 @@
+# 输出到jsonl文件
+
+from .output import Output
+
+import json
+
+
+class OutputJsonl(Output):
+    def __init__(self, argd):
+        self.dir = argd["outputDir"]  # 输出路径（文件夹）
+        self.fileName = argd["outputFileName"]  # 文件名
+        self.outputPath = f"{self.dir}/{self.fileName}.jsonl"  # 输出路径
+        self.ignoreBlank = argd["ignoreBlank"]  # 忽略空白文件
+        # 创建输出文件
+        try:
+            with open(self.outputPath, "w", encoding="utf-8") as f:  # 覆盖创建文件
+                pass
+        except Exception as e:
+            raise Exception(f"Failed to create jsonl file. {e}\n创建jsonl文件失败。")
+
+    def print(self, res):  # 输出图片结果
+        # 不忽略空白条目
+        with open(self.outputPath, "a", encoding="utf-8") as f:  # 追加写入本地文件
+            f.write(json.dumps(res, ensure_ascii=False) + "\n")
--- a/UmiOCR-data/py_src/ocr/output/output_md.py
+++ b/UmiOCR-data/py_src/ocr/output/output_md.py
@@ -0,0 +1,46 @@
+# 输出markdown格式
+
+from .output import Output
+from .tools import getDataText
+
+import os
+
+
+class OutputMD(Output):
+    def __init__(self, argd):
+        self.dir = argd["outputDir"]  # 输出路径（文件夹）
+        self.fileName = argd["outputFileName"]  # 文件名
+        self.outputPath = f"{self.dir}/{self.fileName}.md"  # 输出路径
+        self.ignoreBlank = argd["ignoreBlank"]  # 忽略空白文件
+        # 创建输出文件
+        try:
+            with open(self.outputPath, "w", encoding="utf-8") as f:  # 覆盖创建文件
+                f.write(f'> {argd["startDatetime"]}\n\n')
+        except Exception as e:
+            raise Exception(f"Failed to create jsonl file. {e}\n创建jsonl文件失败。")
+
+    def print(self, res):  # 输出图片结果
+        if not res["code"] == 100 and self.ignoreBlank:
+            return  # 忽略空白图片
+        name = res["fileName"]
+        path = os.path.relpath(  # 从md文件到图片的相对路径
+            res["path"], os.path.dirname(self.outputPath)
+        )
+        path = path.replace(" ", "%20")  # 空格转 %20
+        textOut = f"""
+---
+![{name}]({path})
+[{name}]({path})
+
+"""
+        # 正文
+        if res["code"] == 100:
+            texts = getDataText(res["data"]).split("\n")  # 获取拼接结果列表
+            for t in texts:
+                textOut += f"> {t}  \n"
+        elif res["code"] == 101:
+            pass
+        else:
+            textOut += f'> [Error] OCR failed. Code: {res["code"]}, Msg: {res["data"]}  \n> 【异常】OCR识别失败。  \n'
+        with open(self.outputPath, "a", encoding="utf-8") as f:  # 追加写入本地文件
+            f.write(textOut)
--- a/UmiOCR-data/py_src/ocr/output/output_pdf_layered.py
+++ b/UmiOCR-data/py_src/ocr/output/output_pdf_layered.py
@@ -0,0 +1,171 @@
+# 双层可搜索 searchable pdf
+# https://github.com/pymupdf/PyMuPDF/discussions/2299
+
+import os
+import fitz  # PyMuPDF
+
+from umi_log import logger
+from .output import Output
+
+
+class OutputPdfLayered(Output):
+    def __init__(self, argd):
+        self.dir = argd["outputDir"]  # 输出路径（文件夹）
+        self.originPath = argd["originPath"]  # 原始文件路径
+        self.fileName = argd["outputFileName"]  # 文件名
+        self.password = argd["password"]  # 密码
+        self.outputPath = f"{self.dir}/{self.fileName}.layered.pdf"  # 输出路径
+        self.pdf = None
+        self.existentPages = []  # 已处理的页数
+        self.isInsertFont = False  # 是否有字体嵌入
+        self.opacity = 0  # 文本透明度为0
+        try:
+            self.font = fitz.Font("cjk")  # 字体
+        except Exception as e:
+            raise Exception(f"Failed to load cjk font. {e}\n无法加载cjk字体。")
+        try:
+            self.pdf = self._getPDF(self.originPath)  # 加载pymupdf对象
+        except Exception as e:
+            raise Exception(
+                f"Failed to load doc file. {e}\n无法加载文档。\n{self.originPath}"
+            )
+
+    # 获取pdf文档对象，或将其它类型的文档转为PDF对象
+    def _getPDF(self, path):
+        # https://github.com/pymupdf/PyMuPDF-Utilities/blob/master/examples/convert-document/convert.py
+        doc = fitz.open(path)
+        # 如果已加密，则尝试解密
+        if doc.is_encrypted and not doc.authenticate(self.password):
+            raise Exception(
+                f'The document is encrypted, and the password "{self.password}" is incorrect.\n文档已加密，输入密码不正确。'
+            )
+        if doc.is_pdf:
+            return doc
+        b = doc.convert_to_pdf()  # 转换为PDF格式的二进制数据
+        pdf = fitz.open("pdf", b)  # 创建PDF文档对象
+        try:
+            pdf.set_toc(doc.get_toc())  # 复制原始文档的目录
+        except Exception:
+            logger.warning("pdf.set_toc error", exc_info=True, stack_info=True)
+        # 复制原始文档的元数据（如作者、标题等）
+        meta = doc.metadata
+        if not meta["producer"]:
+            meta["producer"] = "Umi-OCR & PyMuPDF v" + fitz.VersionBind
+        if not meta["creator"]:
+            meta["creator"] = "Umi-OCR & PyMuPDF PDF converter"
+        pdf.set_metadata(meta)
+        # 复制原始文档的链接
+        for pinput in doc:
+            links = pinput.get_links()
+            pout = pdf[pinput.number]
+            for link in links:
+                if link["kind"] == fitz.LINK_NAMED:  # 不处理 named links
+                    continue
+                pout.insert_link(link)  # 写入新文档
+        doc.close()  # 释放原文档
+        return pdf
+
+    # 计算填满宽和高的一行字体大小
+    def _calculateFontSize(self, text, w, h):
+        if h > w:  # 竖排转为横排计算
+            w, h = h, w
+        fontsize = round(h)  # 字体大小初值，假设为行高
+        minSize = 5  # 大小下限
+        getLen = lambda text, s: self.font.text_length(text, fontsize=s)
+        while getLen(text, fontsize) > w and fontsize >= minSize:
+            fontsize -= 1  # 尝试减小字体，直到行宽刚好小于界限
+        while getLen(text, fontsize) < w:
+            fontsize += 1  # 尝试增大字体，直到行宽刚好超过界限
+        while getLen(text, fontsize) > w and fontsize >= minSize:
+            fontsize -= 0.1  # 再次减小字体，将精度提升到 0.1
+        return fontsize
+
+    def print(self, res):  # 输出图片结果
+        if not self.pdf:
+            logger.error("self.pdf 未初始化。")
+            return
+        pno = res["page"] - 1  # 当前页数
+        self.existentPages.append(pno)  # 记录已处理的页面
+        if not res["code"] == 100:
+            return  # 忽略空白
+
+        page = self.pdf[pno]  # 当前页对象
+        page.clean_contents()  # 内容流清理、语法更正，减少错误
+        protation = page.rotation  # 获取页面旋转角度
+        isInsertFont = False  # 当前是否进行过字体注入
+        # 插入文本，用shape.insert_text（可编辑）或page.insert_text（不可编辑）
+        for tb in res["data"]:
+            if self.opacity == 0 and "from" in tb and tb["from"] == "text":
+                continue  # 双层（透明文字）模式下，跳过直接提取的文本，只写入OCR文本
+            if not isInsertFont:  # 页面插入字体
+                self.isInsertFont = isInsertFont = True
+                page.insert_font(fontname="cjk", fontbuffer=self.font.buffer)
+            text = tb["text"]
+            box = tb["box"]
+            x0, y0 = box[0]
+            x2, y2 = box[2]
+            w = x2 - x0
+            h = y2 - y0
+            fontsize = self._calculateFontSize(text, w, h)
+            # 插入点的 旋转后的坐标
+            point = fitz.Point(x0, y2) * page.derotation_matrix
+            page.insert_text(
+                point,
+                text,
+                fontsize,
+                fontname="cjk",
+                rotate=protation,  # 文本角度设定
+                stroke_opacity=self.opacity,  # 描边透明度
+                fill_opacity=self.opacity,  # 填充（字体）透明度
+            )
+
+    def onEnd(self):  # 结束时保存。
+        if not self.pdf:
+            return
+        # 删除未处理的页数
+        for i in range(len(self.pdf) - 1, -1, -1):
+            if i not in self.existentPages:
+                self.pdf.delete_page(i)
+        logger.info(f"保存{len(self.pdf)}页PDF：{self.outputPath}")
+        if self.isInsertFont:  # 有任意页面嵌入字体，则构建字体子集
+            try:  # 对于部分PDF，如用txt直接打印的，构建字体子集会失败。
+                self.pdf.subset_fonts()  # 构建字体子集，减小文件大小。需要 fontTools 库
+            except Exception:  # TODO: 失败原因？可能文件中实际并没有字体？
+                logger.error("构建字体子集失败。", exc_info=True, stack_info=True)
+            # 保存：压缩并进行3级垃圾回收。等同 ez_save
+            self.save(self.pdf, self.outputPath, deflate=True, garbage=3)
+        else:
+            # 无嵌入字体，则直接保存，不压缩
+            self.save(self.pdf, self.outputPath)
+
+    def save(self, pdf, path, **options):  # 保存并关闭 pdf 对象
+        try:
+            # 尝试保存到指定路径
+            pdf.save(path, **options)
+        except Exception:
+            # 保存失败，尝试保存到 ".temp" 路径
+            tempPath = self.outputPath + ".temp"
+            logger.warning(f"保存PDF失败。 path: {path}", exc_info=True)
+            try:
+                pdf.save(tempPath, **options)
+                pdf.close()
+            except Exception as e1:
+                logger.error(
+                    f"保存PDF到临时路径失败。 tempPath: {tempPath}", exc_info=True
+                )
+                raise Exception(f"[Error] Unable to save PDF to [{tempPath}]: {e1}")
+            # 已保存到 .temp 并 close 原对象，尝试替换文件
+            try:
+                if os.path.exists(path):
+                    os.remove(path)
+                os.rename(tempPath, path)
+            except Exception as e2:
+                logger.warning(
+                    f"保存PDF文件替换失败。保存到临时文件: {tempPath}", exc_info=True
+                )
+
+                raise Exception(
+                    f"[Warning] Unable to save PDF: [{path}]. Exception: {e2}. Saved to temporary path: [{tempPath}]."
+                )
+        else:  # 正常结束
+            pdf.close()
--- a/UmiOCR-data/py_src/ocr/output/output_pdf_one_layer.py
+++ b/UmiOCR-data/py_src/ocr/output/output_pdf_one_layer.py
@@ -0,0 +1,43 @@
+# 单层纯文本 PDF
+
+import fitz  # PyMuPDF
+
+from umi_log import logger
+from .output_pdf_layered import OutputPdfLayered
+
+
+class OutputPdfOneLayer(OutputPdfLayered):
+    def __init__(self, argd):
+        super().__init__(argd)
+        self.opacity = 1  # 文本不透明
+        self.outputPath = f"{self.dir}/{self.fileName}.text.pdf"  # 输出路径
+
+    # 创建空白 PDF
+    def _getPDF(self, path):
+        source_doc = fitz.open(path)  # 打开原文档
+        # 如果已加密，则尝试解密
+        if source_doc.is_encrypted and not source_doc.authenticate(self.password):
+            raise Exception(
+                f'The document is encrypted, and the password "{self.password}" is incorrect.\n文档已加密，输入密码不正确。'
+            )
+        pdf = fitz.open()  # 创建空白PDF文档对象
+        # 复制原始文档的元数据（如作者、标题等）
+        meta = source_doc.metadata
+        if not meta["producer"]:
+            meta["producer"] = "Umi-OCR & PyMuPDF v" + fitz.VersionBind
+        if not meta["creator"]:
+            meta["creator"] = "Umi-OCR & PyMuPDF PDF converter"
+        pdf.set_metadata(meta)
+        # 生成空白的每一页
+        for page in source_doc:
+            rect = page.rect  # 原文档渲染尺寸
+            pdf.new_page(width=rect.width, height=rect.height)
+        # 尝试复制原始文档的目录数据
+        try:
+            pdf.set_toc(source_doc.get_toc())
+        except Exception:
+            logger.warning(
+                f"pdf.set_toc error. path: {path}", exc_info=True, stack_info=True
+            )
+        source_doc.close()  # 释放原文档
+        return pdf
--- a/UmiOCR-data/py_src/ocr/output/output_txt.py
+++ b/UmiOCR-data/py_src/ocr/output/output_txt.py
@@ -0,0 +1,33 @@
+# 输出到txt文件
+
+from .output import Output
+from .tools import getDataText
+
+
+class OutputTxt(Output):
+    def __init__(self, argd):
+        self.dir = argd["outputDir"]  # 输出路径（文件夹）
+        self.fileName = argd["outputFileName"]  # 文件名
+        self.outputPath = f"{self.dir}/{self.fileName}.txt"  # 输出路径
+        self.ignoreBlank = argd["ignoreBlank"]  # 忽略空白文件
+        # 创建输出文件
+        try:
+            with open(self.outputPath, "w", encoding="utf-8") as f:  # 覆盖创建文件
+                f.write(f'{argd["startDatetime"]}\n\n')  # 写入开始时间日期
+        except Exception as e:
+            raise Exception(f"Failed to create txt file. {e}\n创建txt文件失败。")
+
+    def print(self, res):  # 输出图片结果
+        if not res["code"] == 100 and self.ignoreBlank:
+            return  # 忽略空白图片
+        textOut = f'≦ {res["fileName"]} ≧\n'
+        if res["code"] == 100:
+            textOut += getDataText(res["data"])  # 获取拼接结果
+            textOut += "\n"  # 结尾额外加换行
+        elif res["code"] == 101:
+            pass
+        else:
+            textOut += f'[Error] OCR failed. Code: {res["code"]}, Msg: {res["data"]}\n【异常】OCR识别失败。\n'
+        textOut += "\n"  # 多空一行
+        with open(self.outputPath, "a", encoding="utf-8") as f:  # 追加写入本地文件
+            f.write(textOut)
--- a/UmiOCR-data/py_src/ocr/output/output_txt_individual.py
+++ b/UmiOCR-data/py_src/ocr/output/output_txt_individual.py
@@ -0,0 +1,35 @@
+# 单独txt文件
+
+import os
+from .output import Output
+from .tools import getDataText
+
+
+class OutputTxtIndividual(Output):
+    def __init__(self, argd):
+        super().__init__(argd)
+        # 是否输出到原目录
+        self.outputSource = argd["outputDirType"] == "source"
+
+    def openOutputFile(self):
+        pass  # 覆盖父类方法
+
+    def print(self, res):  # 输出图片结果
+        if not res["code"] == 100 and self.ignoreBlank:
+            return  # 忽略空白图片
+        textOut = ""
+        if res["code"] == 100:
+            textOut += getDataText(res["data"])  # 获取拼接结果
+        elif res["code"] == 101:
+            pass
+        else:
+            textOut += f'[Error] OCR failed. Code: {res["code"]}, Msg: {res["data"]}\n【异常】OCR识别失败。\n'
+        # 输出文件
+        if self.outputSource:  # 输出到原始路径
+            p, _ = os.path.splitext(res["path"])  # 原路径去除扩展名
+            path = p + ".txt"
+        else:  # 输出到指定路径
+            f, _ = os.path.splitext(res["fileName"])  # 原文件名去除扩展名
+            path = f"{self.dir}/{f}.txt"
+        with open(path, "w", encoding="utf-8") as f:  # 追加写入同名本地文件
+            f.write(textOut)
--- a/UmiOCR-data/py_src/ocr/output/output_txt_plain.py
+++ b/UmiOCR-data/py_src/ocr/output/output_txt_plain.py
@@ -0,0 +1,29 @@
+# 纯文本（无格式）txt文件
+
+from .output import Output
+from .tools import getDataText
+
+
+class OutputTxtPlain(Output):
+    def __init__(self, argd):
+        self.dir = argd["outputDir"]  # 输出路径（文件夹）
+        self.fileName = argd["outputFileName"]  # 文件名
+        self.outputPath = f"{self.dir}/{self.fileName}.p.txt"  # 输出路径
+        # 创建输出文件
+        try:
+            open(self.outputPath, "w").close()  # 覆盖创建文件
+        except Exception as e:
+            raise Exception(
+                f"Failed to create plain txt file. {e}\n创建纯文本txt文件失败。"
+            )
+
+    def print(self, res):  # 输出图片结果
+        if not res["code"] == 100:
+            return  # 强制忽略空白图片
+        textOut = ""
+        if res["code"] == 100:
+            textOut += getDataText(res["data"])  # 获取拼接结果
+            if not textOut[-1] == "\n":  # 确保结尾有换行
+                textOut += "\n"
+        with open(self.outputPath, "a", encoding="utf-8") as f:  # 追加写入本地文件
+            f.write(textOut)
--- a/UmiOCR-data/py_src/ocr/output/tools.py
+++ b/UmiOCR-data/py_src/ocr/output/tools.py
@@ -0,0 +1,9 @@
+# 从data中提取、拼接文本
+def getDataText(data):
+    textOut = ""
+    l = len(data) - 1
+    for i, tb in enumerate(data):
+        textOut += tb["text"]
+        if i < l:
+            textOut += tb["end"]
+    return textOut