docs: 添加涉密文件自检工具实施计划

2026-06-08 13:53:24 +08:00
commit 31161d9a5f
1838 changed files with 455407 additions and 0 deletions
--- a/UmiOCR-data/py_src/ocr/api/init.py
+++ b/UmiOCR-data/py_src/ocr/api/init.py
@@ -0,0 +1,42 @@
+# ===============================================
+# =============== OCR 插件接口管理 ===============
+# ===============================================
+
+from umi_log import logger
+
+ApiDict = {}
+AllDict = {}
+
+
+# TODO: 静态插件
+# 由插件控制器调用，初始化OCR插件的接口。传入动态插件
+def initOcrPlugins(plugins):
+    global ApiDict, AllDict
+    for p in plugins:
+        ApiDict[p] = plugins[p]["api_class"]
+        AllDict[p] = plugins[p]
+
+
+# 生成一个ocr api实例，成功返回对象，失败返回 [Error] 开头的字符串
+def getApiOcr(apiKey, argd):
+    # 检测argd，恢复int类型
+    for k in argd:
+        n = argd[k]
+        if isinstance(n, float):
+            rounded = round(n)
+            if abs(n - rounded) <= 1e-7:
+                argd[k] = rounded
+    if apiKey in ApiDict:
+        try:
+            return ApiDict[apiKey](argd)  # 实例化后返回
+        except Exception as e:
+            logger.error(f"生成api实例{apiKey}失败。", exc_info=True, stack_info=True)
+            return f"[Error] Failed to generate API instance {apiKey}: {e}"
+    return f'[Error] "{apiKey}" not in ApiDict.'
+
+
+# 返回一个API的局部配置字典
+def getLocalOptions(apiKey):
+    if apiKey in AllDict:
+        return AllDict[apiKey]["local_options"]
+    return {}
--- a/UmiOCR-data/py_src/ocr/output/init.py
+++ b/UmiOCR-data/py_src/ocr/output/init.py
@@ -0,0 +1,30 @@
+from .output_txt import OutputTxt
+from .output_txt_plain import OutputTxtPlain
+from .output_txt_individual import OutputTxtIndividual
+from .output_md import OutputMD
+from .output_jsonl import OutputJsonl
+from .output_csv import OutputCsv
+from .output_pdf_layered import OutputPdfLayered
+from .output_pdf_one_layer import OutputPdfOneLayer
+
+"""纯文本输出器。初始化传入参数字典：
+    outputArgd = {
+        "outputDir": "",  # 输出路径
+        "outputDirType": "",  # 输出目录类型，"source" 为原文件目录，"specify"为指定目录
+        "outputFileName": "",  # 输出文件名（前缀）
+        "startDatetime": "",  # 开始日期字符串（标准格式）
+        "ignoreBlank": True/False,  # 忽略空白文件
+    }
+"""
+Output = {
+    # 纯文本输出器
+    "txt": OutputTxt,
+    "txtPlain": OutputTxtPlain,
+    "txtIndividual": OutputTxtIndividual,
+    "md": OutputMD,
+    "jsonl": OutputJsonl,
+    "csv": OutputCsv,
+    # PDF输出器，需要额外的参数 "originPath" 原始文件路径
+    "pdfLayered": OutputPdfLayered,
+    "pdfOneLayer": OutputPdfOneLayer,
+}
--- a/UmiOCR-data/py_src/ocr/output/output.py
+++ b/UmiOCR-data/py_src/ocr/output/output.py
@@ -0,0 +1,32 @@
+# OCR输出器的基类。按指定的格式，将传入的文本输出到指定地方。
+
+from .tools import getDataText
+from ...platform import Platform
+import os
+
+
+class Output:
+    def __init__(self, argd):
+        self.dir = argd["outputDir"]  # 输出路径（文件夹）
+        self.fileName = argd["outputFileName"]  # 文件名
+        self.outputPath = f"{self.dir}/{self.fileName}.txt"  # 输出路径
+        self.ignoreBlank = argd["ignoreBlank"]  # 忽略空白文件
+
+    def print(self, res):  # 输出图片信息
+        if not res["code"] == 100 and self.ignoreBlank:
+            return  # 忽略空白图片
+        textOut = f"图片路径：{res['path']}\n代码：{res['code']}\n"
+        if res["code"] == 100:
+            textOut += getDataText(res["data"])  # 获取拼接结果
+        elif res["code"] == 101:
+            textOut += "无文字"
+        else:
+            textOut += f"错误原因：{res['data']}"
+        print(textOut)
+
+    def openOutputFile(self):  # 打开输出文件
+        if self.outputPath and os.path.exists(self.outputPath):
+            Platform.startfile(self.outputPath)
+
+    def onEnd(self):  # 结束输出。
+        pass
--- a/UmiOCR-data/py_src/ocr/output/output_csv.py
+++ b/UmiOCR-data/py_src/ocr/output/output_csv.py
@@ -0,0 +1,70 @@
+# 输出到csv表格文件
+
+import csv
+
+from umi_log import logger
+from .output import Output
+from .tools import getDataText
+
+
+class OutputCsv(Output):
+    def __init__(self, argd):
+        self.encodings = [  # 保存编码优先级
+            "ansi",  # Windows系统本地编码。在linux和macos下会抛出异常
+            "ascii",  # 纯英
+            "gbk",  # 简中
+            "big5",  # 繁中
+            "shift_jis",  # 日文
+            "euc-kr",  # 韩文
+            "utf-8",
+        ]
+        self.dir = argd["outputDir"]  # 输出路径（文件夹）
+        self.fileName = argd["outputFileName"]  # 文件名
+        self.outputPath = f"{self.dir}/{self.fileName}.csv"  # 输出路径
+        self.ignoreBlank = argd["ignoreBlank"]  # 忽略空白文件
+        self.writeLists = []  # 输出内容列表
+        self.writeText = ""  # 输出内容字符串
+        try:  # 覆盖创建临时文件
+            with open(self.outputPath, "w", encoding="utf-8") as f:
+                pass
+        except Exception as e:
+            raise Exception(f"Failed to create csv file. {e}\n创建csv文件失败。")
+
+    def print(self, res):  # 输出图片结果
+        if not res["code"] == 100 and self.ignoreBlank:
+            return  # 忽略空白图片
+        name = res["fileName"]
+        path = res["path"]
+        if res["code"] == 100:
+            textOut = getDataText(res["data"])  # 获取拼接结果
+        elif res["code"] == 101:
+            textOut = ""
+        else:
+            textOut = f'[Error] OCR failed. Code: {res["code"]}, Msg: {res["data"]} .\n'
+        self.writeLists.append([name, textOut, path])
+        self.writeText += textOut
+
+    def onEnd(self):  # 结束时保存。
+        # 顺序测试编码优先级列表，获取保存编码
+        encoding = "utf-8"
+        for e in self.encodings:
+            try:
+                self.writeText.encode(e)
+                encoding = e
+                break
+            # except UnicodeEncodeError:
+            except Exception:
+                pass
+        logger.info(f"csv encoding: {encoding}")
+        # 创建文件、输出
+        headers = ["Name", "OCR", "Path"]  # 表头
+        try:
+            with open(
+                self.outputPath, "w", encoding=encoding, newline=""
+            ) as f:  # 覆盖创建文件
+                writer = csv.writer(f)
+                writer.writerow(headers)  # 写入CSV表头
+                for writeList in self.writeLists:
+                    writer.writerow(writeList)  # 写入CSV内容
+        except Exception as e:
+            raise Exception(f"Failed to write csv file. {e}\n写入csv文件失败。")
--- a/UmiOCR-data/py_src/ocr/output/output_jsonl.py
+++ b/UmiOCR-data/py_src/ocr/output/output_jsonl.py
@@ -0,0 +1,24 @@
+# 输出到jsonl文件
+
+from .output import Output
+
+import json
+
+
+class OutputJsonl(Output):
+    def __init__(self, argd):
+        self.dir = argd["outputDir"]  # 输出路径（文件夹）
+        self.fileName = argd["outputFileName"]  # 文件名
+        self.outputPath = f"{self.dir}/{self.fileName}.jsonl"  # 输出路径
+        self.ignoreBlank = argd["ignoreBlank"]  # 忽略空白文件
+        # 创建输出文件
+        try:
+            with open(self.outputPath, "w", encoding="utf-8") as f:  # 覆盖创建文件
+                pass
+        except Exception as e:
+            raise Exception(f"Failed to create jsonl file. {e}\n创建jsonl文件失败。")
+
+    def print(self, res):  # 输出图片结果
+        # 不忽略空白条目
+        with open(self.outputPath, "a", encoding="utf-8") as f:  # 追加写入本地文件
+            f.write(json.dumps(res, ensure_ascii=False) + "\n")
--- a/UmiOCR-data/py_src/ocr/output/output_md.py
+++ b/UmiOCR-data/py_src/ocr/output/output_md.py
@@ -0,0 +1,46 @@
+# 输出markdown格式
+
+from .output import Output
+from .tools import getDataText
+
+import os
+
+
+class OutputMD(Output):
+    def __init__(self, argd):
+        self.dir = argd["outputDir"]  # 输出路径（文件夹）
+        self.fileName = argd["outputFileName"]  # 文件名
+        self.outputPath = f"{self.dir}/{self.fileName}.md"  # 输出路径
+        self.ignoreBlank = argd["ignoreBlank"]  # 忽略空白文件
+        # 创建输出文件
+        try:
+            with open(self.outputPath, "w", encoding="utf-8") as f:  # 覆盖创建文件
+                f.write(f'> {argd["startDatetime"]}\n\n')
+        except Exception as e:
+            raise Exception(f"Failed to create jsonl file. {e}\n创建jsonl文件失败。")
+
+    def print(self, res):  # 输出图片结果
+        if not res["code"] == 100 and self.ignoreBlank:
+            return  # 忽略空白图片
+        name = res["fileName"]
+        path = os.path.relpath(  # 从md文件到图片的相对路径
+            res["path"], os.path.dirname(self.outputPath)
+        )
+        path = path.replace(" ", "%20")  # 空格转 %20
+        textOut = f"""
+---
+![{name}]({path})
+[{name}]({path})
+
+"""
+        # 正文
+        if res["code"] == 100:
+            texts = getDataText(res["data"]).split("\n")  # 获取拼接结果列表
+            for t in texts:
+                textOut += f"> {t}  \n"
+        elif res["code"] == 101:
+            pass
+        else:
+            textOut += f'> [Error] OCR failed. Code: {res["code"]}, Msg: {res["data"]}  \n> 【异常】OCR识别失败。  \n'
+        with open(self.outputPath, "a", encoding="utf-8") as f:  # 追加写入本地文件
+            f.write(textOut)
--- a/UmiOCR-data/py_src/ocr/output/output_pdf_layered.py
+++ b/UmiOCR-data/py_src/ocr/output/output_pdf_layered.py
@@ -0,0 +1,171 @@
+# 双层可搜索 searchable pdf
+# https://github.com/pymupdf/PyMuPDF/discussions/2299
+
+import os
+import fitz  # PyMuPDF
+
+from umi_log import logger
+from .output import Output
+
+
+class OutputPdfLayered(Output):
+    def __init__(self, argd):
+        self.dir = argd["outputDir"]  # 输出路径（文件夹）
+        self.originPath = argd["originPath"]  # 原始文件路径
+        self.fileName = argd["outputFileName"]  # 文件名
+        self.password = argd["password"]  # 密码
+        self.outputPath = f"{self.dir}/{self.fileName}.layered.pdf"  # 输出路径
+        self.pdf = None
+        self.existentPages = []  # 已处理的页数
+        self.isInsertFont = False  # 是否有字体嵌入
+        self.opacity = 0  # 文本透明度为0
+        try:
+            self.font = fitz.Font("cjk")  # 字体
+        except Exception as e:
+            raise Exception(f"Failed to load cjk font. {e}\n无法加载cjk字体。")
+        try:
+            self.pdf = self._getPDF(self.originPath)  # 加载pymupdf对象
+        except Exception as e:
+            raise Exception(
+                f"Failed to load doc file. {e}\n无法加载文档。\n{self.originPath}"
+            )
+
+    # 获取pdf文档对象，或将其它类型的文档转为PDF对象
+    def _getPDF(self, path):
+        # https://github.com/pymupdf/PyMuPDF-Utilities/blob/master/examples/convert-document/convert.py
+        doc = fitz.open(path)
+        # 如果已加密，则尝试解密
+        if doc.is_encrypted and not doc.authenticate(self.password):
+            raise Exception(
+                f'The document is encrypted, and the password "{self.password}" is incorrect.\n文档已加密，输入密码不正确。'
+            )
+        if doc.is_pdf:
+            return doc
+        b = doc.convert_to_pdf()  # 转换为PDF格式的二进制数据
+        pdf = fitz.open("pdf", b)  # 创建PDF文档对象
+        try:
+            pdf.set_toc(doc.get_toc())  # 复制原始文档的目录
+        except Exception:
+            logger.warning("pdf.set_toc error", exc_info=True, stack_info=True)
+        # 复制原始文档的元数据（如作者、标题等）
+        meta = doc.metadata
+        if not meta["producer"]:
+            meta["producer"] = "Umi-OCR & PyMuPDF v" + fitz.VersionBind
+        if not meta["creator"]:
+            meta["creator"] = "Umi-OCR & PyMuPDF PDF converter"
+        pdf.set_metadata(meta)
+        # 复制原始文档的链接
+        for pinput in doc:
+            links = pinput.get_links()
+            pout = pdf[pinput.number]
+            for link in links:
+                if link["kind"] == fitz.LINK_NAMED:  # 不处理 named links
+                    continue
+                pout.insert_link(link)  # 写入新文档
+        doc.close()  # 释放原文档
+        return pdf
+
+    # 计算填满宽和高的一行字体大小
+    def _calculateFontSize(self, text, w, h):
+        if h > w:  # 竖排转为横排计算
+            w, h = h, w
+        fontsize = round(h)  # 字体大小初值，假设为行高
+        minSize = 5  # 大小下限
+        getLen = lambda text, s: self.font.text_length(text, fontsize=s)
+        while getLen(text, fontsize) > w and fontsize >= minSize:
+            fontsize -= 1  # 尝试减小字体，直到行宽刚好小于界限
+        while getLen(text, fontsize) < w:
+            fontsize += 1  # 尝试增大字体，直到行宽刚好超过界限
+        while getLen(text, fontsize) > w and fontsize >= minSize:
+            fontsize -= 0.1  # 再次减小字体，将精度提升到 0.1
+        return fontsize
+
+    def print(self, res):  # 输出图片结果
+        if not self.pdf:
+            logger.error("self.pdf 未初始化。")
+            return
+        pno = res["page"] - 1  # 当前页数
+        self.existentPages.append(pno)  # 记录已处理的页面
+        if not res["code"] == 100:
+            return  # 忽略空白
+
+        page = self.pdf[pno]  # 当前页对象
+        page.clean_contents()  # 内容流清理、语法更正，减少错误
+        protation = page.rotation  # 获取页面旋转角度
+        isInsertFont = False  # 当前是否进行过字体注入
+        # 插入文本，用shape.insert_text（可编辑）或page.insert_text（不可编辑）
+        for tb in res["data"]:
+            if self.opacity == 0 and "from" in tb and tb["from"] == "text":
+                continue  # 双层（透明文字）模式下，跳过直接提取的文本，只写入OCR文本
+            if not isInsertFont:  # 页面插入字体
+                self.isInsertFont = isInsertFont = True
+                page.insert_font(fontname="cjk", fontbuffer=self.font.buffer)
+            text = tb["text"]
+            box = tb["box"]
+            x0, y0 = box[0]
+            x2, y2 = box[2]
+            w = x2 - x0
+            h = y2 - y0
+            fontsize = self._calculateFontSize(text, w, h)
+            # 插入点的 旋转后的坐标
+            point = fitz.Point(x0, y2) * page.derotation_matrix
+            page.insert_text(
+                point,
+                text,
+                fontsize,
+                fontname="cjk",
+                rotate=protation,  # 文本角度设定
+                stroke_opacity=self.opacity,  # 描边透明度
+                fill_opacity=self.opacity,  # 填充（字体）透明度
+            )
+
+    def onEnd(self):  # 结束时保存。
+        if not self.pdf:
+            return
+        # 删除未处理的页数
+        for i in range(len(self.pdf) - 1, -1, -1):
+            if i not in self.existentPages:
+                self.pdf.delete_page(i)
+        logger.info(f"保存{len(self.pdf)}页PDF：{self.outputPath}")
+        if self.isInsertFont:  # 有任意页面嵌入字体，则构建字体子集
+            try:  # 对于部分PDF，如用txt直接打印的，构建字体子集会失败。
+                self.pdf.subset_fonts()  # 构建字体子集，减小文件大小。需要 fontTools 库
+            except Exception:  # TODO: 失败原因？可能文件中实际并没有字体？
+                logger.error("构建字体子集失败。", exc_info=True, stack_info=True)
+            # 保存：压缩并进行3级垃圾回收。等同 ez_save
+            self.save(self.pdf, self.outputPath, deflate=True, garbage=3)
+        else:
+            # 无嵌入字体，则直接保存，不压缩
+            self.save(self.pdf, self.outputPath)
+
+    def save(self, pdf, path, **options):  # 保存并关闭 pdf 对象
+        try:
+            # 尝试保存到指定路径
+            pdf.save(path, **options)
+        except Exception:
+            # 保存失败，尝试保存到 ".temp" 路径
+            tempPath = self.outputPath + ".temp"
+            logger.warning(f"保存PDF失败。 path: {path}", exc_info=True)
+            try:
+                pdf.save(tempPath, **options)
+                pdf.close()
+            except Exception as e1:
+                logger.error(
+                    f"保存PDF到临时路径失败。 tempPath: {tempPath}", exc_info=True
+                )
+                raise Exception(f"[Error] Unable to save PDF to [{tempPath}]: {e1}")
+            # 已保存到 .temp 并 close 原对象，尝试替换文件
+            try:
+                if os.path.exists(path):
+                    os.remove(path)
+                os.rename(tempPath, path)
+            except Exception as e2:
+                logger.warning(
+                    f"保存PDF文件替换失败。保存到临时文件: {tempPath}", exc_info=True
+                )
+
+                raise Exception(
+                    f"[Warning] Unable to save PDF: [{path}]. Exception: {e2}. Saved to temporary path: [{tempPath}]."
+                )
+        else:  # 正常结束
+            pdf.close()
--- a/UmiOCR-data/py_src/ocr/output/output_pdf_one_layer.py
+++ b/UmiOCR-data/py_src/ocr/output/output_pdf_one_layer.py
@@ -0,0 +1,43 @@
+# 单层纯文本 PDF
+
+import fitz  # PyMuPDF
+
+from umi_log import logger
+from .output_pdf_layered import OutputPdfLayered
+
+
+class OutputPdfOneLayer(OutputPdfLayered):
+    def __init__(self, argd):
+        super().__init__(argd)
+        self.opacity = 1  # 文本不透明
+        self.outputPath = f"{self.dir}/{self.fileName}.text.pdf"  # 输出路径
+
+    # 创建空白 PDF
+    def _getPDF(self, path):
+        source_doc = fitz.open(path)  # 打开原文档
+        # 如果已加密，则尝试解密
+        if source_doc.is_encrypted and not source_doc.authenticate(self.password):
+            raise Exception(
+                f'The document is encrypted, and the password "{self.password}" is incorrect.\n文档已加密，输入密码不正确。'
+            )
+        pdf = fitz.open()  # 创建空白PDF文档对象
+        # 复制原始文档的元数据（如作者、标题等）
+        meta = source_doc.metadata
+        if not meta["producer"]:
+            meta["producer"] = "Umi-OCR & PyMuPDF v" + fitz.VersionBind
+        if not meta["creator"]:
+            meta["creator"] = "Umi-OCR & PyMuPDF PDF converter"
+        pdf.set_metadata(meta)
+        # 生成空白的每一页
+        for page in source_doc:
+            rect = page.rect  # 原文档渲染尺寸
+            pdf.new_page(width=rect.width, height=rect.height)
+        # 尝试复制原始文档的目录数据
+        try:
+            pdf.set_toc(source_doc.get_toc())
+        except Exception:
+            logger.warning(
+                f"pdf.set_toc error. path: {path}", exc_info=True, stack_info=True
+            )
+        source_doc.close()  # 释放原文档
+        return pdf
--- a/UmiOCR-data/py_src/ocr/output/output_txt.py
+++ b/UmiOCR-data/py_src/ocr/output/output_txt.py
@@ -0,0 +1,33 @@
+# 输出到txt文件
+
+from .output import Output
+from .tools import getDataText
+
+
+class OutputTxt(Output):
+    def __init__(self, argd):
+        self.dir = argd["outputDir"]  # 输出路径（文件夹）
+        self.fileName = argd["outputFileName"]  # 文件名
+        self.outputPath = f"{self.dir}/{self.fileName}.txt"  # 输出路径
+        self.ignoreBlank = argd["ignoreBlank"]  # 忽略空白文件
+        # 创建输出文件
+        try:
+            with open(self.outputPath, "w", encoding="utf-8") as f:  # 覆盖创建文件
+                f.write(f'{argd["startDatetime"]}\n\n')  # 写入开始时间日期
+        except Exception as e:
+            raise Exception(f"Failed to create txt file. {e}\n创建txt文件失败。")
+
+    def print(self, res):  # 输出图片结果
+        if not res["code"] == 100 and self.ignoreBlank:
+            return  # 忽略空白图片
+        textOut = f'≦ {res["fileName"]} ≧\n'
+        if res["code"] == 100:
+            textOut += getDataText(res["data"])  # 获取拼接结果
+            textOut += "\n"  # 结尾额外加换行
+        elif res["code"] == 101:
+            pass
+        else:
+            textOut += f'[Error] OCR failed. Code: {res["code"]}, Msg: {res["data"]}\n【异常】OCR识别失败。\n'
+        textOut += "\n"  # 多空一行
+        with open(self.outputPath, "a", encoding="utf-8") as f:  # 追加写入本地文件
+            f.write(textOut)
--- a/UmiOCR-data/py_src/ocr/output/output_txt_individual.py
+++ b/UmiOCR-data/py_src/ocr/output/output_txt_individual.py
@@ -0,0 +1,35 @@
+# 单独txt文件
+
+import os
+from .output import Output
+from .tools import getDataText
+
+
+class OutputTxtIndividual(Output):
+    def __init__(self, argd):
+        super().__init__(argd)
+        # 是否输出到原目录
+        self.outputSource = argd["outputDirType"] == "source"
+
+    def openOutputFile(self):
+        pass  # 覆盖父类方法
+
+    def print(self, res):  # 输出图片结果
+        if not res["code"] == 100 and self.ignoreBlank:
+            return  # 忽略空白图片
+        textOut = ""
+        if res["code"] == 100:
+            textOut += getDataText(res["data"])  # 获取拼接结果
+        elif res["code"] == 101:
+            pass
+        else:
+            textOut += f'[Error] OCR failed. Code: {res["code"]}, Msg: {res["data"]}\n【异常】OCR识别失败。\n'
+        # 输出文件
+        if self.outputSource:  # 输出到原始路径
+            p, _ = os.path.splitext(res["path"])  # 原路径去除扩展名
+            path = p + ".txt"
+        else:  # 输出到指定路径
+            f, _ = os.path.splitext(res["fileName"])  # 原文件名去除扩展名
+            path = f"{self.dir}/{f}.txt"
+        with open(path, "w", encoding="utf-8") as f:  # 追加写入同名本地文件
+            f.write(textOut)
--- a/UmiOCR-data/py_src/ocr/output/output_txt_plain.py
+++ b/UmiOCR-data/py_src/ocr/output/output_txt_plain.py
@@ -0,0 +1,29 @@
+# 纯文本（无格式）txt文件
+
+from .output import Output
+from .tools import getDataText
+
+
+class OutputTxtPlain(Output):
+    def __init__(self, argd):
+        self.dir = argd["outputDir"]  # 输出路径（文件夹）
+        self.fileName = argd["outputFileName"]  # 文件名
+        self.outputPath = f"{self.dir}/{self.fileName}.p.txt"  # 输出路径
+        # 创建输出文件
+        try:
+            open(self.outputPath, "w").close()  # 覆盖创建文件
+        except Exception as e:
+            raise Exception(
+                f"Failed to create plain txt file. {e}\n创建纯文本txt文件失败。"
+            )
+
+    def print(self, res):  # 输出图片结果
+        if not res["code"] == 100:
+            return  # 强制忽略空白图片
+        textOut = ""
+        if res["code"] == 100:
+            textOut += getDataText(res["data"])  # 获取拼接结果
+            if not textOut[-1] == "\n":  # 确保结尾有换行
+                textOut += "\n"
+        with open(self.outputPath, "a", encoding="utf-8") as f:  # 追加写入本地文件
+            f.write(textOut)
--- a/UmiOCR-data/py_src/ocr/output/tools.py
+++ b/UmiOCR-data/py_src/ocr/output/tools.py
@@ -0,0 +1,9 @@
+# 从data中提取、拼接文本
+def getDataText(data):
+    textOut = ""
+    l = len(data) - 1
+    for i, tb in enumerate(data):
+        textOut += tb["text"]
+        if i < l:
+            textOut += tb["end"]
+    return textOut
--- a/UmiOCR-data/py_src/ocr/tbpu/init.py
+++ b/UmiOCR-data/py_src/ocr/tbpu/init.py
@@ -0,0 +1,32 @@
+# tbpu : text block processing unit 文本块后处理
+
+from .parser_none import ParserNone
+
+from .ignore_area import IgnoreArea
+from .parser_multi_para import MultiPara
+from .parser_multi_line import MultiLine
+from .parser_multi_none import MultiNone
+from .parser_single_para import SinglePara
+from .parser_single_line import SingleLine
+from .parser_single_none import SingleNone
+from .parser_single_code import SingleCode
+
+# 排版解析
+Parser = {
+    "none": ParserNone,  # 不做处理
+    "multi_para": MultiPara,  # 多栏-自然段
+    "multi_line": MultiLine,  # 多栏-总是换行
+    "multi_none": MultiNone,  # 多栏-无换行
+    "single_para": SinglePara,  # 单栏-自然段
+    "single_line": SingleLine,  # 单栏-总是换行
+    "single_none": SingleNone,  # 单栏-无换行
+    "single_code": SingleCode,  # 单栏-代码段
+}
+
+
+# 获取排版解析器对象
+def getParser(key):
+    if key in Parser:
+        return Parser[key]()
+    else:
+        return Parser["none"]()
--- a/UmiOCR-data/py_src/ocr/tbpu/ignore_area.py
+++ b/UmiOCR-data/py_src/ocr/tbpu/ignore_area.py
@@ -0,0 +1,32 @@
+# 忽略区域
+
+from .tbpu import Tbpu
+
+
+class IgnoreArea(Tbpu):
+    def __init__(self, areaList):
+        self.tbpuName = "忽略区域"
+        self.areaList = areaList
+
+    def run(self, textBlocks):
+        # 返回是否矩形框 a 包含 b
+        def isInBox(a, b):
+            return (
+                a[0][0] <= b[0][0]
+                and a[0][1] <= b[0][1]
+                and a[2][0] >= b[2][0]
+                and a[2][1] >= b[2][1]
+            )
+
+        newList = []
+        for b in textBlocks:
+            flag = True  # True 为没有被忽略
+            # 检测当前文块 b 是否在任何一个检测块 a 内
+            for a in self.areaList:
+                if isInBox(a, b["box"]):
+                    flag = False  # 踩到任何一个块，GG
+                    break
+            if flag:  # 没有被忽略
+                newList.append(b)
+
+        return newList
--- a/UmiOCR-data/py_src/ocr/tbpu/parser_multi_line.py
+++ b/UmiOCR-data/py_src/ocr/tbpu/parser_multi_line.py
@@ -0,0 +1,22 @@
+# 排版解析-多栏-单行
+
+from .tbpu import Tbpu
+from .parser_tools.line_preprocessing import linePreprocessing  # 行预处理
+from .parser_tools.gap_tree import GapTree  # 间隙树排序算法
+
+
+class MultiLine(Tbpu):
+    def __init__(self):
+        self.tbpuName = "排版解析-多栏-单行"
+
+        # 构建算法对象，指定包围盒的元素位置
+        self.gtree = GapTree(lambda tb: tb["normalized_bbox"])
+
+    def run(self, textBlocks):
+        textBlocks = linePreprocessing(textBlocks)  # 预处理
+        textBlocks = self.gtree.sort(textBlocks)  # 构建间隙树
+        # 补充行尾间隔符
+        for tb in textBlocks:
+            tb["end"] = "\n"
+            del tb["normalized_bbox"]
+        return textBlocks
--- a/UmiOCR-data/py_src/ocr/tbpu/parser_multi_none.py
+++ b/UmiOCR-data/py_src/ocr/tbpu/parser_multi_none.py
@@ -0,0 +1,29 @@
+# 排版解析-多栏-无换行
+
+from .tbpu import Tbpu
+from .parser_tools.line_preprocessing import linePreprocessing  # 行预处理
+from .parser_tools.gap_tree import GapTree  # 间隙树排序算法
+from .parser_tools.paragraph_parse import word_separator  # 上下句间隔符
+
+
+class MultiNone(Tbpu):
+    def __init__(self):
+        self.tbpuName = "排版解析-多栏-无换行"
+
+        # 构建算法对象，指定包围盒的元素位置
+        self.gtree = GapTree(lambda tb: tb["normalized_bbox"])
+
+    def run(self, textBlocks):
+        textBlocks = linePreprocessing(textBlocks)  # 预处理
+        textBlocks = self.gtree.sort(textBlocks)  # 构建间隙树
+        # 补充行尾间隔符
+        for i in range(len(textBlocks)):
+            tb = textBlocks[i]
+            if i < len(textBlocks) - 1:
+                letter1 = tb["text"][-1]  # 行1结尾字母
+                letter2 = textBlocks[i + 1]["text"][0]  # 行2开头字母
+                tb["end"] = word_separator(letter1, letter2)  # 获取间隔符
+            else:
+                tb["end"] = "\n"
+            del tb["normalized_bbox"]
+        return textBlocks
--- a/UmiOCR-data/py_src/ocr/tbpu/parser_multi_para.py
+++ b/UmiOCR-data/py_src/ocr/tbpu/parser_multi_para.py
@@ -0,0 +1,33 @@
+# 排版解析-多栏-自然段
+
+from .tbpu import Tbpu
+from .parser_tools.line_preprocessing import linePreprocessing  # 行预处理
+from .parser_tools.gap_tree import GapTree  # 间隙树排序算法
+from .parser_tools.paragraph_parse import ParagraphParse  # 段内分析器
+
+
+class MultiPara(Tbpu):
+    def __init__(self):
+        self.tbpuName = "排版解析-多栏-自然段"
+
+        # 间隙树对象
+        self.gtree = GapTree(lambda tb: tb["normalized_bbox"])
+
+        # 段内分析器对象
+        get_info = lambda tb: (tb["normalized_bbox"], tb["text"])
+
+        def set_end(tb, end):  # 获取预测的块尾分隔符
+            tb["end"] = end
+
+        self.pp = ParagraphParse(get_info, set_end)
+
+    def run(self, textBlocks):
+        textBlocks = linePreprocessing(textBlocks)  # 预处理
+        textBlocks = self.gtree.sort(textBlocks)  # 构建间隙树
+        nodes = self.gtree.get_nodes_text_blocks()  # 获取树节点序列
+        # 对每个结点，进行自然段分析
+        for tbs in nodes:
+            self.pp.run(tbs)  # 预测结尾分隔符
+            for tb in tbs:
+                del tb["normalized_bbox"]
+        return textBlocks
--- a/UmiOCR-data/py_src/ocr/tbpu/parser_none.py
+++ b/UmiOCR-data/py_src/ocr/tbpu/parser_none.py
@@ -0,0 +1,14 @@
+# 排版解析-不做处理
+
+from .tbpu import Tbpu
+
+
+class ParserNone(Tbpu):
+    def __init__(self):
+        self.tbpuName = "排版解析-不做处理"
+
+    def run(self, textBlocks):
+        for tb in textBlocks:
+            if "end" not in tb:
+                tb["end"] = "\n"  # 默认结尾间隔符为换行
+        return textBlocks
--- a/UmiOCR-data/py_src/ocr/tbpu/parser_single_code.py
+++ b/UmiOCR-data/py_src/ocr/tbpu/parser_single_code.py
@@ -0,0 +1,73 @@
+# 排版解析-单栏-代码段
+
+from .parser_single_line import SingleLine
+from .parser_tools.line_preprocessing import linePreprocessing  # 行预处理
+
+from bisect import bisect_left
+
+
+class SingleCode(SingleLine):
+    def __init__(self):
+        self.tbpuName = "排版解析-单栏-代码段"
+
+    def merge_line(self, line):  # 合并一行
+        A = line[0]
+        ba = A["box"]
+        ha = ba[3][1] - ba[0][1]  # 块A行高
+        score = A["score"]
+        for i in range(1, len(line)):
+            B = line[i]
+            bb = B["box"]
+            ha = (ha + bb[3][1] - bb[0][1]) / 2
+            # 合并文字，补充与间距相同的空格数
+            space = 0
+            if bb[0][0] > ba[1][0]:
+                space = round((bb[0][0] - ba[1][0]) / ha)
+            A["text"] += "  " * space + B["text"]
+            # 合并包围盒
+            yTop = min(ba[0][1], ba[1][1], bb[0][1], bb[1][1])
+            yBottom = max(ba[2][1], ba[3][1], bb[2][1], bb[3][1])
+            xLeft = min(ba[0][0], ba[3][0], bb[0][0], bb[3][0])
+            xRight = max(ba[1][0], ba[2][0], bb[1][0], bb[2][0])
+            ba[0][1] = ba[1][1] = yTop  # y上
+            ba[2][1] = ba[3][1] = yBottom  # y下
+            ba[0][0] = ba[3][0] = xLeft  # x左
+            ba[1][0] = ba[2][0] = xRight  # x右
+            # 置信度
+            score += B["score"]
+        A["score"] = score / len(line)
+        del A["normalized_bbox"]
+        A["end"] = "\n"
+        return A
+
+    def indent(self, tbs):  # 分析所有行，构造缩进
+        lh = 0  # 平均行高
+        xMin = float("inf")  # 句首的最左、最右x值
+        xMax = float("-inf")
+        for tb in tbs:
+            b = tb["box"]
+            lh += b[3][1] - b[0][1]
+            x = b[0][0]
+            xMin = min(xMin, x)
+            xMax = max(xMax, x)
+        lh /= len(tbs)
+        lh2 = lh / 2
+        # 构建缩进层级列表
+        levelList = []
+        x = xMin
+        while x < xMax:
+            levelList.append(x)
+            x += lh
+        # 按照层级，为每行句首加上空格，并调整包围盒
+        for tb in tbs:
+            b = tb["box"]
+            level = bisect_left(levelList, b[0][0] + lh2) - 1  # 二分查找层级点
+            tb["text"] = "  " * level + tb["text"]  # 补充空格
+            b[0][0] = b[3][0] = xMin  # 左侧归零
+
+    def run(self, textBlocks):
+        textBlocks = linePreprocessing(textBlocks)  # 预处理
+        lines = self.get_lines(textBlocks)  # 获取每一行
+        tbs = [self.merge_line(line) for line in lines]  # 合并所有行
+        self.indent(tbs)  # 为每行添加句首缩进
+        return tbs
--- a/UmiOCR-data/py_src/ocr/tbpu/parser_single_line.py
+++ b/UmiOCR-data/py_src/ocr/tbpu/parser_single_line.py
@@ -0,0 +1,73 @@
+# 排版解析-单栏-单行
+
+from .tbpu import Tbpu
+from .parser_tools.line_preprocessing import linePreprocessing  # 行预处理
+from .parser_tools.paragraph_parse import word_separator  # 上下句间隔符
+
+
+class SingleLine(Tbpu):
+    def __init__(self):
+        self.tbpuName = "排版解析-单栏-单行"
+
+    # 从文本块列表中找出所有行
+    def get_lines(self, textBlocks):
+        # 按x排序
+        textBlocks.sort(key=lambda tb: tb["normalized_bbox"][0])
+        lines = []
+        for i1, tb1 in enumerate(textBlocks):
+            if not tb1:
+                continue
+            # 最左的一个块
+            l1, top1, r1, bottom1 = tb1["normalized_bbox"]
+            h1 = bottom1 - top1
+            now_line = [tb1]
+            # 考察右侧哪些块符合条件
+            for i2 in range(i1 + 1, len(textBlocks)):
+                tb2 = textBlocks[i2]
+                if not tb2:
+                    continue
+                l2, top2, r2, bottom2 = tb2["normalized_bbox"]
+                h2 = bottom2 - top2
+                # 行2左侧太前
+                if l2 < r1 - h1:
+                    continue
+                # 垂直距离太远
+                if top2 < top1 - h1 * 0.5 or bottom2 > bottom1 + h1 * 0.5:
+                    continue
+                # 行高差距过大
+                if abs(h1 - h2) > min(h1, h2) * 0.5:
+                    continue
+                # 符合条件
+                now_line.append(tb2)
+                textBlocks[i2] = None
+                # 更新搜索条件
+                r1 = r2
+            # 处理完一行
+            for i2 in range(len(now_line) - 1):
+                # 检查同一行内相邻文本块的水平间隙
+                l1, t1, r1, b1 = now_line[i2]["normalized_bbox"]
+                l2, t2, r2, b2 = now_line[i2 + 1]["normalized_bbox"]
+                h = (b1 + b2 - t1 - l2) * 0.5
+                if l2 - r1 > h * 1.5:  # 间隙太大，强制设置空格
+                    now_line[i2]["end"] = " "
+                    continue
+                letter1 = now_line[i2]["text"][-1]
+                letter2 = now_line[i2 + 1]["text"][0]
+                now_line[i2]["end"] = word_separator(letter1, letter2)
+            now_line[-1]["end"] = "\n"
+            lines.append(now_line)
+            textBlocks[i1] = None
+        # 所有行按y排序
+        lines.sort(key=lambda tbs: tbs[0]["normalized_bbox"][1])
+        return lines
+
+    def run(self, textBlocks):
+        textBlocks = linePreprocessing(textBlocks)  # 预处理
+        lines = self.get_lines(textBlocks)  # 获取每一行
+        # 解包
+        textBlocks = []
+        for line in lines:
+            for tb in line:
+                del tb["normalized_bbox"]
+                textBlocks.append(tb)
+        return textBlocks
--- a/UmiOCR-data/py_src/ocr/tbpu/parser_single_none.py
+++ b/UmiOCR-data/py_src/ocr/tbpu/parser_single_none.py
@@ -0,0 +1,19 @@
+# 排版解析-单栏-无换行
+
+from .parser_single_line import SingleLine
+from .parser_tools.paragraph_parse import word_separator  # 上下句间隔符
+
+
+class SingleNone(SingleLine):
+    def __init__(self):
+        self.tbpuName = "排版解析-单栏-无换行"
+
+    def run(self, textBlocks):
+        textBlocks = super().run(textBlocks)
+        # 找到换行符，更改为间隔符
+        for i in range(len(textBlocks) - 1):
+            if textBlocks[i]["end"] == "\n":
+                letter1 = textBlocks[i]["text"][-1]
+                letter2 = textBlocks[i + 1]["text"][0]
+                textBlocks[i]["end"] = word_separator(letter1, letter2)
+        return textBlocks
--- a/UmiOCR-data/py_src/ocr/tbpu/parser_single_para.py
+++ b/UmiOCR-data/py_src/ocr/tbpu/parser_single_para.py
@@ -0,0 +1,49 @@
+# 排版解析-单栏-自然段
+
+from .parser_single_line import SingleLine
+from .parser_tools.line_preprocessing import linePreprocessing  # 行预处理
+from .parser_tools.paragraph_parse import ParagraphParse  # 段内分析器
+
+
+class SinglePara(SingleLine):
+    def __init__(self):
+        self.tbpuName = "排版解析-单栏-自然段"
+
+        # 段内分析器对象
+        get_info = lambda tb: (tb["normalized_bbox"], tb["text"])
+
+        def set_end(tb, end):  # 获取预测的块尾分隔符
+            tb["line"][-1]["end"] = end
+
+        self.pp = ParagraphParse(get_info, set_end)
+
+    def run(self, textBlocks):
+        textBlocks = linePreprocessing(textBlocks)  # 预处理
+        lines = self.get_lines(textBlocks)  # 获取每一行
+        # 将行封装为tb
+        temp_tbs = []
+        for line in lines:
+            b0, b1, b2, b3 = line[0]["normalized_bbox"]
+            # 搜索bbox
+            for i in range(1, len(line)):
+                bb = line[i]["normalized_bbox"]
+                b1 = min(b1, bb[1])
+                b2 = max(b1, bb[2])
+                b3 = max(b1, bb[3])
+            # 构建tb
+            temp_tbs.append(
+                {
+                    "normalized_bbox": (b0, b1, b2, b3),
+                    "text": line[0]["text"][0] + line[-1]["text"][-1],
+                    "line": line,
+                }
+            )
+        # 预测结尾分隔符
+        self.pp.run(temp_tbs)
+        # 解包
+        textBlocks = []
+        for t in temp_tbs:
+            for tb in t["line"]:
+                del tb["normalized_bbox"]
+                textBlocks.append(tb)
+        return textBlocks
--- a/UmiOCR-data/py_src/ocr/tbpu/parser_tools/init.py
+++ b/UmiOCR-data/py_src/ocr/tbpu/parser_tools/init.py
--- a/UmiOCR-data/py_src/ocr/tbpu/parser_tools/gap_tree.py
+++ b/UmiOCR-data/py_src/ocr/tbpu/parser_tools/gap_tree.py
@@ -0,0 +1,330 @@
+# 【间隙·树·排序算法】 GapTree_Sort_Algorithm
+# 对OCR结果或PDF提取的文本进行版面分析，按人类阅读顺序进行排序。
+# Author: hiroi-sora
+# https://github.com/hiroi-sora/GapTree_Sort_Algorithm
+
+from typing import Callable
+
+
+class GapTree:
+    def __init__(self, get_bbox: Callable):
+        """
+        :param get_bbox: 函数，传入单个文本块，
+                        返回该文本块左上角、右下角的坐标元组 (x0, y0, x1, y1)
+        """
+        self.get_bbox = get_bbox
+
+    # ======================= 调用接口 =====================
+    # 对文本块列表排序
+    def sort(self, text_blocks: list):
+        """
+        对文本块列表，按人类阅读顺序进行排序。
+
+        :param text_blocks: 文本块对象列表
+        :return: 排序后的文本块列表
+        """
+
+        # 封装块单元，并求页面左右边缘
+        units, page_l, page_r = self._get_units(text_blocks, self.get_bbox)
+        # 求行和竖切线
+        cuts, rows = self._get_cuts_rows(units, page_l, page_r)
+        # 求布局树
+        root = self._get_layout_tree(cuts, rows)
+        # 求树节点序列
+        nodes = self._preorder_traversal(root)
+        # 求排序后的 原始文本块序列
+        new_text_blocks = self._get_text_blocks(nodes)
+
+        # 测试：缓存中间变量，以便调试输出
+        self.current_rows = rows
+        self.current_cuts = cuts
+        self.current_nodes = nodes
+
+        return new_text_blocks
+
+    # 获取以区块为单位的文本块二层列表
+    def get_nodes_text_blocks(self):
+        """
+        获取以区块为单位的文本块二层列表。需要在 sort 后调用。
+
+        :return: [ [区块1的text_blocks], [区块2的text_blocks]... ]
+        """
+        result = []
+        for node in self.current_nodes:
+            tbs = []
+            if node["units"]:
+                for unit in node["units"]:
+                    tbs.append(unit[1])
+                result.append(tbs)
+        return result
+
+    # ======================= 封装块单元列表 =====================
+    # 将原始文本块，封装为 ( (x0,y0,x2,y2), 原始 ) 。并检查页边界。
+    def _get_units(self, text_blocks, get_bbox):
+        # 封装单元列表 units [ ( (x0,y0,x2,y2), 原始文本块 ), ... ]
+        units = []
+        page_l, page_r = float("inf"), -1  # 记录文本块的左右最值，作为页边界
+        for tb in text_blocks:
+            x0, y0, x2, y2 = get_bbox(tb)
+            units.append(((x0, y0, x2, y2), tb))
+            if x0 < page_l:
+                page_l = x0
+            if x2 > page_r:
+                page_r = x2
+        units.sort(key=lambda a: a[0][1])  # 按顶部从上到下排序
+        return units, page_l, page_r
+
+    # ======================= 求行和竖切线 =====================
+    """
+    扫描所有文本块，获取所有行和竖切线。
+    一个行，由一组垂直位置接近的文本块所组成。
+    一条竖切线，由多个连续行中，同一位置的间隙所组成。间隙划分同一行中不同列的文本块。
+    输入：一个页面上的文本块单元列表 units=[ ( (x0,y0,x2,y2), _ ) ] 。必须按上到下排序。
+    返回：
+      竖切线列表 cuts=[ ( 左边缘x, 右边缘x, 起始行号, 结束行号 ) ] 。从左到右排序
+      页面上的行 rows=[ [unit...] ] 。从上到下，从左到右排序
+    """
+
+    def _get_cuts_rows(self, units, page_l, page_r):
+        # 使用间隙组 gaps2 更新 gaps1 。返回： 更新后的gaps1 , gaps1中被移除的间隙
+        def update_gaps(gaps1, gaps2):
+            flags1 = [True for _ in gaps1]  # gaps1[i] 是否彻底移除
+            flags2 = [True for _ in gaps2]  # gaps2[i] 是否新加入
+            new_gaps1 = []
+            for i1, g1 in enumerate(gaps1):
+                l1, r1, _ = g1
+                for i2, g2 in enumerate(gaps2):  # 对每一个gap1，考察所有gap2
+                    l2, r2, _ = g2
+                    # 计算交集的起点和终点
+                    inter_l = max(l1, l2)
+                    inter_r = min(r1, r2)
+                    # 如果交集有效
+                    if inter_l <= inter_r:
+                        # 更新 gap1 左右边缘
+                        new_gaps1.append((inter_l, inter_r, g1[2]))
+                        flags1[i1] = False  # 旧的 gap1 不应移除
+                        flags2[i2] = False  # 新的 gap2 不应添加
+            # gap2 新加入
+            for i2, f2 in enumerate(flags2):
+                if f2:
+                    new_gaps1.append(gaps2[i2])
+            # 记录 gaps1 彻底移除的项
+            del_gaps1 = []
+            for i1, f1 in enumerate(flags1):
+                if f1:
+                    del_gaps1.append(gaps1[i1])
+
+            return new_gaps1, del_gaps1
+
+        # ========================================
+
+        page_l -= 1  # 保证页面左右边缘不与文本块重叠
+        page_r += 1
+        # 存放所有行。“row”指同一水平线上的单元块（可能属于多列）。 [ [unit...] ]
+        rows = []
+        # 已生成完毕的竖切线。[ ( 左边缘x, 右边缘x , 起始行号, 结束行号 ) ]
+        completed_cuts = []
+        # 考察中的间隙。 [ (左边缘x, 右边缘x , 开始行号) ]
+        gaps = []
+        row_index = 0  #  当前行号
+        unit_index = 0  # 当前块号
+        # 从上到下遍历所有文本行
+        l_units = len(units)
+        while unit_index < l_units:
+            # ========== 查找当前行 row ==========
+            unit = units[unit_index]  # 当前行最顶部的块
+            u_bottom = unit[0][3]
+            row = [unit]  # 当前行
+            # 查找当前行的剩余块
+            for i in range(unit_index + 1, len(units)):
+                next_u = units[i]
+                next_top = next_u[0][1]
+                if next_top > u_bottom:
+                    break  # 下一块的顶部超过当前底部，结束本行
+                row.append(next_u)  # 当前行添加块
+                unit_index = i  # 步进 已遍历的块序号
+            # ========== 查找当前行的间隙 row_gaps ==========
+            row.sort(key=lambda x: (x[0][0], x[0][2]))  # 当前行中的块 从左到右排序
+            row_gaps = []  # 当前行的间隙 [ ( ( 左边缘l, 右边缘r ), 开始行号) ]
+            search_start = page_l  # 本轮搜索的线段起始点为页面左边缘
+            for u in row:  # 遍历当前行的块
+                l = u[0][0]  # 块左侧
+                r = u[0][2]  # 块右侧
+                # 若块起始点大于搜索起始点，那么将这部分加入到结果
+                if l > search_start:
+                    row_gaps.append((search_start, l, row_index))
+                # 若块结束点大于搜索起始点，更新搜索起始点
+                if r > search_start:
+                    search_start = r
+            # 页面右边缘 加入最后一个间隙
+            row_gaps.append((search_start, page_r, row_index))
+            # ========== 更新考察中的间隙组 ==========
+            gaps, del_gaps = update_gaps(gaps, row_gaps)
+            # gaps 中被移除的项，加入生成完毕的竖切线 completed_cuts
+            row_max = row_index - 1  # 竖切线结束行号
+            for dg1 in del_gaps:
+                completed_cuts.append((*dg1, row_max))
+            # ========== End ==========
+            rows.append(row)  # 总行列表添加当前行
+            unit_index += 1
+            row_index += 1
+        # 遍历结束，收集 gaps 中剩余的间隙，组成延伸到最后一行的竖切线
+        row_max = len(rows) - 1  # 竖切线结束行号
+        for g in gaps:
+            completed_cuts.append((*g, row_max))
+        completed_cuts.sort(key=lambda c: c[0])
+        return completed_cuts, rows
+
+    # ======================= 求布局树 =====================
+    """
+    一个布局树节点表示一个区块。定义：
+    node = {
+        "x_left": 节点左边缘x,
+        "x_right": 右边缘x,
+        "r_top": 顶部的行号,
+        "r_bottom": 底部的行号,
+        "units": [], # 节点内部的文本块列表（除了根节点为空，其它节点非空） 
+        "children": [], # 子节点，有序
+    }
+    """
+
+    def _get_layout_tree(self, cuts, rows):
+        # 竖切线，将一个横行切开，断开的区域为“间隙”。
+        # 生成每一行对应的间隙 (左侧,右侧) 坐标列表
+        rows_gaps = [[] for _ in rows]
+        for g_i, cut in enumerate(cuts):
+            for r_i in range(cut[2], cut[3] + 1):
+                rows_gaps[r_i].append((cut[0], cut[1]))
+
+        root = {  # 根节点
+            "x_left": cuts[0][0] - 1,
+            "x_right": cuts[-1][1] + 1,
+            "r_top": -1,
+            "r_bottom": -1,
+            "units": [],
+            "children": [],
+        }
+        completed_nodes = [root]  # 已经完成结束的节点
+        now_nodes = []  # 当前正在考虑的节点。无顺序
+
+        # ========== 结束一个节点，加入节点树 ==========
+        def complete(node):
+            node_r = node["x_right"] - 2  # 当前节点右边界
+            max_nodes = []  # 符合父节点条件的，最低的完成节点列表
+            max_r = -2  # 符合父节点条件的最低行数
+            # 在完成列表中，寻找父节点
+            for com_node in completed_nodes:
+                # 父节点的垂直投影必须包含当前右界
+                if node_r < com_node["x_left"] or node_r > com_node["x_right"] + 0.0001:
+                    continue
+                # 父节点底部必须在当前之上
+                if com_node["r_bottom"] >= node["r_top"]:
+                    continue
+                # 遇到更低的符合条件节点
+                if com_node["r_bottom"] > max_r:
+                    max_r = com_node["r_bottom"]
+                    max_nodes = [com_node]
+                    continue
+                # 遇到同样低的符合条件节点
+                if com_node["r_bottom"] == max_r:
+                    max_nodes.append(com_node)
+                    continue
+            # 在最低列表中，寻找最右的节点作为父节点
+            max_node = max(max_nodes, key=lambda n: n["x_right"])
+            max_node["children"].append(node)  # 加入父节点
+            completed_nodes.append(node)  # 加入完成列表
+
+        # ========== 遍历每行，更新节点树 ==========
+        for r_i, row in enumerate(rows):
+            row_gaps = rows_gaps[r_i]  # 当前行的间隙组
+            u_i = g_i = 0  # 当前考察的 文本块、间隙下标
+
+            # ========== 检查是否有正在考虑的节点 可以结束 ==========
+            new_nodes = []
+            for node in now_nodes:  # 遍历节点
+                l_flag = r_flag = False  # 标记节点左右边缘是否延续
+                completed_flag = False  # 标记节点是否可以结束
+                x_left = node["x_left"]  # 左右边缘坐标
+                x_right = node["x_right"]
+                for gap in row_gaps:  # 遍历该行所有间隙
+                    if gap[1] == x_left:  # 节点左边缘被间隙右侧延续
+                        l_flag = True
+                    if gap[0] == x_right:  # 右边缘被间隙左侧延续
+                        r_flag = True
+                    # 任意间隙在本节点下方，打断本节点
+                    if x_left < gap[0] < x_right or x_left < gap[1] < x_right:
+                        completed_flag = True
+                        break
+                if not l_flag or not r_flag:  # 左右任意一个边缘无法延续
+                    completed_flag = True
+                if completed_flag:  # 节点结束，加入节点树
+                    complete(node)
+                else:  # 节点继续
+                    node["r_bottom"] = r_i
+                    new_nodes.append(node)
+            now_nodes = new_nodes
+
+            # ========== 从左到右遍历，将文本块加入对应列的节点 ==========
+            while u_i < len(row):
+                unit = row[u_i]  # 当前块
+                # ========== 当前块 unit 位于间隙 g_i 与 g_i+1 之间的区间 ==========
+                x_l = row_gaps[g_i][1]  # 左间隙 g_i 的右边界
+                x_r = row_gaps[g_i + 1][0]  # 右间隙 g_i+1 的左边界
+                # 检查区间是否正确
+                if unit[0][0] + 0.0001 > x_r:  # 块比右间隙更右，说明到了下一个区间
+                    g_i += 1  # 间隙步进，块不步进
+                    continue
+                # ========== 检查当前块可否加入已有的节点 ==========
+                flag = False
+                for node in now_nodes:
+                    # 若某个节点的左右侧坐标，与当前块一致，则当前块加入节点
+                    if node["x_left"] == x_l and node["x_right"] == x_r:
+                        node["units"].append(unit)
+                        flag = True
+                        break
+                if flag:
+                    u_i += 1  # 块步进
+                    continue
+                # ========== 根据当前块创建新的节点，加入待考虑节点 ==========
+                now_nodes.append(
+                    {
+                        "x_left": x_l,
+                        "x_right": x_r,
+                        "r_top": r_i,
+                        "r_bottom": r_i,
+                        "units": [unit],
+                        "children": [],
+                    }
+                )
+                u_i += 1  # 块步进
+        # 将剩余节点也加入节点树
+        for node in now_nodes:
+            complete(node)
+        # 整理所有节点
+        for node in completed_nodes:
+            # 所有子节点 按从左到右排序
+            node["children"].sort(key=lambda n: n["x_left"])
+            # 所有块单元 按从上到下排序
+            node["units"].sort(key=lambda u: u[0][1])
+        return root
+
+    # ======================= 前序遍历布局树，求节点序列 =====================
+    def _preorder_traversal(self, root):
+        if not root:
+            return []
+        stack = [root]
+        result = []
+        while stack:
+            node = stack.pop()
+            result.append(node)
+            # 将当前节点的子节点逆序压入栈中，以保证左子节点先于右子节点处理
+            stack += reversed(node["children"])
+        return result
+
+    # ======================= 从节点序列中，提取原始文本块序列 =====================
+    def _get_text_blocks(self, nodes):
+        result = []
+        for node in nodes:
+            for unit in node["units"]:
+                result.append(unit[1])
+        return result
--- a/UmiOCR-data/py_src/ocr/tbpu/parser_tools/line_preprocessing.py
+++ b/UmiOCR-data/py_src/ocr/tbpu/parser_tools/line_preprocessing.py
@@ -0,0 +1,98 @@
+# =========================================
+# =============== 按行预处理 ===============
+# =========================================
+
+from statistics import median  # 中位数
+from math import atan2, cos, sin, sqrt, pi, radians, degrees
+
+from umi_log import logger
+
+angle_threshold = 3  # 进行一些操作的最小角度阈值
+angle_threshold_rad = radians(angle_threshold)
+
+
+# 计算两点之间的距离
+def _distance(point1, point2):
+    return sqrt((point2[0] - point1[0]) ** 2 + (point2[1] - point1[1]) ** 2)
+
+
+# 计算一个box的旋转角度
+def _calculateAngle(box):
+    # 获取宽高
+    width = _distance(box[0], box[1])
+    height = _distance(box[1], box[2])
+    # 选择距离较大的两个顶点对，计算角度弧度值
+    if width < height:
+        angle_rad = atan2(box[2][1] - box[1][1], box[2][0] - box[1][0])
+    else:
+        angle_rad = atan2(box[1][1] - box[0][1], box[1][0] - box[0][0])
+    # 标准化角度到[-pi/2, pi/2)范围（加上阈值）
+    if angle_rad < -pi / 2 + angle_threshold_rad:
+        angle_rad += pi
+    elif angle_rad >= pi / 2 + angle_threshold_rad:
+        angle_rad -= pi
+    return angle_rad
+
+
+# 估计一组文本块的旋转角度
+def _estimateRotation(textBlocks):
+    # blocks["box"] = [左上角,右上角,右下角,左下角]
+    angle_rads = (_calculateAngle(block["box"]) for block in textBlocks)
+    median_angle = median(angle_rads)  # 中位数
+    return median_angle
+
+
+# 获取旋转后的标准bbox。angle_threshold为执行旋转的阈值（最小角度值）。
+def _getBboxes(textBlocks, rotation_rad):
+    # 角度低于阈值（接近0°），则不进行旋转，以提高性能。
+    if abs(rotation_rad) <= angle_threshold_rad:
+        bboxes = [
+            (  # 直接构造bbox
+                min(x for x, y in tb["box"]),
+                min(y for x, y in tb["box"]),
+                max(x for x, y in tb["box"]),
+                max(y for x, y in tb["box"]),
+            )
+            for tb in textBlocks
+        ]
+    # 否则，进行旋转操作。
+    else:
+        logger.debug(f"文本块预处理旋转 {degrees(rotation_rad):.2f} °")
+        bboxes = []
+        min_x, min_y = float("inf"), float("inf")  # 初始化最小的x和y坐标
+        cos_angle = cos(-rotation_rad)  # 计算角度正弦值
+        sin_angle = sin(-rotation_rad)
+        for tb in textBlocks:
+            box = tb["box"]
+            rotated_box = [  # 旋转box的每个顶点
+                (cos_angle * x - sin_angle * y, sin_angle * x + cos_angle * y)
+                for x, y in box
+            ]
+            # 解包旋转后的顶点坐标，分别得到所有x和y的值
+            xs, ys = zip(*rotated_box)
+            # 构建标准bbox (左上角x, 左上角y, 右下角x, 右下角y)
+            bbox = (min(xs), min(ys), max(xs), max(ys))
+            bboxes.append(bbox)
+            min_x, min_y = min(min_x, bbox[0]), min(min_y, bbox[1])
+        # 如果旋转后存在负坐标，将所有包围盒平移，使得最小的x和y坐标为0，确保所有坐标非负
+        if min_x < 0 or min_y < 0:
+            bboxes = [
+                (x - min_x, y - min_y, x2 - min_x, y2 - min_y)
+                for (x, y, x2, y2) in bboxes
+            ]
+    return bboxes
+
+
+# 预处理 textBlocks ，将包围盒 ["box"] 转为标准化 bbox ，同时去除 ["text"] 不完整的项
+def linePreprocessing(textBlocks):
+    textBlocks = [i for i in textBlocks if i.get("text", False)]
+    # 判断角度
+    rotation_rad = _estimateRotation(textBlocks)
+    # 获取标准化bbox
+    bboxes = _getBboxes(textBlocks, rotation_rad)
+    # 写入tb
+    for i, tb in enumerate(textBlocks):
+        tb["normalized_bbox"] = bboxes[i]
+    # 按y排序
+    textBlocks.sort(key=lambda tb: tb["normalized_bbox"][1])
+    return textBlocks
--- a/UmiOCR-data/py_src/ocr/tbpu/parser_tools/paragraph_parse.py
+++ b/UmiOCR-data/py_src/ocr/tbpu/parser_tools/paragraph_parse.py
@@ -0,0 +1,173 @@
+# 段落分析器
+# 对已经是一个列区块之内的文本块，判断其段落关系。
+
+from typing import Callable
+import unicodedata
+
+
+# 传入前句尾字符和后句首字符，返回分隔符
+def word_separator(letter1, letter2):
+
+    # 判断Unicode字符是否属于中文、日文或韩文字符集
+    def is_cjk(character):
+        cjk_unicode_ranges = [
+            (0x4E00, 0x9FFF),  # 中文
+            (0x3040, 0x30FF),  # 日文
+            (0x1100, 0x11FF),  # 韩文
+            (0x3130, 0x318F),  # 韩文兼容字母
+            (0xAC00, 0xD7AF),  # 韩文音节
+            # 全角符号
+            (0x3000, 0x303F),  # 中文符号和标点
+            (0xFE30, 0xFE4F),  # 中文兼容形式标点
+            (0xFF00, 0xFFEF),  # 半角和全角形式字符
+        ]
+        return any(start <= ord(character) <= end for start, end in cjk_unicode_ranges)
+
+    if is_cjk(letter1) and is_cjk(letter2):
+        return ""
+
+    # 特殊情况：前文为连字符。
+    if letter1 == "-":
+        return ""
+    # 特殊情况：后文为任意标点符号。
+    if unicodedata.category(letter2).startswith("P"):
+        return ""
+    # 其它正常情况加空格
+    return " "
+
+
+TH = 1.2  # 行高用作对比的阈值
+
+
+class ParagraphParse:
+    def __init__(self, get_info: Callable, set_end: Callable) -> None:
+        """
+        :param get_info: 函数，传入单个文本块，
+                返回该文本块的信息元组： ( (x0, y0, x1, y1), "文本" )
+        :param set_end: 函数，传入单个文本块 和文本尾部的分隔符，该函数要将分隔符保存。
+        """
+        self.get_info = get_info
+        self.set_end = set_end
+
+    # ======================= 调用接口：对文本块列表进行结尾分隔符预测 =====================
+    def run(self, text_blocks: list):
+        """
+        对属于一个区块内的文本块列表，进行段落分析，预测每个文本块结尾的分隔符。
+
+        :param text_blocks: 文本块对象列表
+        :return: 排序后的文本块列表
+        """
+        # 封装块单元
+        units = self._get_units(text_blocks, self.get_info)
+        # 执行分析
+        self._parse(units)
+        return text_blocks
+
+    # ======================= 封装块单元列表 =====================
+    # 将原始文本块，封装为 ( (x0,y0,x2,y2), ("开头","结尾"), 原始 ) 。
+    def _get_units(self, text_blocks, get_info):
+        units = []
+        for tb in text_blocks:
+            bbox, text = get_info(tb)
+            units.append((bbox, (text[0], text[-1]), tb))
+        return units
+
+    # ======================= 分析 =====================
+
+    # 执行分析
+    def _parse(self, units):
+        units.sort(key=lambda a: a[0][1])  # 确保从上到下有序
+        para_l, para_top, para_r, para_bottom = units[0][0]  # 当前段的左右
+        para_line_h = para_bottom - para_top  # 当前段行高
+        para_line_s = None  # 当前段行间距
+        now_para = [units[0]]  # 当前段的块
+        paras = []  # 总的段
+        paras_line_space = []  # 总的段的行间距
+        # 取 左右相等为一个自然段的主体
+        for i in range(1, len(units)):
+            l, top, r, bottom = units[i][0]  # 当前块上下左右边缘
+            h = bottom - top
+            ls = top - para_bottom  # 行间距
+            # 检测是否同一段
+            if (  # 左右边缘都相等
+                abs(para_l - l) <= para_line_h * TH
+                and abs(para_r - r) <= para_line_h * TH
+                # 行间距不大
+                and (para_line_s == None or ls < para_line_s + para_line_h * 0.5)
+            ):
+                # 更新数据
+                para_l = (para_l + l) / 2
+                para_r = (para_r + r) / 2
+                para_line_h = (para_line_h + h) / 2
+                para_line_s = ls if para_line_s == None else (para_line_s + ls) / 2
+                # 添加到当前段
+                now_para.append(units[i])
+            else:  # 非同一段，归档上一段，创建新一段
+                paras.append(now_para)
+                paras_line_space.append(para_line_s)
+                now_para = [units[i]]
+                para_l, para_r, para_line_h = l, r, bottom - top
+                para_line_s = None
+            para_bottom = bottom
+        # 归档最后一段
+        paras.append(now_para)
+        paras_line_space.append(para_line_s)
+
+        # 合并只有1行的段，添加到上/下段作为首/尾句
+        for i1 in reversed(range(len(paras))):
+            para = paras[i1]
+            if len(para) == 1:
+                l, top, r, bottom = para[0][0]
+                up_flag = down_flag = False
+                # 上段末尾条件：左对齐，右不超，行间距够小
+                if i1 > 0:
+                    # 检查左右
+                    up_l, up_top, up_r, up_bottom = paras[i1 - 1][-1][0]
+                    up_dist, up_h = abs(up_l - l), up_bottom - up_top
+                    up_flag = up_dist <= up_h * TH and r <= up_r + up_h * TH
+                    # 检查行间距
+                    if (
+                        paras_line_space[i1 - 1] != None
+                        and top - up_bottom > paras_line_space[i1 - 1] + up_h * 0.5
+                    ):
+                        up_flag = False
+                # 下段开头条件：右对齐/单行超出，左缩进
+                if i1 < len(paras) - 1:
+                    down_l, down_top, down_r, down_bottom = paras[i1 + 1][0][0]
+                    down_h = down_bottom - down_top
+                    # 左对齐或缩进
+                    if down_l - down_h * TH <= l <= down_l + down_h * (1 + TH):
+                        if len(paras[i1 + 1]) > 1:  # 多行，右对齐
+                            down_flag = abs(down_r - r) <= down_h * TH
+                        else:  # 单行，右可超出
+                            down_flag = down_r - down_h * TH < r
+                    # 检查行间距
+                    if (
+                        paras_line_space[i1 + 1] != None
+                        and down_top - bottom > paras_line_space[i1 + 1] + down_h * 0.5
+                    ):
+                        down_flag = False
+
+                # 选择添加到上还是下段
+                if up_flag and down_flag:  # 两段都符合，则选择垂直距离更近的
+                    if top - up_bottom < down_top - bottom:
+                        paras[i1 - 1].append(para[0])
+                    else:
+                        paras[i1 + 1].insert(0, para[0])
+                elif up_flag:  # 只有一段符合，直接选择
+                    paras[i1 - 1].append(para[0])
+                elif down_flag:
+                    paras[i1 + 1].insert(0, para[0])
+                if up_flag or down_flag:
+                    del paras[i1]
+                    del paras_line_space[i1]
+
+        # 刷新所有段，添加end
+        for para in paras:
+            for i1 in range(len(para) - 1):
+                letter1 = para[i1][1][1]  # 行1结尾字母
+                letter2 = para[i1 + 1][1][0]  # 行2开头字母
+                sep = word_separator(letter1, letter2)
+                self.set_end(para[i1][2], sep)
+            self.set_end(para[-1][2], "\n")
+        return units
--- a/UmiOCR-data/py_src/ocr/tbpu/tbpu.py
+++ b/UmiOCR-data/py_src/ocr/tbpu/tbpu.py
@@ -0,0 +1,22 @@
+# tbpu : text block processing unit
+# 文块处理器的基类。
+# OCR返回的结果中，一项包含文字、包围盒、置信度的元素，称为一个“文块” - text block 。
+# 文块不一定是完整的一句话或一个段落。反之，一般是零散的文字。
+# 一个OCR结果常由多个文块组成。
+# 文块处理器就是：将传入的多个文块进行处理，比如合并、排序、删除文块。
+
+
+class Tbpu:
+    def __init__(self):
+        self.tbpuName = "文块处理单元-未知"
+
+    def run(self, textBlocks):
+        """输入：textBlocks文块列表。例：\n
+        [
+            {'box': [[29, 19], [172, 19], [172, 44], [29, 44]], 'score': 0.89, 'text': '文本111'},
+            {'box': [[29, 60], [161, 60], [161, 86], [29, 86]], 'score': 0.75, 'text': '文本222'},
+        ]
+        输出：排序后的textBlocks文块列表，每个块增加键：
+        'end' 结尾间隔符
+        """
+        return textBlocks