docs: 添加涉密文件自检工具实施计划
This commit is contained in:
42
UmiOCR-data/py_src/ocr/api/__init__.py
Normal file
42
UmiOCR-data/py_src/ocr/api/__init__.py
Normal file
@@ -0,0 +1,42 @@
|
||||
# ===============================================
|
||||
# =============== OCR 插件接口管理 ===============
|
||||
# ===============================================
|
||||
|
||||
from umi_log import logger
|
||||
|
||||
ApiDict = {}
|
||||
AllDict = {}
|
||||
|
||||
|
||||
# TODO: 静态插件
|
||||
# 由插件控制器调用,初始化OCR插件的接口。传入动态插件
|
||||
def initOcrPlugins(plugins):
|
||||
global ApiDict, AllDict
|
||||
for p in plugins:
|
||||
ApiDict[p] = plugins[p]["api_class"]
|
||||
AllDict[p] = plugins[p]
|
||||
|
||||
|
||||
# 生成一个ocr api实例,成功返回对象,失败返回 [Error] 开头的字符串
|
||||
def getApiOcr(apiKey, argd):
|
||||
# 检测argd,恢复int类型
|
||||
for k in argd:
|
||||
n = argd[k]
|
||||
if isinstance(n, float):
|
||||
rounded = round(n)
|
||||
if abs(n - rounded) <= 1e-7:
|
||||
argd[k] = rounded
|
||||
if apiKey in ApiDict:
|
||||
try:
|
||||
return ApiDict[apiKey](argd) # 实例化后返回
|
||||
except Exception as e:
|
||||
logger.error(f"生成api实例{apiKey}失败。", exc_info=True, stack_info=True)
|
||||
return f"[Error] Failed to generate API instance {apiKey}: {e}"
|
||||
return f'[Error] "{apiKey}" not in ApiDict.'
|
||||
|
||||
|
||||
# 返回一个API的局部配置字典
|
||||
def getLocalOptions(apiKey):
|
||||
if apiKey in AllDict:
|
||||
return AllDict[apiKey]["local_options"]
|
||||
return {}
|
||||
30
UmiOCR-data/py_src/ocr/output/__init__.py
Normal file
30
UmiOCR-data/py_src/ocr/output/__init__.py
Normal file
@@ -0,0 +1,30 @@
|
||||
from .output_txt import OutputTxt
|
||||
from .output_txt_plain import OutputTxtPlain
|
||||
from .output_txt_individual import OutputTxtIndividual
|
||||
from .output_md import OutputMD
|
||||
from .output_jsonl import OutputJsonl
|
||||
from .output_csv import OutputCsv
|
||||
from .output_pdf_layered import OutputPdfLayered
|
||||
from .output_pdf_one_layer import OutputPdfOneLayer
|
||||
|
||||
"""纯文本输出器。初始化传入参数字典:
|
||||
outputArgd = {
|
||||
"outputDir": "", # 输出路径
|
||||
"outputDirType": "", # 输出目录类型,"source" 为原文件目录,"specify"为指定目录
|
||||
"outputFileName": "", # 输出文件名(前缀)
|
||||
"startDatetime": "", # 开始日期字符串(标准格式)
|
||||
"ignoreBlank": True/False, # 忽略空白文件
|
||||
}
|
||||
"""
|
||||
Output = {
|
||||
# 纯文本输出器
|
||||
"txt": OutputTxt,
|
||||
"txtPlain": OutputTxtPlain,
|
||||
"txtIndividual": OutputTxtIndividual,
|
||||
"md": OutputMD,
|
||||
"jsonl": OutputJsonl,
|
||||
"csv": OutputCsv,
|
||||
# PDF输出器,需要额外的参数 "originPath" 原始文件路径
|
||||
"pdfLayered": OutputPdfLayered,
|
||||
"pdfOneLayer": OutputPdfOneLayer,
|
||||
}
|
||||
32
UmiOCR-data/py_src/ocr/output/output.py
Normal file
32
UmiOCR-data/py_src/ocr/output/output.py
Normal file
@@ -0,0 +1,32 @@
|
||||
# OCR输出器的基类。按指定的格式,将传入的文本输出到指定地方。
|
||||
|
||||
from .tools import getDataText
|
||||
from ...platform import Platform
|
||||
import os
|
||||
|
||||
|
||||
class Output:
|
||||
def __init__(self, argd):
|
||||
self.dir = argd["outputDir"] # 输出路径(文件夹)
|
||||
self.fileName = argd["outputFileName"] # 文件名
|
||||
self.outputPath = f"{self.dir}/{self.fileName}.txt" # 输出路径
|
||||
self.ignoreBlank = argd["ignoreBlank"] # 忽略空白文件
|
||||
|
||||
def print(self, res): # 输出图片信息
|
||||
if not res["code"] == 100 and self.ignoreBlank:
|
||||
return # 忽略空白图片
|
||||
textOut = f"图片路径:{res['path']}\n代码:{res['code']}\n"
|
||||
if res["code"] == 100:
|
||||
textOut += getDataText(res["data"]) # 获取拼接结果
|
||||
elif res["code"] == 101:
|
||||
textOut += "无文字"
|
||||
else:
|
||||
textOut += f"错误原因:{res['data']}"
|
||||
print(textOut)
|
||||
|
||||
def openOutputFile(self): # 打开输出文件
|
||||
if self.outputPath and os.path.exists(self.outputPath):
|
||||
Platform.startfile(self.outputPath)
|
||||
|
||||
def onEnd(self): # 结束输出。
|
||||
pass
|
||||
70
UmiOCR-data/py_src/ocr/output/output_csv.py
Normal file
70
UmiOCR-data/py_src/ocr/output/output_csv.py
Normal file
@@ -0,0 +1,70 @@
|
||||
# 输出到csv表格文件
|
||||
|
||||
import csv
|
||||
|
||||
from umi_log import logger
|
||||
from .output import Output
|
||||
from .tools import getDataText
|
||||
|
||||
|
||||
class OutputCsv(Output):
|
||||
def __init__(self, argd):
|
||||
self.encodings = [ # 保存编码优先级
|
||||
"ansi", # Windows系统本地编码。在linux和macos下会抛出异常
|
||||
"ascii", # 纯英
|
||||
"gbk", # 简中
|
||||
"big5", # 繁中
|
||||
"shift_jis", # 日文
|
||||
"euc-kr", # 韩文
|
||||
"utf-8",
|
||||
]
|
||||
self.dir = argd["outputDir"] # 输出路径(文件夹)
|
||||
self.fileName = argd["outputFileName"] # 文件名
|
||||
self.outputPath = f"{self.dir}/{self.fileName}.csv" # 输出路径
|
||||
self.ignoreBlank = argd["ignoreBlank"] # 忽略空白文件
|
||||
self.writeLists = [] # 输出内容列表
|
||||
self.writeText = "" # 输出内容字符串
|
||||
try: # 覆盖创建临时文件
|
||||
with open(self.outputPath, "w", encoding="utf-8") as f:
|
||||
pass
|
||||
except Exception as e:
|
||||
raise Exception(f"Failed to create csv file. {e}\n创建csv文件失败。")
|
||||
|
||||
def print(self, res): # 输出图片结果
|
||||
if not res["code"] == 100 and self.ignoreBlank:
|
||||
return # 忽略空白图片
|
||||
name = res["fileName"]
|
||||
path = res["path"]
|
||||
if res["code"] == 100:
|
||||
textOut = getDataText(res["data"]) # 获取拼接结果
|
||||
elif res["code"] == 101:
|
||||
textOut = ""
|
||||
else:
|
||||
textOut = f'[Error] OCR failed. Code: {res["code"]}, Msg: {res["data"]} .\n'
|
||||
self.writeLists.append([name, textOut, path])
|
||||
self.writeText += textOut
|
||||
|
||||
def onEnd(self): # 结束时保存。
|
||||
# 顺序测试编码优先级列表,获取保存编码
|
||||
encoding = "utf-8"
|
||||
for e in self.encodings:
|
||||
try:
|
||||
self.writeText.encode(e)
|
||||
encoding = e
|
||||
break
|
||||
# except UnicodeEncodeError:
|
||||
except Exception:
|
||||
pass
|
||||
logger.info(f"csv encoding: {encoding}")
|
||||
# 创建文件、输出
|
||||
headers = ["Name", "OCR", "Path"] # 表头
|
||||
try:
|
||||
with open(
|
||||
self.outputPath, "w", encoding=encoding, newline=""
|
||||
) as f: # 覆盖创建文件
|
||||
writer = csv.writer(f)
|
||||
writer.writerow(headers) # 写入CSV表头
|
||||
for writeList in self.writeLists:
|
||||
writer.writerow(writeList) # 写入CSV内容
|
||||
except Exception as e:
|
||||
raise Exception(f"Failed to write csv file. {e}\n写入csv文件失败。")
|
||||
24
UmiOCR-data/py_src/ocr/output/output_jsonl.py
Normal file
24
UmiOCR-data/py_src/ocr/output/output_jsonl.py
Normal file
@@ -0,0 +1,24 @@
|
||||
# 输出到jsonl文件
|
||||
|
||||
from .output import Output
|
||||
|
||||
import json
|
||||
|
||||
|
||||
class OutputJsonl(Output):
|
||||
def __init__(self, argd):
|
||||
self.dir = argd["outputDir"] # 输出路径(文件夹)
|
||||
self.fileName = argd["outputFileName"] # 文件名
|
||||
self.outputPath = f"{self.dir}/{self.fileName}.jsonl" # 输出路径
|
||||
self.ignoreBlank = argd["ignoreBlank"] # 忽略空白文件
|
||||
# 创建输出文件
|
||||
try:
|
||||
with open(self.outputPath, "w", encoding="utf-8") as f: # 覆盖创建文件
|
||||
pass
|
||||
except Exception as e:
|
||||
raise Exception(f"Failed to create jsonl file. {e}\n创建jsonl文件失败。")
|
||||
|
||||
def print(self, res): # 输出图片结果
|
||||
# 不忽略空白条目
|
||||
with open(self.outputPath, "a", encoding="utf-8") as f: # 追加写入本地文件
|
||||
f.write(json.dumps(res, ensure_ascii=False) + "\n")
|
||||
46
UmiOCR-data/py_src/ocr/output/output_md.py
Normal file
46
UmiOCR-data/py_src/ocr/output/output_md.py
Normal file
@@ -0,0 +1,46 @@
|
||||
# 输出markdown格式
|
||||
|
||||
from .output import Output
|
||||
from .tools import getDataText
|
||||
|
||||
import os
|
||||
|
||||
|
||||
class OutputMD(Output):
|
||||
def __init__(self, argd):
|
||||
self.dir = argd["outputDir"] # 输出路径(文件夹)
|
||||
self.fileName = argd["outputFileName"] # 文件名
|
||||
self.outputPath = f"{self.dir}/{self.fileName}.md" # 输出路径
|
||||
self.ignoreBlank = argd["ignoreBlank"] # 忽略空白文件
|
||||
# 创建输出文件
|
||||
try:
|
||||
with open(self.outputPath, "w", encoding="utf-8") as f: # 覆盖创建文件
|
||||
f.write(f'> {argd["startDatetime"]}\n\n')
|
||||
except Exception as e:
|
||||
raise Exception(f"Failed to create jsonl file. {e}\n创建jsonl文件失败。")
|
||||
|
||||
def print(self, res): # 输出图片结果
|
||||
if not res["code"] == 100 and self.ignoreBlank:
|
||||
return # 忽略空白图片
|
||||
name = res["fileName"]
|
||||
path = os.path.relpath( # 从md文件到图片的相对路径
|
||||
res["path"], os.path.dirname(self.outputPath)
|
||||
)
|
||||
path = path.replace(" ", "%20") # 空格转 %20
|
||||
textOut = f"""
|
||||
---
|
||||

|
||||
[{name}]({path})
|
||||
|
||||
"""
|
||||
# 正文
|
||||
if res["code"] == 100:
|
||||
texts = getDataText(res["data"]).split("\n") # 获取拼接结果列表
|
||||
for t in texts:
|
||||
textOut += f"> {t} \n"
|
||||
elif res["code"] == 101:
|
||||
pass
|
||||
else:
|
||||
textOut += f'> [Error] OCR failed. Code: {res["code"]}, Msg: {res["data"]} \n> 【异常】OCR识别失败。 \n'
|
||||
with open(self.outputPath, "a", encoding="utf-8") as f: # 追加写入本地文件
|
||||
f.write(textOut)
|
||||
171
UmiOCR-data/py_src/ocr/output/output_pdf_layered.py
Normal file
171
UmiOCR-data/py_src/ocr/output/output_pdf_layered.py
Normal file
@@ -0,0 +1,171 @@
|
||||
# 双层可搜索 searchable pdf
|
||||
# https://github.com/pymupdf/PyMuPDF/discussions/2299
|
||||
|
||||
import os
|
||||
import fitz # PyMuPDF
|
||||
|
||||
from umi_log import logger
|
||||
from .output import Output
|
||||
|
||||
|
||||
class OutputPdfLayered(Output):
|
||||
def __init__(self, argd):
|
||||
self.dir = argd["outputDir"] # 输出路径(文件夹)
|
||||
self.originPath = argd["originPath"] # 原始文件路径
|
||||
self.fileName = argd["outputFileName"] # 文件名
|
||||
self.password = argd["password"] # 密码
|
||||
self.outputPath = f"{self.dir}/{self.fileName}.layered.pdf" # 输出路径
|
||||
self.pdf = None
|
||||
self.existentPages = [] # 已处理的页数
|
||||
self.isInsertFont = False # 是否有字体嵌入
|
||||
self.opacity = 0 # 文本透明度为0
|
||||
try:
|
||||
self.font = fitz.Font("cjk") # 字体
|
||||
except Exception as e:
|
||||
raise Exception(f"Failed to load cjk font. {e}\n无法加载cjk字体。")
|
||||
try:
|
||||
self.pdf = self._getPDF(self.originPath) # 加载pymupdf对象
|
||||
except Exception as e:
|
||||
raise Exception(
|
||||
f"Failed to load doc file. {e}\n无法加载文档。\n{self.originPath}"
|
||||
)
|
||||
|
||||
# 获取pdf文档对象,或将其它类型的文档转为PDF对象
|
||||
def _getPDF(self, path):
|
||||
# https://github.com/pymupdf/PyMuPDF-Utilities/blob/master/examples/convert-document/convert.py
|
||||
doc = fitz.open(path)
|
||||
# 如果已加密,则尝试解密
|
||||
if doc.is_encrypted and not doc.authenticate(self.password):
|
||||
raise Exception(
|
||||
f'The document is encrypted, and the password "{self.password}" is incorrect.\n文档已加密,输入密码不正确。'
|
||||
)
|
||||
if doc.is_pdf:
|
||||
return doc
|
||||
b = doc.convert_to_pdf() # 转换为PDF格式的二进制数据
|
||||
pdf = fitz.open("pdf", b) # 创建PDF文档对象
|
||||
try:
|
||||
pdf.set_toc(doc.get_toc()) # 复制原始文档的目录
|
||||
except Exception:
|
||||
logger.warning("pdf.set_toc error", exc_info=True, stack_info=True)
|
||||
# 复制原始文档的元数据(如作者、标题等)
|
||||
meta = doc.metadata
|
||||
if not meta["producer"]:
|
||||
meta["producer"] = "Umi-OCR & PyMuPDF v" + fitz.VersionBind
|
||||
if not meta["creator"]:
|
||||
meta["creator"] = "Umi-OCR & PyMuPDF PDF converter"
|
||||
pdf.set_metadata(meta)
|
||||
# 复制原始文档的链接
|
||||
for pinput in doc:
|
||||
links = pinput.get_links()
|
||||
pout = pdf[pinput.number]
|
||||
for link in links:
|
||||
if link["kind"] == fitz.LINK_NAMED: # 不处理 named links
|
||||
continue
|
||||
pout.insert_link(link) # 写入新文档
|
||||
doc.close() # 释放原文档
|
||||
return pdf
|
||||
|
||||
# 计算填满宽和高的一行字体大小
|
||||
def _calculateFontSize(self, text, w, h):
|
||||
if h > w: # 竖排转为横排计算
|
||||
w, h = h, w
|
||||
fontsize = round(h) # 字体大小初值,假设为行高
|
||||
minSize = 5 # 大小下限
|
||||
getLen = lambda text, s: self.font.text_length(text, fontsize=s)
|
||||
while getLen(text, fontsize) > w and fontsize >= minSize:
|
||||
fontsize -= 1 # 尝试减小字体,直到行宽刚好小于界限
|
||||
while getLen(text, fontsize) < w:
|
||||
fontsize += 1 # 尝试增大字体,直到行宽刚好超过界限
|
||||
while getLen(text, fontsize) > w and fontsize >= minSize:
|
||||
fontsize -= 0.1 # 再次减小字体,将精度提升到 0.1
|
||||
return fontsize
|
||||
|
||||
def print(self, res): # 输出图片结果
|
||||
if not self.pdf:
|
||||
logger.error("self.pdf 未初始化。")
|
||||
return
|
||||
pno = res["page"] - 1 # 当前页数
|
||||
self.existentPages.append(pno) # 记录已处理的页面
|
||||
if not res["code"] == 100:
|
||||
return # 忽略空白
|
||||
|
||||
page = self.pdf[pno] # 当前页对象
|
||||
page.clean_contents() # 内容流清理、语法更正,减少错误
|
||||
protation = page.rotation # 获取页面旋转角度
|
||||
isInsertFont = False # 当前是否进行过字体注入
|
||||
# 插入文本,用shape.insert_text(可编辑)或page.insert_text(不可编辑)
|
||||
for tb in res["data"]:
|
||||
if self.opacity == 0 and "from" in tb and tb["from"] == "text":
|
||||
continue # 双层(透明文字)模式下,跳过直接提取的文本,只写入OCR文本
|
||||
if not isInsertFont: # 页面插入字体
|
||||
self.isInsertFont = isInsertFont = True
|
||||
page.insert_font(fontname="cjk", fontbuffer=self.font.buffer)
|
||||
text = tb["text"]
|
||||
box = tb["box"]
|
||||
x0, y0 = box[0]
|
||||
x2, y2 = box[2]
|
||||
w = x2 - x0
|
||||
h = y2 - y0
|
||||
fontsize = self._calculateFontSize(text, w, h)
|
||||
# 插入点的 旋转后的坐标
|
||||
point = fitz.Point(x0, y2) * page.derotation_matrix
|
||||
page.insert_text(
|
||||
point,
|
||||
text,
|
||||
fontsize,
|
||||
fontname="cjk",
|
||||
rotate=protation, # 文本角度设定
|
||||
stroke_opacity=self.opacity, # 描边透明度
|
||||
fill_opacity=self.opacity, # 填充(字体)透明度
|
||||
)
|
||||
|
||||
def onEnd(self): # 结束时保存。
|
||||
if not self.pdf:
|
||||
return
|
||||
# 删除未处理的页数
|
||||
for i in range(len(self.pdf) - 1, -1, -1):
|
||||
if i not in self.existentPages:
|
||||
self.pdf.delete_page(i)
|
||||
logger.info(f"保存{len(self.pdf)}页PDF:{self.outputPath}")
|
||||
if self.isInsertFont: # 有任意页面嵌入字体,则构建字体子集
|
||||
try: # 对于部分PDF,如用txt直接打印的,构建字体子集会失败。
|
||||
self.pdf.subset_fonts() # 构建字体子集,减小文件大小。需要 fontTools 库
|
||||
except Exception: # TODO: 失败原因?可能文件中实际并没有字体?
|
||||
logger.error("构建字体子集失败。", exc_info=True, stack_info=True)
|
||||
# 保存:压缩并进行3级垃圾回收。等同 ez_save
|
||||
self.save(self.pdf, self.outputPath, deflate=True, garbage=3)
|
||||
else:
|
||||
# 无嵌入字体,则直接保存,不压缩
|
||||
self.save(self.pdf, self.outputPath)
|
||||
|
||||
def save(self, pdf, path, **options): # 保存并关闭 pdf 对象
|
||||
try:
|
||||
# 尝试保存到指定路径
|
||||
pdf.save(path, **options)
|
||||
except Exception:
|
||||
# 保存失败,尝试保存到 ".temp" 路径
|
||||
tempPath = self.outputPath + ".temp"
|
||||
logger.warning(f"保存PDF失败。 path: {path}", exc_info=True)
|
||||
try:
|
||||
pdf.save(tempPath, **options)
|
||||
pdf.close()
|
||||
except Exception as e1:
|
||||
logger.error(
|
||||
f"保存PDF到临时路径失败。 tempPath: {tempPath}", exc_info=True
|
||||
)
|
||||
raise Exception(f"[Error] Unable to save PDF to [{tempPath}]: {e1}")
|
||||
# 已保存到 .temp 并 close 原对象,尝试替换文件
|
||||
try:
|
||||
if os.path.exists(path):
|
||||
os.remove(path)
|
||||
os.rename(tempPath, path)
|
||||
except Exception as e2:
|
||||
logger.warning(
|
||||
f"保存PDF文件替换失败。保存到临时文件: {tempPath}", exc_info=True
|
||||
)
|
||||
|
||||
raise Exception(
|
||||
f"[Warning] Unable to save PDF: [{path}]. Exception: {e2}. Saved to temporary path: [{tempPath}]."
|
||||
)
|
||||
else: # 正常结束
|
||||
pdf.close()
|
||||
43
UmiOCR-data/py_src/ocr/output/output_pdf_one_layer.py
Normal file
43
UmiOCR-data/py_src/ocr/output/output_pdf_one_layer.py
Normal file
@@ -0,0 +1,43 @@
|
||||
# 单层纯文本 PDF
|
||||
|
||||
import fitz # PyMuPDF
|
||||
|
||||
from umi_log import logger
|
||||
from .output_pdf_layered import OutputPdfLayered
|
||||
|
||||
|
||||
class OutputPdfOneLayer(OutputPdfLayered):
|
||||
def __init__(self, argd):
|
||||
super().__init__(argd)
|
||||
self.opacity = 1 # 文本不透明
|
||||
self.outputPath = f"{self.dir}/{self.fileName}.text.pdf" # 输出路径
|
||||
|
||||
# 创建空白 PDF
|
||||
def _getPDF(self, path):
|
||||
source_doc = fitz.open(path) # 打开原文档
|
||||
# 如果已加密,则尝试解密
|
||||
if source_doc.is_encrypted and not source_doc.authenticate(self.password):
|
||||
raise Exception(
|
||||
f'The document is encrypted, and the password "{self.password}" is incorrect.\n文档已加密,输入密码不正确。'
|
||||
)
|
||||
pdf = fitz.open() # 创建空白PDF文档对象
|
||||
# 复制原始文档的元数据(如作者、标题等)
|
||||
meta = source_doc.metadata
|
||||
if not meta["producer"]:
|
||||
meta["producer"] = "Umi-OCR & PyMuPDF v" + fitz.VersionBind
|
||||
if not meta["creator"]:
|
||||
meta["creator"] = "Umi-OCR & PyMuPDF PDF converter"
|
||||
pdf.set_metadata(meta)
|
||||
# 生成空白的每一页
|
||||
for page in source_doc:
|
||||
rect = page.rect # 原文档渲染尺寸
|
||||
pdf.new_page(width=rect.width, height=rect.height)
|
||||
# 尝试复制原始文档的目录数据
|
||||
try:
|
||||
pdf.set_toc(source_doc.get_toc())
|
||||
except Exception:
|
||||
logger.warning(
|
||||
f"pdf.set_toc error. path: {path}", exc_info=True, stack_info=True
|
||||
)
|
||||
source_doc.close() # 释放原文档
|
||||
return pdf
|
||||
33
UmiOCR-data/py_src/ocr/output/output_txt.py
Normal file
33
UmiOCR-data/py_src/ocr/output/output_txt.py
Normal file
@@ -0,0 +1,33 @@
|
||||
# 输出到txt文件
|
||||
|
||||
from .output import Output
|
||||
from .tools import getDataText
|
||||
|
||||
|
||||
class OutputTxt(Output):
|
||||
def __init__(self, argd):
|
||||
self.dir = argd["outputDir"] # 输出路径(文件夹)
|
||||
self.fileName = argd["outputFileName"] # 文件名
|
||||
self.outputPath = f"{self.dir}/{self.fileName}.txt" # 输出路径
|
||||
self.ignoreBlank = argd["ignoreBlank"] # 忽略空白文件
|
||||
# 创建输出文件
|
||||
try:
|
||||
with open(self.outputPath, "w", encoding="utf-8") as f: # 覆盖创建文件
|
||||
f.write(f'{argd["startDatetime"]}\n\n') # 写入开始时间日期
|
||||
except Exception as e:
|
||||
raise Exception(f"Failed to create txt file. {e}\n创建txt文件失败。")
|
||||
|
||||
def print(self, res): # 输出图片结果
|
||||
if not res["code"] == 100 and self.ignoreBlank:
|
||||
return # 忽略空白图片
|
||||
textOut = f'≦ {res["fileName"]} ≧\n'
|
||||
if res["code"] == 100:
|
||||
textOut += getDataText(res["data"]) # 获取拼接结果
|
||||
textOut += "\n" # 结尾额外加换行
|
||||
elif res["code"] == 101:
|
||||
pass
|
||||
else:
|
||||
textOut += f'[Error] OCR failed. Code: {res["code"]}, Msg: {res["data"]}\n【异常】OCR识别失败。\n'
|
||||
textOut += "\n" # 多空一行
|
||||
with open(self.outputPath, "a", encoding="utf-8") as f: # 追加写入本地文件
|
||||
f.write(textOut)
|
||||
35
UmiOCR-data/py_src/ocr/output/output_txt_individual.py
Normal file
35
UmiOCR-data/py_src/ocr/output/output_txt_individual.py
Normal file
@@ -0,0 +1,35 @@
|
||||
# 单独txt文件
|
||||
|
||||
import os
|
||||
from .output import Output
|
||||
from .tools import getDataText
|
||||
|
||||
|
||||
class OutputTxtIndividual(Output):
|
||||
def __init__(self, argd):
|
||||
super().__init__(argd)
|
||||
# 是否输出到原目录
|
||||
self.outputSource = argd["outputDirType"] == "source"
|
||||
|
||||
def openOutputFile(self):
|
||||
pass # 覆盖父类方法
|
||||
|
||||
def print(self, res): # 输出图片结果
|
||||
if not res["code"] == 100 and self.ignoreBlank:
|
||||
return # 忽略空白图片
|
||||
textOut = ""
|
||||
if res["code"] == 100:
|
||||
textOut += getDataText(res["data"]) # 获取拼接结果
|
||||
elif res["code"] == 101:
|
||||
pass
|
||||
else:
|
||||
textOut += f'[Error] OCR failed. Code: {res["code"]}, Msg: {res["data"]}\n【异常】OCR识别失败。\n'
|
||||
# 输出文件
|
||||
if self.outputSource: # 输出到原始路径
|
||||
p, _ = os.path.splitext(res["path"]) # 原路径去除扩展名
|
||||
path = p + ".txt"
|
||||
else: # 输出到指定路径
|
||||
f, _ = os.path.splitext(res["fileName"]) # 原文件名去除扩展名
|
||||
path = f"{self.dir}/{f}.txt"
|
||||
with open(path, "w", encoding="utf-8") as f: # 追加写入同名本地文件
|
||||
f.write(textOut)
|
||||
29
UmiOCR-data/py_src/ocr/output/output_txt_plain.py
Normal file
29
UmiOCR-data/py_src/ocr/output/output_txt_plain.py
Normal file
@@ -0,0 +1,29 @@
|
||||
# 纯文本(无格式)txt文件
|
||||
|
||||
from .output import Output
|
||||
from .tools import getDataText
|
||||
|
||||
|
||||
class OutputTxtPlain(Output):
|
||||
def __init__(self, argd):
|
||||
self.dir = argd["outputDir"] # 输出路径(文件夹)
|
||||
self.fileName = argd["outputFileName"] # 文件名
|
||||
self.outputPath = f"{self.dir}/{self.fileName}.p.txt" # 输出路径
|
||||
# 创建输出文件
|
||||
try:
|
||||
open(self.outputPath, "w").close() # 覆盖创建文件
|
||||
except Exception as e:
|
||||
raise Exception(
|
||||
f"Failed to create plain txt file. {e}\n创建纯文本txt文件失败。"
|
||||
)
|
||||
|
||||
def print(self, res): # 输出图片结果
|
||||
if not res["code"] == 100:
|
||||
return # 强制忽略空白图片
|
||||
textOut = ""
|
||||
if res["code"] == 100:
|
||||
textOut += getDataText(res["data"]) # 获取拼接结果
|
||||
if not textOut[-1] == "\n": # 确保结尾有换行
|
||||
textOut += "\n"
|
||||
with open(self.outputPath, "a", encoding="utf-8") as f: # 追加写入本地文件
|
||||
f.write(textOut)
|
||||
9
UmiOCR-data/py_src/ocr/output/tools.py
Normal file
9
UmiOCR-data/py_src/ocr/output/tools.py
Normal file
@@ -0,0 +1,9 @@
|
||||
# 从data中提取、拼接文本
|
||||
def getDataText(data):
|
||||
textOut = ""
|
||||
l = len(data) - 1
|
||||
for i, tb in enumerate(data):
|
||||
textOut += tb["text"]
|
||||
if i < l:
|
||||
textOut += tb["end"]
|
||||
return textOut
|
||||
32
UmiOCR-data/py_src/ocr/tbpu/__init__.py
Normal file
32
UmiOCR-data/py_src/ocr/tbpu/__init__.py
Normal file
@@ -0,0 +1,32 @@
|
||||
# tbpu : text block processing unit 文本块后处理
|
||||
|
||||
from .parser_none import ParserNone
|
||||
|
||||
from .ignore_area import IgnoreArea
|
||||
from .parser_multi_para import MultiPara
|
||||
from .parser_multi_line import MultiLine
|
||||
from .parser_multi_none import MultiNone
|
||||
from .parser_single_para import SinglePara
|
||||
from .parser_single_line import SingleLine
|
||||
from .parser_single_none import SingleNone
|
||||
from .parser_single_code import SingleCode
|
||||
|
||||
# 排版解析
|
||||
Parser = {
|
||||
"none": ParserNone, # 不做处理
|
||||
"multi_para": MultiPara, # 多栏-自然段
|
||||
"multi_line": MultiLine, # 多栏-总是换行
|
||||
"multi_none": MultiNone, # 多栏-无换行
|
||||
"single_para": SinglePara, # 单栏-自然段
|
||||
"single_line": SingleLine, # 单栏-总是换行
|
||||
"single_none": SingleNone, # 单栏-无换行
|
||||
"single_code": SingleCode, # 单栏-代码段
|
||||
}
|
||||
|
||||
|
||||
# 获取排版解析器对象
|
||||
def getParser(key):
|
||||
if key in Parser:
|
||||
return Parser[key]()
|
||||
else:
|
||||
return Parser["none"]()
|
||||
32
UmiOCR-data/py_src/ocr/tbpu/ignore_area.py
Normal file
32
UmiOCR-data/py_src/ocr/tbpu/ignore_area.py
Normal file
@@ -0,0 +1,32 @@
|
||||
# 忽略区域
|
||||
|
||||
from .tbpu import Tbpu
|
||||
|
||||
|
||||
class IgnoreArea(Tbpu):
|
||||
def __init__(self, areaList):
|
||||
self.tbpuName = "忽略区域"
|
||||
self.areaList = areaList
|
||||
|
||||
def run(self, textBlocks):
|
||||
# 返回是否矩形框 a 包含 b
|
||||
def isInBox(a, b):
|
||||
return (
|
||||
a[0][0] <= b[0][0]
|
||||
and a[0][1] <= b[0][1]
|
||||
and a[2][0] >= b[2][0]
|
||||
and a[2][1] >= b[2][1]
|
||||
)
|
||||
|
||||
newList = []
|
||||
for b in textBlocks:
|
||||
flag = True # True 为没有被忽略
|
||||
# 检测当前文块 b 是否在任何一个检测块 a 内
|
||||
for a in self.areaList:
|
||||
if isInBox(a, b["box"]):
|
||||
flag = False # 踩到任何一个块,GG
|
||||
break
|
||||
if flag: # 没有被忽略
|
||||
newList.append(b)
|
||||
|
||||
return newList
|
||||
22
UmiOCR-data/py_src/ocr/tbpu/parser_multi_line.py
Normal file
22
UmiOCR-data/py_src/ocr/tbpu/parser_multi_line.py
Normal file
@@ -0,0 +1,22 @@
|
||||
# 排版解析-多栏-单行
|
||||
|
||||
from .tbpu import Tbpu
|
||||
from .parser_tools.line_preprocessing import linePreprocessing # 行预处理
|
||||
from .parser_tools.gap_tree import GapTree # 间隙树排序算法
|
||||
|
||||
|
||||
class MultiLine(Tbpu):
|
||||
def __init__(self):
|
||||
self.tbpuName = "排版解析-多栏-单行"
|
||||
|
||||
# 构建算法对象,指定包围盒的元素位置
|
||||
self.gtree = GapTree(lambda tb: tb["normalized_bbox"])
|
||||
|
||||
def run(self, textBlocks):
|
||||
textBlocks = linePreprocessing(textBlocks) # 预处理
|
||||
textBlocks = self.gtree.sort(textBlocks) # 构建间隙树
|
||||
# 补充行尾间隔符
|
||||
for tb in textBlocks:
|
||||
tb["end"] = "\n"
|
||||
del tb["normalized_bbox"]
|
||||
return textBlocks
|
||||
29
UmiOCR-data/py_src/ocr/tbpu/parser_multi_none.py
Normal file
29
UmiOCR-data/py_src/ocr/tbpu/parser_multi_none.py
Normal file
@@ -0,0 +1,29 @@
|
||||
# 排版解析-多栏-无换行
|
||||
|
||||
from .tbpu import Tbpu
|
||||
from .parser_tools.line_preprocessing import linePreprocessing # 行预处理
|
||||
from .parser_tools.gap_tree import GapTree # 间隙树排序算法
|
||||
from .parser_tools.paragraph_parse import word_separator # 上下句间隔符
|
||||
|
||||
|
||||
class MultiNone(Tbpu):
|
||||
def __init__(self):
|
||||
self.tbpuName = "排版解析-多栏-无换行"
|
||||
|
||||
# 构建算法对象,指定包围盒的元素位置
|
||||
self.gtree = GapTree(lambda tb: tb["normalized_bbox"])
|
||||
|
||||
def run(self, textBlocks):
|
||||
textBlocks = linePreprocessing(textBlocks) # 预处理
|
||||
textBlocks = self.gtree.sort(textBlocks) # 构建间隙树
|
||||
# 补充行尾间隔符
|
||||
for i in range(len(textBlocks)):
|
||||
tb = textBlocks[i]
|
||||
if i < len(textBlocks) - 1:
|
||||
letter1 = tb["text"][-1] # 行1结尾字母
|
||||
letter2 = textBlocks[i + 1]["text"][0] # 行2开头字母
|
||||
tb["end"] = word_separator(letter1, letter2) # 获取间隔符
|
||||
else:
|
||||
tb["end"] = "\n"
|
||||
del tb["normalized_bbox"]
|
||||
return textBlocks
|
||||
33
UmiOCR-data/py_src/ocr/tbpu/parser_multi_para.py
Normal file
33
UmiOCR-data/py_src/ocr/tbpu/parser_multi_para.py
Normal file
@@ -0,0 +1,33 @@
|
||||
# 排版解析-多栏-自然段
|
||||
|
||||
from .tbpu import Tbpu
|
||||
from .parser_tools.line_preprocessing import linePreprocessing # 行预处理
|
||||
from .parser_tools.gap_tree import GapTree # 间隙树排序算法
|
||||
from .parser_tools.paragraph_parse import ParagraphParse # 段内分析器
|
||||
|
||||
|
||||
class MultiPara(Tbpu):
|
||||
def __init__(self):
|
||||
self.tbpuName = "排版解析-多栏-自然段"
|
||||
|
||||
# 间隙树对象
|
||||
self.gtree = GapTree(lambda tb: tb["normalized_bbox"])
|
||||
|
||||
# 段内分析器对象
|
||||
get_info = lambda tb: (tb["normalized_bbox"], tb["text"])
|
||||
|
||||
def set_end(tb, end): # 获取预测的块尾分隔符
|
||||
tb["end"] = end
|
||||
|
||||
self.pp = ParagraphParse(get_info, set_end)
|
||||
|
||||
def run(self, textBlocks):
|
||||
textBlocks = linePreprocessing(textBlocks) # 预处理
|
||||
textBlocks = self.gtree.sort(textBlocks) # 构建间隙树
|
||||
nodes = self.gtree.get_nodes_text_blocks() # 获取树节点序列
|
||||
# 对每个结点,进行自然段分析
|
||||
for tbs in nodes:
|
||||
self.pp.run(tbs) # 预测结尾分隔符
|
||||
for tb in tbs:
|
||||
del tb["normalized_bbox"]
|
||||
return textBlocks
|
||||
14
UmiOCR-data/py_src/ocr/tbpu/parser_none.py
Normal file
14
UmiOCR-data/py_src/ocr/tbpu/parser_none.py
Normal file
@@ -0,0 +1,14 @@
|
||||
# 排版解析-不做处理
|
||||
|
||||
from .tbpu import Tbpu
|
||||
|
||||
|
||||
class ParserNone(Tbpu):
|
||||
def __init__(self):
|
||||
self.tbpuName = "排版解析-不做处理"
|
||||
|
||||
def run(self, textBlocks):
|
||||
for tb in textBlocks:
|
||||
if "end" not in tb:
|
||||
tb["end"] = "\n" # 默认结尾间隔符为换行
|
||||
return textBlocks
|
||||
73
UmiOCR-data/py_src/ocr/tbpu/parser_single_code.py
Normal file
73
UmiOCR-data/py_src/ocr/tbpu/parser_single_code.py
Normal file
@@ -0,0 +1,73 @@
|
||||
# 排版解析-单栏-代码段
|
||||
|
||||
from .parser_single_line import SingleLine
|
||||
from .parser_tools.line_preprocessing import linePreprocessing # 行预处理
|
||||
|
||||
from bisect import bisect_left
|
||||
|
||||
|
||||
class SingleCode(SingleLine):
|
||||
def __init__(self):
|
||||
self.tbpuName = "排版解析-单栏-代码段"
|
||||
|
||||
def merge_line(self, line): # 合并一行
|
||||
A = line[0]
|
||||
ba = A["box"]
|
||||
ha = ba[3][1] - ba[0][1] # 块A行高
|
||||
score = A["score"]
|
||||
for i in range(1, len(line)):
|
||||
B = line[i]
|
||||
bb = B["box"]
|
||||
ha = (ha + bb[3][1] - bb[0][1]) / 2
|
||||
# 合并文字,补充与间距相同的空格数
|
||||
space = 0
|
||||
if bb[0][0] > ba[1][0]:
|
||||
space = round((bb[0][0] - ba[1][0]) / ha)
|
||||
A["text"] += " " * space + B["text"]
|
||||
# 合并包围盒
|
||||
yTop = min(ba[0][1], ba[1][1], bb[0][1], bb[1][1])
|
||||
yBottom = max(ba[2][1], ba[3][1], bb[2][1], bb[3][1])
|
||||
xLeft = min(ba[0][0], ba[3][0], bb[0][0], bb[3][0])
|
||||
xRight = max(ba[1][0], ba[2][0], bb[1][0], bb[2][0])
|
||||
ba[0][1] = ba[1][1] = yTop # y上
|
||||
ba[2][1] = ba[3][1] = yBottom # y下
|
||||
ba[0][0] = ba[3][0] = xLeft # x左
|
||||
ba[1][0] = ba[2][0] = xRight # x右
|
||||
# 置信度
|
||||
score += B["score"]
|
||||
A["score"] = score / len(line)
|
||||
del A["normalized_bbox"]
|
||||
A["end"] = "\n"
|
||||
return A
|
||||
|
||||
def indent(self, tbs): # 分析所有行,构造缩进
|
||||
lh = 0 # 平均行高
|
||||
xMin = float("inf") # 句首的最左、最右x值
|
||||
xMax = float("-inf")
|
||||
for tb in tbs:
|
||||
b = tb["box"]
|
||||
lh += b[3][1] - b[0][1]
|
||||
x = b[0][0]
|
||||
xMin = min(xMin, x)
|
||||
xMax = max(xMax, x)
|
||||
lh /= len(tbs)
|
||||
lh2 = lh / 2
|
||||
# 构建缩进层级列表
|
||||
levelList = []
|
||||
x = xMin
|
||||
while x < xMax:
|
||||
levelList.append(x)
|
||||
x += lh
|
||||
# 按照层级,为每行句首加上空格,并调整包围盒
|
||||
for tb in tbs:
|
||||
b = tb["box"]
|
||||
level = bisect_left(levelList, b[0][0] + lh2) - 1 # 二分查找层级点
|
||||
tb["text"] = " " * level + tb["text"] # 补充空格
|
||||
b[0][0] = b[3][0] = xMin # 左侧归零
|
||||
|
||||
def run(self, textBlocks):
|
||||
textBlocks = linePreprocessing(textBlocks) # 预处理
|
||||
lines = self.get_lines(textBlocks) # 获取每一行
|
||||
tbs = [self.merge_line(line) for line in lines] # 合并所有行
|
||||
self.indent(tbs) # 为每行添加句首缩进
|
||||
return tbs
|
||||
73
UmiOCR-data/py_src/ocr/tbpu/parser_single_line.py
Normal file
73
UmiOCR-data/py_src/ocr/tbpu/parser_single_line.py
Normal file
@@ -0,0 +1,73 @@
|
||||
# 排版解析-单栏-单行
|
||||
|
||||
from .tbpu import Tbpu
|
||||
from .parser_tools.line_preprocessing import linePreprocessing # 行预处理
|
||||
from .parser_tools.paragraph_parse import word_separator # 上下句间隔符
|
||||
|
||||
|
||||
class SingleLine(Tbpu):
|
||||
def __init__(self):
|
||||
self.tbpuName = "排版解析-单栏-单行"
|
||||
|
||||
# 从文本块列表中找出所有行
|
||||
def get_lines(self, textBlocks):
|
||||
# 按x排序
|
||||
textBlocks.sort(key=lambda tb: tb["normalized_bbox"][0])
|
||||
lines = []
|
||||
for i1, tb1 in enumerate(textBlocks):
|
||||
if not tb1:
|
||||
continue
|
||||
# 最左的一个块
|
||||
l1, top1, r1, bottom1 = tb1["normalized_bbox"]
|
||||
h1 = bottom1 - top1
|
||||
now_line = [tb1]
|
||||
# 考察右侧哪些块符合条件
|
||||
for i2 in range(i1 + 1, len(textBlocks)):
|
||||
tb2 = textBlocks[i2]
|
||||
if not tb2:
|
||||
continue
|
||||
l2, top2, r2, bottom2 = tb2["normalized_bbox"]
|
||||
h2 = bottom2 - top2
|
||||
# 行2左侧太前
|
||||
if l2 < r1 - h1:
|
||||
continue
|
||||
# 垂直距离太远
|
||||
if top2 < top1 - h1 * 0.5 or bottom2 > bottom1 + h1 * 0.5:
|
||||
continue
|
||||
# 行高差距过大
|
||||
if abs(h1 - h2) > min(h1, h2) * 0.5:
|
||||
continue
|
||||
# 符合条件
|
||||
now_line.append(tb2)
|
||||
textBlocks[i2] = None
|
||||
# 更新搜索条件
|
||||
r1 = r2
|
||||
# 处理完一行
|
||||
for i2 in range(len(now_line) - 1):
|
||||
# 检查同一行内相邻文本块的水平间隙
|
||||
l1, t1, r1, b1 = now_line[i2]["normalized_bbox"]
|
||||
l2, t2, r2, b2 = now_line[i2 + 1]["normalized_bbox"]
|
||||
h = (b1 + b2 - t1 - l2) * 0.5
|
||||
if l2 - r1 > h * 1.5: # 间隙太大,强制设置空格
|
||||
now_line[i2]["end"] = " "
|
||||
continue
|
||||
letter1 = now_line[i2]["text"][-1]
|
||||
letter2 = now_line[i2 + 1]["text"][0]
|
||||
now_line[i2]["end"] = word_separator(letter1, letter2)
|
||||
now_line[-1]["end"] = "\n"
|
||||
lines.append(now_line)
|
||||
textBlocks[i1] = None
|
||||
# 所有行按y排序
|
||||
lines.sort(key=lambda tbs: tbs[0]["normalized_bbox"][1])
|
||||
return lines
|
||||
|
||||
def run(self, textBlocks):
|
||||
textBlocks = linePreprocessing(textBlocks) # 预处理
|
||||
lines = self.get_lines(textBlocks) # 获取每一行
|
||||
# 解包
|
||||
textBlocks = []
|
||||
for line in lines:
|
||||
for tb in line:
|
||||
del tb["normalized_bbox"]
|
||||
textBlocks.append(tb)
|
||||
return textBlocks
|
||||
19
UmiOCR-data/py_src/ocr/tbpu/parser_single_none.py
Normal file
19
UmiOCR-data/py_src/ocr/tbpu/parser_single_none.py
Normal file
@@ -0,0 +1,19 @@
|
||||
# 排版解析-单栏-无换行
|
||||
|
||||
from .parser_single_line import SingleLine
|
||||
from .parser_tools.paragraph_parse import word_separator # 上下句间隔符
|
||||
|
||||
|
||||
class SingleNone(SingleLine):
|
||||
def __init__(self):
|
||||
self.tbpuName = "排版解析-单栏-无换行"
|
||||
|
||||
def run(self, textBlocks):
|
||||
textBlocks = super().run(textBlocks)
|
||||
# 找到换行符,更改为间隔符
|
||||
for i in range(len(textBlocks) - 1):
|
||||
if textBlocks[i]["end"] == "\n":
|
||||
letter1 = textBlocks[i]["text"][-1]
|
||||
letter2 = textBlocks[i + 1]["text"][0]
|
||||
textBlocks[i]["end"] = word_separator(letter1, letter2)
|
||||
return textBlocks
|
||||
49
UmiOCR-data/py_src/ocr/tbpu/parser_single_para.py
Normal file
49
UmiOCR-data/py_src/ocr/tbpu/parser_single_para.py
Normal file
@@ -0,0 +1,49 @@
|
||||
# 排版解析-单栏-自然段
|
||||
|
||||
from .parser_single_line import SingleLine
|
||||
from .parser_tools.line_preprocessing import linePreprocessing # 行预处理
|
||||
from .parser_tools.paragraph_parse import ParagraphParse # 段内分析器
|
||||
|
||||
|
||||
class SinglePara(SingleLine):
|
||||
def __init__(self):
|
||||
self.tbpuName = "排版解析-单栏-自然段"
|
||||
|
||||
# 段内分析器对象
|
||||
get_info = lambda tb: (tb["normalized_bbox"], tb["text"])
|
||||
|
||||
def set_end(tb, end): # 获取预测的块尾分隔符
|
||||
tb["line"][-1]["end"] = end
|
||||
|
||||
self.pp = ParagraphParse(get_info, set_end)
|
||||
|
||||
def run(self, textBlocks):
|
||||
textBlocks = linePreprocessing(textBlocks) # 预处理
|
||||
lines = self.get_lines(textBlocks) # 获取每一行
|
||||
# 将行封装为tb
|
||||
temp_tbs = []
|
||||
for line in lines:
|
||||
b0, b1, b2, b3 = line[0]["normalized_bbox"]
|
||||
# 搜索bbox
|
||||
for i in range(1, len(line)):
|
||||
bb = line[i]["normalized_bbox"]
|
||||
b1 = min(b1, bb[1])
|
||||
b2 = max(b1, bb[2])
|
||||
b3 = max(b1, bb[3])
|
||||
# 构建tb
|
||||
temp_tbs.append(
|
||||
{
|
||||
"normalized_bbox": (b0, b1, b2, b3),
|
||||
"text": line[0]["text"][0] + line[-1]["text"][-1],
|
||||
"line": line,
|
||||
}
|
||||
)
|
||||
# 预测结尾分隔符
|
||||
self.pp.run(temp_tbs)
|
||||
# 解包
|
||||
textBlocks = []
|
||||
for t in temp_tbs:
|
||||
for tb in t["line"]:
|
||||
del tb["normalized_bbox"]
|
||||
textBlocks.append(tb)
|
||||
return textBlocks
|
||||
330
UmiOCR-data/py_src/ocr/tbpu/parser_tools/gap_tree.py
Normal file
330
UmiOCR-data/py_src/ocr/tbpu/parser_tools/gap_tree.py
Normal file
@@ -0,0 +1,330 @@
|
||||
# 【间隙·树·排序算法】 GapTree_Sort_Algorithm
|
||||
# 对OCR结果或PDF提取的文本进行版面分析,按人类阅读顺序进行排序。
|
||||
# Author: hiroi-sora
|
||||
# https://github.com/hiroi-sora/GapTree_Sort_Algorithm
|
||||
|
||||
from typing import Callable
|
||||
|
||||
|
||||
class GapTree:
|
||||
def __init__(self, get_bbox: Callable):
|
||||
"""
|
||||
:param get_bbox: 函数,传入单个文本块,
|
||||
返回该文本块左上角、右下角的坐标元组 (x0, y0, x1, y1)
|
||||
"""
|
||||
self.get_bbox = get_bbox
|
||||
|
||||
# ======================= 调用接口 =====================
|
||||
# 对文本块列表排序
|
||||
def sort(self, text_blocks: list):
|
||||
"""
|
||||
对文本块列表,按人类阅读顺序进行排序。
|
||||
|
||||
:param text_blocks: 文本块对象列表
|
||||
:return: 排序后的文本块列表
|
||||
"""
|
||||
|
||||
# 封装块单元,并求页面左右边缘
|
||||
units, page_l, page_r = self._get_units(text_blocks, self.get_bbox)
|
||||
# 求行和竖切线
|
||||
cuts, rows = self._get_cuts_rows(units, page_l, page_r)
|
||||
# 求布局树
|
||||
root = self._get_layout_tree(cuts, rows)
|
||||
# 求树节点序列
|
||||
nodes = self._preorder_traversal(root)
|
||||
# 求排序后的 原始文本块序列
|
||||
new_text_blocks = self._get_text_blocks(nodes)
|
||||
|
||||
# 测试:缓存中间变量,以便调试输出
|
||||
self.current_rows = rows
|
||||
self.current_cuts = cuts
|
||||
self.current_nodes = nodes
|
||||
|
||||
return new_text_blocks
|
||||
|
||||
# 获取以区块为单位的文本块二层列表
|
||||
def get_nodes_text_blocks(self):
|
||||
"""
|
||||
获取以区块为单位的文本块二层列表。需要在 sort 后调用。
|
||||
|
||||
:return: [ [区块1的text_blocks], [区块2的text_blocks]... ]
|
||||
"""
|
||||
result = []
|
||||
for node in self.current_nodes:
|
||||
tbs = []
|
||||
if node["units"]:
|
||||
for unit in node["units"]:
|
||||
tbs.append(unit[1])
|
||||
result.append(tbs)
|
||||
return result
|
||||
|
||||
# ======================= 封装块单元列表 =====================
|
||||
# 将原始文本块,封装为 ( (x0,y0,x2,y2), 原始 ) 。并检查页边界。
|
||||
def _get_units(self, text_blocks, get_bbox):
|
||||
# 封装单元列表 units [ ( (x0,y0,x2,y2), 原始文本块 ), ... ]
|
||||
units = []
|
||||
page_l, page_r = float("inf"), -1 # 记录文本块的左右最值,作为页边界
|
||||
for tb in text_blocks:
|
||||
x0, y0, x2, y2 = get_bbox(tb)
|
||||
units.append(((x0, y0, x2, y2), tb))
|
||||
if x0 < page_l:
|
||||
page_l = x0
|
||||
if x2 > page_r:
|
||||
page_r = x2
|
||||
units.sort(key=lambda a: a[0][1]) # 按顶部从上到下排序
|
||||
return units, page_l, page_r
|
||||
|
||||
# ======================= 求行和竖切线 =====================
|
||||
"""
|
||||
扫描所有文本块,获取所有行和竖切线。
|
||||
一个行,由一组垂直位置接近的文本块所组成。
|
||||
一条竖切线,由多个连续行中,同一位置的间隙所组成。间隙划分同一行中不同列的文本块。
|
||||
输入:一个页面上的文本块单元列表 units=[ ( (x0,y0,x2,y2), _ ) ] 。必须按上到下排序。
|
||||
返回:
|
||||
竖切线列表 cuts=[ ( 左边缘x, 右边缘x, 起始行号, 结束行号 ) ] 。从左到右排序
|
||||
页面上的行 rows=[ [unit...] ] 。从上到下,从左到右排序
|
||||
"""
|
||||
|
||||
def _get_cuts_rows(self, units, page_l, page_r):
|
||||
# 使用间隙组 gaps2 更新 gaps1 。返回: 更新后的gaps1 , gaps1中被移除的间隙
|
||||
def update_gaps(gaps1, gaps2):
|
||||
flags1 = [True for _ in gaps1] # gaps1[i] 是否彻底移除
|
||||
flags2 = [True for _ in gaps2] # gaps2[i] 是否新加入
|
||||
new_gaps1 = []
|
||||
for i1, g1 in enumerate(gaps1):
|
||||
l1, r1, _ = g1
|
||||
for i2, g2 in enumerate(gaps2): # 对每一个gap1,考察所有gap2
|
||||
l2, r2, _ = g2
|
||||
# 计算交集的起点和终点
|
||||
inter_l = max(l1, l2)
|
||||
inter_r = min(r1, r2)
|
||||
# 如果交集有效
|
||||
if inter_l <= inter_r:
|
||||
# 更新 gap1 左右边缘
|
||||
new_gaps1.append((inter_l, inter_r, g1[2]))
|
||||
flags1[i1] = False # 旧的 gap1 不应移除
|
||||
flags2[i2] = False # 新的 gap2 不应添加
|
||||
# gap2 新加入
|
||||
for i2, f2 in enumerate(flags2):
|
||||
if f2:
|
||||
new_gaps1.append(gaps2[i2])
|
||||
# 记录 gaps1 彻底移除的项
|
||||
del_gaps1 = []
|
||||
for i1, f1 in enumerate(flags1):
|
||||
if f1:
|
||||
del_gaps1.append(gaps1[i1])
|
||||
|
||||
return new_gaps1, del_gaps1
|
||||
|
||||
# ========================================
|
||||
|
||||
page_l -= 1 # 保证页面左右边缘不与文本块重叠
|
||||
page_r += 1
|
||||
# 存放所有行。“row”指同一水平线上的单元块(可能属于多列)。 [ [unit...] ]
|
||||
rows = []
|
||||
# 已生成完毕的竖切线。[ ( 左边缘x, 右边缘x , 起始行号, 结束行号 ) ]
|
||||
completed_cuts = []
|
||||
# 考察中的间隙。 [ (左边缘x, 右边缘x , 开始行号) ]
|
||||
gaps = []
|
||||
row_index = 0 # 当前行号
|
||||
unit_index = 0 # 当前块号
|
||||
# 从上到下遍历所有文本行
|
||||
l_units = len(units)
|
||||
while unit_index < l_units:
|
||||
# ========== 查找当前行 row ==========
|
||||
unit = units[unit_index] # 当前行最顶部的块
|
||||
u_bottom = unit[0][3]
|
||||
row = [unit] # 当前行
|
||||
# 查找当前行的剩余块
|
||||
for i in range(unit_index + 1, len(units)):
|
||||
next_u = units[i]
|
||||
next_top = next_u[0][1]
|
||||
if next_top > u_bottom:
|
||||
break # 下一块的顶部超过当前底部,结束本行
|
||||
row.append(next_u) # 当前行添加块
|
||||
unit_index = i # 步进 已遍历的块序号
|
||||
# ========== 查找当前行的间隙 row_gaps ==========
|
||||
row.sort(key=lambda x: (x[0][0], x[0][2])) # 当前行中的块 从左到右排序
|
||||
row_gaps = [] # 当前行的间隙 [ ( ( 左边缘l, 右边缘r ), 开始行号) ]
|
||||
search_start = page_l # 本轮搜索的线段起始点为页面左边缘
|
||||
for u in row: # 遍历当前行的块
|
||||
l = u[0][0] # 块左侧
|
||||
r = u[0][2] # 块右侧
|
||||
# 若块起始点大于搜索起始点,那么将这部分加入到结果
|
||||
if l > search_start:
|
||||
row_gaps.append((search_start, l, row_index))
|
||||
# 若块结束点大于搜索起始点,更新搜索起始点
|
||||
if r > search_start:
|
||||
search_start = r
|
||||
# 页面右边缘 加入最后一个间隙
|
||||
row_gaps.append((search_start, page_r, row_index))
|
||||
# ========== 更新考察中的间隙组 ==========
|
||||
gaps, del_gaps = update_gaps(gaps, row_gaps)
|
||||
# gaps 中被移除的项,加入生成完毕的竖切线 completed_cuts
|
||||
row_max = row_index - 1 # 竖切线结束行号
|
||||
for dg1 in del_gaps:
|
||||
completed_cuts.append((*dg1, row_max))
|
||||
# ========== End ==========
|
||||
rows.append(row) # 总行列表添加当前行
|
||||
unit_index += 1
|
||||
row_index += 1
|
||||
# 遍历结束,收集 gaps 中剩余的间隙,组成延伸到最后一行的竖切线
|
||||
row_max = len(rows) - 1 # 竖切线结束行号
|
||||
for g in gaps:
|
||||
completed_cuts.append((*g, row_max))
|
||||
completed_cuts.sort(key=lambda c: c[0])
|
||||
return completed_cuts, rows
|
||||
|
||||
# ======================= 求布局树 =====================
|
||||
"""
|
||||
一个布局树节点表示一个区块。定义:
|
||||
node = {
|
||||
"x_left": 节点左边缘x,
|
||||
"x_right": 右边缘x,
|
||||
"r_top": 顶部的行号,
|
||||
"r_bottom": 底部的行号,
|
||||
"units": [], # 节点内部的文本块列表(除了根节点为空,其它节点非空)
|
||||
"children": [], # 子节点,有序
|
||||
}
|
||||
"""
|
||||
|
||||
def _get_layout_tree(self, cuts, rows):
|
||||
# 竖切线,将一个横行切开,断开的区域为“间隙”。
|
||||
# 生成每一行对应的间隙 (左侧,右侧) 坐标列表
|
||||
rows_gaps = [[] for _ in rows]
|
||||
for g_i, cut in enumerate(cuts):
|
||||
for r_i in range(cut[2], cut[3] + 1):
|
||||
rows_gaps[r_i].append((cut[0], cut[1]))
|
||||
|
||||
root = { # 根节点
|
||||
"x_left": cuts[0][0] - 1,
|
||||
"x_right": cuts[-1][1] + 1,
|
||||
"r_top": -1,
|
||||
"r_bottom": -1,
|
||||
"units": [],
|
||||
"children": [],
|
||||
}
|
||||
completed_nodes = [root] # 已经完成结束的节点
|
||||
now_nodes = [] # 当前正在考虑的节点。无顺序
|
||||
|
||||
# ========== 结束一个节点,加入节点树 ==========
|
||||
def complete(node):
|
||||
node_r = node["x_right"] - 2 # 当前节点右边界
|
||||
max_nodes = [] # 符合父节点条件的,最低的完成节点列表
|
||||
max_r = -2 # 符合父节点条件的最低行数
|
||||
# 在完成列表中,寻找父节点
|
||||
for com_node in completed_nodes:
|
||||
# 父节点的垂直投影必须包含当前右界
|
||||
if node_r < com_node["x_left"] or node_r > com_node["x_right"] + 0.0001:
|
||||
continue
|
||||
# 父节点底部必须在当前之上
|
||||
if com_node["r_bottom"] >= node["r_top"]:
|
||||
continue
|
||||
# 遇到更低的符合条件节点
|
||||
if com_node["r_bottom"] > max_r:
|
||||
max_r = com_node["r_bottom"]
|
||||
max_nodes = [com_node]
|
||||
continue
|
||||
# 遇到同样低的符合条件节点
|
||||
if com_node["r_bottom"] == max_r:
|
||||
max_nodes.append(com_node)
|
||||
continue
|
||||
# 在最低列表中,寻找最右的节点作为父节点
|
||||
max_node = max(max_nodes, key=lambda n: n["x_right"])
|
||||
max_node["children"].append(node) # 加入父节点
|
||||
completed_nodes.append(node) # 加入完成列表
|
||||
|
||||
# ========== 遍历每行,更新节点树 ==========
|
||||
for r_i, row in enumerate(rows):
|
||||
row_gaps = rows_gaps[r_i] # 当前行的间隙组
|
||||
u_i = g_i = 0 # 当前考察的 文本块、间隙下标
|
||||
|
||||
# ========== 检查是否有正在考虑的节点 可以结束 ==========
|
||||
new_nodes = []
|
||||
for node in now_nodes: # 遍历节点
|
||||
l_flag = r_flag = False # 标记节点左右边缘是否延续
|
||||
completed_flag = False # 标记节点是否可以结束
|
||||
x_left = node["x_left"] # 左右边缘坐标
|
||||
x_right = node["x_right"]
|
||||
for gap in row_gaps: # 遍历该行所有间隙
|
||||
if gap[1] == x_left: # 节点左边缘被间隙右侧延续
|
||||
l_flag = True
|
||||
if gap[0] == x_right: # 右边缘被间隙左侧延续
|
||||
r_flag = True
|
||||
# 任意间隙在本节点下方,打断本节点
|
||||
if x_left < gap[0] < x_right or x_left < gap[1] < x_right:
|
||||
completed_flag = True
|
||||
break
|
||||
if not l_flag or not r_flag: # 左右任意一个边缘无法延续
|
||||
completed_flag = True
|
||||
if completed_flag: # 节点结束,加入节点树
|
||||
complete(node)
|
||||
else: # 节点继续
|
||||
node["r_bottom"] = r_i
|
||||
new_nodes.append(node)
|
||||
now_nodes = new_nodes
|
||||
|
||||
# ========== 从左到右遍历,将文本块加入对应列的节点 ==========
|
||||
while u_i < len(row):
|
||||
unit = row[u_i] # 当前块
|
||||
# ========== 当前块 unit 位于间隙 g_i 与 g_i+1 之间的区间 ==========
|
||||
x_l = row_gaps[g_i][1] # 左间隙 g_i 的右边界
|
||||
x_r = row_gaps[g_i + 1][0] # 右间隙 g_i+1 的左边界
|
||||
# 检查区间是否正确
|
||||
if unit[0][0] + 0.0001 > x_r: # 块比右间隙更右,说明到了下一个区间
|
||||
g_i += 1 # 间隙步进,块不步进
|
||||
continue
|
||||
# ========== 检查当前块可否加入已有的节点 ==========
|
||||
flag = False
|
||||
for node in now_nodes:
|
||||
# 若某个节点的左右侧坐标,与当前块一致,则当前块加入节点
|
||||
if node["x_left"] == x_l and node["x_right"] == x_r:
|
||||
node["units"].append(unit)
|
||||
flag = True
|
||||
break
|
||||
if flag:
|
||||
u_i += 1 # 块步进
|
||||
continue
|
||||
# ========== 根据当前块创建新的节点,加入待考虑节点 ==========
|
||||
now_nodes.append(
|
||||
{
|
||||
"x_left": x_l,
|
||||
"x_right": x_r,
|
||||
"r_top": r_i,
|
||||
"r_bottom": r_i,
|
||||
"units": [unit],
|
||||
"children": [],
|
||||
}
|
||||
)
|
||||
u_i += 1 # 块步进
|
||||
# 将剩余节点也加入节点树
|
||||
for node in now_nodes:
|
||||
complete(node)
|
||||
# 整理所有节点
|
||||
for node in completed_nodes:
|
||||
# 所有子节点 按从左到右排序
|
||||
node["children"].sort(key=lambda n: n["x_left"])
|
||||
# 所有块单元 按从上到下排序
|
||||
node["units"].sort(key=lambda u: u[0][1])
|
||||
return root
|
||||
|
||||
# ======================= 前序遍历布局树,求节点序列 =====================
|
||||
def _preorder_traversal(self, root):
|
||||
if not root:
|
||||
return []
|
||||
stack = [root]
|
||||
result = []
|
||||
while stack:
|
||||
node = stack.pop()
|
||||
result.append(node)
|
||||
# 将当前节点的子节点逆序压入栈中,以保证左子节点先于右子节点处理
|
||||
stack += reversed(node["children"])
|
||||
return result
|
||||
|
||||
# ======================= 从节点序列中,提取原始文本块序列 =====================
|
||||
def _get_text_blocks(self, nodes):
|
||||
result = []
|
||||
for node in nodes:
|
||||
for unit in node["units"]:
|
||||
result.append(unit[1])
|
||||
return result
|
||||
@@ -0,0 +1,98 @@
|
||||
# =========================================
|
||||
# =============== 按行预处理 ===============
|
||||
# =========================================
|
||||
|
||||
from statistics import median # 中位数
|
||||
from math import atan2, cos, sin, sqrt, pi, radians, degrees
|
||||
|
||||
from umi_log import logger
|
||||
|
||||
angle_threshold = 3 # 进行一些操作的最小角度阈值
|
||||
angle_threshold_rad = radians(angle_threshold)
|
||||
|
||||
|
||||
# 计算两点之间的距离
|
||||
def _distance(point1, point2):
|
||||
return sqrt((point2[0] - point1[0]) ** 2 + (point2[1] - point1[1]) ** 2)
|
||||
|
||||
|
||||
# 计算一个box的旋转角度
|
||||
def _calculateAngle(box):
|
||||
# 获取宽高
|
||||
width = _distance(box[0], box[1])
|
||||
height = _distance(box[1], box[2])
|
||||
# 选择距离较大的两个顶点对,计算角度弧度值
|
||||
if width < height:
|
||||
angle_rad = atan2(box[2][1] - box[1][1], box[2][0] - box[1][0])
|
||||
else:
|
||||
angle_rad = atan2(box[1][1] - box[0][1], box[1][0] - box[0][0])
|
||||
# 标准化角度到[-pi/2, pi/2)范围(加上阈值)
|
||||
if angle_rad < -pi / 2 + angle_threshold_rad:
|
||||
angle_rad += pi
|
||||
elif angle_rad >= pi / 2 + angle_threshold_rad:
|
||||
angle_rad -= pi
|
||||
return angle_rad
|
||||
|
||||
|
||||
# 估计一组文本块的旋转角度
|
||||
def _estimateRotation(textBlocks):
|
||||
# blocks["box"] = [左上角,右上角,右下角,左下角]
|
||||
angle_rads = (_calculateAngle(block["box"]) for block in textBlocks)
|
||||
median_angle = median(angle_rads) # 中位数
|
||||
return median_angle
|
||||
|
||||
|
||||
# 获取旋转后的标准bbox。angle_threshold为执行旋转的阈值(最小角度值)。
|
||||
def _getBboxes(textBlocks, rotation_rad):
|
||||
# 角度低于阈值(接近0°),则不进行旋转,以提高性能。
|
||||
if abs(rotation_rad) <= angle_threshold_rad:
|
||||
bboxes = [
|
||||
( # 直接构造bbox
|
||||
min(x for x, y in tb["box"]),
|
||||
min(y for x, y in tb["box"]),
|
||||
max(x for x, y in tb["box"]),
|
||||
max(y for x, y in tb["box"]),
|
||||
)
|
||||
for tb in textBlocks
|
||||
]
|
||||
# 否则,进行旋转操作。
|
||||
else:
|
||||
logger.debug(f"文本块预处理旋转 {degrees(rotation_rad):.2f} °")
|
||||
bboxes = []
|
||||
min_x, min_y = float("inf"), float("inf") # 初始化最小的x和y坐标
|
||||
cos_angle = cos(-rotation_rad) # 计算角度正弦值
|
||||
sin_angle = sin(-rotation_rad)
|
||||
for tb in textBlocks:
|
||||
box = tb["box"]
|
||||
rotated_box = [ # 旋转box的每个顶点
|
||||
(cos_angle * x - sin_angle * y, sin_angle * x + cos_angle * y)
|
||||
for x, y in box
|
||||
]
|
||||
# 解包旋转后的顶点坐标,分别得到所有x和y的值
|
||||
xs, ys = zip(*rotated_box)
|
||||
# 构建标准bbox (左上角x, 左上角y, 右下角x, 右下角y)
|
||||
bbox = (min(xs), min(ys), max(xs), max(ys))
|
||||
bboxes.append(bbox)
|
||||
min_x, min_y = min(min_x, bbox[0]), min(min_y, bbox[1])
|
||||
# 如果旋转后存在负坐标,将所有包围盒平移,使得最小的x和y坐标为0,确保所有坐标非负
|
||||
if min_x < 0 or min_y < 0:
|
||||
bboxes = [
|
||||
(x - min_x, y - min_y, x2 - min_x, y2 - min_y)
|
||||
for (x, y, x2, y2) in bboxes
|
||||
]
|
||||
return bboxes
|
||||
|
||||
|
||||
# 预处理 textBlocks ,将包围盒 ["box"] 转为标准化 bbox ,同时去除 ["text"] 不完整的项
|
||||
def linePreprocessing(textBlocks):
|
||||
textBlocks = [i for i in textBlocks if i.get("text", False)]
|
||||
# 判断角度
|
||||
rotation_rad = _estimateRotation(textBlocks)
|
||||
# 获取标准化bbox
|
||||
bboxes = _getBboxes(textBlocks, rotation_rad)
|
||||
# 写入tb
|
||||
for i, tb in enumerate(textBlocks):
|
||||
tb["normalized_bbox"] = bboxes[i]
|
||||
# 按y排序
|
||||
textBlocks.sort(key=lambda tb: tb["normalized_bbox"][1])
|
||||
return textBlocks
|
||||
173
UmiOCR-data/py_src/ocr/tbpu/parser_tools/paragraph_parse.py
Normal file
173
UmiOCR-data/py_src/ocr/tbpu/parser_tools/paragraph_parse.py
Normal file
@@ -0,0 +1,173 @@
|
||||
# 段落分析器
|
||||
# 对已经是一个列区块之内的文本块,判断其段落关系。
|
||||
|
||||
from typing import Callable
|
||||
import unicodedata
|
||||
|
||||
|
||||
# 传入前句尾字符和后句首字符,返回分隔符
|
||||
def word_separator(letter1, letter2):
|
||||
|
||||
# 判断Unicode字符是否属于中文、日文或韩文字符集
|
||||
def is_cjk(character):
|
||||
cjk_unicode_ranges = [
|
||||
(0x4E00, 0x9FFF), # 中文
|
||||
(0x3040, 0x30FF), # 日文
|
||||
(0x1100, 0x11FF), # 韩文
|
||||
(0x3130, 0x318F), # 韩文兼容字母
|
||||
(0xAC00, 0xD7AF), # 韩文音节
|
||||
# 全角符号
|
||||
(0x3000, 0x303F), # 中文符号和标点
|
||||
(0xFE30, 0xFE4F), # 中文兼容形式标点
|
||||
(0xFF00, 0xFFEF), # 半角和全角形式字符
|
||||
]
|
||||
return any(start <= ord(character) <= end for start, end in cjk_unicode_ranges)
|
||||
|
||||
if is_cjk(letter1) and is_cjk(letter2):
|
||||
return ""
|
||||
|
||||
# 特殊情况:前文为连字符。
|
||||
if letter1 == "-":
|
||||
return ""
|
||||
# 特殊情况:后文为任意标点符号。
|
||||
if unicodedata.category(letter2).startswith("P"):
|
||||
return ""
|
||||
# 其它正常情况加空格
|
||||
return " "
|
||||
|
||||
|
||||
TH = 1.2 # 行高用作对比的阈值
|
||||
|
||||
|
||||
class ParagraphParse:
|
||||
def __init__(self, get_info: Callable, set_end: Callable) -> None:
|
||||
"""
|
||||
:param get_info: 函数,传入单个文本块,
|
||||
返回该文本块的信息元组: ( (x0, y0, x1, y1), "文本" )
|
||||
:param set_end: 函数,传入单个文本块 和文本尾部的分隔符,该函数要将分隔符保存。
|
||||
"""
|
||||
self.get_info = get_info
|
||||
self.set_end = set_end
|
||||
|
||||
# ======================= 调用接口:对文本块列表进行结尾分隔符预测 =====================
|
||||
def run(self, text_blocks: list):
|
||||
"""
|
||||
对属于一个区块内的文本块列表,进行段落分析,预测每个文本块结尾的分隔符。
|
||||
|
||||
:param text_blocks: 文本块对象列表
|
||||
:return: 排序后的文本块列表
|
||||
"""
|
||||
# 封装块单元
|
||||
units = self._get_units(text_blocks, self.get_info)
|
||||
# 执行分析
|
||||
self._parse(units)
|
||||
return text_blocks
|
||||
|
||||
# ======================= 封装块单元列表 =====================
|
||||
# 将原始文本块,封装为 ( (x0,y0,x2,y2), ("开头","结尾"), 原始 ) 。
|
||||
def _get_units(self, text_blocks, get_info):
|
||||
units = []
|
||||
for tb in text_blocks:
|
||||
bbox, text = get_info(tb)
|
||||
units.append((bbox, (text[0], text[-1]), tb))
|
||||
return units
|
||||
|
||||
# ======================= 分析 =====================
|
||||
|
||||
# 执行分析
|
||||
def _parse(self, units):
|
||||
units.sort(key=lambda a: a[0][1]) # 确保从上到下有序
|
||||
para_l, para_top, para_r, para_bottom = units[0][0] # 当前段的左右
|
||||
para_line_h = para_bottom - para_top # 当前段行高
|
||||
para_line_s = None # 当前段行间距
|
||||
now_para = [units[0]] # 当前段的块
|
||||
paras = [] # 总的段
|
||||
paras_line_space = [] # 总的段的行间距
|
||||
# 取 左右相等为一个自然段的主体
|
||||
for i in range(1, len(units)):
|
||||
l, top, r, bottom = units[i][0] # 当前块上下左右边缘
|
||||
h = bottom - top
|
||||
ls = top - para_bottom # 行间距
|
||||
# 检测是否同一段
|
||||
if ( # 左右边缘都相等
|
||||
abs(para_l - l) <= para_line_h * TH
|
||||
and abs(para_r - r) <= para_line_h * TH
|
||||
# 行间距不大
|
||||
and (para_line_s == None or ls < para_line_s + para_line_h * 0.5)
|
||||
):
|
||||
# 更新数据
|
||||
para_l = (para_l + l) / 2
|
||||
para_r = (para_r + r) / 2
|
||||
para_line_h = (para_line_h + h) / 2
|
||||
para_line_s = ls if para_line_s == None else (para_line_s + ls) / 2
|
||||
# 添加到当前段
|
||||
now_para.append(units[i])
|
||||
else: # 非同一段,归档上一段,创建新一段
|
||||
paras.append(now_para)
|
||||
paras_line_space.append(para_line_s)
|
||||
now_para = [units[i]]
|
||||
para_l, para_r, para_line_h = l, r, bottom - top
|
||||
para_line_s = None
|
||||
para_bottom = bottom
|
||||
# 归档最后一段
|
||||
paras.append(now_para)
|
||||
paras_line_space.append(para_line_s)
|
||||
|
||||
# 合并只有1行的段,添加到上/下段作为首/尾句
|
||||
for i1 in reversed(range(len(paras))):
|
||||
para = paras[i1]
|
||||
if len(para) == 1:
|
||||
l, top, r, bottom = para[0][0]
|
||||
up_flag = down_flag = False
|
||||
# 上段末尾条件:左对齐,右不超,行间距够小
|
||||
if i1 > 0:
|
||||
# 检查左右
|
||||
up_l, up_top, up_r, up_bottom = paras[i1 - 1][-1][0]
|
||||
up_dist, up_h = abs(up_l - l), up_bottom - up_top
|
||||
up_flag = up_dist <= up_h * TH and r <= up_r + up_h * TH
|
||||
# 检查行间距
|
||||
if (
|
||||
paras_line_space[i1 - 1] != None
|
||||
and top - up_bottom > paras_line_space[i1 - 1] + up_h * 0.5
|
||||
):
|
||||
up_flag = False
|
||||
# 下段开头条件:右对齐/单行超出,左缩进
|
||||
if i1 < len(paras) - 1:
|
||||
down_l, down_top, down_r, down_bottom = paras[i1 + 1][0][0]
|
||||
down_h = down_bottom - down_top
|
||||
# 左对齐或缩进
|
||||
if down_l - down_h * TH <= l <= down_l + down_h * (1 + TH):
|
||||
if len(paras[i1 + 1]) > 1: # 多行,右对齐
|
||||
down_flag = abs(down_r - r) <= down_h * TH
|
||||
else: # 单行,右可超出
|
||||
down_flag = down_r - down_h * TH < r
|
||||
# 检查行间距
|
||||
if (
|
||||
paras_line_space[i1 + 1] != None
|
||||
and down_top - bottom > paras_line_space[i1 + 1] + down_h * 0.5
|
||||
):
|
||||
down_flag = False
|
||||
|
||||
# 选择添加到上还是下段
|
||||
if up_flag and down_flag: # 两段都符合,则选择垂直距离更近的
|
||||
if top - up_bottom < down_top - bottom:
|
||||
paras[i1 - 1].append(para[0])
|
||||
else:
|
||||
paras[i1 + 1].insert(0, para[0])
|
||||
elif up_flag: # 只有一段符合,直接选择
|
||||
paras[i1 - 1].append(para[0])
|
||||
elif down_flag:
|
||||
paras[i1 + 1].insert(0, para[0])
|
||||
if up_flag or down_flag:
|
||||
del paras[i1]
|
||||
del paras_line_space[i1]
|
||||
|
||||
# 刷新所有段,添加end
|
||||
for para in paras:
|
||||
for i1 in range(len(para) - 1):
|
||||
letter1 = para[i1][1][1] # 行1结尾字母
|
||||
letter2 = para[i1 + 1][1][0] # 行2开头字母
|
||||
sep = word_separator(letter1, letter2)
|
||||
self.set_end(para[i1][2], sep)
|
||||
self.set_end(para[-1][2], "\n")
|
||||
return units
|
||||
22
UmiOCR-data/py_src/ocr/tbpu/tbpu.py
Normal file
22
UmiOCR-data/py_src/ocr/tbpu/tbpu.py
Normal file
@@ -0,0 +1,22 @@
|
||||
# tbpu : text block processing unit
|
||||
# 文块处理器的基类。
|
||||
# OCR返回的结果中,一项包含文字、包围盒、置信度的元素,称为一个“文块” - text block 。
|
||||
# 文块不一定是完整的一句话或一个段落。反之,一般是零散的文字。
|
||||
# 一个OCR结果常由多个文块组成。
|
||||
# 文块处理器就是:将传入的多个文块进行处理,比如合并、排序、删除文块。
|
||||
|
||||
|
||||
class Tbpu:
|
||||
def __init__(self):
|
||||
self.tbpuName = "文块处理单元-未知"
|
||||
|
||||
def run(self, textBlocks):
|
||||
"""输入:textBlocks文块列表。例:\n
|
||||
[
|
||||
{'box': [[29, 19], [172, 19], [172, 44], [29, 44]], 'score': 0.89, 'text': '文本111'},
|
||||
{'box': [[29, 60], [161, 60], [161, 86], [29, 86]], 'score': 0.75, 'text': '文本222'},
|
||||
]
|
||||
输出:排序后的textBlocks文块列表,每个块增加键:
|
||||
'end' 结尾间隔符
|
||||
"""
|
||||
return textBlocks
|
||||
Reference in New Issue
Block a user