Files
work-secretfile-selfcheck/UmiOCR-data/py_src/mission/mission_doc.py

373 lines
16 KiB
Python
Raw Normal View History

# ===============================================
# =============== 文档 - 任务管理器 ===============
# ===============================================
# API所有页数page 均为1开始
import fitz # PyMuPDF
import time
import math
from PIL import Image
from io import BytesIO
from umi_log import logger
from .mission import Mission
from .mission_ocr import MissionOCR
from ..ocr.tbpu import getParser
from ..ocr.tbpu import IgnoreArea
from ..ocr.tbpu.parser_tools.paragraph_parse import word_separator # 上下句间隔符
MinSize = 1080 # 最小渲染分辨率
# 合法文件后缀
DocSuf = [
".pdf",
".xps",
".epub",
".mobi",
".fb2",
".cbz",
]
class FitzOpen:
def __init__(self, path):
self._path = path
self._doc = None
def __enter__(self):
self._doc = fitz.open(self._path)
return self._doc
def __exit__(self, exc_type, exc_val, exc_tb):
self._doc.close()
# https://pymupdf.readthedocs.io/en/latest/matrix.html#matrix
# 从变换矩阵中提取角度值返回0~359整数
def transform_to_rotation(matrix):
# [1, 0, 0, 1, 0, 0] -> [cos(deg), sin(deg), -sin(deg), cos(deg), 0, 0].
a, b, c, d, _, _ = matrix
# 处理缩放和反射
scale = math.sqrt(a**2 + b**2)
if scale < 1e-6:
return 0
# 归一化以消除缩放影响
cos_theta = a / scale
sin_theta = b / scale
# 检查反射
determinant = a * d - b * c
if determinant < 0:
# 反射情况,调整角度计算
cos_theta = -cos_theta
theta_rad = math.atan2(sin_theta, cos_theta)
theta_deg = math.degrees(theta_rad)
rounded_deg = round(theta_deg) % 360
return rounded_deg
class _MissionDocClass(Mission):
def __init__(self):
super().__init__()
self._schedulingMode = "1234" # 调度方式:顺序
self._minInterval = 0.1 # msnTask最短调用间隔
self._lastCallTime = 0 # 上一次调用时间
# 添加一个文档任务
# msnInfo: { 回调函数"onXX", 参数"argd":{"tbpu.xx", "ocr.xx"} }
# msnPath: 单个文档路径
# pageRange: 页数范围。可选: None 全部页 , [1,3] 页面范围(含开头结束)。
# pageList: 指定多个页数。可选: [] 使用pageRange设置 , [1,2,3] 指定页数
# password: 密码(非必填)
def addMission(self, msnInfo, msnPath, pageRange=None, pageList=[], password=""):
# =============== 加载文档,获取文档操作对象 ===============
try:
doc = fitz.open(msnPath)
except Exception as e:
return f"[Error] fitz.open error: {msnPath} {e}"
if doc.is_encrypted and not doc.authenticate(password):
if password:
msg = f"[Error] Incorrect password. 文档已加密,密码错误。 [{password}]"
else:
msg = "[Error] Doc encrypted. 文档已加密,请提供密码。"
return msg
msnInfo["doc"] = doc
msnInfo["path"] = msnPath
# =============== 拦截 onEnd ===============
msnInfo["sourceOnEnd"] = msnInfo["onEnd"] if "onEnd" in msnInfo else None
msnInfo["onEnd"] = self._preOnEnd
# =============== pageRange 页面范围 ===============
page_count = doc.page_count
if len(pageList) == 0:
if isinstance(pageRange, (tuple, list)) and len(pageRange) == 2:
a, b = pageRange[0], pageRange[1]
if a < 0:
a += page_count + 1
if b < 0:
b += page_count + 1
if a < 1:
return f"[Error] pageRange {pageRange} 范围起始不能小于1"
if b > page_count:
return f"[Error] pageRange {pageRange} 范围结束不能大于页数 {doc.page_count}"
if a > b:
return f"[Error] pageRange {pageRange} 范围错误"
pageList = list(range(a - 1, b))
else:
pageList = list(range(0, page_count))
# 检查页数列表合法性
if len(pageList) == 0:
return "[Error] 页数列表为空"
for p in pageList:
if not isinstance(p, int):
return "[Error] 页数列表内容非整数"
if not 0 <= p < page_count:
return f"[Error] 页数列表超出 1~{page_count} 范围"
msnInfo["pageList"] = pageList
# =============== tbpu文本块后处理 msnInfo["tbpu"] ===============
argd = msnInfo["argd"] # 参数
msnInfo["tbpu"] = []
msnInfo["ignoreArea"] = {}
# 忽略区域
if "tbpu.ignoreArea" in argd:
iArea = argd["tbpu.ignoreArea"]
if isinstance(iArea, list) and len(iArea) > 0:
msnInfo["ignoreArea"]["obj"] = IgnoreArea(iArea)
# 范围负数转为倒数第x页
igStart = argd.get("tbpu.ignoreRangeStart", 1)
igEnd = argd.get("tbpu.ignoreRangeEnd", page_count)
if igStart < 0:
igStart += page_count + 1
if igEnd < 0:
igEnd += page_count + 1
msnInfo["ignoreArea"]["start"] = igStart - 1 # -1是将起始1页转为起始0页
msnInfo["ignoreArea"]["end"] = igEnd - 1
logger.debug(f"忽略区域范围: {igStart} ~ {igEnd}")
# 获取排版解析器对象
if "tbpu.parser" in argd:
msnInfo["tbpu"].append(getParser(argd["tbpu.parser"]))
return self.addMissionList(msnInfo, pageList)
def msnTask(self, msnInfo, pno): # 执行msn。pno为当前页数
doc = msnInfo["doc"] # 文档对象
page = doc[pno] # 页面对象
argd = msnInfo["argd"] # 参数
extractionMode = argd["doc.extractionMode"] # OCR内容模式
""" mixed - 混合OCR/拷贝文本
fullPage - 整页强制OCR
imageOnly - 仅OCR图片
textOnly - 仅拷贝原有文本 """
errMsg = "" # 本次任务流程的异常信息
# =============== 提取图片和原文本 ===============
imgs = [] # 待OCR的图片列表
tbs = [] # text box 文本块列表
page_rotation = page.rotation # 获取页面的旋转角度
if extractionMode == "fullPage": # 模式整页强制OCR
# 检查页面边长,如果低于阈值,则增加放大系数,以提高渲染清晰度
rect = page.rect
w, h = abs(rect[2] - rect[0]), abs(rect[3] - rect[1])
m = min(w, h)
if m < MinSize:
zoom = MinSize / max(m, 1)
matrix = fitz.Matrix(zoom, zoom)
else:
zoom = 1
matrix = fitz.Identity
p = page.get_pixmap(matrix=matrix)
bytes = p.tobytes("png")
scale = 1 / zoom
imgs.append(
{"bytes": bytes, "xy": (0, 0), "scale_w": scale, "scale_h": scale}
)
else:
# 获取元素 https://pymupdf.readthedocs.io/en/latest/_images/img-textpage.png
# https://pymupdf.readthedocs.io/en/latest/textpage.html#structure-of-dictionary-outputs
# 确保越界图像能被采集 https://github.com/pymupdf/PyMuPDF/issues/3171
p = page.get_text("dict", clip=fitz.INFINITE_RECT())
for t in p["blocks"]: # 遍历区块(段落)
# ========== 获取图片 ==========
if t["type"] == 1 and (
extractionMode == "imageOnly" or extractionMode == "mixed"
):
# 提取图片相对旋转角,加上页面旋转角,得到图片绝对旋转角
transform = t["transform"]
img_rotation = transform_to_rotation(transform)
abs_rotation = round(page_rotation+img_rotation) % 360
img_bytes = t["image"] # 图片字节
bbox = t["bbox"] # 图片包围盒
# 图片视觉大小、原始大小、缩放比例
w1, h1 = bbox[2] - bbox[0], bbox[3] - bbox[1]
w2, h2 = t["width"], t["height"]
# 特殊情况图片宽高为0
if w2 <= 0 or h2 <= 0:
continue
# 单独计算宽高的缩放比例
scale_w = w1 / w2
scale_h = h1 / h2
# 如果图片有绝对旋转,则逆向旋转图片字节
if page_rotation != 0 or img_rotation != 0:
logger.debug(f"P{pno}-{len(imgs)} 旋转:页面{page_rotation}°,图片{img_rotation}°,绝对{abs_rotation}°")
if abs_rotation != 0:
try:
with Image.open(BytesIO(img_bytes)) as pimg:
# 记录原图格式
format = pimg.format
if not format:
format = "PNG"
# PDF的旋转是顺时针需要逆时针旋转图片
pimg = pimg.rotate(-abs_rotation, expand=True)
# 将旋转后的图片转回bytes
buffered = BytesIO()
pimg.save(buffered, format=format)
img_bytes = buffered.getvalue()
except Exception:
logger.error(
"旋转文档图片异常。", exc_info=True, stack_info=True
)
# 记录图片
imgs.append(
{
"bytes": img_bytes,
"xy": (bbox[0], bbox[1]),
"scale_w": scale_w,
"scale_h": scale_h,
}
)
# ========== 获取文本块 ==========
elif t["type"] == 0 and (
extractionMode == "textOnly" or extractionMode == "mixed"
):
l = len(t["lines"]) - 1
for index, line in enumerate(t["lines"]): # 遍历每一行
# 拼接该行所有子文本块的内容
text = ""
for span in line["spans"]:
text += span["text"]
# 提取其他信息组装为OCR文本块格式
if text:
# 获取该行的的包围盒
b = line["bbox"]
if page_rotation == 0: # 页面没有旋转,直接提取
box = [
[b[0], b[1]],
[b[2], b[1]],
[b[2], b[3]],
[b[0], b[3]],
]
else: # 页面有旋转,默认文本行无相对旋转,则反向消除文本的绝对旋转
# https://pymupdf.readthedocs.io/en/latest/page.html#Page.derotation_matrix
rotation_matrix = page.rotation_matrix
b01 = fitz.Point(b[0], b[1]) * rotation_matrix
b23 = fitz.Point(b[2], b[3]) * rotation_matrix
x0 = min(b01.x, b23.x)
x1 = max(b01.x, b23.x)
y0 = min(b01.y, b23.y)
y1 = max(b01.y, b23.y)
box = [
[x0, y0],
[x1, y0],
[x1, y1],
[x0, y1],
]
# 组装文本块
tb = {
"box": box,
"text": text,
"score": 1,
"end": "\n" if index == l else "", # 结尾符
"from": "text", # 来源:直接提取文本
}
tbs.append(tb)
# 补充结尾符
for i1 in range(len(tbs) - 1):
if tbs[i1]["end"]: # 跳过已有结尾符的
continue
i2 = i1 + 1
sep = word_separator(tbs[i1]["text"][-1], tbs[i2]["text"][0])
tbs[i1]["end"] = sep
# =============== 调用OCR将 imgs 的内容提取出来放入 tbs ===============
if imgs:
# 提取 "ocr." 开头的参数组装OCR参数字典
ocrArgd = {}
for k in argd:
if k.startswith("ocr."):
ocrArgd[k] = argd[k]
# 调用OCR堵塞等待任务完成
ocrList = MissionOCR.addMissionWait(ocrArgd, imgs)
# 整理OCR结果
for o in ocrList:
res = o["result"]
if res["code"] == 100:
x, y = o["xy"]
scale_w = o["scale_w"]
scale_h = o["scale_h"]
for r in res["data"]:
# 将所有文本块的坐标,从图片相对坐标系,转为页面绝对坐标系
for bi in range(4):
r["box"][bi][0] = r["box"][bi][0] * scale_w + x
r["box"][bi][1] = r["box"][bi][1] * scale_h + y
r["from"] = "ocr" # 来源OCR
tbs.append(r)
elif res["code"] != 101:
errMsg += f'[Error] OCR code:{res["code"]} msg:{res["data"]}\n'
# =============== tbpu文本块后处理 ===============
# 忽略区域
if msnInfo["ignoreArea"] and tbs:
# 检查范围
igStart = msnInfo["ignoreArea"]["start"]
igEnd = msnInfo["ignoreArea"]["end"]
if pno >= igStart and pno <= igEnd:
tbs = msnInfo["ignoreArea"]["obj"].run(tbs)
# 其他tbpu
if msnInfo["tbpu"] and tbs:
for tbpu in msnInfo["tbpu"]:
tbs = tbpu.run(tbs)
# =============== 组装结果字典 resDict ===============
if errMsg:
logger.error(f"文档识别异常。P{pno}, errMsg: {errMsg}")
errMsg = f"[Error] Doc P{pno}\n" + errMsg
if tbs: # 有文本
resDict = {"code": 100, "data": tbs}
elif errMsg: # 无文本,有异常
resDict = {"code": 102, "data": errMsg}
else: # 无文本,无异常
resDict = {"code": 101, "data": ""}
# ===== 仅提取文本时任务速度过快频繁回调会导致UI卡死因此故意引入延迟 =====
currentTime = time.time()
elapsedTime = currentTime - self._lastCallTime
# 如果与上一次调用的时间差小于最短间隔,则睡至满足最短间隔
if elapsedTime < self._minInterval:
t = self._minInterval - elapsedTime
time.sleep(t)
self._lastCallTime = currentTime
return resDict
# 获取一个文档的信息,如页数
def getDocInfo(self, path):
try:
with FitzOpen(path) as doc:
info = {
"path": path,
"page_count": doc.page_count,
"is_encrypted": doc.is_encrypted,
}
return info
except Exception as e:
return {"path": path, "error": e}
# 结束前的处理
def _preOnEnd(self, msnInfo, msg):
# 先关闭文档对象,再触发原本的 onEnd ,防止新文档保存到原路径时的冲突
msnInfo["doc"].close()
if msnInfo["sourceOnEnd"]:
msnInfo["sourceOnEnd"](msnInfo, msg)
# 全局 DOC 任务管理器
MissionDOC = _MissionDocClass()