373 lines
16 KiB
Python
373 lines
16 KiB
Python
|
|
# ===============================================
|
|||
|
|
# =============== 文档 - 任务管理器 ===============
|
|||
|
|
# ===============================================
|
|||
|
|
|
|||
|
|
# API所有页数page 均为1开始
|
|||
|
|
|
|||
|
|
import fitz # PyMuPDF
|
|||
|
|
import time
|
|||
|
|
import math
|
|||
|
|
from PIL import Image
|
|||
|
|
from io import BytesIO
|
|||
|
|
|
|||
|
|
from umi_log import logger
|
|||
|
|
from .mission import Mission
|
|||
|
|
from .mission_ocr import MissionOCR
|
|||
|
|
from ..ocr.tbpu import getParser
|
|||
|
|
from ..ocr.tbpu import IgnoreArea
|
|||
|
|
from ..ocr.tbpu.parser_tools.paragraph_parse import word_separator # 上下句间隔符
|
|||
|
|
|
|||
|
|
MinSize = 1080 # 最小渲染分辨率
|
|||
|
|
|
|||
|
|
# 合法文件后缀
|
|||
|
|
DocSuf = [
|
|||
|
|
".pdf",
|
|||
|
|
".xps",
|
|||
|
|
".epub",
|
|||
|
|
".mobi",
|
|||
|
|
".fb2",
|
|||
|
|
".cbz",
|
|||
|
|
]
|
|||
|
|
|
|||
|
|
|
|||
|
|
class FitzOpen:
|
|||
|
|
def __init__(self, path):
|
|||
|
|
self._path = path
|
|||
|
|
self._doc = None
|
|||
|
|
|
|||
|
|
def __enter__(self):
|
|||
|
|
self._doc = fitz.open(self._path)
|
|||
|
|
return self._doc
|
|||
|
|
|
|||
|
|
def __exit__(self, exc_type, exc_val, exc_tb):
|
|||
|
|
self._doc.close()
|
|||
|
|
|
|||
|
|
|
|||
|
|
# https://pymupdf.readthedocs.io/en/latest/matrix.html#matrix
|
|||
|
|
# 从变换矩阵中提取角度值,返回0~359整数
|
|||
|
|
def transform_to_rotation(matrix):
|
|||
|
|
# [1, 0, 0, 1, 0, 0] -> [cos(deg), sin(deg), -sin(deg), cos(deg), 0, 0].
|
|||
|
|
a, b, c, d, _, _ = matrix
|
|||
|
|
# 处理缩放和反射
|
|||
|
|
scale = math.sqrt(a**2 + b**2)
|
|||
|
|
if scale < 1e-6:
|
|||
|
|
return 0
|
|||
|
|
# 归一化以消除缩放影响
|
|||
|
|
cos_theta = a / scale
|
|||
|
|
sin_theta = b / scale
|
|||
|
|
# 检查反射
|
|||
|
|
determinant = a * d - b * c
|
|||
|
|
if determinant < 0:
|
|||
|
|
# 反射情况,调整角度计算
|
|||
|
|
cos_theta = -cos_theta
|
|||
|
|
theta_rad = math.atan2(sin_theta, cos_theta)
|
|||
|
|
theta_deg = math.degrees(theta_rad)
|
|||
|
|
rounded_deg = round(theta_deg) % 360
|
|||
|
|
return rounded_deg
|
|||
|
|
|
|||
|
|
|
|||
|
|
class _MissionDocClass(Mission):
|
|||
|
|
def __init__(self):
|
|||
|
|
super().__init__()
|
|||
|
|
self._schedulingMode = "1234" # 调度方式:顺序
|
|||
|
|
self._minInterval = 0.1 # msnTask最短调用间隔
|
|||
|
|
self._lastCallTime = 0 # 上一次调用时间
|
|||
|
|
|
|||
|
|
# 添加一个文档任务
|
|||
|
|
# msnInfo: { 回调函数"onXX", 参数"argd":{"tbpu.xx", "ocr.xx"} }
|
|||
|
|
# msnPath: 单个文档路径
|
|||
|
|
# pageRange: 页数范围。可选: None 全部页 , [1,3] 页面范围(含开头结束)。
|
|||
|
|
# pageList: 指定多个页数。可选: [] 使用pageRange设置 , [1,2,3] 指定页数
|
|||
|
|
# password: 密码(非必填)
|
|||
|
|
def addMission(self, msnInfo, msnPath, pageRange=None, pageList=[], password=""):
|
|||
|
|
# =============== 加载文档,获取文档操作对象 ===============
|
|||
|
|
try:
|
|||
|
|
doc = fitz.open(msnPath)
|
|||
|
|
except Exception as e:
|
|||
|
|
return f"[Error] fitz.open error: {msnPath} {e}"
|
|||
|
|
if doc.is_encrypted and not doc.authenticate(password):
|
|||
|
|
if password:
|
|||
|
|
msg = f"[Error] Incorrect password. 文档已加密,密码错误。 [{password}]"
|
|||
|
|
else:
|
|||
|
|
msg = "[Error] Doc encrypted. 文档已加密,请提供密码。"
|
|||
|
|
return msg
|
|||
|
|
msnInfo["doc"] = doc
|
|||
|
|
msnInfo["path"] = msnPath
|
|||
|
|
# =============== 拦截 onEnd ===============
|
|||
|
|
msnInfo["sourceOnEnd"] = msnInfo["onEnd"] if "onEnd" in msnInfo else None
|
|||
|
|
msnInfo["onEnd"] = self._preOnEnd
|
|||
|
|
# =============== pageRange 页面范围 ===============
|
|||
|
|
page_count = doc.page_count
|
|||
|
|
if len(pageList) == 0:
|
|||
|
|
if isinstance(pageRange, (tuple, list)) and len(pageRange) == 2:
|
|||
|
|
a, b = pageRange[0], pageRange[1]
|
|||
|
|
if a < 0:
|
|||
|
|
a += page_count + 1
|
|||
|
|
if b < 0:
|
|||
|
|
b += page_count + 1
|
|||
|
|
if a < 1:
|
|||
|
|
return f"[Error] pageRange {pageRange} 范围起始不能小于1"
|
|||
|
|
if b > page_count:
|
|||
|
|
return f"[Error] pageRange {pageRange} 范围结束不能大于页数 {doc.page_count}"
|
|||
|
|
if a > b:
|
|||
|
|
return f"[Error] pageRange {pageRange} 范围错误"
|
|||
|
|
pageList = list(range(a - 1, b))
|
|||
|
|
else:
|
|||
|
|
pageList = list(range(0, page_count))
|
|||
|
|
# 检查页数列表合法性
|
|||
|
|
if len(pageList) == 0:
|
|||
|
|
return "[Error] 页数列表为空"
|
|||
|
|
for p in pageList:
|
|||
|
|
if not isinstance(p, int):
|
|||
|
|
return "[Error] 页数列表内容非整数"
|
|||
|
|
if not 0 <= p < page_count:
|
|||
|
|
return f"[Error] 页数列表超出 1~{page_count} 范围"
|
|||
|
|
msnInfo["pageList"] = pageList
|
|||
|
|
# =============== tbpu文本块后处理 msnInfo["tbpu"] ===============
|
|||
|
|
argd = msnInfo["argd"] # 参数
|
|||
|
|
msnInfo["tbpu"] = []
|
|||
|
|
msnInfo["ignoreArea"] = {}
|
|||
|
|
# 忽略区域
|
|||
|
|
if "tbpu.ignoreArea" in argd:
|
|||
|
|
iArea = argd["tbpu.ignoreArea"]
|
|||
|
|
if isinstance(iArea, list) and len(iArea) > 0:
|
|||
|
|
msnInfo["ignoreArea"]["obj"] = IgnoreArea(iArea)
|
|||
|
|
# 范围,负数转为倒数第x页
|
|||
|
|
igStart = argd.get("tbpu.ignoreRangeStart", 1)
|
|||
|
|
igEnd = argd.get("tbpu.ignoreRangeEnd", page_count)
|
|||
|
|
if igStart < 0:
|
|||
|
|
igStart += page_count + 1
|
|||
|
|
if igEnd < 0:
|
|||
|
|
igEnd += page_count + 1
|
|||
|
|
msnInfo["ignoreArea"]["start"] = igStart - 1 # -1是将起始1页转为起始0页
|
|||
|
|
msnInfo["ignoreArea"]["end"] = igEnd - 1
|
|||
|
|
logger.debug(f"忽略区域范围: {igStart} ~ {igEnd} 。")
|
|||
|
|
# 获取排版解析器对象
|
|||
|
|
if "tbpu.parser" in argd:
|
|||
|
|
msnInfo["tbpu"].append(getParser(argd["tbpu.parser"]))
|
|||
|
|
return self.addMissionList(msnInfo, pageList)
|
|||
|
|
|
|||
|
|
def msnTask(self, msnInfo, pno): # 执行msn。pno为当前页数
|
|||
|
|
doc = msnInfo["doc"] # 文档对象
|
|||
|
|
page = doc[pno] # 页面对象
|
|||
|
|
argd = msnInfo["argd"] # 参数
|
|||
|
|
extractionMode = argd["doc.extractionMode"] # OCR内容模式
|
|||
|
|
""" mixed - 混合OCR/拷贝文本
|
|||
|
|
fullPage - 整页强制OCR
|
|||
|
|
imageOnly - 仅OCR图片
|
|||
|
|
textOnly - 仅拷贝原有文本 """
|
|||
|
|
errMsg = "" # 本次任务流程的异常信息
|
|||
|
|
|
|||
|
|
# =============== 提取图片和原文本 ===============
|
|||
|
|
imgs = [] # 待OCR的图片列表
|
|||
|
|
tbs = [] # text box 文本块列表
|
|||
|
|
page_rotation = page.rotation # 获取页面的旋转角度
|
|||
|
|
if extractionMode == "fullPage": # 模式:整页强制OCR
|
|||
|
|
# 检查页面边长,如果低于阈值,则增加放大系数,以提高渲染清晰度
|
|||
|
|
rect = page.rect
|
|||
|
|
w, h = abs(rect[2] - rect[0]), abs(rect[3] - rect[1])
|
|||
|
|
m = min(w, h)
|
|||
|
|
if m < MinSize:
|
|||
|
|
zoom = MinSize / max(m, 1)
|
|||
|
|
matrix = fitz.Matrix(zoom, zoom)
|
|||
|
|
else:
|
|||
|
|
zoom = 1
|
|||
|
|
matrix = fitz.Identity
|
|||
|
|
p = page.get_pixmap(matrix=matrix)
|
|||
|
|
bytes = p.tobytes("png")
|
|||
|
|
scale = 1 / zoom
|
|||
|
|
imgs.append(
|
|||
|
|
{"bytes": bytes, "xy": (0, 0), "scale_w": scale, "scale_h": scale}
|
|||
|
|
)
|
|||
|
|
else:
|
|||
|
|
# 获取元素 https://pymupdf.readthedocs.io/en/latest/_images/img-textpage.png
|
|||
|
|
# https://pymupdf.readthedocs.io/en/latest/textpage.html#structure-of-dictionary-outputs
|
|||
|
|
# 确保越界图像能被采集 https://github.com/pymupdf/PyMuPDF/issues/3171
|
|||
|
|
p = page.get_text("dict", clip=fitz.INFINITE_RECT())
|
|||
|
|
for t in p["blocks"]: # 遍历区块(段落)
|
|||
|
|
# ========== 获取图片 ==========
|
|||
|
|
if t["type"] == 1 and (
|
|||
|
|
extractionMode == "imageOnly" or extractionMode == "mixed"
|
|||
|
|
):
|
|||
|
|
# 提取图片相对旋转角,加上页面旋转角,得到图片绝对旋转角
|
|||
|
|
transform = t["transform"]
|
|||
|
|
img_rotation = transform_to_rotation(transform)
|
|||
|
|
abs_rotation = round(page_rotation+img_rotation) % 360
|
|||
|
|
img_bytes = t["image"] # 图片字节
|
|||
|
|
bbox = t["bbox"] # 图片包围盒
|
|||
|
|
# 图片视觉大小、原始大小、缩放比例
|
|||
|
|
w1, h1 = bbox[2] - bbox[0], bbox[3] - bbox[1]
|
|||
|
|
w2, h2 = t["width"], t["height"]
|
|||
|
|
# 特殊情况:图片宽高为0
|
|||
|
|
if w2 <= 0 or h2 <= 0:
|
|||
|
|
continue
|
|||
|
|
# 单独计算宽高的缩放比例
|
|||
|
|
scale_w = w1 / w2
|
|||
|
|
scale_h = h1 / h2
|
|||
|
|
# 如果图片有绝对旋转,则逆向旋转图片字节
|
|||
|
|
if page_rotation != 0 or img_rotation != 0:
|
|||
|
|
logger.debug(f"P{pno}-{len(imgs)} 旋转:页面{page_rotation}°,图片{img_rotation}°,绝对{abs_rotation}°")
|
|||
|
|
if abs_rotation != 0:
|
|||
|
|
try:
|
|||
|
|
with Image.open(BytesIO(img_bytes)) as pimg:
|
|||
|
|
# 记录原图格式
|
|||
|
|
format = pimg.format
|
|||
|
|
if not format:
|
|||
|
|
format = "PNG"
|
|||
|
|
# PDF的旋转是顺时针,需要逆时针旋转图片
|
|||
|
|
pimg = pimg.rotate(-abs_rotation, expand=True)
|
|||
|
|
# 将旋转后的图片转回bytes
|
|||
|
|
buffered = BytesIO()
|
|||
|
|
pimg.save(buffered, format=format)
|
|||
|
|
img_bytes = buffered.getvalue()
|
|||
|
|
except Exception:
|
|||
|
|
logger.error(
|
|||
|
|
"旋转文档图片异常。", exc_info=True, stack_info=True
|
|||
|
|
)
|
|||
|
|
# 记录图片
|
|||
|
|
imgs.append(
|
|||
|
|
{
|
|||
|
|
"bytes": img_bytes,
|
|||
|
|
"xy": (bbox[0], bbox[1]),
|
|||
|
|
"scale_w": scale_w,
|
|||
|
|
"scale_h": scale_h,
|
|||
|
|
}
|
|||
|
|
)
|
|||
|
|
# ========== 获取文本块 ==========
|
|||
|
|
elif t["type"] == 0 and (
|
|||
|
|
extractionMode == "textOnly" or extractionMode == "mixed"
|
|||
|
|
):
|
|||
|
|
l = len(t["lines"]) - 1
|
|||
|
|
for index, line in enumerate(t["lines"]): # 遍历每一行
|
|||
|
|
# 拼接该行所有子文本块的内容
|
|||
|
|
text = ""
|
|||
|
|
for span in line["spans"]:
|
|||
|
|
text += span["text"]
|
|||
|
|
# 提取其他信息,组装为OCR文本块格式
|
|||
|
|
if text:
|
|||
|
|
# 获取该行的的包围盒
|
|||
|
|
b = line["bbox"]
|
|||
|
|
if page_rotation == 0: # 页面没有旋转,直接提取
|
|||
|
|
box = [
|
|||
|
|
[b[0], b[1]],
|
|||
|
|
[b[2], b[1]],
|
|||
|
|
[b[2], b[3]],
|
|||
|
|
[b[0], b[3]],
|
|||
|
|
]
|
|||
|
|
else: # 页面有旋转,默认文本行无相对旋转,则反向消除文本的绝对旋转
|
|||
|
|
# https://pymupdf.readthedocs.io/en/latest/page.html#Page.derotation_matrix
|
|||
|
|
rotation_matrix = page.rotation_matrix
|
|||
|
|
b01 = fitz.Point(b[0], b[1]) * rotation_matrix
|
|||
|
|
b23 = fitz.Point(b[2], b[3]) * rotation_matrix
|
|||
|
|
x0 = min(b01.x, b23.x)
|
|||
|
|
x1 = max(b01.x, b23.x)
|
|||
|
|
y0 = min(b01.y, b23.y)
|
|||
|
|
y1 = max(b01.y, b23.y)
|
|||
|
|
box = [
|
|||
|
|
[x0, y0],
|
|||
|
|
[x1, y0],
|
|||
|
|
[x1, y1],
|
|||
|
|
[x0, y1],
|
|||
|
|
]
|
|||
|
|
# 组装文本块
|
|||
|
|
tb = {
|
|||
|
|
"box": box,
|
|||
|
|
"text": text,
|
|||
|
|
"score": 1,
|
|||
|
|
"end": "\n" if index == l else "", # 结尾符
|
|||
|
|
"from": "text", # 来源:直接提取文本
|
|||
|
|
}
|
|||
|
|
tbs.append(tb)
|
|||
|
|
# 补充结尾符
|
|||
|
|
for i1 in range(len(tbs) - 1):
|
|||
|
|
if tbs[i1]["end"]: # 跳过已有结尾符的
|
|||
|
|
continue
|
|||
|
|
i2 = i1 + 1
|
|||
|
|
sep = word_separator(tbs[i1]["text"][-1], tbs[i2]["text"][0])
|
|||
|
|
tbs[i1]["end"] = sep
|
|||
|
|
|
|||
|
|
# =============== 调用OCR,将 imgs 的内容提取出来放入 tbs ===============
|
|||
|
|
if imgs:
|
|||
|
|
# 提取 "ocr." 开头的参数,组装OCR参数字典
|
|||
|
|
ocrArgd = {}
|
|||
|
|
for k in argd:
|
|||
|
|
if k.startswith("ocr."):
|
|||
|
|
ocrArgd[k] = argd[k]
|
|||
|
|
# 调用OCR,堵塞等待任务完成
|
|||
|
|
ocrList = MissionOCR.addMissionWait(ocrArgd, imgs)
|
|||
|
|
# 整理OCR结果
|
|||
|
|
for o in ocrList:
|
|||
|
|
res = o["result"]
|
|||
|
|
if res["code"] == 100:
|
|||
|
|
x, y = o["xy"]
|
|||
|
|
scale_w = o["scale_w"]
|
|||
|
|
scale_h = o["scale_h"]
|
|||
|
|
for r in res["data"]:
|
|||
|
|
# 将所有文本块的坐标,从图片相对坐标系,转为页面绝对坐标系
|
|||
|
|
for bi in range(4):
|
|||
|
|
r["box"][bi][0] = r["box"][bi][0] * scale_w + x
|
|||
|
|
r["box"][bi][1] = r["box"][bi][1] * scale_h + y
|
|||
|
|
r["from"] = "ocr" # 来源:OCR
|
|||
|
|
tbs.append(r)
|
|||
|
|
elif res["code"] != 101:
|
|||
|
|
errMsg += f'[Error] OCR code:{res["code"]} msg:{res["data"]}\n'
|
|||
|
|
|
|||
|
|
# =============== tbpu文本块后处理 ===============
|
|||
|
|
# 忽略区域
|
|||
|
|
if msnInfo["ignoreArea"] and tbs:
|
|||
|
|
# 检查范围
|
|||
|
|
igStart = msnInfo["ignoreArea"]["start"]
|
|||
|
|
igEnd = msnInfo["ignoreArea"]["end"]
|
|||
|
|
if pno >= igStart and pno <= igEnd:
|
|||
|
|
tbs = msnInfo["ignoreArea"]["obj"].run(tbs)
|
|||
|
|
# 其他tbpu
|
|||
|
|
if msnInfo["tbpu"] and tbs:
|
|||
|
|
for tbpu in msnInfo["tbpu"]:
|
|||
|
|
tbs = tbpu.run(tbs)
|
|||
|
|
|
|||
|
|
# =============== 组装结果字典 resDict ===============
|
|||
|
|
if errMsg:
|
|||
|
|
logger.error(f"文档识别异常。P{pno}, errMsg: {errMsg}")
|
|||
|
|
errMsg = f"[Error] Doc P{pno}\n" + errMsg
|
|||
|
|
|
|||
|
|
if tbs: # 有文本
|
|||
|
|
resDict = {"code": 100, "data": tbs}
|
|||
|
|
elif errMsg: # 无文本,有异常
|
|||
|
|
resDict = {"code": 102, "data": errMsg}
|
|||
|
|
else: # 无文本,无异常
|
|||
|
|
resDict = {"code": 101, "data": ""}
|
|||
|
|
|
|||
|
|
# ===== 仅提取文本时任务速度过快,频繁回调会导致UI卡死,因此故意引入延迟 =====
|
|||
|
|
currentTime = time.time()
|
|||
|
|
elapsedTime = currentTime - self._lastCallTime
|
|||
|
|
# 如果与上一次调用的时间差小于最短间隔,则睡至满足最短间隔
|
|||
|
|
if elapsedTime < self._minInterval:
|
|||
|
|
t = self._minInterval - elapsedTime
|
|||
|
|
time.sleep(t)
|
|||
|
|
self._lastCallTime = currentTime
|
|||
|
|
return resDict
|
|||
|
|
|
|||
|
|
# 获取一个文档的信息,如页数
|
|||
|
|
def getDocInfo(self, path):
|
|||
|
|
try:
|
|||
|
|
with FitzOpen(path) as doc:
|
|||
|
|
info = {
|
|||
|
|
"path": path,
|
|||
|
|
"page_count": doc.page_count,
|
|||
|
|
"is_encrypted": doc.is_encrypted,
|
|||
|
|
}
|
|||
|
|
return info
|
|||
|
|
except Exception as e:
|
|||
|
|
return {"path": path, "error": e}
|
|||
|
|
|
|||
|
|
# 结束前的处理
|
|||
|
|
def _preOnEnd(self, msnInfo, msg):
|
|||
|
|
# 先关闭文档对象,再触发原本的 onEnd ,防止新文档保存到原路径时的冲突
|
|||
|
|
msnInfo["doc"].close()
|
|||
|
|
if msnInfo["sourceOnEnd"]:
|
|||
|
|
msnInfo["sourceOnEnd"](msnInfo, msg)
|
|||
|
|
|
|||
|
|
|
|||
|
|
# 全局 DOC 任务管理器
|
|||
|
|
MissionDOC = _MissionDocClass()
|