Files
work-secretfile-selfcheck/UmiOCR-data/py_src/ocr/output/output_pdf_layered.py

172 lines
7.6 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
# 双层可搜索 searchable pdf
# https://github.com/pymupdf/PyMuPDF/discussions/2299
import os
import fitz # PyMuPDF
from umi_log import logger
from .output import Output
class OutputPdfLayered(Output):
def __init__(self, argd):
self.dir = argd["outputDir"] # 输出路径(文件夹)
self.originPath = argd["originPath"] # 原始文件路径
self.fileName = argd["outputFileName"] # 文件名
self.password = argd["password"] # 密码
self.outputPath = f"{self.dir}/{self.fileName}.layered.pdf" # 输出路径
self.pdf = None
self.existentPages = [] # 已处理的页数
self.isInsertFont = False # 是否有字体嵌入
self.opacity = 0 # 文本透明度为0
try:
self.font = fitz.Font("cjk") # 字体
except Exception as e:
raise Exception(f"Failed to load cjk font. {e}\n无法加载cjk字体。")
try:
self.pdf = self._getPDF(self.originPath) # 加载pymupdf对象
except Exception as e:
raise Exception(
f"Failed to load doc file. {e}\n无法加载文档。\n{self.originPath}"
)
# 获取pdf文档对象或将其它类型的文档转为PDF对象
def _getPDF(self, path):
# https://github.com/pymupdf/PyMuPDF-Utilities/blob/master/examples/convert-document/convert.py
doc = fitz.open(path)
# 如果已加密,则尝试解密
if doc.is_encrypted and not doc.authenticate(self.password):
raise Exception(
f'The document is encrypted, and the password "{self.password}" is incorrect.\n文档已加密,输入密码不正确。'
)
if doc.is_pdf:
return doc
b = doc.convert_to_pdf() # 转换为PDF格式的二进制数据
pdf = fitz.open("pdf", b) # 创建PDF文档对象
try:
pdf.set_toc(doc.get_toc()) # 复制原始文档的目录
except Exception:
logger.warning("pdf.set_toc error", exc_info=True, stack_info=True)
# 复制原始文档的元数据(如作者、标题等)
meta = doc.metadata
if not meta["producer"]:
meta["producer"] = "Umi-OCR & PyMuPDF v" + fitz.VersionBind
if not meta["creator"]:
meta["creator"] = "Umi-OCR & PyMuPDF PDF converter"
pdf.set_metadata(meta)
# 复制原始文档的链接
for pinput in doc:
links = pinput.get_links()
pout = pdf[pinput.number]
for link in links:
if link["kind"] == fitz.LINK_NAMED: # 不处理 named links
continue
pout.insert_link(link) # 写入新文档
doc.close() # 释放原文档
return pdf
# 计算填满宽和高的一行字体大小
def _calculateFontSize(self, text, w, h):
if h > w: # 竖排转为横排计算
w, h = h, w
fontsize = round(h) # 字体大小初值,假设为行高
minSize = 5 # 大小下限
getLen = lambda text, s: self.font.text_length(text, fontsize=s)
while getLen(text, fontsize) > w and fontsize >= minSize:
fontsize -= 1 # 尝试减小字体,直到行宽刚好小于界限
while getLen(text, fontsize) < w:
fontsize += 1 # 尝试增大字体,直到行宽刚好超过界限
while getLen(text, fontsize) > w and fontsize >= minSize:
fontsize -= 0.1 # 再次减小字体,将精度提升到 0.1
return fontsize
def print(self, res): # 输出图片结果
if not self.pdf:
logger.error("self.pdf 未初始化。")
return
pno = res["page"] - 1 # 当前页数
self.existentPages.append(pno) # 记录已处理的页面
if not res["code"] == 100:
return # 忽略空白
page = self.pdf[pno] # 当前页对象
page.clean_contents() # 内容流清理、语法更正,减少错误
protation = page.rotation # 获取页面旋转角度
isInsertFont = False # 当前是否进行过字体注入
# 插入文本用shape.insert_text可编辑或page.insert_text不可编辑
for tb in res["data"]:
if self.opacity == 0 and "from" in tb and tb["from"] == "text":
continue # 双层透明文字模式下跳过直接提取的文本只写入OCR文本
if not isInsertFont: # 页面插入字体
self.isInsertFont = isInsertFont = True
page.insert_font(fontname="cjk", fontbuffer=self.font.buffer)
text = tb["text"]
box = tb["box"]
x0, y0 = box[0]
x2, y2 = box[2]
w = x2 - x0
h = y2 - y0
fontsize = self._calculateFontSize(text, w, h)
# 插入点的 旋转后的坐标
point = fitz.Point(x0, y2) * page.derotation_matrix
page.insert_text(
point,
text,
fontsize,
fontname="cjk",
rotate=protation, # 文本角度设定
stroke_opacity=self.opacity, # 描边透明度
fill_opacity=self.opacity, # 填充(字体)透明度
)
def onEnd(self): # 结束时保存。
if not self.pdf:
return
# 删除未处理的页数
for i in range(len(self.pdf) - 1, -1, -1):
if i not in self.existentPages:
self.pdf.delete_page(i)
logger.info(f"保存{len(self.pdf)}页PDF{self.outputPath}")
if self.isInsertFont: # 有任意页面嵌入字体,则构建字体子集
try: # 对于部分PDF如用txt直接打印的构建字体子集会失败。
self.pdf.subset_fonts() # 构建字体子集,减小文件大小。需要 fontTools 库
except Exception: # TODO: 失败原因?可能文件中实际并没有字体?
logger.error("构建字体子集失败。", exc_info=True, stack_info=True)
# 保存压缩并进行3级垃圾回收。等同 ez_save
self.save(self.pdf, self.outputPath, deflate=True, garbage=3)
else:
# 无嵌入字体,则直接保存,不压缩
self.save(self.pdf, self.outputPath)
def save(self, pdf, path, **options): # 保存并关闭 pdf 对象
try:
# 尝试保存到指定路径
pdf.save(path, **options)
except Exception:
# 保存失败,尝试保存到 ".temp" 路径
tempPath = self.outputPath + ".temp"
logger.warning(f"保存PDF失败。 path: {path}", exc_info=True)
try:
pdf.save(tempPath, **options)
pdf.close()
except Exception as e1:
logger.error(
f"保存PDF到临时路径失败。 tempPath: {tempPath}", exc_info=True
)
raise Exception(f"[Error] Unable to save PDF to [{tempPath}]: {e1}")
# 已保存到 .temp 并 close 原对象,尝试替换文件
try:
if os.path.exists(path):
os.remove(path)
os.rename(tempPath, path)
except Exception as e2:
logger.warning(
f"保存PDF文件替换失败。保存到临时文件: {tempPath}", exc_info=True
)
raise Exception(
f"[Warning] Unable to save PDF: [{path}]. Exception: {e2}. Saved to temporary path: [{tempPath}]."
)
else: # 正常结束
pdf.close()