docs: 添加涉密文件自检工具实施计划
This commit is contained in:
30
UmiOCR-data/py_src/ocr/output/__init__.py
Normal file
30
UmiOCR-data/py_src/ocr/output/__init__.py
Normal file
@@ -0,0 +1,30 @@
|
||||
from .output_txt import OutputTxt
|
||||
from .output_txt_plain import OutputTxtPlain
|
||||
from .output_txt_individual import OutputTxtIndividual
|
||||
from .output_md import OutputMD
|
||||
from .output_jsonl import OutputJsonl
|
||||
from .output_csv import OutputCsv
|
||||
from .output_pdf_layered import OutputPdfLayered
|
||||
from .output_pdf_one_layer import OutputPdfOneLayer
|
||||
|
||||
"""纯文本输出器。初始化传入参数字典:
|
||||
outputArgd = {
|
||||
"outputDir": "", # 输出路径
|
||||
"outputDirType": "", # 输出目录类型,"source" 为原文件目录,"specify"为指定目录
|
||||
"outputFileName": "", # 输出文件名(前缀)
|
||||
"startDatetime": "", # 开始日期字符串(标准格式)
|
||||
"ignoreBlank": True/False, # 忽略空白文件
|
||||
}
|
||||
"""
|
||||
Output = {
|
||||
# 纯文本输出器
|
||||
"txt": OutputTxt,
|
||||
"txtPlain": OutputTxtPlain,
|
||||
"txtIndividual": OutputTxtIndividual,
|
||||
"md": OutputMD,
|
||||
"jsonl": OutputJsonl,
|
||||
"csv": OutputCsv,
|
||||
# PDF输出器,需要额外的参数 "originPath" 原始文件路径
|
||||
"pdfLayered": OutputPdfLayered,
|
||||
"pdfOneLayer": OutputPdfOneLayer,
|
||||
}
|
||||
32
UmiOCR-data/py_src/ocr/output/output.py
Normal file
32
UmiOCR-data/py_src/ocr/output/output.py
Normal file
@@ -0,0 +1,32 @@
|
||||
# OCR输出器的基类。按指定的格式,将传入的文本输出到指定地方。
|
||||
|
||||
from .tools import getDataText
|
||||
from ...platform import Platform
|
||||
import os
|
||||
|
||||
|
||||
class Output:
|
||||
def __init__(self, argd):
|
||||
self.dir = argd["outputDir"] # 输出路径(文件夹)
|
||||
self.fileName = argd["outputFileName"] # 文件名
|
||||
self.outputPath = f"{self.dir}/{self.fileName}.txt" # 输出路径
|
||||
self.ignoreBlank = argd["ignoreBlank"] # 忽略空白文件
|
||||
|
||||
def print(self, res): # 输出图片信息
|
||||
if not res["code"] == 100 and self.ignoreBlank:
|
||||
return # 忽略空白图片
|
||||
textOut = f"图片路径:{res['path']}\n代码:{res['code']}\n"
|
||||
if res["code"] == 100:
|
||||
textOut += getDataText(res["data"]) # 获取拼接结果
|
||||
elif res["code"] == 101:
|
||||
textOut += "无文字"
|
||||
else:
|
||||
textOut += f"错误原因:{res['data']}"
|
||||
print(textOut)
|
||||
|
||||
def openOutputFile(self): # 打开输出文件
|
||||
if self.outputPath and os.path.exists(self.outputPath):
|
||||
Platform.startfile(self.outputPath)
|
||||
|
||||
def onEnd(self): # 结束输出。
|
||||
pass
|
||||
70
UmiOCR-data/py_src/ocr/output/output_csv.py
Normal file
70
UmiOCR-data/py_src/ocr/output/output_csv.py
Normal file
@@ -0,0 +1,70 @@
|
||||
# 输出到csv表格文件
|
||||
|
||||
import csv
|
||||
|
||||
from umi_log import logger
|
||||
from .output import Output
|
||||
from .tools import getDataText
|
||||
|
||||
|
||||
class OutputCsv(Output):
|
||||
def __init__(self, argd):
|
||||
self.encodings = [ # 保存编码优先级
|
||||
"ansi", # Windows系统本地编码。在linux和macos下会抛出异常
|
||||
"ascii", # 纯英
|
||||
"gbk", # 简中
|
||||
"big5", # 繁中
|
||||
"shift_jis", # 日文
|
||||
"euc-kr", # 韩文
|
||||
"utf-8",
|
||||
]
|
||||
self.dir = argd["outputDir"] # 输出路径(文件夹)
|
||||
self.fileName = argd["outputFileName"] # 文件名
|
||||
self.outputPath = f"{self.dir}/{self.fileName}.csv" # 输出路径
|
||||
self.ignoreBlank = argd["ignoreBlank"] # 忽略空白文件
|
||||
self.writeLists = [] # 输出内容列表
|
||||
self.writeText = "" # 输出内容字符串
|
||||
try: # 覆盖创建临时文件
|
||||
with open(self.outputPath, "w", encoding="utf-8") as f:
|
||||
pass
|
||||
except Exception as e:
|
||||
raise Exception(f"Failed to create csv file. {e}\n创建csv文件失败。")
|
||||
|
||||
def print(self, res): # 输出图片结果
|
||||
if not res["code"] == 100 and self.ignoreBlank:
|
||||
return # 忽略空白图片
|
||||
name = res["fileName"]
|
||||
path = res["path"]
|
||||
if res["code"] == 100:
|
||||
textOut = getDataText(res["data"]) # 获取拼接结果
|
||||
elif res["code"] == 101:
|
||||
textOut = ""
|
||||
else:
|
||||
textOut = f'[Error] OCR failed. Code: {res["code"]}, Msg: {res["data"]} .\n'
|
||||
self.writeLists.append([name, textOut, path])
|
||||
self.writeText += textOut
|
||||
|
||||
def onEnd(self): # 结束时保存。
|
||||
# 顺序测试编码优先级列表,获取保存编码
|
||||
encoding = "utf-8"
|
||||
for e in self.encodings:
|
||||
try:
|
||||
self.writeText.encode(e)
|
||||
encoding = e
|
||||
break
|
||||
# except UnicodeEncodeError:
|
||||
except Exception:
|
||||
pass
|
||||
logger.info(f"csv encoding: {encoding}")
|
||||
# 创建文件、输出
|
||||
headers = ["Name", "OCR", "Path"] # 表头
|
||||
try:
|
||||
with open(
|
||||
self.outputPath, "w", encoding=encoding, newline=""
|
||||
) as f: # 覆盖创建文件
|
||||
writer = csv.writer(f)
|
||||
writer.writerow(headers) # 写入CSV表头
|
||||
for writeList in self.writeLists:
|
||||
writer.writerow(writeList) # 写入CSV内容
|
||||
except Exception as e:
|
||||
raise Exception(f"Failed to write csv file. {e}\n写入csv文件失败。")
|
||||
24
UmiOCR-data/py_src/ocr/output/output_jsonl.py
Normal file
24
UmiOCR-data/py_src/ocr/output/output_jsonl.py
Normal file
@@ -0,0 +1,24 @@
|
||||
# 输出到jsonl文件
|
||||
|
||||
from .output import Output
|
||||
|
||||
import json
|
||||
|
||||
|
||||
class OutputJsonl(Output):
|
||||
def __init__(self, argd):
|
||||
self.dir = argd["outputDir"] # 输出路径(文件夹)
|
||||
self.fileName = argd["outputFileName"] # 文件名
|
||||
self.outputPath = f"{self.dir}/{self.fileName}.jsonl" # 输出路径
|
||||
self.ignoreBlank = argd["ignoreBlank"] # 忽略空白文件
|
||||
# 创建输出文件
|
||||
try:
|
||||
with open(self.outputPath, "w", encoding="utf-8") as f: # 覆盖创建文件
|
||||
pass
|
||||
except Exception as e:
|
||||
raise Exception(f"Failed to create jsonl file. {e}\n创建jsonl文件失败。")
|
||||
|
||||
def print(self, res): # 输出图片结果
|
||||
# 不忽略空白条目
|
||||
with open(self.outputPath, "a", encoding="utf-8") as f: # 追加写入本地文件
|
||||
f.write(json.dumps(res, ensure_ascii=False) + "\n")
|
||||
46
UmiOCR-data/py_src/ocr/output/output_md.py
Normal file
46
UmiOCR-data/py_src/ocr/output/output_md.py
Normal file
@@ -0,0 +1,46 @@
|
||||
# 输出markdown格式
|
||||
|
||||
from .output import Output
|
||||
from .tools import getDataText
|
||||
|
||||
import os
|
||||
|
||||
|
||||
class OutputMD(Output):
|
||||
def __init__(self, argd):
|
||||
self.dir = argd["outputDir"] # 输出路径(文件夹)
|
||||
self.fileName = argd["outputFileName"] # 文件名
|
||||
self.outputPath = f"{self.dir}/{self.fileName}.md" # 输出路径
|
||||
self.ignoreBlank = argd["ignoreBlank"] # 忽略空白文件
|
||||
# 创建输出文件
|
||||
try:
|
||||
with open(self.outputPath, "w", encoding="utf-8") as f: # 覆盖创建文件
|
||||
f.write(f'> {argd["startDatetime"]}\n\n')
|
||||
except Exception as e:
|
||||
raise Exception(f"Failed to create jsonl file. {e}\n创建jsonl文件失败。")
|
||||
|
||||
def print(self, res): # 输出图片结果
|
||||
if not res["code"] == 100 and self.ignoreBlank:
|
||||
return # 忽略空白图片
|
||||
name = res["fileName"]
|
||||
path = os.path.relpath( # 从md文件到图片的相对路径
|
||||
res["path"], os.path.dirname(self.outputPath)
|
||||
)
|
||||
path = path.replace(" ", "%20") # 空格转 %20
|
||||
textOut = f"""
|
||||
---
|
||||

|
||||
[{name}]({path})
|
||||
|
||||
"""
|
||||
# 正文
|
||||
if res["code"] == 100:
|
||||
texts = getDataText(res["data"]).split("\n") # 获取拼接结果列表
|
||||
for t in texts:
|
||||
textOut += f"> {t} \n"
|
||||
elif res["code"] == 101:
|
||||
pass
|
||||
else:
|
||||
textOut += f'> [Error] OCR failed. Code: {res["code"]}, Msg: {res["data"]} \n> 【异常】OCR识别失败。 \n'
|
||||
with open(self.outputPath, "a", encoding="utf-8") as f: # 追加写入本地文件
|
||||
f.write(textOut)
|
||||
171
UmiOCR-data/py_src/ocr/output/output_pdf_layered.py
Normal file
171
UmiOCR-data/py_src/ocr/output/output_pdf_layered.py
Normal file
@@ -0,0 +1,171 @@
|
||||
# 双层可搜索 searchable pdf
|
||||
# https://github.com/pymupdf/PyMuPDF/discussions/2299
|
||||
|
||||
import os
|
||||
import fitz # PyMuPDF
|
||||
|
||||
from umi_log import logger
|
||||
from .output import Output
|
||||
|
||||
|
||||
class OutputPdfLayered(Output):
|
||||
def __init__(self, argd):
|
||||
self.dir = argd["outputDir"] # 输出路径(文件夹)
|
||||
self.originPath = argd["originPath"] # 原始文件路径
|
||||
self.fileName = argd["outputFileName"] # 文件名
|
||||
self.password = argd["password"] # 密码
|
||||
self.outputPath = f"{self.dir}/{self.fileName}.layered.pdf" # 输出路径
|
||||
self.pdf = None
|
||||
self.existentPages = [] # 已处理的页数
|
||||
self.isInsertFont = False # 是否有字体嵌入
|
||||
self.opacity = 0 # 文本透明度为0
|
||||
try:
|
||||
self.font = fitz.Font("cjk") # 字体
|
||||
except Exception as e:
|
||||
raise Exception(f"Failed to load cjk font. {e}\n无法加载cjk字体。")
|
||||
try:
|
||||
self.pdf = self._getPDF(self.originPath) # 加载pymupdf对象
|
||||
except Exception as e:
|
||||
raise Exception(
|
||||
f"Failed to load doc file. {e}\n无法加载文档。\n{self.originPath}"
|
||||
)
|
||||
|
||||
# 获取pdf文档对象,或将其它类型的文档转为PDF对象
|
||||
def _getPDF(self, path):
|
||||
# https://github.com/pymupdf/PyMuPDF-Utilities/blob/master/examples/convert-document/convert.py
|
||||
doc = fitz.open(path)
|
||||
# 如果已加密,则尝试解密
|
||||
if doc.is_encrypted and not doc.authenticate(self.password):
|
||||
raise Exception(
|
||||
f'The document is encrypted, and the password "{self.password}" is incorrect.\n文档已加密,输入密码不正确。'
|
||||
)
|
||||
if doc.is_pdf:
|
||||
return doc
|
||||
b = doc.convert_to_pdf() # 转换为PDF格式的二进制数据
|
||||
pdf = fitz.open("pdf", b) # 创建PDF文档对象
|
||||
try:
|
||||
pdf.set_toc(doc.get_toc()) # 复制原始文档的目录
|
||||
except Exception:
|
||||
logger.warning("pdf.set_toc error", exc_info=True, stack_info=True)
|
||||
# 复制原始文档的元数据(如作者、标题等)
|
||||
meta = doc.metadata
|
||||
if not meta["producer"]:
|
||||
meta["producer"] = "Umi-OCR & PyMuPDF v" + fitz.VersionBind
|
||||
if not meta["creator"]:
|
||||
meta["creator"] = "Umi-OCR & PyMuPDF PDF converter"
|
||||
pdf.set_metadata(meta)
|
||||
# 复制原始文档的链接
|
||||
for pinput in doc:
|
||||
links = pinput.get_links()
|
||||
pout = pdf[pinput.number]
|
||||
for link in links:
|
||||
if link["kind"] == fitz.LINK_NAMED: # 不处理 named links
|
||||
continue
|
||||
pout.insert_link(link) # 写入新文档
|
||||
doc.close() # 释放原文档
|
||||
return pdf
|
||||
|
||||
# 计算填满宽和高的一行字体大小
|
||||
def _calculateFontSize(self, text, w, h):
|
||||
if h > w: # 竖排转为横排计算
|
||||
w, h = h, w
|
||||
fontsize = round(h) # 字体大小初值,假设为行高
|
||||
minSize = 5 # 大小下限
|
||||
getLen = lambda text, s: self.font.text_length(text, fontsize=s)
|
||||
while getLen(text, fontsize) > w and fontsize >= minSize:
|
||||
fontsize -= 1 # 尝试减小字体,直到行宽刚好小于界限
|
||||
while getLen(text, fontsize) < w:
|
||||
fontsize += 1 # 尝试增大字体,直到行宽刚好超过界限
|
||||
while getLen(text, fontsize) > w and fontsize >= minSize:
|
||||
fontsize -= 0.1 # 再次减小字体,将精度提升到 0.1
|
||||
return fontsize
|
||||
|
||||
def print(self, res): # 输出图片结果
|
||||
if not self.pdf:
|
||||
logger.error("self.pdf 未初始化。")
|
||||
return
|
||||
pno = res["page"] - 1 # 当前页数
|
||||
self.existentPages.append(pno) # 记录已处理的页面
|
||||
if not res["code"] == 100:
|
||||
return # 忽略空白
|
||||
|
||||
page = self.pdf[pno] # 当前页对象
|
||||
page.clean_contents() # 内容流清理、语法更正,减少错误
|
||||
protation = page.rotation # 获取页面旋转角度
|
||||
isInsertFont = False # 当前是否进行过字体注入
|
||||
# 插入文本,用shape.insert_text(可编辑)或page.insert_text(不可编辑)
|
||||
for tb in res["data"]:
|
||||
if self.opacity == 0 and "from" in tb and tb["from"] == "text":
|
||||
continue # 双层(透明文字)模式下,跳过直接提取的文本,只写入OCR文本
|
||||
if not isInsertFont: # 页面插入字体
|
||||
self.isInsertFont = isInsertFont = True
|
||||
page.insert_font(fontname="cjk", fontbuffer=self.font.buffer)
|
||||
text = tb["text"]
|
||||
box = tb["box"]
|
||||
x0, y0 = box[0]
|
||||
x2, y2 = box[2]
|
||||
w = x2 - x0
|
||||
h = y2 - y0
|
||||
fontsize = self._calculateFontSize(text, w, h)
|
||||
# 插入点的 旋转后的坐标
|
||||
point = fitz.Point(x0, y2) * page.derotation_matrix
|
||||
page.insert_text(
|
||||
point,
|
||||
text,
|
||||
fontsize,
|
||||
fontname="cjk",
|
||||
rotate=protation, # 文本角度设定
|
||||
stroke_opacity=self.opacity, # 描边透明度
|
||||
fill_opacity=self.opacity, # 填充(字体)透明度
|
||||
)
|
||||
|
||||
def onEnd(self): # 结束时保存。
|
||||
if not self.pdf:
|
||||
return
|
||||
# 删除未处理的页数
|
||||
for i in range(len(self.pdf) - 1, -1, -1):
|
||||
if i not in self.existentPages:
|
||||
self.pdf.delete_page(i)
|
||||
logger.info(f"保存{len(self.pdf)}页PDF:{self.outputPath}")
|
||||
if self.isInsertFont: # 有任意页面嵌入字体,则构建字体子集
|
||||
try: # 对于部分PDF,如用txt直接打印的,构建字体子集会失败。
|
||||
self.pdf.subset_fonts() # 构建字体子集,减小文件大小。需要 fontTools 库
|
||||
except Exception: # TODO: 失败原因?可能文件中实际并没有字体?
|
||||
logger.error("构建字体子集失败。", exc_info=True, stack_info=True)
|
||||
# 保存:压缩并进行3级垃圾回收。等同 ez_save
|
||||
self.save(self.pdf, self.outputPath, deflate=True, garbage=3)
|
||||
else:
|
||||
# 无嵌入字体,则直接保存,不压缩
|
||||
self.save(self.pdf, self.outputPath)
|
||||
|
||||
def save(self, pdf, path, **options): # 保存并关闭 pdf 对象
|
||||
try:
|
||||
# 尝试保存到指定路径
|
||||
pdf.save(path, **options)
|
||||
except Exception:
|
||||
# 保存失败,尝试保存到 ".temp" 路径
|
||||
tempPath = self.outputPath + ".temp"
|
||||
logger.warning(f"保存PDF失败。 path: {path}", exc_info=True)
|
||||
try:
|
||||
pdf.save(tempPath, **options)
|
||||
pdf.close()
|
||||
except Exception as e1:
|
||||
logger.error(
|
||||
f"保存PDF到临时路径失败。 tempPath: {tempPath}", exc_info=True
|
||||
)
|
||||
raise Exception(f"[Error] Unable to save PDF to [{tempPath}]: {e1}")
|
||||
# 已保存到 .temp 并 close 原对象,尝试替换文件
|
||||
try:
|
||||
if os.path.exists(path):
|
||||
os.remove(path)
|
||||
os.rename(tempPath, path)
|
||||
except Exception as e2:
|
||||
logger.warning(
|
||||
f"保存PDF文件替换失败。保存到临时文件: {tempPath}", exc_info=True
|
||||
)
|
||||
|
||||
raise Exception(
|
||||
f"[Warning] Unable to save PDF: [{path}]. Exception: {e2}. Saved to temporary path: [{tempPath}]."
|
||||
)
|
||||
else: # 正常结束
|
||||
pdf.close()
|
||||
43
UmiOCR-data/py_src/ocr/output/output_pdf_one_layer.py
Normal file
43
UmiOCR-data/py_src/ocr/output/output_pdf_one_layer.py
Normal file
@@ -0,0 +1,43 @@
|
||||
# 单层纯文本 PDF
|
||||
|
||||
import fitz # PyMuPDF
|
||||
|
||||
from umi_log import logger
|
||||
from .output_pdf_layered import OutputPdfLayered
|
||||
|
||||
|
||||
class OutputPdfOneLayer(OutputPdfLayered):
|
||||
def __init__(self, argd):
|
||||
super().__init__(argd)
|
||||
self.opacity = 1 # 文本不透明
|
||||
self.outputPath = f"{self.dir}/{self.fileName}.text.pdf" # 输出路径
|
||||
|
||||
# 创建空白 PDF
|
||||
def _getPDF(self, path):
|
||||
source_doc = fitz.open(path) # 打开原文档
|
||||
# 如果已加密,则尝试解密
|
||||
if source_doc.is_encrypted and not source_doc.authenticate(self.password):
|
||||
raise Exception(
|
||||
f'The document is encrypted, and the password "{self.password}" is incorrect.\n文档已加密,输入密码不正确。'
|
||||
)
|
||||
pdf = fitz.open() # 创建空白PDF文档对象
|
||||
# 复制原始文档的元数据(如作者、标题等)
|
||||
meta = source_doc.metadata
|
||||
if not meta["producer"]:
|
||||
meta["producer"] = "Umi-OCR & PyMuPDF v" + fitz.VersionBind
|
||||
if not meta["creator"]:
|
||||
meta["creator"] = "Umi-OCR & PyMuPDF PDF converter"
|
||||
pdf.set_metadata(meta)
|
||||
# 生成空白的每一页
|
||||
for page in source_doc:
|
||||
rect = page.rect # 原文档渲染尺寸
|
||||
pdf.new_page(width=rect.width, height=rect.height)
|
||||
# 尝试复制原始文档的目录数据
|
||||
try:
|
||||
pdf.set_toc(source_doc.get_toc())
|
||||
except Exception:
|
||||
logger.warning(
|
||||
f"pdf.set_toc error. path: {path}", exc_info=True, stack_info=True
|
||||
)
|
||||
source_doc.close() # 释放原文档
|
||||
return pdf
|
||||
33
UmiOCR-data/py_src/ocr/output/output_txt.py
Normal file
33
UmiOCR-data/py_src/ocr/output/output_txt.py
Normal file
@@ -0,0 +1,33 @@
|
||||
# 输出到txt文件
|
||||
|
||||
from .output import Output
|
||||
from .tools import getDataText
|
||||
|
||||
|
||||
class OutputTxt(Output):
|
||||
def __init__(self, argd):
|
||||
self.dir = argd["outputDir"] # 输出路径(文件夹)
|
||||
self.fileName = argd["outputFileName"] # 文件名
|
||||
self.outputPath = f"{self.dir}/{self.fileName}.txt" # 输出路径
|
||||
self.ignoreBlank = argd["ignoreBlank"] # 忽略空白文件
|
||||
# 创建输出文件
|
||||
try:
|
||||
with open(self.outputPath, "w", encoding="utf-8") as f: # 覆盖创建文件
|
||||
f.write(f'{argd["startDatetime"]}\n\n') # 写入开始时间日期
|
||||
except Exception as e:
|
||||
raise Exception(f"Failed to create txt file. {e}\n创建txt文件失败。")
|
||||
|
||||
def print(self, res): # 输出图片结果
|
||||
if not res["code"] == 100 and self.ignoreBlank:
|
||||
return # 忽略空白图片
|
||||
textOut = f'≦ {res["fileName"]} ≧\n'
|
||||
if res["code"] == 100:
|
||||
textOut += getDataText(res["data"]) # 获取拼接结果
|
||||
textOut += "\n" # 结尾额外加换行
|
||||
elif res["code"] == 101:
|
||||
pass
|
||||
else:
|
||||
textOut += f'[Error] OCR failed. Code: {res["code"]}, Msg: {res["data"]}\n【异常】OCR识别失败。\n'
|
||||
textOut += "\n" # 多空一行
|
||||
with open(self.outputPath, "a", encoding="utf-8") as f: # 追加写入本地文件
|
||||
f.write(textOut)
|
||||
35
UmiOCR-data/py_src/ocr/output/output_txt_individual.py
Normal file
35
UmiOCR-data/py_src/ocr/output/output_txt_individual.py
Normal file
@@ -0,0 +1,35 @@
|
||||
# 单独txt文件
|
||||
|
||||
import os
|
||||
from .output import Output
|
||||
from .tools import getDataText
|
||||
|
||||
|
||||
class OutputTxtIndividual(Output):
|
||||
def __init__(self, argd):
|
||||
super().__init__(argd)
|
||||
# 是否输出到原目录
|
||||
self.outputSource = argd["outputDirType"] == "source"
|
||||
|
||||
def openOutputFile(self):
|
||||
pass # 覆盖父类方法
|
||||
|
||||
def print(self, res): # 输出图片结果
|
||||
if not res["code"] == 100 and self.ignoreBlank:
|
||||
return # 忽略空白图片
|
||||
textOut = ""
|
||||
if res["code"] == 100:
|
||||
textOut += getDataText(res["data"]) # 获取拼接结果
|
||||
elif res["code"] == 101:
|
||||
pass
|
||||
else:
|
||||
textOut += f'[Error] OCR failed. Code: {res["code"]}, Msg: {res["data"]}\n【异常】OCR识别失败。\n'
|
||||
# 输出文件
|
||||
if self.outputSource: # 输出到原始路径
|
||||
p, _ = os.path.splitext(res["path"]) # 原路径去除扩展名
|
||||
path = p + ".txt"
|
||||
else: # 输出到指定路径
|
||||
f, _ = os.path.splitext(res["fileName"]) # 原文件名去除扩展名
|
||||
path = f"{self.dir}/{f}.txt"
|
||||
with open(path, "w", encoding="utf-8") as f: # 追加写入同名本地文件
|
||||
f.write(textOut)
|
||||
29
UmiOCR-data/py_src/ocr/output/output_txt_plain.py
Normal file
29
UmiOCR-data/py_src/ocr/output/output_txt_plain.py
Normal file
@@ -0,0 +1,29 @@
|
||||
# 纯文本(无格式)txt文件
|
||||
|
||||
from .output import Output
|
||||
from .tools import getDataText
|
||||
|
||||
|
||||
class OutputTxtPlain(Output):
|
||||
def __init__(self, argd):
|
||||
self.dir = argd["outputDir"] # 输出路径(文件夹)
|
||||
self.fileName = argd["outputFileName"] # 文件名
|
||||
self.outputPath = f"{self.dir}/{self.fileName}.p.txt" # 输出路径
|
||||
# 创建输出文件
|
||||
try:
|
||||
open(self.outputPath, "w").close() # 覆盖创建文件
|
||||
except Exception as e:
|
||||
raise Exception(
|
||||
f"Failed to create plain txt file. {e}\n创建纯文本txt文件失败。"
|
||||
)
|
||||
|
||||
def print(self, res): # 输出图片结果
|
||||
if not res["code"] == 100:
|
||||
return # 强制忽略空白图片
|
||||
textOut = ""
|
||||
if res["code"] == 100:
|
||||
textOut += getDataText(res["data"]) # 获取拼接结果
|
||||
if not textOut[-1] == "\n": # 确保结尾有换行
|
||||
textOut += "\n"
|
||||
with open(self.outputPath, "a", encoding="utf-8") as f: # 追加写入本地文件
|
||||
f.write(textOut)
|
||||
9
UmiOCR-data/py_src/ocr/output/tools.py
Normal file
9
UmiOCR-data/py_src/ocr/output/tools.py
Normal file
@@ -0,0 +1,9 @@
|
||||
# 从data中提取、拼接文本
|
||||
def getDataText(data):
|
||||
textOut = ""
|
||||
l = len(data) - 1
|
||||
for i, tb in enumerate(data):
|
||||
textOut += tb["text"]
|
||||
if i < l:
|
||||
textOut += tb["end"]
|
||||
return textOut
|
||||
Reference in New Issue
Block a user