Files
work-secretfile-selfcheck/src/inspect/docx_inspector.rs

82 lines
3.6 KiB
Rust
Raw Normal View History

// DOCX 文本抽检docx-rs 0.4 读段落,关键词匹配
use std::future::Future;
use std::path::Path;
use std::pin::Pin;
use std::sync::atomic::AtomicBool;
use docx_rs::{DocumentChild, ParagraphChild, RunChild, TableCellContent, TableChild, TableRowChild};
use crate::config::AppConfig;
use crate::inspect::{make_hit, Finding, Inspector};
use crate::matcher::keywords::{keywords_for, Matcher};
pub struct DocxInspector;
impl Inspector for DocxInspector {
fn inspect<'a>(
&'a self,
path: &'a Path,
cfg: &'a AppConfig,
_cancel: &'a AtomicBool,
log: &'a (dyn Fn(&str) + Send + Sync),
) -> Pin<Box<dyn Future<Output = anyhow::Result<Finding>> + Send + 'a>> {
Box::pin(async move {
log(" 解析 DOCX 文本……");
let bytes = std::fs::read(path)?;
// docx-rs 0.4:使用 read_docx 解析整个 zip
let doc = docx_rs::read_docx(&bytes).map_err(|e| anyhow::anyhow!("docx-rs 解析失败:{:?}", e))?;
let mut text = String::new();
for d in doc.document.children.iter() {
match d {
DocumentChild::Paragraph(p) => {
// p: &Box<Paragraph>
for pc in p.children.iter() {
if let ParagraphChild::Run(r) = pc {
for rc in r.children.iter() {
if let RunChild::Text(t) = rc {
text.push_str(&t.text);
}
}
}
}
text.push('\n');
}
DocumentChild::Table(t) => {
// t: &Box<Table>t.rows: Vec<TableChild>
for tc in t.rows.iter() {
if let TableChild::TableRow(row) = tc {
for rc in row.cells.iter() {
if let TableRowChild::TableCell(cell) = rc {
for cc in cell.children.iter() {
if let TableCellContent::Paragraph(p) = cc {
for pc in p.children.iter() {
if let ParagraphChild::Run(r) = pc {
for rcc in r.children.iter() {
if let RunChild::Text(t) = rcc {
text.push_str(&t.text);
}
}
}
}
text.push('\n');
}
}
}
text.push('\t');
}
text.push('\n');
}
}
}
_ => {}
}
}
let kws = keywords_for("docx", &cfg.keyword);
let m = Matcher::new(kws, &cfg.keyword);
let hits = m.find(&text);
Ok(make_hit(path, "docx", hits, text, None))
})
}
}