82 lines
3.6 KiB
Rust
82 lines
3.6 KiB
Rust
|
|
// DOCX 文本抽检:docx-rs 0.4 读段落,关键词匹配
|
|||
|
|
use std::future::Future;
|
|||
|
|
use std::path::Path;
|
|||
|
|
use std::pin::Pin;
|
|||
|
|
use std::sync::atomic::AtomicBool;
|
|||
|
|
|
|||
|
|
use docx_rs::{DocumentChild, ParagraphChild, RunChild, TableCellContent, TableChild, TableRowChild};
|
|||
|
|
|
|||
|
|
use crate::config::AppConfig;
|
|||
|
|
use crate::inspect::{make_hit, Finding, Inspector};
|
|||
|
|
use crate::matcher::keywords::{keywords_for, Matcher};
|
|||
|
|
|
|||
|
|
pub struct DocxInspector;
|
|||
|
|
|
|||
|
|
impl Inspector for DocxInspector {
|
|||
|
|
fn inspect<'a>(
|
|||
|
|
&'a self,
|
|||
|
|
path: &'a Path,
|
|||
|
|
cfg: &'a AppConfig,
|
|||
|
|
_cancel: &'a AtomicBool,
|
|||
|
|
log: &'a (dyn Fn(&str) + Send + Sync),
|
|||
|
|
) -> Pin<Box<dyn Future<Output = anyhow::Result<Finding>> + Send + 'a>> {
|
|||
|
|
Box::pin(async move {
|
|||
|
|
log(" 解析 DOCX 文本……");
|
|||
|
|
let bytes = std::fs::read(path)?;
|
|||
|
|
// docx-rs 0.4:使用 read_docx 解析整个 zip
|
|||
|
|
let doc = docx_rs::read_docx(&bytes).map_err(|e| anyhow::anyhow!("docx-rs 解析失败:{:?}", e))?;
|
|||
|
|
|
|||
|
|
let mut text = String::new();
|
|||
|
|
for d in doc.document.children.iter() {
|
|||
|
|
match d {
|
|||
|
|
DocumentChild::Paragraph(p) => {
|
|||
|
|
// p: &Box<Paragraph>
|
|||
|
|
for pc in p.children.iter() {
|
|||
|
|
if let ParagraphChild::Run(r) = pc {
|
|||
|
|
for rc in r.children.iter() {
|
|||
|
|
if let RunChild::Text(t) = rc {
|
|||
|
|
text.push_str(&t.text);
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
text.push('\n');
|
|||
|
|
}
|
|||
|
|
DocumentChild::Table(t) => {
|
|||
|
|
// t: &Box<Table>;t.rows: Vec<TableChild>
|
|||
|
|
for tc in t.rows.iter() {
|
|||
|
|
if let TableChild::TableRow(row) = tc {
|
|||
|
|
for rc in row.cells.iter() {
|
|||
|
|
if let TableRowChild::TableCell(cell) = rc {
|
|||
|
|
for cc in cell.children.iter() {
|
|||
|
|
if let TableCellContent::Paragraph(p) = cc {
|
|||
|
|
for pc in p.children.iter() {
|
|||
|
|
if let ParagraphChild::Run(r) = pc {
|
|||
|
|
for rcc in r.children.iter() {
|
|||
|
|
if let RunChild::Text(t) = rcc {
|
|||
|
|
text.push_str(&t.text);
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
text.push('\n');
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
text.push('\t');
|
|||
|
|
}
|
|||
|
|
text.push('\n');
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
_ => {}
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
let kws = keywords_for("docx", &cfg.keyword);
|
|||
|
|
let m = Matcher::new(kws, &cfg.keyword);
|
|||
|
|
let hits = m.find(&text);
|
|||
|
|
Ok(make_hit(path, "docx", hits, text, None))
|
|||
|
|
})
|
|||
|
|
}
|
|||
|
|
}
|