// DOCX 文本抽检:docx-rs 0.4 读段落,关键词匹配 use std::future::Future; use std::path::Path; use std::pin::Pin; use std::sync::atomic::AtomicBool; use docx_rs::{DocumentChild, ParagraphChild, RunChild, TableCellContent, TableChild, TableRowChild}; use crate::config::AppConfig; use crate::inspect::{make_hit, Finding, Inspector}; use crate::matcher::keywords::{keywords_for, Matcher}; pub struct DocxInspector; impl Inspector for DocxInspector { fn inspect<'a>( &'a self, path: &'a Path, cfg: &'a AppConfig, _cancel: &'a AtomicBool, log: &'a (dyn Fn(&str) + Send + Sync), ) -> Pin> + Send + 'a>> { Box::pin(async move { log(" 解析 DOCX 文本……"); let bytes = std::fs::read(path)?; // docx-rs 0.4:使用 read_docx 解析整个 zip let doc = docx_rs::read_docx(&bytes).map_err(|e| anyhow::anyhow!("docx-rs 解析失败:{:?}", e))?; let mut text = String::new(); for d in doc.document.children.iter() { match d { DocumentChild::Paragraph(p) => { // p: &Box for pc in p.children.iter() { if let ParagraphChild::Run(r) = pc { for rc in r.children.iter() { if let RunChild::Text(t) = rc { text.push_str(&t.text); } } } } text.push('\n'); } DocumentChild::Table(t) => { // t: &Box;t.rows: Vec for tc in t.rows.iter() { if let TableChild::TableRow(row) = tc { for rc in row.cells.iter() { if let TableRowChild::TableCell(cell) = rc { for cc in cell.children.iter() { if let TableCellContent::Paragraph(p) = cc { for pc in p.children.iter() { if let ParagraphChild::Run(r) = pc { for rcc in r.children.iter() { if let RunChild::Text(t) = rcc { text.push_str(&t.text); } } } } text.push('\n'); } } } text.push('\t'); } text.push('\n'); } } } _ => {} } } let kws = keywords_for("docx", &cfg.keyword); let m = Matcher::new(kws, &cfg.keyword); let hits = m.find(&text); Ok(make_hit(path, "docx", hits, text, None)) }) } }