fn decode_entities(text: &str) -> String { let mut result = String::with_capacity(text.len()); let mut chars = text.chars(); while let Some(c) = chars.next() { if c == '&' { let mut entity = String::new(); for ec in &mut chars { if ec == ';' { break; } entity.push(ec); } let decoded = match entity.as_str() { "amp" => "&", "lt" => "<", "gt" => ">", "quot" => "\"", "nbsp" => " ", "emsp" => " ", "ensp" => " ", "mdash" => "—", "ndash" => "–", "ldquo" => "\"", "rdquo" => "\"", "lsquo" => "'", "rsquo" => "'", "hellip" => "…", _ => "", }; result.push_str(decoded); } else { result.push(c); } } result } fn tag_name_from(tag_content: &str) -> &str { tag_content .split_whitespace() .next() .unwrap_or("") .trim_end_matches('/') } pub fn strip_html(input: &str) -> String { let mut out = String::with_capacity(input.len()); let mut pos = 0; let mut heading_level: Option = None; while pos < input.len() { // Find next '<' let remaining = &input[pos..]; let tag_start = remaining.find('<'); let tag_start = match tag_start { Some(s) => pos + s, None => { // No more tags, emit remaining text out.push_str(&decode_entities(&input[pos..])); break; } }; // Emit text before the tag if tag_start > pos { let text = decode_entities(&input[pos..tag_start]); if heading_level.is_some() { out.push_str("\x01"); out.push_str(&text); out.push_str("\x02"); } else { out.push_str(&text); } } // Find '>' to close the tag let tag_end = match input[tag_start..].find('>') { Some(i) => tag_start + i, None => { // Unclosed tag, emit rest as text out.push_str(&decode_entities(&input[tag_start..])); break; } }; let tag_content = &input[tag_start + 1..tag_end]; let name = tag_name_from(tag_content); match name { "script" | "style" => { // Skip content until closing tag let close_tag = format!("'); if let Some(ce) = close_tag_end { pos = tag_end + cs + ce + 1; continue; } } pos = tag_end + 1; } "br" => { if !out.is_empty() { out.push('\n'); } pos = tag_end + 1; } "hr" => { if !out.is_empty() { out.push_str("\n\n"); } out.push_str("---\n\n"); pos = tag_end + 1; } "li" => { if !out.is_empty() && !out.ends_with('\n') { out.push('\n'); } out.push_str("- "); pos = tag_end + 1; } "/li" | "/dd" | "/dt" | "/ol" | "/ul" => { pos = tag_end + 1; } "h1" => { heading_level = Some(1); pos = tag_end + 1; } "h2" => { heading_level = Some(2); pos = tag_end + 1; } "h3" => { heading_level = Some(3); pos = tag_end + 1; } "h4" => { heading_level = Some(4); pos = tag_end + 1; } "h5" => { heading_level = Some(5); pos = tag_end + 1; } "h6" => { heading_level = Some(6); pos = tag_end + 1; } "p" | "div" | "blockquote" => { pos = tag_end + 1; } "/p" | "/div" | "/blockquote" => { if !out.is_empty() && !out.ends_with('\n') { out.push('\n'); } out.push('\n'); pos = tag_end + 1; } "/h1" | "/h2" | "/h3" | "/h4" | "/h5" | "/h6" => { heading_level = None; if !out.is_empty() && !out.ends_with('\n') { out.push('\n'); } out.push('\n'); pos = tag_end + 1; } _ => { pos = tag_end + 1; } } } // Collapse 3+ consecutive newlines into 2 let out = out.trim(); let mut final_out = String::with_capacity(out.len()); let mut nl_count = 0usize; for c in out.chars() { if c == '\n' { nl_count += 1; if nl_count <= 2 { final_out.push(c); } } else { nl_count = 0; final_out.push(c); } } final_out } #[derive(Debug, Clone)] pub struct TocEntry { pub label: String, pub section: usize, pub children: Vec, } #[derive(Debug, Clone)] pub struct ContentBlock { pub text: String, pub is_heading: bool, } #[derive(Debug, Clone)] pub struct Section { pub title: String, pub content: String, pub blocks: Vec, pub page_block_ranges: Vec<(usize, usize)>, } #[derive(Debug, Clone, Copy, PartialEq)] pub enum BookLayout { Reflowable, FixedLayout, } impl BookLayout { pub fn label(&self) -> &str { match self { BookLayout::Reflowable => "重排", BookLayout::FixedLayout => "固定", } } } #[derive(Debug, Clone)] pub struct Book { pub title: String, pub author: String, pub cover: Option>, pub layout: BookLayout, pub sections: Vec
, pub toc: Vec, } use std::path::Path; fn detect_layout(doc: &mut epub::doc::EpubDoc>) -> BookLayout { if let Some(vals) = doc.metadata.get("rendition:layout") { if vals.iter().any(|v| v == "pre-paginated") { return BookLayout::FixedLayout; } } if let Ok(opf) = doc.get_resource_str_by_path(&doc.root_file.clone()) { if opf.contains("rendition:layout") && opf.contains("pre-paginated") { return BookLayout::FixedLayout; } if opf.contains("rendition:layout-pre-paginated") { return BookLayout::FixedLayout; } } BookLayout::Reflowable } pub fn load_epub(path: impl AsRef) -> Result { let path = path.as_ref(); let mut doc = epub::doc::EpubDoc::new(path) .map_err(|e| format!("无法打开文件: {}", e))?; let layout = detect_layout(&mut doc); let title = doc.mdata("title").unwrap_or_else(|| "未知标题".to_string()); let author = doc.mdata("creator").unwrap_or_else(|| "未知作者".to_string()); let cover = doc.get_cover().ok(); let spine = doc.spine.clone(); let mut sections = Vec::new(); for (i, href) in spine.iter().enumerate() { let raw_html = doc.get_resource_str(href) .map_err(|e| format!("读取章节失败: {}", e))?; let text = strip_html(&raw_html); let title = extract_title(&raw_html) .unwrap_or_else(|| format!("第{}章", i + 1)); sections.push(Section { title, content: text, blocks: Vec::new(), page_block_ranges: Vec::new(), }); } let raw_toc = std::mem::take(&mut doc.toc); let toc = build_toc(&raw_toc, &spine); Ok(Book { title, author, cover, layout, sections, toc }) } fn extract_title(html: &str) -> Option { if let Some(start) = html.find("") { let rest = &html[start + 7..]; if let Some(end) = rest.find("") { return Some(strip_html(&rest[..end]).trim().to_string()); } } if let Some(start) = html.find("') { let inner = &rest[content_start + 1..]; if let Some(end) = inner.find("") { return Some(strip_html(&inner[..end]).trim().to_string()); } } } None } fn build_toc( entries: &[epub::doc::NavPoint], spine: &[String], ) -> Vec { entries .iter() .map(|e| { let content_str = e.content.to_string_lossy(); let section = spine .iter() .position(|s| content_str.contains(s.trim_end_matches('/'))) // unwrap_or(0) is safe: a real TOC entry should always match a spine item .unwrap_or(0); TocEntry { label: e.label.clone(), section, children: build_toc(&e.children, spine), } }) .collect() } #[cfg(test)] mod tests { use super::*; #[test] fn test_epub_loader_nonexistent_file() { let result = load_epub("nonexistent.epub"); assert!(result.is_err()); } #[test] fn test_strip_html_plain_text() { assert_eq!(strip_html("Hello World"), "Hello World"); } #[test] fn test_strip_html_simple_tags() { assert_eq!(strip_html("

Hello

"), "Hello"); } #[test] fn test_strip_html_nested_tags() { assert_eq!( strip_html("

Hello World

"), "Hello World" ); } #[test] fn test_strip_html_html_entities() { assert_eq!(strip_html("Hello & World"), "Hello & World"); assert_eq!(strip_html("Hello World"), "Hello World"); } #[test] fn test_strip_html_empty() { assert_eq!(strip_html(""), ""); } #[test] fn test_extract_title_from_title_tag() { let html = "My Book Title"; assert_eq!(extract_title(html), Some("My Book Title".to_string())); } #[test] fn test_extract_title_from_h1() { let html = "

Chapter One

text

"; assert_eq!(extract_title(html), Some("Chapter One".to_string())); } #[test] fn test_extract_title_prefers_title() { let html = "Book

Chapter

"; assert_eq!(extract_title(html), Some("Book".to_string())); } #[test] fn test_extract_title_missing() { assert_eq!(extract_title("

no title

"), None); } #[test] fn test_extract_title_empty() { assert_eq!(extract_title(""), None); } #[test] fn test_html_to_plain_paragraphs() { let html = "

第一段

第二段

"; let result = strip_html(html); assert!(result.contains("第一段")); assert!(result.contains("第二段")); assert!(result.contains('\n')); assert!(result.ends_with("第二段")); } #[test] fn test_html_to_plain_heading() { let html = "

标题

正文

"; let result = strip_html(html); assert!(result.contains("标题")); assert!(result.contains("正文")); assert!(result.contains('\n')); } #[test] fn test_html_to_plain_list() { let html = "
  • 项目一
  • 项目二
"; let result = strip_html(html); assert!(result.starts_with("- ")); assert!(result.contains("项目一")); assert!(result.contains("项目二")); } #[test] fn test_html_to_plain_br() { let html = "第一行
第二行"; let result = strip_html(html); assert_eq!(result, "第一行\n第二行"); } #[test] fn test_html_to_plain_skip_script() { let html = "

正文

更多正文

"; let result = strip_html(html); assert!(result.contains("正文")); assert!(result.contains("更多正文")); assert!(!result.contains("var x=1")); } #[test] fn test_html_to_plain_line_break_collapse() { let html = "

段一

段二

段三

"; let result = strip_html(html); let non_empty: Vec<&str> = result.lines().filter(|l| !l.is_empty()).collect(); assert_eq!(non_empty.len(), 3); assert_eq!(non_empty[0], "段一"); assert_eq!(non_empty[1], "段二"); assert_eq!(non_empty[2], "段三"); } #[test] fn test_build_toc_empty() { let toc = build_toc(&[], &[]); assert!(toc.is_empty()); } }