Files
epub-read/src/book.rs

606 lines
18 KiB
Rust
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
fn decode_entities(text: &str) -> String {
let mut result = String::with_capacity(text.len());
let mut chars = text.chars();
while let Some(c) = chars.next() {
if c == '&' {
let mut entity = String::new();
for ec in &mut chars {
if ec == ';' {
break;
}
entity.push(ec);
}
let decoded = match entity.as_str() {
"amp" => "&",
"lt" => "<",
"gt" => ">",
"quot" => "\"",
"nbsp" => " ",
"emsp" => " ",
"ensp" => " ",
"mdash" => "",
"ndash" => "",
"ldquo" => "\"",
"rdquo" => "\"",
"lsquo" => "'",
"rsquo" => "'",
"hellip" => "",
_ => "",
};
result.push_str(decoded);
} else {
result.push(c);
}
}
result
}
fn tag_name_from(tag_content: &str) -> &str {
tag_content
.split_whitespace()
.next()
.unwrap_or("")
.trim_end_matches('/')
}
fn extract_id_from_tag(tag_content: &str) -> Option<String> {
if let Some(id_pos) = tag_content.find("id=\"") {
let after_quote = &tag_content[id_pos + 4..];
if let Some(end_quote) = after_quote.find('\"') {
let id_val = &after_quote[..end_quote];
if !id_val.is_empty() {
return Some(id_val.to_string());
}
}
}
if let Some(id_pos) = tag_content.find("id='") {
let after_quote = &tag_content[id_pos + 4..];
if let Some(end_quote) = after_quote.find('\'') {
let id_val = &after_quote[..end_quote];
if !id_val.is_empty() {
return Some(id_val.to_string());
}
}
}
None
}
pub fn strip_html(input: &str) -> String {
let mut out = String::with_capacity(input.len());
let mut pos = 0;
let mut heading_level: Option<u32> = None;
let mut pending_anchor: Option<String> = None;
while pos < input.len() {
// Find next '<'
let remaining = &input[pos..];
let tag_start = remaining.find('<');
let tag_start = match tag_start {
Some(s) => pos + s,
None => {
// No more tags, emit remaining text
out.push_str(&decode_entities(&input[pos..]));
break;
}
};
// Emit text before the tag
if tag_start > pos {
let text = decode_entities(&input[pos..tag_start]);
if let Some(level) = heading_level {
out.push('\x01');
out.push(char::from_digit(level, 10).unwrap_or('1'));
if let Some(ref anchor) = pending_anchor {
out.push('\x03');
out.push_str(anchor);
out.push('\x04');
}
out.push_str(&text);
out.push('\x02');
pending_anchor = None;
} else {
out.push_str(&text);
}
}
// Find '>' to close the tag
let tag_end = match input[tag_start..].find('>') {
Some(i) => tag_start + i,
None => {
// Unclosed tag, emit rest as text
out.push_str(&decode_entities(&input[tag_start..]));
break;
}
};
let tag_content = &input[tag_start + 1..tag_end];
let name = tag_name_from(tag_content);
match name {
"script" | "style" => {
// Skip content until closing tag
let close_tag = format!("</{}", name);
if let Some(cs) = input[tag_end..].find(&close_tag) {
let close_tag_end = input[tag_end + cs..].find('>');
if let Some(ce) = close_tag_end {
pos = tag_end + cs + ce + 1;
continue;
}
}
pos = tag_end + 1;
}
"a" => {
// Capture anchor id
if heading_level.is_none() {
pending_anchor = extract_id_from_tag(tag_content);
}
pos = tag_end + 1;
}
"br" => {
if !out.is_empty() {
out.push('\n');
}
pos = tag_end + 1;
}
"hr" => {
if !out.is_empty() {
out.push_str("\n\n");
}
out.push_str("---\n\n");
pos = tag_end + 1;
}
"li" => {
if !out.is_empty() && !out.ends_with('\n') {
out.push('\n');
}
out.push_str("- ");
pos = tag_end + 1;
}
"/li" | "/dd" | "/dt" | "/ol" | "/ul" => {
pos = tag_end + 1;
}
"h1" | "h2" | "h3" | "h4" | "h5" | "h6" => {
let level = name[1..2].parse::<u32>().unwrap_or(1);
heading_level = Some(level);
if pending_anchor.is_none() {
pending_anchor = extract_id_from_tag(tag_content);
}
pos = tag_end + 1;
}
"p" | "div" | "blockquote" => {
if pending_anchor.is_none() {
pending_anchor = extract_id_from_tag(tag_content);
}
pos = tag_end + 1;
}
"/p" | "/div" | "/blockquote" => {
if !out.is_empty() && !out.ends_with('\n') {
out.push('\n');
}
out.push('\n');
pending_anchor = None;
pos = tag_end + 1;
}
"/h1" | "/h2" | "/h3" | "/h4" | "/h5" | "/h6" => {
heading_level = None;
pending_anchor = None;
if !out.is_empty() && !out.ends_with('\n') {
out.push('\n');
}
out.push('\n');
pos = tag_end + 1;
}
_ => {
if pending_anchor.is_none() {
pending_anchor = extract_id_from_tag(tag_content);
}
pos = tag_end + 1;
}
}
}
// Collapse 3+ consecutive newlines into 2
let out = out.trim();
let mut final_out = String::with_capacity(out.len());
let mut nl_count = 0usize;
for c in out.chars() {
if c == '\n' {
nl_count += 1;
if nl_count <= 2 {
final_out.push(c);
}
} else {
nl_count = 0;
final_out.push(c);
}
}
final_out
}
#[derive(Debug, Clone)]
pub struct TocEntry {
pub label: String,
pub section: usize,
pub anchor: Option<String>,
pub children: Vec<TocEntry>,
}
#[derive(Debug, Clone)]
pub struct ContentBlock {
pub text: String,
pub heading_level: u8, // 0 = body, 1-6 = h1-h6
pub anchor: Option<String>,
}
#[derive(Debug, Clone)]
pub struct Section {
pub title: String,
pub content: String,
pub blocks: Vec<ContentBlock>,
pub page_block_ranges: Vec<(usize, usize)>,
}
#[derive(Debug, Clone, Copy, PartialEq)]
pub enum BookLayout {
Reflowable,
FixedLayout,
}
impl BookLayout {
pub fn label(&self) -> &str {
match self {
BookLayout::Reflowable => "重排",
BookLayout::FixedLayout => "固定",
}
}
}
#[derive(Debug, Clone)]
pub struct Book {
pub title: String,
pub author: String,
pub cover: Option<Vec<u8>>,
pub layout: BookLayout,
pub sections: Vec<Section>,
pub toc: Vec<TocEntry>,
}
use std::path::Path;
fn detect_layout(doc: &mut epub::doc::EpubDoc<std::io::BufReader<std::fs::File>>) -> BookLayout {
if let Some(vals) = doc.metadata.get("rendition:layout") {
if vals.iter().any(|v| v == "pre-paginated") {
return BookLayout::FixedLayout;
}
}
if let Ok(opf) = doc.get_resource_str_by_path(&doc.root_file.clone()) {
if opf.contains("rendition:layout") && opf.contains("pre-paginated") {
return BookLayout::FixedLayout;
}
if opf.contains("rendition:layout-pre-paginated") {
return BookLayout::FixedLayout;
}
}
BookLayout::Reflowable
}
pub fn load_epub(path: impl AsRef<Path>) -> Result<Book, String> {
let path = path.as_ref();
let mut doc = epub::doc::EpubDoc::new(path)
.map_err(|e| format!("无法打开文件: {}", e))?;
let layout = detect_layout(&mut doc);
let title = doc.mdata("title").unwrap_or_else(|| "未知标题".to_string());
let author = doc.mdata("creator").unwrap_or_else(|| "未知作者".to_string());
let cover = doc.get_cover().ok();
let spine: Vec<String> = doc.spine.iter()
.filter(|id| {
if id.as_str() == "nav" { return false; }
if let Some((path, _)) = doc.resources.get(*id) {
let path_str = path.to_string_lossy().to_lowercase();
if path_str.ends_with("nav.xhtml") || path_str.ends_with("nav.html") {
return false;
}
}
true
})
.cloned()
.collect();
let spine_paths: Vec<String> = spine.iter().map(|id| {
doc.resources.get(id)
.map(|(path, _)| path.to_string_lossy().to_string())
.unwrap_or_else(|| id.clone())
}).collect();
let mut sections = Vec::new();
for (i, href) in spine.iter().enumerate() {
let raw_html = doc.get_resource_str(href)
.map_err(|e| format!("读取章节失败: {}", e))?;
let text = strip_html(&raw_html);
let title = extract_title(&raw_html)
.unwrap_or_else(|| format!("{}", i + 1));
sections.push(Section {
title,
content: text,
blocks: Vec::new(),
page_block_ranges: Vec::new(),
});
}
let raw_toc = std::mem::take(&mut doc.toc);
let toc = build_toc(&raw_toc, &spine, &spine_paths);
Ok(Book { title, author, cover, layout, sections, toc })
}
fn extract_title(html: &str) -> Option<String> {
if let Some(start) = html.find("<title>") {
let rest = &html[start + 7..];
if let Some(end) = rest.find("</title>") {
return Some(strip_html(&rest[..end]).trim().to_string());
}
}
if let Some(start) = html.find("<h1") {
let rest = &html[start..];
if let Some(content_start) = rest.find('>') {
let inner = &rest[content_start + 1..];
if let Some(end) = inner.find("</h1>") {
return Some(strip_html(&inner[..end]).trim().to_string());
}
}
}
None
}
fn extract_filename(path: &str) -> &str {
let path = path.trim_end_matches('/').trim_end_matches('\\');
path.rsplit(&['/', '\\'][..]).next().unwrap_or(path)
}
fn extract_fragment(path: &str) -> Option<String> {
if let Some(hash_pos) = path.find('#') {
let fragment = &path[hash_pos + 1..];
if !fragment.is_empty() {
Some(fragment.to_string())
} else {
None
}
} else {
None
}
}
fn build_toc(
entries: &[epub::doc::NavPoint],
spine: &[String],
spine_paths: &[String],
) -> Vec<TocEntry> {
entries
.iter()
.map(|e| {
let content_str = e.content.to_string_lossy();
let anchor = extract_fragment(&content_str);
let content_file = extract_filename(&content_str);
let section = spine_paths
.iter()
.position(|s| {
let spine_file = extract_filename(s);
if spine_file == content_file {
return true;
}
content_str.contains(s.as_str()) || s.contains(content_str.as_ref())
})
.or_else(|| {
spine.iter().position(|s| {
let spine_file = extract_filename(s);
spine_file == content_file
|| content_str.contains(s.as_str())
|| s.contains(content_str.as_ref())
})
})
.unwrap_or(0);
TocEntry {
label: e.label.clone(),
section,
anchor,
children: build_toc(&e.children, spine, spine_paths),
}
})
.collect()
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_epub_loader_nonexistent_file() {
let result = load_epub("nonexistent.epub");
assert!(result.is_err());
}
#[test]
fn test_strip_html_plain_text() {
assert_eq!(strip_html("Hello World"), "Hello World");
}
#[test]
fn test_strip_html_simple_tags() {
assert_eq!(strip_html("<p>Hello</p>"), "Hello");
}
#[test]
fn test_strip_html_nested_tags() {
assert_eq!(
strip_html("<div><p>Hello <b>World</b></p></div>"),
"Hello World"
);
}
#[test]
fn test_strip_html_html_entities() {
assert_eq!(strip_html("Hello &amp; World"), "Hello & World");
assert_eq!(strip_html("Hello&nbsp;World"), "Hello World");
}
#[test]
fn test_strip_html_empty() {
assert_eq!(strip_html(""), "");
}
#[test]
fn test_extract_title_from_title_tag() {
let html = "<html><head><title>My Book Title</title></head><body></body></html>";
assert_eq!(extract_title(html), Some("My Book Title".to_string()));
}
#[test]
fn test_extract_title_from_h1() {
let html = "<html><body><h1>Chapter One</h1><p>text</p></body></html>";
assert_eq!(extract_title(html), Some("Chapter One".to_string()));
}
#[test]
fn test_extract_title_prefers_title() {
let html = "<html><head><title>Book</title></head><body><h1>Chapter</h1></body></html>";
assert_eq!(extract_title(html), Some("Book".to_string()));
}
#[test]
fn test_extract_title_missing() {
assert_eq!(extract_title("<html><body><p>no title</p></body></html>"), None);
}
#[test]
fn test_extract_title_empty() {
assert_eq!(extract_title(""), None);
}
#[test]
fn test_html_to_plain_paragraphs() {
let html = "<p>第一段</p><p>第二段</p>";
let result = strip_html(html);
assert!(result.contains("第一段"));
assert!(result.contains("第二段"));
assert!(result.contains('\n'));
assert!(result.ends_with("第二段"));
}
#[test]
fn test_html_to_plain_heading() {
let html = "<h1>标题</h1><p>正文</p>";
let result = strip_html(html);
assert!(result.contains("标题"));
assert!(result.contains("正文"));
assert!(result.contains('\n'));
}
#[test]
fn test_html_to_plain_list() {
let html = "<ul><li>项目一</li><li>项目二</li></ul>";
let result = strip_html(html);
assert!(result.starts_with("- "));
assert!(result.contains("项目一"));
assert!(result.contains("项目二"));
}
#[test]
fn test_html_to_plain_br() {
let html = "第一行<br>第二行";
let result = strip_html(html);
assert_eq!(result, "第一行\n第二行");
}
#[test]
fn test_html_to_plain_skip_script() {
let html = "<p>正文</p><script>var x=1;</script><p>更多正文</p>";
let result = strip_html(html);
assert!(result.contains("正文"));
assert!(result.contains("更多正文"));
assert!(!result.contains("var x=1"));
}
#[test]
fn test_html_to_plain_line_break_collapse() {
let html = "<p>段一</p><p>段二</p><p>段三</p>";
let result = strip_html(html);
let non_empty: Vec<&str> = result.lines().filter(|l| !l.is_empty()).collect();
assert_eq!(non_empty.len(), 3);
assert_eq!(non_empty[0], "段一");
assert_eq!(non_empty[1], "段二");
assert_eq!(non_empty[2], "段三");
}
#[test]
fn test_build_toc_empty() {
let toc = build_toc(&[], &[], &[]);
assert!(toc.is_empty());
}
#[test]
fn test_load_sample_epub_nav_filtered() {
let book = load_epub("sample-short.epub").expect("Failed to load sample epub");
// Nav document is filtered out, leaving only chapter_0 as the single section
assert_eq!(book.sections.len(), 1);
assert_eq!(book.sections[0].title, "Understanding Digital Formats");
}
#[test]
fn test_toc_section_bounds() {
let book = load_epub("sample-short.epub").expect("Failed to load sample epub");
// All TOC section indices should be within sections range
fn check_bounds(entries: &[TocEntry], max: usize) {
for e in entries {
assert!(e.section < max, "TOC entry '{}' maps to section {} but only {} sections exist", e.label, e.section, max);
check_bounds(&e.children, max);
}
}
check_bounds(&book.toc, book.sections.len());
}
#[test]
fn test_build_toc_filename_matching() {
use epub::doc::NavPoint;
let spine = vec![
"OEBPS/Text/chapter1.xhtml".to_string(),
"OEBPS/Text/chapter2.xhtml".to_string(),
];
let nav_points = vec![
NavPoint {
label: "Chapter 2".to_string(),
content: std::path::PathBuf::from("Text/chapter2.xhtml"),
play_order: 1,
children: vec![],
},
];
let toc = build_toc(&nav_points, &spine, &spine);
assert_eq!(toc.len(), 1);
assert_eq!(toc[0].section, 1); // maps to spine index 1
assert_eq!(toc[0].label, "Chapter 2");
}
#[test]
fn test_build_toc_exact_path_match() {
use epub::doc::NavPoint;
let spine = vec![
"chapter1.xhtml".to_string(),
"chapter2.xhtml".to_string(),
];
let nav_points = vec![
NavPoint {
label: "Chapter 1".to_string(),
content: std::path::PathBuf::from("chapter1.xhtml"),
play_order: 1,
children: vec![],
},
];
let toc = build_toc(&nav_points, &spine, &spine);
assert_eq!(toc.len(), 1);
assert_eq!(toc[0].section, 0);
}
}