Files
epub-read/src/book.rs

443 lines
13 KiB
Rust
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
fn decode_entities(text: &str) -> String {
let mut result = String::with_capacity(text.len());
let mut chars = text.chars();
while let Some(c) = chars.next() {
if c == '&' {
let mut entity = String::new();
for ec in &mut chars {
if ec == ';' {
break;
}
entity.push(ec);
}
let decoded = match entity.as_str() {
"amp" => "&",
"lt" => "<",
"gt" => ">",
"quot" => "\"",
"nbsp" => " ",
"emsp" => " ",
"ensp" => " ",
"mdash" => "",
"ndash" => "",
"ldquo" => "\"",
"rdquo" => "\"",
"lsquo" => "'",
"rsquo" => "'",
"hellip" => "",
_ => "",
};
result.push_str(decoded);
} else {
result.push(c);
}
}
result
}
fn tag_name_from(tag_content: &str) -> &str {
tag_content
.split_whitespace()
.next()
.unwrap_or("")
.trim_end_matches('/')
}
pub fn strip_html(input: &str) -> String {
let mut out = String::with_capacity(input.len());
let mut pos = 0;
let mut heading_level: Option<u32> = None;
while pos < input.len() {
// Find next '<'
let remaining = &input[pos..];
let tag_start = remaining.find('<');
let tag_start = match tag_start {
Some(s) => pos + s,
None => {
// No more tags, emit remaining text
out.push_str(&decode_entities(&input[pos..]));
break;
}
};
// Emit text before the tag
if tag_start > pos {
let text = decode_entities(&input[pos..tag_start]);
if let Some(level) = heading_level {
out.push('\x01');
out.push(char::from_digit(level, 10).unwrap_or('1'));
out.push_str(&text);
out.push('\x02');
} else {
out.push_str(&text);
}
}
// Find '>' to close the tag
let tag_end = match input[tag_start..].find('>') {
Some(i) => tag_start + i,
None => {
// Unclosed tag, emit rest as text
out.push_str(&decode_entities(&input[tag_start..]));
break;
}
};
let tag_content = &input[tag_start + 1..tag_end];
let name = tag_name_from(tag_content);
match name {
"script" | "style" => {
// Skip content until closing tag
let close_tag = format!("</{}", name);
if let Some(cs) = input[tag_end..].find(&close_tag) {
let close_tag_end = input[tag_end + cs..].find('>');
if let Some(ce) = close_tag_end {
pos = tag_end + cs + ce + 1;
continue;
}
}
pos = tag_end + 1;
}
"br" => {
if !out.is_empty() {
out.push('\n');
}
pos = tag_end + 1;
}
"hr" => {
if !out.is_empty() {
out.push_str("\n\n");
}
out.push_str("---\n\n");
pos = tag_end + 1;
}
"li" => {
if !out.is_empty() && !out.ends_with('\n') {
out.push('\n');
}
out.push_str("- ");
pos = tag_end + 1;
}
"/li" | "/dd" | "/dt" | "/ol" | "/ul" => {
pos = tag_end + 1;
}
"h1" => { heading_level = Some(1); pos = tag_end + 1; }
"h2" => { heading_level = Some(2); pos = tag_end + 1; }
"h3" => { heading_level = Some(3); pos = tag_end + 1; }
"h4" => { heading_level = Some(4); pos = tag_end + 1; }
"h5" => { heading_level = Some(5); pos = tag_end + 1; }
"h6" => { heading_level = Some(6); pos = tag_end + 1; }
"p" | "div" | "blockquote" => {
pos = tag_end + 1;
}
"/p" | "/div" | "/blockquote" => {
if !out.is_empty() && !out.ends_with('\n') {
out.push('\n');
}
out.push('\n');
pos = tag_end + 1;
}
"/h1" | "/h2" | "/h3" | "/h4" | "/h5" | "/h6" => {
heading_level = None;
if !out.is_empty() && !out.ends_with('\n') {
out.push('\n');
}
out.push('\n');
pos = tag_end + 1;
}
_ => {
pos = tag_end + 1;
}
}
}
// Collapse 3+ consecutive newlines into 2
let out = out.trim();
let mut final_out = String::with_capacity(out.len());
let mut nl_count = 0usize;
for c in out.chars() {
if c == '\n' {
nl_count += 1;
if nl_count <= 2 {
final_out.push(c);
}
} else {
nl_count = 0;
final_out.push(c);
}
}
final_out
}
#[derive(Debug, Clone)]
pub struct TocEntry {
pub label: String,
pub section: usize,
pub children: Vec<TocEntry>,
}
#[derive(Debug, Clone)]
pub struct ContentBlock {
pub text: String,
pub heading_level: u8, // 0 = body, 1-6 = h1-h6
}
#[derive(Debug, Clone)]
pub struct Section {
pub title: String,
pub content: String,
pub blocks: Vec<ContentBlock>,
pub page_block_ranges: Vec<(usize, usize)>,
}
#[derive(Debug, Clone, Copy, PartialEq)]
pub enum BookLayout {
Reflowable,
FixedLayout,
}
impl BookLayout {
pub fn label(&self) -> &str {
match self {
BookLayout::Reflowable => "重排",
BookLayout::FixedLayout => "固定",
}
}
}
#[derive(Debug, Clone)]
pub struct Book {
pub title: String,
pub author: String,
pub cover: Option<Vec<u8>>,
pub layout: BookLayout,
pub sections: Vec<Section>,
pub toc: Vec<TocEntry>,
}
use std::path::Path;
fn detect_layout(doc: &mut epub::doc::EpubDoc<std::io::BufReader<std::fs::File>>) -> BookLayout {
if let Some(vals) = doc.metadata.get("rendition:layout") {
if vals.iter().any(|v| v == "pre-paginated") {
return BookLayout::FixedLayout;
}
}
if let Ok(opf) = doc.get_resource_str_by_path(&doc.root_file.clone()) {
if opf.contains("rendition:layout") && opf.contains("pre-paginated") {
return BookLayout::FixedLayout;
}
if opf.contains("rendition:layout-pre-paginated") {
return BookLayout::FixedLayout;
}
}
BookLayout::Reflowable
}
pub fn load_epub(path: impl AsRef<Path>) -> Result<Book, String> {
let path = path.as_ref();
let mut doc = epub::doc::EpubDoc::new(path)
.map_err(|e| format!("无法打开文件: {}", e))?;
let layout = detect_layout(&mut doc);
let title = doc.mdata("title").unwrap_or_else(|| "未知标题".to_string());
let author = doc.mdata("creator").unwrap_or_else(|| "未知作者".to_string());
let cover = doc.get_cover().ok();
let spine = doc.spine.clone();
let mut sections = Vec::new();
for (i, href) in spine.iter().enumerate() {
let raw_html = doc.get_resource_str(href)
.map_err(|e| format!("读取章节失败: {}", e))?;
let text = strip_html(&raw_html);
let title = extract_title(&raw_html)
.unwrap_or_else(|| format!("{}", i + 1));
sections.push(Section {
title,
content: text,
blocks: Vec::new(),
page_block_ranges: Vec::new(),
});
}
let raw_toc = std::mem::take(&mut doc.toc);
let toc = build_toc(&raw_toc, &spine);
Ok(Book { title, author, cover, layout, sections, toc })
}
fn extract_title(html: &str) -> Option<String> {
if let Some(start) = html.find("<title>") {
let rest = &html[start + 7..];
if let Some(end) = rest.find("</title>") {
return Some(strip_html(&rest[..end]).trim().to_string());
}
}
if let Some(start) = html.find("<h1") {
let rest = &html[start..];
if let Some(content_start) = rest.find('>') {
let inner = &rest[content_start + 1..];
if let Some(end) = inner.find("</h1>") {
return Some(strip_html(&inner[..end]).trim().to_string());
}
}
}
None
}
fn build_toc(
entries: &[epub::doc::NavPoint],
spine: &[String],
) -> Vec<TocEntry> {
entries
.iter()
.map(|e| {
let content_str = e.content.to_string_lossy();
let section = spine
.iter()
.position(|s| content_str.contains(s.trim_end_matches('/')))
// unwrap_or(0) is safe: a real TOC entry should always match a spine item
.unwrap_or(0);
TocEntry {
label: e.label.clone(),
section,
children: build_toc(&e.children, spine),
}
})
.collect()
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_epub_loader_nonexistent_file() {
let result = load_epub("nonexistent.epub");
assert!(result.is_err());
}
#[test]
fn test_strip_html_plain_text() {
assert_eq!(strip_html("Hello World"), "Hello World");
}
#[test]
fn test_strip_html_simple_tags() {
assert_eq!(strip_html("<p>Hello</p>"), "Hello");
}
#[test]
fn test_strip_html_nested_tags() {
assert_eq!(
strip_html("<div><p>Hello <b>World</b></p></div>"),
"Hello World"
);
}
#[test]
fn test_strip_html_html_entities() {
assert_eq!(strip_html("Hello &amp; World"), "Hello & World");
assert_eq!(strip_html("Hello&nbsp;World"), "Hello World");
}
#[test]
fn test_strip_html_empty() {
assert_eq!(strip_html(""), "");
}
#[test]
fn test_extract_title_from_title_tag() {
let html = "<html><head><title>My Book Title</title></head><body></body></html>";
assert_eq!(extract_title(html), Some("My Book Title".to_string()));
}
#[test]
fn test_extract_title_from_h1() {
let html = "<html><body><h1>Chapter One</h1><p>text</p></body></html>";
assert_eq!(extract_title(html), Some("Chapter One".to_string()));
}
#[test]
fn test_extract_title_prefers_title() {
let html = "<html><head><title>Book</title></head><body><h1>Chapter</h1></body></html>";
assert_eq!(extract_title(html), Some("Book".to_string()));
}
#[test]
fn test_extract_title_missing() {
assert_eq!(extract_title("<html><body><p>no title</p></body></html>"), None);
}
#[test]
fn test_extract_title_empty() {
assert_eq!(extract_title(""), None);
}
#[test]
fn test_html_to_plain_paragraphs() {
let html = "<p>第一段</p><p>第二段</p>";
let result = strip_html(html);
assert!(result.contains("第一段"));
assert!(result.contains("第二段"));
assert!(result.contains('\n'));
assert!(result.ends_with("第二段"));
}
#[test]
fn test_html_to_plain_heading() {
let html = "<h1>标题</h1><p>正文</p>";
let result = strip_html(html);
assert!(result.contains("标题"));
assert!(result.contains("正文"));
assert!(result.contains('\n'));
}
#[test]
fn test_html_to_plain_list() {
let html = "<ul><li>项目一</li><li>项目二</li></ul>";
let result = strip_html(html);
assert!(result.starts_with("- "));
assert!(result.contains("项目一"));
assert!(result.contains("项目二"));
}
#[test]
fn test_html_to_plain_br() {
let html = "第一行<br>第二行";
let result = strip_html(html);
assert_eq!(result, "第一行\n第二行");
}
#[test]
fn test_html_to_plain_skip_script() {
let html = "<p>正文</p><script>var x=1;</script><p>更多正文</p>";
let result = strip_html(html);
assert!(result.contains("正文"));
assert!(result.contains("更多正文"));
assert!(!result.contains("var x=1"));
}
#[test]
fn test_html_to_plain_line_break_collapse() {
let html = "<p>段一</p><p>段二</p><p>段三</p>";
let result = strip_html(html);
let non_empty: Vec<&str> = result.lines().filter(|l| !l.is_empty()).collect();
assert_eq!(non_empty.len(), 3);
assert_eq!(non_empty[0], "段一");
assert_eq!(non_empty[1], "段二");
assert_eq!(non_empty[2], "段三");
}
#[test]
fn test_build_toc_empty() {
let toc = build_toc(&[], &[]);
assert!(toc.is_empty());
}
}