epub-read/src/book.rs

fn decode_entities(text: &str) -> String {
    let mut result = String::with_capacity(text.len());
    let mut chars = text.chars();
    while let Some(c) = chars.next() {
        if c == '&' {
            let mut entity = String::new();
            for ec in &mut chars {
                if ec == ';' {
                    break;
                }
                entity.push(ec);
            }
            let decoded = match entity.as_str() {
                "amp" => "&",
                "lt" => "<",
                "gt" => ">",
                "quot" => "\"",
                "nbsp" => " ",
                "emsp" => "  ",
                "ensp" => " ",
                "mdash" => "—",
                "ndash" => "–",
                "ldquo" => "\"",
                "rdquo" => "\"",
                "lsquo" => "'",
                "rsquo" => "'",
                "hellip" => "…",
                _ => "",
            };
            result.push_str(decoded);
        } else {
            result.push(c);
        }
    }
    result
}

fn tag_name_from(tag_content: &str) -> &str {
    tag_content
        .split_whitespace()
        .next()
        .unwrap_or("")
        .trim_end_matches('/')
}

pub fn strip_html(input: &str) -> String {
    let mut out = String::with_capacity(input.len());
    let mut pos = 0;
    let mut heading_level: Option<u32> = None;

    while pos < input.len() {
        // Find next '<'
        let remaining = &input[pos..];
        let tag_start = remaining.find('<');

        let tag_start = match tag_start {
            Some(s) => pos + s,
            None => {
                // No more tags, emit remaining text
                out.push_str(&decode_entities(&input[pos..]));
                break;
            }
        };

        // Emit text before the tag
        if tag_start > pos {
            let text = decode_entities(&input[pos..tag_start]);
            if let Some(level) = heading_level {
                out.push('\x01');
                out.push(char::from_digit(level, 10).unwrap_or('1'));
                out.push_str(&text);
                out.push('\x02');
            } else {
                out.push_str(&text);
            }
        }

        // Find '>' to close the tag
        let tag_end = match input[tag_start..].find('>') {
            Some(i) => tag_start + i,
            None => {
                // Unclosed tag, emit rest as text
                out.push_str(&decode_entities(&input[tag_start..]));
                break;
            }
        };

        let tag_content = &input[tag_start + 1..tag_end];
        let name = tag_name_from(tag_content);

        match name {
            "script" | "style" => {
                // Skip content until closing tag
                let close_tag = format!("</{}", name);
                if let Some(cs) = input[tag_end..].find(&close_tag) {
                    let close_tag_end = input[tag_end + cs..].find('>');
                    if let Some(ce) = close_tag_end {
                        pos = tag_end + cs + ce + 1;
                        continue;
                    }
                }
                pos = tag_end + 1;
            }
            "br" => {
                if !out.is_empty() {
                    out.push('\n');
                }
                pos = tag_end + 1;
            }
            "hr" => {
                if !out.is_empty() {
                    out.push_str("\n\n");
                }
                out.push_str("---\n\n");
                pos = tag_end + 1;
            }
            "li" => {
                if !out.is_empty() && !out.ends_with('\n') {
                    out.push('\n');
                }
                out.push_str("- ");
                pos = tag_end + 1;
            }
            "/li" | "/dd" | "/dt" | "/ol" | "/ul" => {
                pos = tag_end + 1;
            }
            "h1" => { heading_level = Some(1); pos = tag_end + 1; }
            "h2" => { heading_level = Some(2); pos = tag_end + 1; }
            "h3" => { heading_level = Some(3); pos = tag_end + 1; }
            "h4" => { heading_level = Some(4); pos = tag_end + 1; }
            "h5" => { heading_level = Some(5); pos = tag_end + 1; }
            "h6" => { heading_level = Some(6); pos = tag_end + 1; }
            "p" | "div" | "blockquote" => {
                pos = tag_end + 1;
            }
            "/p" | "/div" | "/blockquote" => {
                if !out.is_empty() && !out.ends_with('\n') {
                    out.push('\n');
                }
                out.push('\n');
                pos = tag_end + 1;
            }
            "/h1" | "/h2" | "/h3" | "/h4" | "/h5" | "/h6" => {
                heading_level = None;
                if !out.is_empty() && !out.ends_with('\n') {
                    out.push('\n');
                }
                out.push('\n');
                pos = tag_end + 1;
            }
            _ => {
                pos = tag_end + 1;
            }
        }
    }

    // Collapse 3+ consecutive newlines into 2
    let out = out.trim();
    let mut final_out = String::with_capacity(out.len());
    let mut nl_count = 0usize;
    for c in out.chars() {
        if c == '\n' {
            nl_count += 1;
            if nl_count <= 2 {
                final_out.push(c);
            }
        } else {
            nl_count = 0;
            final_out.push(c);
        }
    }

    final_out
}

#[derive(Debug, Clone)]
pub struct TocEntry {
    pub label: String,
    pub section: usize,
    pub children: Vec<TocEntry>,
}

#[derive(Debug, Clone)]
pub struct ContentBlock {
    pub text: String,
    pub heading_level: u8, // 0 = body, 1-6 = h1-h6
}

#[derive(Debug, Clone)]
pub struct Section {
    pub title: String,
    pub content: String,
    pub blocks: Vec<ContentBlock>,
    pub page_block_ranges: Vec<(usize, usize)>,
}

#[derive(Debug, Clone, Copy, PartialEq)]
pub enum BookLayout {
    Reflowable,
    FixedLayout,
}

impl BookLayout {
    pub fn label(&self) -> &str {
        match self {
            BookLayout::Reflowable => "重排",
            BookLayout::FixedLayout => "固定",
        }
    }
}

#[derive(Debug, Clone)]
pub struct Book {
    pub title: String,
    pub author: String,
    pub cover: Option<Vec<u8>>,
    pub layout: BookLayout,
    pub sections: Vec<Section>,
    pub toc: Vec<TocEntry>,
}

use std::path::Path;

fn detect_layout(doc: &mut epub::doc::EpubDoc<std::io::BufReader<std::fs::File>>) -> BookLayout {
    if let Some(vals) = doc.metadata.get("rendition:layout") {
        if vals.iter().any(|v| v == "pre-paginated") {
            return BookLayout::FixedLayout;
        }
    }
    if let Ok(opf) = doc.get_resource_str_by_path(&doc.root_file.clone()) {
        if opf.contains("rendition:layout") && opf.contains("pre-paginated") {
            return BookLayout::FixedLayout;
        }
        if opf.contains("rendition:layout-pre-paginated") {
            return BookLayout::FixedLayout;
        }
    }
    BookLayout::Reflowable
}

pub fn load_epub(path: impl AsRef<Path>) -> Result<Book, String> {
    let path = path.as_ref();
    let mut doc = epub::doc::EpubDoc::new(path)
        .map_err(|e| format!("无法打开文件: {}", e))?;

    let layout = detect_layout(&mut doc);

    let title = doc.mdata("title").unwrap_or_else(|| "未知标题".to_string());
    let author = doc.mdata("creator").unwrap_or_else(|| "未知作者".to_string());
    let cover = doc.get_cover().ok();
    let spine = doc.spine.clone();

    let mut sections = Vec::new();
    for (i, href) in spine.iter().enumerate() {
        let raw_html = doc.get_resource_str(href)
            .map_err(|e| format!("读取章节失败: {}", e))?;
        let text = strip_html(&raw_html);
        let title = extract_title(&raw_html)
            .unwrap_or_else(|| format!("第{}章", i + 1));
        sections.push(Section {
            title,
            content: text,
            blocks: Vec::new(),
            page_block_ranges: Vec::new(),
        });
    }

    let raw_toc = std::mem::take(&mut doc.toc);
    let toc = build_toc(&raw_toc, &spine);

    Ok(Book { title, author, cover, layout, sections, toc })
}

fn extract_title(html: &str) -> Option<String> {
    if let Some(start) = html.find("<title>") {
        let rest = &html[start + 7..];
        if let Some(end) = rest.find("</title>") {
            return Some(strip_html(&rest[..end]).trim().to_string());
        }
    }
    if let Some(start) = html.find("<h1") {
        let rest = &html[start..];
        if let Some(content_start) = rest.find('>') {
            let inner = &rest[content_start + 1..];
            if let Some(end) = inner.find("</h1>") {
                return Some(strip_html(&inner[..end]).trim().to_string());
            }
        }
    }
    None
}

fn build_toc(
    entries: &[epub::doc::NavPoint],
    spine: &[String],
) -> Vec<TocEntry> {
    entries
        .iter()
        .map(|e| {
            let content_str = e.content.to_string_lossy();
            let section = spine
                .iter()
                .position(|s| content_str.contains(s.trim_end_matches('/')))
                // unwrap_or(0) is safe: a real TOC entry should always match a spine item
                .unwrap_or(0);
            TocEntry {
                label: e.label.clone(),
                section,
                children: build_toc(&e.children, spine),
            }
        })
        .collect()
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_epub_loader_nonexistent_file() {
        let result = load_epub("nonexistent.epub");
        assert!(result.is_err());
    }

    #[test]
    fn test_strip_html_plain_text() {
        assert_eq!(strip_html("Hello World"), "Hello World");
    }

    #[test]
    fn test_strip_html_simple_tags() {
        assert_eq!(strip_html("<p>Hello</p>"), "Hello");
    }

    #[test]
    fn test_strip_html_nested_tags() {
        assert_eq!(
            strip_html("<div><p>Hello <b>World</b></p></div>"),
            "Hello World"
        );
    }

    #[test]
    fn test_strip_html_html_entities() {
        assert_eq!(strip_html("Hello &amp; World"), "Hello & World");
        assert_eq!(strip_html("Hello&nbsp;World"), "Hello World");
    }

    #[test]
    fn test_strip_html_empty() {
        assert_eq!(strip_html(""), "");
    }

    #[test]
    fn test_extract_title_from_title_tag() {
        let html = "<html><head><title>My Book Title</title></head><body></body></html>";
        assert_eq!(extract_title(html), Some("My Book Title".to_string()));
    }

    #[test]
    fn test_extract_title_from_h1() {
        let html = "<html><body><h1>Chapter One</h1><p>text</p></body></html>";
        assert_eq!(extract_title(html), Some("Chapter One".to_string()));
    }

    #[test]
    fn test_extract_title_prefers_title() {
        let html = "<html><head><title>Book</title></head><body><h1>Chapter</h1></body></html>";
        assert_eq!(extract_title(html), Some("Book".to_string()));
    }

    #[test]
    fn test_extract_title_missing() {
        assert_eq!(extract_title("<html><body><p>no title</p></body></html>"), None);
    }

    #[test]
    fn test_extract_title_empty() {
        assert_eq!(extract_title(""), None);
    }

    #[test]
    fn test_html_to_plain_paragraphs() {
        let html = "<p>第一段</p><p>第二段</p>";
        let result = strip_html(html);
        assert!(result.contains("第一段"));
        assert!(result.contains("第二段"));
        assert!(result.contains('\n'));
        assert!(result.ends_with("第二段"));
    }

    #[test]
    fn test_html_to_plain_heading() {
        let html = "<h1>标题</h1><p>正文</p>";
        let result = strip_html(html);
        assert!(result.contains("标题"));
        assert!(result.contains("正文"));
        assert!(result.contains('\n'));
    }

    #[test]
    fn test_html_to_plain_list() {
        let html = "<ul><li>项目一</li><li>项目二</li></ul>";
        let result = strip_html(html);
        assert!(result.starts_with("- "));
        assert!(result.contains("项目一"));
        assert!(result.contains("项目二"));
    }

    #[test]
    fn test_html_to_plain_br() {
        let html = "第一行<br>第二行";
        let result = strip_html(html);
        assert_eq!(result, "第一行\n第二行");
    }

    #[test]
    fn test_html_to_plain_skip_script() {
        let html = "<p>正文</p><script>var x=1;</script><p>更多正文</p>";
        let result = strip_html(html);
        assert!(result.contains("正文"));
        assert!(result.contains("更多正文"));
        assert!(!result.contains("var x=1"));
    }

    #[test]
    fn test_html_to_plain_line_break_collapse() {
        let html = "<p>段一</p><p>段二</p><p>段三</p>";
        let result = strip_html(html);
        let non_empty: Vec<&str> = result.lines().filter(|l| !l.is_empty()).collect();
        assert_eq!(non_empty.len(), 3);
        assert_eq!(non_empty[0], "段一");
        assert_eq!(non_empty[1], "段二");
        assert_eq!(non_empty[2], "段三");
    }

    #[test]
    fn test_build_toc_empty() {
        let toc = build_toc(&[], &[]);
        assert!(toc.is_empty());
    }
}