feat: improve HTML text extraction with paragraph preservation, add reading margins, paragraph-aware pagination

2026-05-14 20:43:29 +08:00
parent b0071c6617
commit 16f801cdf8
2 changed files with 248 additions and 33 deletions
--- a/src/book.rs
+++ b/src/book.rs
@@ -1,37 +1,155 @@
-pub fn strip_html(input: &str) -> String {
+fn decode_entities(text: &str) -> String {
-    let mut result = String::with_capacity(input.len());
+    let mut result = String::with_capacity(text.len());
-    let mut in_tag = false;
+    let mut chars = text.chars();
-    let mut in_entity = false;
+    while let Some(c) = chars.next() {
-    let mut entity = String::new();
+        if c == '&' {
-
+            let mut entity = String::new();
-    for c in input.chars() {
+            for ec in &mut chars {
-        match c {
+                if ec == ';' {
-            '<' => in_tag = true,
+                    break;
-            '>' if in_tag => in_tag = false,
+                }
-            '&' if !in_tag => {
+                entity.push(ec);
                in_entity = true;
                entity.clear();
            }
-            ';' if in_entity => {
+            let decoded = match entity.as_str() {
-                in_entity = false;
+                "amp" => "&",
-                let decoded = match entity.as_str() {
+                "lt" => "<",
-                    "amp" => "&",
+                "gt" => ">",
-                    "lt" => "<",
+                "quot" => "\"",
-                    "gt" => ">",
+                "nbsp" => " ",
-                    "quot" => "\"",
+                "emsp" => "  ",
-                    "nbsp" => " ",
+                "ensp" => " ",
-                    _ => "",
+                "mdash" => "—",
-                };
+                "ndash" => "–",
-                result.push_str(decoded);
+                "ldquo" => "\"",
-            }
+                "rdquo" => "\"",
-            c if !in_tag && !in_entity => result.push(c),
+                "lsquo" => "'",
-            c if in_entity => entity.push(c),
+                "rsquo" => "'",
-            _ => {}
+                "hellip" => "…",
                _ => "",
            };
            result.push_str(decoded);
        } else {
            result.push(c);
        }
    }
    result
 }
 fn tag_name_from(tag_content: &str) -> &str {
    tag_content
        .split_whitespace()
        .next()
        .unwrap_or("")
        .trim_end_matches('/')
 }
 pub fn strip_html(input: &str) -> String {
    let mut out = String::with_capacity(input.len());
    let mut pos = 0;
    while pos < input.len() {
        // Find next '<'
        let remaining = &input[pos..];
        let tag_start = remaining.find('<');
        let tag_start = match tag_start {
            Some(s) => pos + s,
            None => {
                // No more tags, emit remaining text
                out.push_str(&decode_entities(&input[pos..]));
                break;
            }
        };
        // Emit text before the tag
        if tag_start > pos {
            out.push_str(&decode_entities(&input[pos..tag_start]));
        }
        // Find '>' to close the tag
        let tag_end = match input[tag_start..].find('>') {
            Some(i) => tag_start + i,
            None => {
                // Unclosed tag, emit rest as text
                out.push_str(&decode_entities(&input[tag_start..]));
                break;
            }
        };
        let tag_content = &input[tag_start + 1..tag_end];
        let name = tag_name_from(tag_content);
        match name {
            "script" | "style" => {
                // Skip content until closing tag
                let close_tag = format!("</{}", name);
                if let Some(cs) = input[tag_end..].find(&close_tag) {
                    let close_tag_end = input[tag_end + cs..].find('>');
                    if let Some(ce) = close_tag_end {
                        pos = tag_end + cs + ce + 1;
                        continue;
                    }
                }
                pos = tag_end + 1;
            }
            "br" => {
                if !out.is_empty() {
                    out.push('\n');
                }
                pos = tag_end + 1;
            }
            "hr" => {
                if !out.is_empty() {
                    out.push_str("\n\n");
                }
                out.push_str("---\n\n");
                pos = tag_end + 1;
            }
            "li" => {
                if !out.is_empty() && !out.ends_with('\n') {
                    out.push('\n');
                }
                out.push_str("- ");
                pos = tag_end + 1;
            }
            "/li" | "/dd" | "/dt" | "/ol" | "/ul" => {
                pos = tag_end + 1;
            }
            "p" | "div" | "h1" | "h2" | "h3" | "h4" | "h5" | "h6" | "blockquote" => {
                pos = tag_end + 1;
            }
            "/p" | "/div" | "/blockquote" | "/h1" | "/h2" | "/h3" | "/h4" | "/h5" | "/h6" => {
                if !out.is_empty() && !out.ends_with('\n') {
                    out.push('\n');
                }
                out.push('\n');
                pos = tag_end + 1;
            }
            _ => {
                pos = tag_end + 1;
            }
        }
    }
    // Collapse 3+ consecutive newlines into 2
    let out = out.trim();
    let mut final_out = String::with_capacity(out.len());
    let mut nl_count = 0usize;
    for c in out.chars() {
        if c == '\n' {
            nl_count += 1;
            if nl_count <= 2 {
                final_out.push(c);
            }
        } else {
            nl_count = 0;
            final_out.push(c);
        }
    }
    final_out
 }
 #[derive(Debug, Clone)]
 pub struct TocEntry {
    pub label: String,
@@ -196,6 +314,61 @@ mod tests {
        assert_eq!(extract_title(""), None);
    }
    #[test]
    fn test_html_to_plain_paragraphs() {
        let html = "<p>第一段</p><p>第二段</p>";
        let result = strip_html(html);
        assert!(result.contains("第一段"));
        assert!(result.contains("第二段"));
        assert!(result.contains('\n'));
        assert!(result.ends_with("第二段"));
    }
    #[test]
    fn test_html_to_plain_heading() {
        let html = "<h1>标题</h1><p>正文</p>";
        let result = strip_html(html);
        assert!(result.contains("标题"));
        assert!(result.contains("正文"));
        assert!(result.contains('\n'));
    }
    #[test]
    fn test_html_to_plain_list() {
        let html = "<ul><li>项目一</li><li>项目二</li></ul>";
        let result = strip_html(html);
        assert!(result.starts_with("- "));
        assert!(result.contains("项目一"));
        assert!(result.contains("项目二"));
    }
    #[test]
    fn test_html_to_plain_br() {
        let html = "第一行<br>第二行";
        let result = strip_html(html);
        assert_eq!(result, "第一行\n第二行");
    }
    #[test]
    fn test_html_to_plain_skip_script() {
        let html = "<p>正文</p><script>var x=1;</script><p>更多正文</p>";
        let result = strip_html(html);
        assert!(result.contains("正文"));
        assert!(result.contains("更多正文"));
        assert!(!result.contains("var x=1"));
    }
    #[test]
    fn test_html_to_plain_line_break_collapse() {
        let html = "<p>段一</p><p>段二</p><p>段三</p>";
        let result = strip_html(html);
        let non_empty: Vec<&str> = result.lines().filter(|l| !l.is_empty()).collect();
        assert_eq!(non_empty.len(), 3);
        assert_eq!(non_empty[0], "段一");
        assert_eq!(non_empty[1], "段二");
        assert_eq!(non_empty[2], "段三");
    }
    #[test]
    fn test_build_toc_empty() {
        let toc = build_toc(&[], &[]);
--- a/src/reader.rs
+++ b/src/reader.rs
@@ -160,14 +160,22 @@ pub fn reading_view(
    // --- Center text area ---
    egui::CentralPanel::default().show_inside(ui, |ui| {
-        let (rect, response) = ui.allocate_at_least(ui.available_size(), egui::Sense::click());
+        let available = ui.available_size();
        let (rect, response) = ui.allocate_at_least(available, egui::Sense::click());
        // Add reading margins (inset)
        let inset = 24.0;
        let text_rect = egui::Rect::from_min_size(
            egui::pos2(rect.min.x + inset, rect.min.y),
            egui::vec2((rect.width() - inset * 2.0).max(100.0), rect.height()),
        );
        if let Some(section) = book.sections.get(*current_section) {
            if *current_page < section.pages.len().saturating_sub(1) {
                let start = section.pages[*current_page];
                let end = section.pages[*current_page + 1];
                let text: String = section.content.chars().skip(start).take(end - start).collect();
-                ui.put(rect, |ui: &mut egui::Ui| {
+                ui.put(text_rect, |ui: &mut egui::Ui| {
                    ui.add(
                        egui::Label::new(
                            egui::RichText::new(&text)
@@ -242,15 +250,49 @@ pub fn calculate_pages(text: &str, chars_per_page: usize) -> Vec<usize> {
        return pages;
    }
-    let total_chars = text.chars().count();
+    let chars: Vec<char> = text.chars().collect();
    let total_chars = chars.len();
    if total_chars <= chars_per_page {
        pages.push(total_chars);
        return pages;
    }
-    let mut pos = 0;
+    let mut pos: usize = 0;
    while pos < total_chars {
-        pos = (pos + chars_per_page).min(total_chars);
+        let next = pos + chars_per_page;
        if next >= total_chars {
            pages.push(total_chars);
            break;
        }
        // Search backward from next for paragraph (\n\n) or line (\n) breaks
        let search_start = pos + chars_per_page / 2;
        let search_end = (next + chars_per_page / 2).min(total_chars);
        let mut split = next;
        // Prefer double newline (paragraph), then single newline
        let mut found = false;
        for i in (search_start..search_end).rev() {
            if chars[i] == '\n' && i > 0 && chars[i - 1] == '\n' {
                split = i - 1;
                found = true;
                break;
            }
        }
        if !found {
            for i in (search_start..search_end).rev() {
                if chars[i] == '\n' {
                    split = i + 1;
                    break;
                }
            }
        }
        if split <= pos {
            split = next;
        }
        pos = split.min(total_chars);
        pages.push(pos);
    }