feat: improve HTML text extraction with paragraph preservation, add reading margins, paragraph-aware pagination

2026-05-14 20:43:29 +08:00
parent b0071c6617
commit 16f801cdf8
2 changed files with 248 additions and 33 deletions
--- a/src/book.rs
+++ b/src/book.rs
@@ -1,37 +1,155 @@
-pub fn strip_html(input: &str) -> String {
-    let mut result = String::with_capacity(input.len());
-    let mut in_tag = false;
-    let mut in_entity = false;
-    let mut entity = String::new();
-
-    for c in input.chars() {
-        match c {
-            '<' => in_tag = true,
-            '>' if in_tag => in_tag = false,
-            '&' if !in_tag => {
-                in_entity = true;
-                entity.clear();
+fn decode_entities(text: &str) -> String {
+    let mut result = String::with_capacity(text.len());
+    let mut chars = text.chars();
+    while let Some(c) = chars.next() {
+        if c == '&' {
+            let mut entity = String::new();
+            for ec in &mut chars {
+                if ec == ';' {
+                    break;
+                }
+                entity.push(ec);
            }
-            ';' if in_entity => {
-                in_entity = false;
-                let decoded = match entity.as_str() {
-                    "amp" => "&",
-                    "lt" => "<",
-                    "gt" => ">",
-                    "quot" => "\"",
-                    "nbsp" => " ",
-                    _ => "",
-                };
-                result.push_str(decoded);
-            }
-            c if !in_tag && !in_entity => result.push(c),
-            c if in_entity => entity.push(c),
-            _ => {}
+            let decoded = match entity.as_str() {
+                "amp" => "&",
+                "lt" => "<",
+                "gt" => ">",
+                "quot" => "\"",
+                "nbsp" => " ",
+                "emsp" => "  ",
+                "ensp" => " ",
+                "mdash" => "—",
+                "ndash" => "–",
+                "ldquo" => "\"",
+                "rdquo" => "\"",
+                "lsquo" => "'",
+                "rsquo" => "'",
+                "hellip" => "…",
+                _ => "",
+            };
+            result.push_str(decoded);
+        } else {
+            result.push(c);
        }
    }
    result
 }

+fn tag_name_from(tag_content: &str) -> &str {
+    tag_content
+        .split_whitespace()
+        .next()
+        .unwrap_or("")
+        .trim_end_matches('/')
+}
+
+pub fn strip_html(input: &str) -> String {
+    let mut out = String::with_capacity(input.len());
+    let mut pos = 0;
+
+    while pos < input.len() {
+        // Find next '<'
+        let remaining = &input[pos..];
+        let tag_start = remaining.find('<');
+
+        let tag_start = match tag_start {
+            Some(s) => pos + s,
+            None => {
+                // No more tags, emit remaining text
+                out.push_str(&decode_entities(&input[pos..]));
+                break;
+            }
+        };
+
+        // Emit text before the tag
+        if tag_start > pos {
+            out.push_str(&decode_entities(&input[pos..tag_start]));
+        }
+
+        // Find '>' to close the tag
+        let tag_end = match input[tag_start..].find('>') {
+            Some(i) => tag_start + i,
+            None => {
+                // Unclosed tag, emit rest as text
+                out.push_str(&decode_entities(&input[tag_start..]));
+                break;
+            }
+        };
+
+        let tag_content = &input[tag_start + 1..tag_end];
+        let name = tag_name_from(tag_content);
+
+        match name {
+            "script" | "style" => {
+                // Skip content until closing tag
+                let close_tag = format!("</{}", name);
+                if let Some(cs) = input[tag_end..].find(&close_tag) {
+                    let close_tag_end = input[tag_end + cs..].find('>');
+                    if let Some(ce) = close_tag_end {
+                        pos = tag_end + cs + ce + 1;
+                        continue;
+                    }
+                }
+                pos = tag_end + 1;
+            }
+            "br" => {
+                if !out.is_empty() {
+                    out.push('\n');
+                }
+                pos = tag_end + 1;
+            }
+            "hr" => {
+                if !out.is_empty() {
+                    out.push_str("\n\n");
+                }
+                out.push_str("---\n\n");
+                pos = tag_end + 1;
+            }
+            "li" => {
+                if !out.is_empty() && !out.ends_with('\n') {
+                    out.push('\n');
+                }
+                out.push_str("- ");
+                pos = tag_end + 1;
+            }
+            "/li" | "/dd" | "/dt" | "/ol" | "/ul" => {
+                pos = tag_end + 1;
+            }
+            "p" | "div" | "h1" | "h2" | "h3" | "h4" | "h5" | "h6" | "blockquote" => {
+                pos = tag_end + 1;
+            }
+            "/p" | "/div" | "/blockquote" | "/h1" | "/h2" | "/h3" | "/h4" | "/h5" | "/h6" => {
+                if !out.is_empty() && !out.ends_with('\n') {
+                    out.push('\n');
+                }
+                out.push('\n');
+                pos = tag_end + 1;
+            }
+            _ => {
+                pos = tag_end + 1;
+            }
+        }
+    }
+
+    // Collapse 3+ consecutive newlines into 2
+    let out = out.trim();
+    let mut final_out = String::with_capacity(out.len());
+    let mut nl_count = 0usize;
+    for c in out.chars() {
+        if c == '\n' {
+            nl_count += 1;
+            if nl_count <= 2 {
+                final_out.push(c);
+            }
+        } else {
+            nl_count = 0;
+            final_out.push(c);
+        }
+    }
+
+    final_out
+}
+
 #[derive(Debug, Clone)]
 pub struct TocEntry {
    pub label: String,
@@ -196,6 +314,61 @@ mod tests {
        assert_eq!(extract_title(""), None);
    }

+    #[test]
+    fn test_html_to_plain_paragraphs() {
+        let html = "<p>第一段</p><p>第二段</p>";
+        let result = strip_html(html);
+        assert!(result.contains("第一段"));
+        assert!(result.contains("第二段"));
+        assert!(result.contains('\n'));
+        assert!(result.ends_with("第二段"));
+    }
+
+    #[test]
+    fn test_html_to_plain_heading() {
+        let html = "<h1>标题</h1><p>正文</p>";
+        let result = strip_html(html);
+        assert!(result.contains("标题"));
+        assert!(result.contains("正文"));
+        assert!(result.contains('\n'));
+    }
+
+    #[test]
+    fn test_html_to_plain_list() {
+        let html = "<ul><li>项目一</li><li>项目二</li></ul>";
+        let result = strip_html(html);
+        assert!(result.starts_with("- "));
+        assert!(result.contains("项目一"));
+        assert!(result.contains("项目二"));
+    }
+
+    #[test]
+    fn test_html_to_plain_br() {
+        let html = "第一行<br>第二行";
+        let result = strip_html(html);
+        assert_eq!(result, "第一行\n第二行");
+    }
+
+    #[test]
+    fn test_html_to_plain_skip_script() {
+        let html = "<p>正文</p><script>var x=1;</script><p>更多正文</p>";
+        let result = strip_html(html);
+        assert!(result.contains("正文"));
+        assert!(result.contains("更多正文"));
+        assert!(!result.contains("var x=1"));
+    }
+
+    #[test]
+    fn test_html_to_plain_line_break_collapse() {
+        let html = "<p>段一</p><p>段二</p><p>段三</p>";
+        let result = strip_html(html);
+        let non_empty: Vec<&str> = result.lines().filter(|l| !l.is_empty()).collect();
+        assert_eq!(non_empty.len(), 3);
+        assert_eq!(non_empty[0], "段一");
+        assert_eq!(non_empty[1], "段二");
+        assert_eq!(non_empty[2], "段三");
+    }
+
    #[test]
    fn test_build_toc_empty() {
        let toc = build_toc(&[], &[]);