feat: improve HTML text extraction with paragraph preservation, add reading margins, paragraph-aware pagination

This commit is contained in:
Developer
2026-05-14 20:43:29 +08:00
parent b0071c6617
commit 16f801cdf8
2 changed files with 248 additions and 33 deletions

View File

@@ -1,37 +1,155 @@
pub fn strip_html(input: &str) -> String { fn decode_entities(text: &str) -> String {
let mut result = String::with_capacity(input.len()); let mut result = String::with_capacity(text.len());
let mut in_tag = false; let mut chars = text.chars();
let mut in_entity = false; while let Some(c) = chars.next() {
let mut entity = String::new(); if c == '&' {
let mut entity = String::new();
for c in input.chars() { for ec in &mut chars {
match c { if ec == ';' {
'<' => in_tag = true, break;
'>' if in_tag => in_tag = false, }
'&' if !in_tag => { entity.push(ec);
in_entity = true;
entity.clear();
} }
';' if in_entity => { let decoded = match entity.as_str() {
in_entity = false; "amp" => "&",
let decoded = match entity.as_str() { "lt" => "<",
"amp" => "&", "gt" => ">",
"lt" => "<", "quot" => "\"",
"gt" => ">", "nbsp" => " ",
"quot" => "\"", "emsp" => " ",
"nbsp" => " ", "ensp" => " ",
_ => "", "mdash" => "",
}; "ndash" => "",
result.push_str(decoded); "ldquo" => "\"",
} "rdquo" => "\"",
c if !in_tag && !in_entity => result.push(c), "lsquo" => "'",
c if in_entity => entity.push(c), "rsquo" => "'",
_ => {} "hellip" => "",
_ => "",
};
result.push_str(decoded);
} else {
result.push(c);
} }
} }
result result
} }
fn tag_name_from(tag_content: &str) -> &str {
tag_content
.split_whitespace()
.next()
.unwrap_or("")
.trim_end_matches('/')
}
pub fn strip_html(input: &str) -> String {
let mut out = String::with_capacity(input.len());
let mut pos = 0;
while pos < input.len() {
// Find next '<'
let remaining = &input[pos..];
let tag_start = remaining.find('<');
let tag_start = match tag_start {
Some(s) => pos + s,
None => {
// No more tags, emit remaining text
out.push_str(&decode_entities(&input[pos..]));
break;
}
};
// Emit text before the tag
if tag_start > pos {
out.push_str(&decode_entities(&input[pos..tag_start]));
}
// Find '>' to close the tag
let tag_end = match input[tag_start..].find('>') {
Some(i) => tag_start + i,
None => {
// Unclosed tag, emit rest as text
out.push_str(&decode_entities(&input[tag_start..]));
break;
}
};
let tag_content = &input[tag_start + 1..tag_end];
let name = tag_name_from(tag_content);
match name {
"script" | "style" => {
// Skip content until closing tag
let close_tag = format!("</{}", name);
if let Some(cs) = input[tag_end..].find(&close_tag) {
let close_tag_end = input[tag_end + cs..].find('>');
if let Some(ce) = close_tag_end {
pos = tag_end + cs + ce + 1;
continue;
}
}
pos = tag_end + 1;
}
"br" => {
if !out.is_empty() {
out.push('\n');
}
pos = tag_end + 1;
}
"hr" => {
if !out.is_empty() {
out.push_str("\n\n");
}
out.push_str("---\n\n");
pos = tag_end + 1;
}
"li" => {
if !out.is_empty() && !out.ends_with('\n') {
out.push('\n');
}
out.push_str("- ");
pos = tag_end + 1;
}
"/li" | "/dd" | "/dt" | "/ol" | "/ul" => {
pos = tag_end + 1;
}
"p" | "div" | "h1" | "h2" | "h3" | "h4" | "h5" | "h6" | "blockquote" => {
pos = tag_end + 1;
}
"/p" | "/div" | "/blockquote" | "/h1" | "/h2" | "/h3" | "/h4" | "/h5" | "/h6" => {
if !out.is_empty() && !out.ends_with('\n') {
out.push('\n');
}
out.push('\n');
pos = tag_end + 1;
}
_ => {
pos = tag_end + 1;
}
}
}
// Collapse 3+ consecutive newlines into 2
let out = out.trim();
let mut final_out = String::with_capacity(out.len());
let mut nl_count = 0usize;
for c in out.chars() {
if c == '\n' {
nl_count += 1;
if nl_count <= 2 {
final_out.push(c);
}
} else {
nl_count = 0;
final_out.push(c);
}
}
final_out
}
#[derive(Debug, Clone)] #[derive(Debug, Clone)]
pub struct TocEntry { pub struct TocEntry {
pub label: String, pub label: String,
@@ -196,6 +314,61 @@ mod tests {
assert_eq!(extract_title(""), None); assert_eq!(extract_title(""), None);
} }
#[test]
fn test_html_to_plain_paragraphs() {
let html = "<p>第一段</p><p>第二段</p>";
let result = strip_html(html);
assert!(result.contains("第一段"));
assert!(result.contains("第二段"));
assert!(result.contains('\n'));
assert!(result.ends_with("第二段"));
}
#[test]
fn test_html_to_plain_heading() {
let html = "<h1>标题</h1><p>正文</p>";
let result = strip_html(html);
assert!(result.contains("标题"));
assert!(result.contains("正文"));
assert!(result.contains('\n'));
}
#[test]
fn test_html_to_plain_list() {
let html = "<ul><li>项目一</li><li>项目二</li></ul>";
let result = strip_html(html);
assert!(result.starts_with("- "));
assert!(result.contains("项目一"));
assert!(result.contains("项目二"));
}
#[test]
fn test_html_to_plain_br() {
let html = "第一行<br>第二行";
let result = strip_html(html);
assert_eq!(result, "第一行\n第二行");
}
#[test]
fn test_html_to_plain_skip_script() {
let html = "<p>正文</p><script>var x=1;</script><p>更多正文</p>";
let result = strip_html(html);
assert!(result.contains("正文"));
assert!(result.contains("更多正文"));
assert!(!result.contains("var x=1"));
}
#[test]
fn test_html_to_plain_line_break_collapse() {
let html = "<p>段一</p><p>段二</p><p>段三</p>";
let result = strip_html(html);
let non_empty: Vec<&str> = result.lines().filter(|l| !l.is_empty()).collect();
assert_eq!(non_empty.len(), 3);
assert_eq!(non_empty[0], "段一");
assert_eq!(non_empty[1], "段二");
assert_eq!(non_empty[2], "段三");
}
#[test] #[test]
fn test_build_toc_empty() { fn test_build_toc_empty() {
let toc = build_toc(&[], &[]); let toc = build_toc(&[], &[]);

View File

@@ -160,14 +160,22 @@ pub fn reading_view(
// --- Center text area --- // --- Center text area ---
egui::CentralPanel::default().show_inside(ui, |ui| { egui::CentralPanel::default().show_inside(ui, |ui| {
let (rect, response) = ui.allocate_at_least(ui.available_size(), egui::Sense::click()); let available = ui.available_size();
let (rect, response) = ui.allocate_at_least(available, egui::Sense::click());
// Add reading margins (inset)
let inset = 24.0;
let text_rect = egui::Rect::from_min_size(
egui::pos2(rect.min.x + inset, rect.min.y),
egui::vec2((rect.width() - inset * 2.0).max(100.0), rect.height()),
);
if let Some(section) = book.sections.get(*current_section) { if let Some(section) = book.sections.get(*current_section) {
if *current_page < section.pages.len().saturating_sub(1) { if *current_page < section.pages.len().saturating_sub(1) {
let start = section.pages[*current_page]; let start = section.pages[*current_page];
let end = section.pages[*current_page + 1]; let end = section.pages[*current_page + 1];
let text: String = section.content.chars().skip(start).take(end - start).collect(); let text: String = section.content.chars().skip(start).take(end - start).collect();
ui.put(rect, |ui: &mut egui::Ui| { ui.put(text_rect, |ui: &mut egui::Ui| {
ui.add( ui.add(
egui::Label::new( egui::Label::new(
egui::RichText::new(&text) egui::RichText::new(&text)
@@ -242,15 +250,49 @@ pub fn calculate_pages(text: &str, chars_per_page: usize) -> Vec<usize> {
return pages; return pages;
} }
let total_chars = text.chars().count(); let chars: Vec<char> = text.chars().collect();
let total_chars = chars.len();
if total_chars <= chars_per_page { if total_chars <= chars_per_page {
pages.push(total_chars); pages.push(total_chars);
return pages; return pages;
} }
let mut pos = 0; let mut pos: usize = 0;
while pos < total_chars { while pos < total_chars {
pos = (pos + chars_per_page).min(total_chars); let next = pos + chars_per_page;
if next >= total_chars {
pages.push(total_chars);
break;
}
// Search backward from next for paragraph (\n\n) or line (\n) breaks
let search_start = pos + chars_per_page / 2;
let search_end = (next + chars_per_page / 2).min(total_chars);
let mut split = next;
// Prefer double newline (paragraph), then single newline
let mut found = false;
for i in (search_start..search_end).rev() {
if chars[i] == '\n' && i > 0 && chars[i - 1] == '\n' {
split = i - 1;
found = true;
break;
}
}
if !found {
for i in (search_start..search_end).rev() {
if chars[i] == '\n' {
split = i + 1;
break;
}
}
}
if split <= pos {
split = next;
}
pos = split.min(total_chars);
pages.push(pos); pages.push(pos);
} }