feat: improve HTML text extraction with paragraph preservation, add reading margins, paragraph-aware pagination
This commit is contained in:
229
src/book.rs
229
src/book.rs
@@ -1,37 +1,155 @@
|
|||||||
pub fn strip_html(input: &str) -> String {
|
fn decode_entities(text: &str) -> String {
|
||||||
let mut result = String::with_capacity(input.len());
|
let mut result = String::with_capacity(text.len());
|
||||||
let mut in_tag = false;
|
let mut chars = text.chars();
|
||||||
let mut in_entity = false;
|
while let Some(c) = chars.next() {
|
||||||
let mut entity = String::new();
|
if c == '&' {
|
||||||
|
let mut entity = String::new();
|
||||||
for c in input.chars() {
|
for ec in &mut chars {
|
||||||
match c {
|
if ec == ';' {
|
||||||
'<' => in_tag = true,
|
break;
|
||||||
'>' if in_tag => in_tag = false,
|
}
|
||||||
'&' if !in_tag => {
|
entity.push(ec);
|
||||||
in_entity = true;
|
|
||||||
entity.clear();
|
|
||||||
}
|
}
|
||||||
';' if in_entity => {
|
let decoded = match entity.as_str() {
|
||||||
in_entity = false;
|
"amp" => "&",
|
||||||
let decoded = match entity.as_str() {
|
"lt" => "<",
|
||||||
"amp" => "&",
|
"gt" => ">",
|
||||||
"lt" => "<",
|
"quot" => "\"",
|
||||||
"gt" => ">",
|
"nbsp" => " ",
|
||||||
"quot" => "\"",
|
"emsp" => " ",
|
||||||
"nbsp" => " ",
|
"ensp" => " ",
|
||||||
_ => "",
|
"mdash" => "—",
|
||||||
};
|
"ndash" => "–",
|
||||||
result.push_str(decoded);
|
"ldquo" => "\"",
|
||||||
}
|
"rdquo" => "\"",
|
||||||
c if !in_tag && !in_entity => result.push(c),
|
"lsquo" => "'",
|
||||||
c if in_entity => entity.push(c),
|
"rsquo" => "'",
|
||||||
_ => {}
|
"hellip" => "…",
|
||||||
|
_ => "",
|
||||||
|
};
|
||||||
|
result.push_str(decoded);
|
||||||
|
} else {
|
||||||
|
result.push(c);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
result
|
result
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn tag_name_from(tag_content: &str) -> &str {
|
||||||
|
tag_content
|
||||||
|
.split_whitespace()
|
||||||
|
.next()
|
||||||
|
.unwrap_or("")
|
||||||
|
.trim_end_matches('/')
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn strip_html(input: &str) -> String {
|
||||||
|
let mut out = String::with_capacity(input.len());
|
||||||
|
let mut pos = 0;
|
||||||
|
|
||||||
|
while pos < input.len() {
|
||||||
|
// Find next '<'
|
||||||
|
let remaining = &input[pos..];
|
||||||
|
let tag_start = remaining.find('<');
|
||||||
|
|
||||||
|
let tag_start = match tag_start {
|
||||||
|
Some(s) => pos + s,
|
||||||
|
None => {
|
||||||
|
// No more tags, emit remaining text
|
||||||
|
out.push_str(&decode_entities(&input[pos..]));
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// Emit text before the tag
|
||||||
|
if tag_start > pos {
|
||||||
|
out.push_str(&decode_entities(&input[pos..tag_start]));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Find '>' to close the tag
|
||||||
|
let tag_end = match input[tag_start..].find('>') {
|
||||||
|
Some(i) => tag_start + i,
|
||||||
|
None => {
|
||||||
|
// Unclosed tag, emit rest as text
|
||||||
|
out.push_str(&decode_entities(&input[tag_start..]));
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
let tag_content = &input[tag_start + 1..tag_end];
|
||||||
|
let name = tag_name_from(tag_content);
|
||||||
|
|
||||||
|
match name {
|
||||||
|
"script" | "style" => {
|
||||||
|
// Skip content until closing tag
|
||||||
|
let close_tag = format!("</{}", name);
|
||||||
|
if let Some(cs) = input[tag_end..].find(&close_tag) {
|
||||||
|
let close_tag_end = input[tag_end + cs..].find('>');
|
||||||
|
if let Some(ce) = close_tag_end {
|
||||||
|
pos = tag_end + cs + ce + 1;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
pos = tag_end + 1;
|
||||||
|
}
|
||||||
|
"br" => {
|
||||||
|
if !out.is_empty() {
|
||||||
|
out.push('\n');
|
||||||
|
}
|
||||||
|
pos = tag_end + 1;
|
||||||
|
}
|
||||||
|
"hr" => {
|
||||||
|
if !out.is_empty() {
|
||||||
|
out.push_str("\n\n");
|
||||||
|
}
|
||||||
|
out.push_str("---\n\n");
|
||||||
|
pos = tag_end + 1;
|
||||||
|
}
|
||||||
|
"li" => {
|
||||||
|
if !out.is_empty() && !out.ends_with('\n') {
|
||||||
|
out.push('\n');
|
||||||
|
}
|
||||||
|
out.push_str("- ");
|
||||||
|
pos = tag_end + 1;
|
||||||
|
}
|
||||||
|
"/li" | "/dd" | "/dt" | "/ol" | "/ul" => {
|
||||||
|
pos = tag_end + 1;
|
||||||
|
}
|
||||||
|
"p" | "div" | "h1" | "h2" | "h3" | "h4" | "h5" | "h6" | "blockquote" => {
|
||||||
|
pos = tag_end + 1;
|
||||||
|
}
|
||||||
|
"/p" | "/div" | "/blockquote" | "/h1" | "/h2" | "/h3" | "/h4" | "/h5" | "/h6" => {
|
||||||
|
if !out.is_empty() && !out.ends_with('\n') {
|
||||||
|
out.push('\n');
|
||||||
|
}
|
||||||
|
out.push('\n');
|
||||||
|
pos = tag_end + 1;
|
||||||
|
}
|
||||||
|
_ => {
|
||||||
|
pos = tag_end + 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Collapse 3+ consecutive newlines into 2
|
||||||
|
let out = out.trim();
|
||||||
|
let mut final_out = String::with_capacity(out.len());
|
||||||
|
let mut nl_count = 0usize;
|
||||||
|
for c in out.chars() {
|
||||||
|
if c == '\n' {
|
||||||
|
nl_count += 1;
|
||||||
|
if nl_count <= 2 {
|
||||||
|
final_out.push(c);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
nl_count = 0;
|
||||||
|
final_out.push(c);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
final_out
|
||||||
|
}
|
||||||
|
|
||||||
#[derive(Debug, Clone)]
|
#[derive(Debug, Clone)]
|
||||||
pub struct TocEntry {
|
pub struct TocEntry {
|
||||||
pub label: String,
|
pub label: String,
|
||||||
@@ -196,6 +314,61 @@ mod tests {
|
|||||||
assert_eq!(extract_title(""), None);
|
assert_eq!(extract_title(""), None);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_html_to_plain_paragraphs() {
|
||||||
|
let html = "<p>第一段</p><p>第二段</p>";
|
||||||
|
let result = strip_html(html);
|
||||||
|
assert!(result.contains("第一段"));
|
||||||
|
assert!(result.contains("第二段"));
|
||||||
|
assert!(result.contains('\n'));
|
||||||
|
assert!(result.ends_with("第二段"));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_html_to_plain_heading() {
|
||||||
|
let html = "<h1>标题</h1><p>正文</p>";
|
||||||
|
let result = strip_html(html);
|
||||||
|
assert!(result.contains("标题"));
|
||||||
|
assert!(result.contains("正文"));
|
||||||
|
assert!(result.contains('\n'));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_html_to_plain_list() {
|
||||||
|
let html = "<ul><li>项目一</li><li>项目二</li></ul>";
|
||||||
|
let result = strip_html(html);
|
||||||
|
assert!(result.starts_with("- "));
|
||||||
|
assert!(result.contains("项目一"));
|
||||||
|
assert!(result.contains("项目二"));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_html_to_plain_br() {
|
||||||
|
let html = "第一行<br>第二行";
|
||||||
|
let result = strip_html(html);
|
||||||
|
assert_eq!(result, "第一行\n第二行");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_html_to_plain_skip_script() {
|
||||||
|
let html = "<p>正文</p><script>var x=1;</script><p>更多正文</p>";
|
||||||
|
let result = strip_html(html);
|
||||||
|
assert!(result.contains("正文"));
|
||||||
|
assert!(result.contains("更多正文"));
|
||||||
|
assert!(!result.contains("var x=1"));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_html_to_plain_line_break_collapse() {
|
||||||
|
let html = "<p>段一</p><p>段二</p><p>段三</p>";
|
||||||
|
let result = strip_html(html);
|
||||||
|
let non_empty: Vec<&str> = result.lines().filter(|l| !l.is_empty()).collect();
|
||||||
|
assert_eq!(non_empty.len(), 3);
|
||||||
|
assert_eq!(non_empty[0], "段一");
|
||||||
|
assert_eq!(non_empty[1], "段二");
|
||||||
|
assert_eq!(non_empty[2], "段三");
|
||||||
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_build_toc_empty() {
|
fn test_build_toc_empty() {
|
||||||
let toc = build_toc(&[], &[]);
|
let toc = build_toc(&[], &[]);
|
||||||
|
|||||||
@@ -160,14 +160,22 @@ pub fn reading_view(
|
|||||||
|
|
||||||
// --- Center text area ---
|
// --- Center text area ---
|
||||||
egui::CentralPanel::default().show_inside(ui, |ui| {
|
egui::CentralPanel::default().show_inside(ui, |ui| {
|
||||||
let (rect, response) = ui.allocate_at_least(ui.available_size(), egui::Sense::click());
|
let available = ui.available_size();
|
||||||
|
let (rect, response) = ui.allocate_at_least(available, egui::Sense::click());
|
||||||
|
|
||||||
|
// Add reading margins (inset)
|
||||||
|
let inset = 24.0;
|
||||||
|
let text_rect = egui::Rect::from_min_size(
|
||||||
|
egui::pos2(rect.min.x + inset, rect.min.y),
|
||||||
|
egui::vec2((rect.width() - inset * 2.0).max(100.0), rect.height()),
|
||||||
|
);
|
||||||
|
|
||||||
if let Some(section) = book.sections.get(*current_section) {
|
if let Some(section) = book.sections.get(*current_section) {
|
||||||
if *current_page < section.pages.len().saturating_sub(1) {
|
if *current_page < section.pages.len().saturating_sub(1) {
|
||||||
let start = section.pages[*current_page];
|
let start = section.pages[*current_page];
|
||||||
let end = section.pages[*current_page + 1];
|
let end = section.pages[*current_page + 1];
|
||||||
let text: String = section.content.chars().skip(start).take(end - start).collect();
|
let text: String = section.content.chars().skip(start).take(end - start).collect();
|
||||||
ui.put(rect, |ui: &mut egui::Ui| {
|
ui.put(text_rect, |ui: &mut egui::Ui| {
|
||||||
ui.add(
|
ui.add(
|
||||||
egui::Label::new(
|
egui::Label::new(
|
||||||
egui::RichText::new(&text)
|
egui::RichText::new(&text)
|
||||||
@@ -242,15 +250,49 @@ pub fn calculate_pages(text: &str, chars_per_page: usize) -> Vec<usize> {
|
|||||||
return pages;
|
return pages;
|
||||||
}
|
}
|
||||||
|
|
||||||
let total_chars = text.chars().count();
|
let chars: Vec<char> = text.chars().collect();
|
||||||
|
let total_chars = chars.len();
|
||||||
if total_chars <= chars_per_page {
|
if total_chars <= chars_per_page {
|
||||||
pages.push(total_chars);
|
pages.push(total_chars);
|
||||||
return pages;
|
return pages;
|
||||||
}
|
}
|
||||||
|
|
||||||
let mut pos = 0;
|
let mut pos: usize = 0;
|
||||||
while pos < total_chars {
|
while pos < total_chars {
|
||||||
pos = (pos + chars_per_page).min(total_chars);
|
let next = pos + chars_per_page;
|
||||||
|
if next >= total_chars {
|
||||||
|
pages.push(total_chars);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Search backward from next for paragraph (\n\n) or line (\n) breaks
|
||||||
|
let search_start = pos + chars_per_page / 2;
|
||||||
|
let search_end = (next + chars_per_page / 2).min(total_chars);
|
||||||
|
let mut split = next;
|
||||||
|
|
||||||
|
// Prefer double newline (paragraph), then single newline
|
||||||
|
let mut found = false;
|
||||||
|
for i in (search_start..search_end).rev() {
|
||||||
|
if chars[i] == '\n' && i > 0 && chars[i - 1] == '\n' {
|
||||||
|
split = i - 1;
|
||||||
|
found = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if !found {
|
||||||
|
for i in (search_start..search_end).rev() {
|
||||||
|
if chars[i] == '\n' {
|
||||||
|
split = i + 1;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if split <= pos {
|
||||||
|
split = next;
|
||||||
|
}
|
||||||
|
|
||||||
|
pos = split.min(total_chars);
|
||||||
pages.push(pos);
|
pages.push(pos);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user