FableFlow/03_Story/merge_md_to_pdf.py

"""
将当前目录下的所有md文件按文件名中的数字排序（从第一章开始），合并成一个pdf文件
"""

import os
import re  # 新增：导入正则模块
import markdown
from weasyprint import HTML, CSS

def extract_chapter_number(filename):
    """
    正则提取文件名中的数字（核心函数）
    :param filename: 文件名（如：第1章.md、10.md、章节2.md）
    :return: 提取到的数字（int），无数字返回9999（排最后）
    """
    # 正则匹配所有连续数字（\d+ 匹配1个及以上数字）
    match = re.search(r'\d+', filename)
    if match:
        return int(match.group())  # 转成整数，保证排序正确（10>2）
    else:
        return 9999  # 无数字的文件排最后

# 获取当前目录
current_dir = os.path.dirname(os.path.abspath(__file__))

# 获取所有md文件 + 按提取的数字排序（核心修正点）
md_files = [f for f in os.listdir(current_dir) if f.endswith('.md')]
md_files = sorted(md_files, key=lambda x: extract_chapter_number(x))  # 按数字排序

print(f"找到 {len(md_files)} 个md文件（按章节数字排序）:")
for f in md_files:
    print(f"  - {f}")

# 合并所有md文件内容（原有逻辑不变）
combined_html = """<!DOCTYPE html>
<html lang=\"zh-CN\">
<head>
    <meta charset=\"UTF-8\">
    <title>小说合集</title>
    <style>
        @page {
            margin: 2cm;
            @bottom-right {
                content: counter(page);
            }
        }
        body {
            font-family: \"SimSun\", \"宋体\", serif;
            font-size: 12pt;
            line-height: 1.8;
            text-align: justify;
        }
        h1 {
            font-size: 18pt;
            text-align: center;
            margin-top: 2em;
            margin-bottom: 1em;
        }
        h2 {
            font-size: 14pt;
            text-align: center;
            margin-top: 1.5em;
            margin-bottom: 0.8em;
        }
        p {
            margin: 0.5em 0;
            text-indent: 2em;
        }
        .chapter-title {
            page-break-before: always;
        }
        .chapter-title:first-child {
            page-break-before: auto;
        }
    </style>
</head>
<body>
"""

for md_file in md_files:
    file_path = os.path.join(current_dir, md_file)

    # 读取md文件
    with open(file_path, 'r', encoding='utf-8') as f:
        content = f.read()

    # 转换为html（保留表格、代码块格式）
    md = markdown.Markdown(extensions=['tables', 'fenced_code'])
    html_content = md.convert(content)

    # 添加章节标题
    chapter_title = md_file.replace('.md', '')
    combined_html += f'<h1 class=\"chapter-title\">{chapter_title}</h1>\n'
    combined_html += html_content + '\n'

combined_html += '</body></html>'

# 输出pdf文件
output_pdf = os.path.join(current_dir, '小说合集.pdf')
HTML(string=combined_html).write_pdf(output_pdf)

print(f"\nPDF已生成: {output_pdf}")