novel-doomsday-resurgence/tools/clean_duplicate_sections.py

#!/usr/bin/env python3
"""
清理重复的爽点段落和修复标题格式
"""

import os
import re

CHAPTERS_DIR = "/root/.openclaw/workspace/projects/末日重生_囤货/chapters"

def clean_duplicate_sections(content):
    """
    清理重复的爽点段落和修复格式
    """
    lines = content.split('\n')
    cleaned_lines = []

    i = 0
    while i < len(lines):
        line = lines[i]

        # 1. 修复章节标题
        # 如果行包含"第X章"但不是以#开头，添加#
        if re.search(r'第\d+章', line) and not line.startswith('#'):
            # 提取章节编号和标题
            match = re.search(r'第(\d+)章\s*(.+)', line)
            if match:
                chapter_num = match.group(1)
                title = match.group(2).strip()
                line = f"# 第{chapter_num}章 {title}"

        # 2. 处理重复的爽点段落
        if '【爽点' in line:
            # 检查是否重复
            if cleaned_lines and '【爽点' in cleaned_lines[-1]:
                # 跳过重复的爽点行
                i += 1
                continue

            # 检查下一行是否也是爽点段落
            if i + 1 < len(lines) and '【爽点' in lines[i + 1]:
                # 跳过重复的爽点段落
                i += 1
                continue

        # 3. 清理多余的空白行
        if line.strip() == '':
            if not cleaned_lines or cleaned_lines[-1].strip() == '':
                i += 1
                continue

        cleaned_lines.append(line)
        i += 1

    # 重新构建内容
    result = '\n'.join(cleaned_lines)

    # 4. 修复爽点部分的格式
    # 将爽点部分移到章节结尾，并确保格式正确
    if '【爽点' in result:
        # 找到爽点部分
        sections = result.split('\n\n')
        main_content = []
        shuangdian_sections = []

        for section in sections:
            if '【爽点' in section:
                shuangdian_sections.append(section)
            else:
                main_content.append(section)

        # 清理爽点部分
        cleaned_shuangdian = []
        seen = set()
        for section in shuangdian_sections:
            # 提取爽点内容（去除重复）
            lines = section.split('\n')
            key_lines = []
            for line in lines:
                if '【爽点' in line:
                    # 提取爽点编号
                    match = re.search(r'【爽点([^】]+)】', line)
                    if match:
                        key = match.group(1)
                        if key not in seen:
                            seen.add(key)
                            key_lines.append(line)
                elif line.strip() and not line.startswith('【爽点'):
                    key_lines.append(line)

            if key_lines:
                cleaned_shuangdian.append('\n'.join(key_lines))

        # 重新组合内容
        result = '\n\n'.join(main_content)
        if cleaned_shuangdian:
            result += '\n\n' + '\n\n'.join(cleaned_shuangdian)

    # 5. 确保章节标题在开头
    title_match = re.search(r'第(\d+)章\s+(.+)', result[:200])
    if title_match:
        chapter_num = title_match.group(1)
        chapter_title = title_match.group(2).strip()
        standard_title = f"# 第{chapter_num}章 {chapter_title}"

        # 替换开头的标题
        result = re.sub(r'^.*第\d+章.*$', standard_title, result, flags=re.MULTILINE)

    # 6. 清理多余的换行
    result = re.sub(r'\n{3,}', '\n\n', result)

    return result.strip() + '\n'

def main():
    print("清理重复的爽点段落和修复格式...")

    chapter_files = [f for f in os.listdir(CHAPTERS_DIR) if f.endswith('.md')]

    for filename in sorted(chapter_files):
        filepath = os.path.join(CHAPTERS_DIR, filename)

        print(f"处理: {filename}")

        with open(filepath, 'r', encoding='utf-8') as f:
            content = f.read()

        cleaned_content = clean_duplicate_sections(content)

        if content != cleaned_content:
            # 创建备份
            backup_path = filepath + '.clean.bak'
            with open(backup_path, 'w', encoding='utf-8') as f:
                f.write(content)

            # 写入清理后的内容
            with open(filepath, 'w', encoding='utf-8') as f:
                f.write(cleaned_content)

            print(f"  ✓ 已清理重复内容")
        else:
            print(f"  ✓ 无需清理")

    print("\n清理完成！")

if __name__ == '__main__':
    main()