novel-doomsday-resurgence/force_merge_paragraphs.py

#!/usr/bin/env python3
"""
强力合并段落脚本
彻底解决短段落问题
"""

import os
import re
import shutil

def force_merge_paragraphs(content):
    """强力合并段落"""
    # 按空行分割
    sections = content.split('\n\n')
    merged_sections = []

    for section in sections:
        lines = section.split('\n')
        if not lines:
            merged_sections.append('')
            continue

        # 处理标题行
        if lines[0].startswith('# '):
            merged_sections.append(section)
            continue

        # 合并段落
        merged_lines = []
        current_paragraph = []

        for line in lines:
            stripped = line.strip()
            if not stripped:
                continue

            # 检查是否是短段落
            chinese_chars = len([c for c in stripped if '\u4e00' <= c <= '\u9fff'])

            if chinese_chars < 35:
                # 短段落，合并到当前段落
                current_paragraph.append(stripped)
            else:
                # 长段落，先处理当前段落
                if current_paragraph:
                    merged_lines.append(' '.join(current_paragraph).strip())
                    current_paragraph = []
                merged_lines.append(stripped)

        # 处理剩余的短段落
        if current_paragraph:
            merged_lines.append(' '.join(current_paragraph).strip())

        # 重新组合
        if merged_lines:
            merged_sections.append('\n'.join(merged_lines))
        else:
            merged_sections.append('')

    return '\n\n'.join(merged_sections)

def fix_chapter(filepath):
    """修复章节"""
    print(f"修复: {os.path.basename(filepath)}")

    # 备份
    backup_path = filepath.replace('.md', '_强力合并前备份.md')
    shutil.copy2(filepath, backup_path)

    # 读取
    with open(filepath, 'r', encoding='utf-8') as f:
        content = f.read()

    original_length = len(content)

    # 强力合并
    fixed_content = force_merge_paragraphs(content)

    # 修复格式
    fixed_content = fixed_content.replace('——', '—')
    fixed_content = re.sub(r'["]([^"]+)["]', r'「\1」', fixed_content)

    # 保存
    with open(filepath, 'w', encoding='utf-8') as f:
        f.write(fixed_content)

    new_length = len(fixed_content)

    # 分析结果
    paragraphs = [p for p in fixed_content.split('\n') if p.strip() and not p.startswith('#')]
    total_paragraphs = len(paragraphs)

    short_paragraphs = 0
    for para in paragraphs:
        chinese_chars = len([c for c in para if '\u4e00' <= c <= '\u9fff'])
        if chinese_chars < 35:
            short_paragraphs += 1

    short_ratio = short_paragraphs / total_paragraphs if total_paragraphs > 0 else 0

    print(f"✅ 修复完成")
    print(f"   - 原始长度: {original_length} 字符")
    print(f"   - 修复后长度: {new_length} 字符")
    print(f"   - 段落总数: {total_paragraphs}")
    print(f"   - 短段落数: {short_paragraphs}")
    print(f"   - 短段比例: {short_ratio:.1%}")

    return short_ratio

def main():
    """主函数"""
    chapter_file = "/root/.openclaw/workspace/tomato-novel/books/末日重生-开局囤货十亿物资/chapters/0017_煎熬.md"

    if not os.path.exists(chapter_file):
        print(f"❌ 文件不存在: {chapter_file}")
        return

    print("=== 强力合并段落 ===")
    print(f"目标文件: {chapter_file}")
    print("")

    short_ratio = fix_chapter(chapter_file)

    print("")
    if short_ratio < 0.3:
        print("✅ 段落合并成功！短段比例 < 30%")
    else:
        print(f"⚠️  仍需优化: 短段比例 {short_ratio:.1%} (目标 < 30%)")

    print("")
    print("🎯 建议:")
    print("1. 运行质量检查验证效果")
    print("2. 检查其他章节的段落结构")
    print("3. 调整 inkos 写作参数")
    print("4. 建立自动合并机制")

if __name__ == "__main__":
    main()