novel-doomsday-resurgence/tools/clean_duplicate_sections.py
唐天洛 2003fa15ef 章节标题质量改进系统完成
 修复关键标题问题:
1. 筹码_手动修复 → 致命筹码
2. 修复 → 心灵修复
3. 对峙(2) → 生死对峙

 创建完整质量检查与修复工具集:
1. chapter_title_qc.py - 标题质量分析系统
2. apply_title_fixes.py - 自动修复工具
3. clean_ai_markers.py - AI标记清理工具
4. final_format_fix.py - 最终格式修复工具
5. improve_all_titles.py - 全面标题改进工具

 所有29个章节标题质量均已优化,评分A级以上
 移除爽点分析内容,确保正文纯净
 提升标题吸引力和阅读体验
2026-03-30 14:53:52 +08:00

146 lines
4.6 KiB
Python

#!/usr/bin/env python3
"""
清理重复的爽点段落和修复标题格式
"""
import os
import re
CHAPTERS_DIR = "/root/.openclaw/workspace/projects/末日重生_囤货/chapters"
def clean_duplicate_sections(content):
"""
清理重复的爽点段落和修复格式
"""
lines = content.split('\n')
cleaned_lines = []
i = 0
while i < len(lines):
line = lines[i]
# 1. 修复章节标题
# 如果行包含"第X章"但不是以#开头,添加#
if re.search(r'\d+章', line) and not line.startswith('#'):
# 提取章节编号和标题
match = re.search(r'第(\d+)章\s*(.+)', line)
if match:
chapter_num = match.group(1)
title = match.group(2).strip()
line = f"# 第{chapter_num}{title}"
# 2. 处理重复的爽点段落
if '【爽点' in line:
# 检查是否重复
if cleaned_lines and '【爽点' in cleaned_lines[-1]:
# 跳过重复的爽点行
i += 1
continue
# 检查下一行是否也是爽点段落
if i + 1 < len(lines) and '【爽点' in lines[i + 1]:
# 跳过重复的爽点段落
i += 1
continue
# 3. 清理多余的空白行
if line.strip() == '':
if not cleaned_lines or cleaned_lines[-1].strip() == '':
i += 1
continue
cleaned_lines.append(line)
i += 1
# 重新构建内容
result = '\n'.join(cleaned_lines)
# 4. 修复爽点部分的格式
# 将爽点部分移到章节结尾,并确保格式正确
if '【爽点' in result:
# 找到爽点部分
sections = result.split('\n\n')
main_content = []
shuangdian_sections = []
for section in sections:
if '【爽点' in section:
shuangdian_sections.append(section)
else:
main_content.append(section)
# 清理爽点部分
cleaned_shuangdian = []
seen = set()
for section in shuangdian_sections:
# 提取爽点内容(去除重复)
lines = section.split('\n')
key_lines = []
for line in lines:
if '【爽点' in line:
# 提取爽点编号
match = re.search(r'【爽点([^】]+)】', line)
if match:
key = match.group(1)
if key not in seen:
seen.add(key)
key_lines.append(line)
elif line.strip() and not line.startswith('【爽点'):
key_lines.append(line)
if key_lines:
cleaned_shuangdian.append('\n'.join(key_lines))
# 重新组合内容
result = '\n\n'.join(main_content)
if cleaned_shuangdian:
result += '\n\n' + '\n\n'.join(cleaned_shuangdian)
# 5. 确保章节标题在开头
title_match = re.search(r'第(\d+)章\s+(.+)', result[:200])
if title_match:
chapter_num = title_match.group(1)
chapter_title = title_match.group(2).strip()
standard_title = f"# 第{chapter_num}{chapter_title}"
# 替换开头的标题
result = re.sub(r'^.*第\d+章.*$', standard_title, result, flags=re.MULTILINE)
# 6. 清理多余的换行
result = re.sub(r'\n{3,}', '\n\n', result)
return result.strip() + '\n'
def main():
print("清理重复的爽点段落和修复格式...")
chapter_files = [f for f in os.listdir(CHAPTERS_DIR) if f.endswith('.md')]
for filename in sorted(chapter_files):
filepath = os.path.join(CHAPTERS_DIR, filename)
print(f"处理: {filename}")
with open(filepath, 'r', encoding='utf-8') as f:
content = f.read()
cleaned_content = clean_duplicate_sections(content)
if content != cleaned_content:
# 创建备份
backup_path = filepath + '.clean.bak'
with open(backup_path, 'w', encoding='utf-8') as f:
f.write(content)
# 写入清理后的内容
with open(filepath, 'w', encoding='utf-8') as f:
f.write(cleaned_content)
print(f" ✓ 已清理重复内容")
else:
print(f" ✓ 无需清理")
print("\n清理完成!")
if __name__ == '__main__':
main()