novel-doomsday-resurgence/fix_paragraphs.py
唐天洛 5dc8c00de0 feat(sync): 固化小说内容到Git仓库
📚 小说内容:
- 《末日重生-开局囤货十亿物资》33章
- 完整的状态文件、记忆索引、钩子系统

🛠️ 系统配置:
- 版本控制管理系统
- 自动化脚本系统
- 质量监控系统

🧠 固化记忆:
- 长期记忆文件
- 系统配置文档
- 恢复流程指南

💾 数据安全:
- 本地备份系统
- Git版本控制
- 远程同步机制

同步时间: 2026-03-30 16:25:35
系统状态: inkos正常运行中 (PID: 1433309)
创作进度: 第33章《油粮》创作中
2026-03-30 16:25:35 +08:00

92 lines
3.4 KiB
Python

#!/usr/bin/env python3
import re
import sys
def fix_paragraphs(input_file, output_file):
with open(input_file, 'r', encoding='utf-8') as f:
content = f.read()
# 分割成段落
paragraphs = []
current_para = []
lines = content.split('\n')
for line in lines:
stripped = line.strip()
if not stripped: # 空行表示段落分隔
if current_para:
paragraphs.append(''.join(current_para))
current_para = []
paragraphs.append('') # 保留空行
else:
current_para.append(line + ' ')
if current_para:
paragraphs.append(''.join(current_para))
# 合并短段落
fixed_paragraphs = []
i = 0
while i < len(paragraphs):
if paragraphs[i] == '':
fixed_paragraphs.append('')
i += 1
continue
# 统计当前段落字数(中文字符)
char_count = len([c for c in paragraphs[i] if '\u4e00' <= c <= '\u9fff' or c.isalnum()])
if char_count < 35 and i + 1 < len(paragraphs) and paragraphs[i + 1] != '':
# 合并当前段落和下一个段落
merged = paragraphs[i].rstrip() + ' ' + paragraphs[i + 1].lstrip()
# 检查合并后是否还是短段落,如果是,继续合并
merged_char_count = len([c for c in merged if '\u4e00' <= c <= '\u9fff' or c.isalnum()])
j = i + 2
while merged_char_count < 35 and j < len(paragraphs) and paragraphs[j] != '':
merged = merged.rstrip() + ' ' + paragraphs[j].lstrip()
merged_char_count = len([c for c in merged if '\u4e00' <= c <= '\u9fff' or c.isalnum()])
j += 1
fixed_paragraphs.append(merged)
i = j
else:
fixed_paragraphs.append(paragraphs[i])
i += 1
# 重新构建内容
result = []
for para in fixed_paragraphs:
if para == '':
result.append('')
else:
# 清理多余的空白
para = re.sub(r'\s+', ' ', para).strip()
result.append(para)
# 生成最终文本
final_text = '\n'.join(result)
# 写回文件
with open(output_file, 'w', encoding='utf-8') as f:
f.write(final_text)
# 统计修复效果
original_paras = [p for p in paragraphs if p != '']
fixed_paras = [p for p in fixed_paragraphs if p != '']
original_short = sum(1 for p in original_paras if len([c for c in p if '\u4e00' <= c <= '\u9fff' or c.isalnum()]) < 35)
fixed_short = sum(1 for p in fixed_paras if len([c for c in p if '\u4e00' <= c <= '\u9fff' or c.isalnum()]) < 35)
print(f'原始段落数: {len(original_paras)}')
print(f'原始短段落数: {original_short} ({original_short/len(original_paras)*100:.1f}%)')
print(f'修复后段落数: {len(fixed_paras)}')
print(f'修复后短段落数: {fixed_short} ({fixed_short/len(fixed_paras)*100:.1f}%)')
print(f'短段落减少: {original_short - fixed_short} ({((original_short - fixed_short)/original_short)*100:.1f}%)')
if __name__ == '__main__':
if len(sys.argv) != 3:
print('用法: python fix_paragraphs.py 输入文件 输出文件')
sys.exit(1)
input_file = sys.argv[1]
output_file = sys.argv[2]
fix_paragraphs(input_file, output_file)