📚 小说内容: - 《末日重生-开局囤货十亿物资》33章 - 完整的状态文件、记忆索引、钩子系统 🛠️ 系统配置: - 版本控制管理系统 - 自动化脚本系统 - 质量监控系统 🧠 固化记忆: - 长期记忆文件 - 系统配置文档 - 恢复流程指南 💾 数据安全: - 本地备份系统 - Git版本控制 - 远程同步机制 同步时间: 2026-03-30 16:25:35 系统状态: inkos正常运行中 (PID: 1433309) 创作进度: 第33章《油粮》创作中
92 lines
3.4 KiB
Python
92 lines
3.4 KiB
Python
#!/usr/bin/env python3
|
|
import re
|
|
import sys
|
|
|
|
def fix_paragraphs(input_file, output_file):
|
|
with open(input_file, 'r', encoding='utf-8') as f:
|
|
content = f.read()
|
|
|
|
# 分割成段落
|
|
paragraphs = []
|
|
current_para = []
|
|
lines = content.split('\n')
|
|
|
|
for line in lines:
|
|
stripped = line.strip()
|
|
if not stripped: # 空行表示段落分隔
|
|
if current_para:
|
|
paragraphs.append(''.join(current_para))
|
|
current_para = []
|
|
paragraphs.append('') # 保留空行
|
|
else:
|
|
current_para.append(line + ' ')
|
|
if current_para:
|
|
paragraphs.append(''.join(current_para))
|
|
|
|
# 合并短段落
|
|
fixed_paragraphs = []
|
|
i = 0
|
|
while i < len(paragraphs):
|
|
if paragraphs[i] == '':
|
|
fixed_paragraphs.append('')
|
|
i += 1
|
|
continue
|
|
|
|
# 统计当前段落字数(中文字符)
|
|
char_count = len([c for c in paragraphs[i] if '\u4e00' <= c <= '\u9fff' or c.isalnum()])
|
|
|
|
if char_count < 35 and i + 1 < len(paragraphs) and paragraphs[i + 1] != '':
|
|
# 合并当前段落和下一个段落
|
|
merged = paragraphs[i].rstrip() + ' ' + paragraphs[i + 1].lstrip()
|
|
# 检查合并后是否还是短段落,如果是,继续合并
|
|
merged_char_count = len([c for c in merged if '\u4e00' <= c <= '\u9fff' or c.isalnum()])
|
|
j = i + 2
|
|
while merged_char_count < 35 and j < len(paragraphs) and paragraphs[j] != '':
|
|
merged = merged.rstrip() + ' ' + paragraphs[j].lstrip()
|
|
merged_char_count = len([c for c in merged if '\u4e00' <= c <= '\u9fff' or c.isalnum()])
|
|
j += 1
|
|
|
|
fixed_paragraphs.append(merged)
|
|
i = j
|
|
else:
|
|
fixed_paragraphs.append(paragraphs[i])
|
|
i += 1
|
|
|
|
# 重新构建内容
|
|
result = []
|
|
for para in fixed_paragraphs:
|
|
if para == '':
|
|
result.append('')
|
|
else:
|
|
# 清理多余的空白
|
|
para = re.sub(r'\s+', ' ', para).strip()
|
|
result.append(para)
|
|
|
|
# 生成最终文本
|
|
final_text = '\n'.join(result)
|
|
|
|
# 写回文件
|
|
with open(output_file, 'w', encoding='utf-8') as f:
|
|
f.write(final_text)
|
|
|
|
# 统计修复效果
|
|
original_paras = [p for p in paragraphs if p != '']
|
|
fixed_paras = [p for p in fixed_paragraphs if p != '']
|
|
|
|
original_short = sum(1 for p in original_paras if len([c for c in p if '\u4e00' <= c <= '\u9fff' or c.isalnum()]) < 35)
|
|
fixed_short = sum(1 for p in fixed_paras if len([c for c in p if '\u4e00' <= c <= '\u9fff' or c.isalnum()]) < 35)
|
|
|
|
print(f'原始段落数: {len(original_paras)}')
|
|
print(f'原始短段落数: {original_short} ({original_short/len(original_paras)*100:.1f}%)')
|
|
print(f'修复后段落数: {len(fixed_paras)}')
|
|
print(f'修复后短段落数: {fixed_short} ({fixed_short/len(fixed_paras)*100:.1f}%)')
|
|
print(f'短段落减少: {original_short - fixed_short} ({((original_short - fixed_short)/original_short)*100:.1f}%)')
|
|
|
|
if __name__ == '__main__':
|
|
if len(sys.argv) != 3:
|
|
print('用法: python fix_paragraphs.py 输入文件 输出文件')
|
|
sys.exit(1)
|
|
|
|
input_file = sys.argv[1]
|
|
output_file = sys.argv[2]
|
|
fix_paragraphs(input_file, output_file) |