92 lines
3.4 KiB
Python
92 lines
3.4 KiB
Python
|
|
#!/usr/bin/env python3
|
||
|
|
import re
|
||
|
|
import sys
|
||
|
|
|
||
|
|
def fix_paragraphs(input_file, output_file):
|
||
|
|
with open(input_file, 'r', encoding='utf-8') as f:
|
||
|
|
content = f.read()
|
||
|
|
|
||
|
|
# 分割成段落
|
||
|
|
paragraphs = []
|
||
|
|
current_para = []
|
||
|
|
lines = content.split('\n')
|
||
|
|
|
||
|
|
for line in lines:
|
||
|
|
stripped = line.strip()
|
||
|
|
if not stripped: # 空行表示段落分隔
|
||
|
|
if current_para:
|
||
|
|
paragraphs.append(''.join(current_para))
|
||
|
|
current_para = []
|
||
|
|
paragraphs.append('') # 保留空行
|
||
|
|
else:
|
||
|
|
current_para.append(line + ' ')
|
||
|
|
if current_para:
|
||
|
|
paragraphs.append(''.join(current_para))
|
||
|
|
|
||
|
|
# 合并短段落
|
||
|
|
fixed_paragraphs = []
|
||
|
|
i = 0
|
||
|
|
while i < len(paragraphs):
|
||
|
|
if paragraphs[i] == '':
|
||
|
|
fixed_paragraphs.append('')
|
||
|
|
i += 1
|
||
|
|
continue
|
||
|
|
|
||
|
|
# 统计当前段落字数(中文字符)
|
||
|
|
char_count = len([c for c in paragraphs[i] if '\u4e00' <= c <= '\u9fff' or c.isalnum()])
|
||
|
|
|
||
|
|
if char_count < 35 and i + 1 < len(paragraphs) and paragraphs[i + 1] != '':
|
||
|
|
# 合并当前段落和下一个段落
|
||
|
|
merged = paragraphs[i].rstrip() + ' ' + paragraphs[i + 1].lstrip()
|
||
|
|
# 检查合并后是否还是短段落,如果是,继续合并
|
||
|
|
merged_char_count = len([c for c in merged if '\u4e00' <= c <= '\u9fff' or c.isalnum()])
|
||
|
|
j = i + 2
|
||
|
|
while merged_char_count < 35 and j < len(paragraphs) and paragraphs[j] != '':
|
||
|
|
merged = merged.rstrip() + ' ' + paragraphs[j].lstrip()
|
||
|
|
merged_char_count = len([c for c in merged if '\u4e00' <= c <= '\u9fff' or c.isalnum()])
|
||
|
|
j += 1
|
||
|
|
|
||
|
|
fixed_paragraphs.append(merged)
|
||
|
|
i = j
|
||
|
|
else:
|
||
|
|
fixed_paragraphs.append(paragraphs[i])
|
||
|
|
i += 1
|
||
|
|
|
||
|
|
# 重新构建内容
|
||
|
|
result = []
|
||
|
|
for para in fixed_paragraphs:
|
||
|
|
if para == '':
|
||
|
|
result.append('')
|
||
|
|
else:
|
||
|
|
# 清理多余的空白
|
||
|
|
para = re.sub(r'\s+', ' ', para).strip()
|
||
|
|
result.append(para)
|
||
|
|
|
||
|
|
# 生成最终文本
|
||
|
|
final_text = '\n'.join(result)
|
||
|
|
|
||
|
|
# 写回文件
|
||
|
|
with open(output_file, 'w', encoding='utf-8') as f:
|
||
|
|
f.write(final_text)
|
||
|
|
|
||
|
|
# 统计修复效果
|
||
|
|
original_paras = [p for p in paragraphs if p != '']
|
||
|
|
fixed_paras = [p for p in fixed_paragraphs if p != '']
|
||
|
|
|
||
|
|
original_short = sum(1 for p in original_paras if len([c for c in p if '\u4e00' <= c <= '\u9fff' or c.isalnum()]) < 35)
|
||
|
|
fixed_short = sum(1 for p in fixed_paras if len([c for c in p if '\u4e00' <= c <= '\u9fff' or c.isalnum()]) < 35)
|
||
|
|
|
||
|
|
print(f'原始段落数: {len(original_paras)}')
|
||
|
|
print(f'原始短段落数: {original_short} ({original_short/len(original_paras)*100:.1f}%)')
|
||
|
|
print(f'修复后段落数: {len(fixed_paras)}')
|
||
|
|
print(f'修复后短段落数: {fixed_short} ({fixed_short/len(fixed_paras)*100:.1f}%)')
|
||
|
|
print(f'短段落减少: {original_short - fixed_short} ({((original_short - fixed_short)/original_short)*100:.1f}%)')
|
||
|
|
|
||
|
|
if __name__ == '__main__':
|
||
|
|
if len(sys.argv) != 3:
|
||
|
|
print('用法: python fix_paragraphs.py 输入文件 输出文件')
|
||
|
|
sys.exit(1)
|
||
|
|
|
||
|
|
input_file = sys.argv[1]
|
||
|
|
output_file = sys.argv[2]
|
||
|
|
fix_paragraphs(input_file, output_file)
|