#!/usr/bin/env python3 import re import sys def fix_paragraphs(input_file, output_file): with open(input_file, 'r', encoding='utf-8') as f: content = f.read() # 分割成段落 paragraphs = [] current_para = [] lines = content.split('\n') for line in lines: stripped = line.strip() if not stripped: # 空行表示段落分隔 if current_para: paragraphs.append(''.join(current_para)) current_para = [] paragraphs.append('') # 保留空行 else: current_para.append(line + ' ') if current_para: paragraphs.append(''.join(current_para)) # 合并短段落 fixed_paragraphs = [] i = 0 while i < len(paragraphs): if paragraphs[i] == '': fixed_paragraphs.append('') i += 1 continue # 统计当前段落字数(中文字符) char_count = len([c for c in paragraphs[i] if '\u4e00' <= c <= '\u9fff' or c.isalnum()]) if char_count < 35 and i + 1 < len(paragraphs) and paragraphs[i + 1] != '': # 合并当前段落和下一个段落 merged = paragraphs[i].rstrip() + ' ' + paragraphs[i + 1].lstrip() # 检查合并后是否还是短段落,如果是,继续合并 merged_char_count = len([c for c in merged if '\u4e00' <= c <= '\u9fff' or c.isalnum()]) j = i + 2 while merged_char_count < 35 and j < len(paragraphs) and paragraphs[j] != '': merged = merged.rstrip() + ' ' + paragraphs[j].lstrip() merged_char_count = len([c for c in merged if '\u4e00' <= c <= '\u9fff' or c.isalnum()]) j += 1 fixed_paragraphs.append(merged) i = j else: fixed_paragraphs.append(paragraphs[i]) i += 1 # 重新构建内容 result = [] for para in fixed_paragraphs: if para == '': result.append('') else: # 清理多余的空白 para = re.sub(r'\s+', ' ', para).strip() result.append(para) # 生成最终文本 final_text = '\n'.join(result) # 写回文件 with open(output_file, 'w', encoding='utf-8') as f: f.write(final_text) # 统计修复效果 original_paras = [p for p in paragraphs if p != ''] fixed_paras = [p for p in fixed_paragraphs if p != ''] original_short = sum(1 for p in original_paras if len([c for c in p if '\u4e00' <= c <= '\u9fff' or c.isalnum()]) < 35) fixed_short = sum(1 for p in fixed_paras if len([c for c in p if '\u4e00' <= c <= '\u9fff' or c.isalnum()]) < 35) print(f'原始段落数: {len(original_paras)}') print(f'原始短段落数: {original_short} ({original_short/len(original_paras)*100:.1f}%)') print(f'修复后段落数: {len(fixed_paras)}') print(f'修复后短段落数: {fixed_short} ({fixed_short/len(fixed_paras)*100:.1f}%)') print(f'短段落减少: {original_short - fixed_short} ({((original_short - fixed_short)/original_short)*100:.1f}%)') if __name__ == '__main__': if len(sys.argv) != 3: print('用法: python fix_paragraphs.py 输入文件 输出文件') sys.exit(1) input_file = sys.argv[1] output_file = sys.argv[2] fix_paragraphs(input_file, output_file)