#!/usr/bin/env python3 import re import sys def count_chinese_chars(text): """统计中文字符数""" return len([c for c in text if '\u4e00' <= c <= '\u9fff']) def is_short_paragraph(text, threshold=35): """判断是否为短段落""" # 排除标题行 if text.startswith('# ') or text.startswith('## ') or text.startswith('### '): return False # 统计有效字符 chinese_chars = count_chinese_chars(text) # 数字和英文字母也算作有效字符 other_chars = len(re.findall(r'[a-zA-Z0-9]', text)) total_chars = chinese_chars + other_chars return total_chars < threshold def fix_paragraphs(input_file, output_file): with open(input_file, 'r', encoding='utf-8') as f: lines = f.readlines() # 将内容分割成段落 paragraphs = [] current_para = [] for line in lines: stripped = line.strip() if stripped == '': if current_para: paragraphs.append(''.join(current_para)) current_para = [] paragraphs.append('') # 空段落 else: current_para.append(line) if current_para: paragraphs.append(''.join(current_para)) # 合并短段落 merged_paragraphs = [] i = 0 while i < len(paragraphs): if paragraphs[i] == '': merged_paragraphs.append('') i += 1 continue current_text = paragraphs[i].strip() # 如果是短段落,尝试合并后续短段落 if is_short_paragraph(current_text) and i + 1 < len(paragraphs) and paragraphs[i + 1] != '': merged_text = current_text j = i + 1 # 合并后续短段落 while j < len(paragraphs) and paragraphs[j] != '' and is_short_paragraph(paragraphs[j].strip()): merged_text += ' ' + paragraphs[j].strip() j += 1 # 如果合并后达到一定长度,停止合并 if count_chinese_chars(merged_text) >= 50: break merged_paragraphs.append(merged_text) i = j else: merged_paragraphs.append(current_text) i += 1 # 重新生成文本 result_lines = [] for para in merged_paragraphs: if para == '': result_lines.append('') else: # 清理多余的空白和标点 para = re.sub(r'\s+', ' ', para) para = re.sub(r'([。!?])\s+', r'\1\n\n', para) # 在句号后添加换行 para = re.sub(r'([。!?])$', r'\1\n', para) result_lines.append(para) final_text = '\n'.join(result_lines) # 写回文件 with open(output_file, 'w', encoding='utf-8') as f: f.write(final_text) # 统计信息 print(f'修复完成!') print(f'输出文件: {output_file}') # 分析原始和修复后的段落 original_non_empty = [p for p in paragraphs if p != ''] fixed_non_empty = [p for p in merged_paragraphs if p != ''] original_short = sum(1 for p in original_non_empty if is_short_paragraph(p.strip())) fixed_short = sum(1 for p in fixed_non_empty if is_short_paragraph(p.strip())) print(f'原始段落数: {len(original_non_empty)}') print(f'原始短段落数: {original_short} ({original_short/len(original_non_empty)*100:.1f}%)') print(f'修复后段落数: {len(fixed_non_empty)}') print(f'修复后短段落数: {fixed_short} ({fixed_short/len(fixed_non_empty)*100:.1f}%)') # 计算段落长度的改善 original_lengths = [count_chinese_chars(p) for p in original_non_empty] fixed_lengths = [count_chinese_chars(p) for p in fixed_non_empty] print(f'原始平均段落长度: {sum(original_lengths)/len(original_lengths):.1f}字') print(f'修复后平均段落长度: {sum(fixed_lengths)/len(fixed_lengths):.1f}字') print(f'长度增加: {(sum(fixed_lengths)/len(fixed_lengths) - sum(original_lengths)/len(original_lengths)):.1f}字') if __name__ == '__main__': if len(sys.argv) != 3: print('用法: python fix_paragraphs_v2.py 输入文件 输出文件') sys.exit(1) input_file = sys.argv[1] output_file = sys.argv[2] fix_paragraphs(input_file, output_file)