123 lines
4.3 KiB
Python
123 lines
4.3 KiB
Python
|
|
#!/usr/bin/env python3
|
||
|
|
import re
|
||
|
|
import sys
|
||
|
|
|
||
|
|
def count_chinese_chars(text):
|
||
|
|
"""统计中文字符数"""
|
||
|
|
return len([c for c in text if '\u4e00' <= c <= '\u9fff'])
|
||
|
|
|
||
|
|
def is_short_paragraph(text, threshold=35):
|
||
|
|
"""判断是否为短段落"""
|
||
|
|
# 排除标题行
|
||
|
|
if text.startswith('# ') or text.startswith('## ') or text.startswith('### '):
|
||
|
|
return False
|
||
|
|
|
||
|
|
# 统计有效字符
|
||
|
|
chinese_chars = count_chinese_chars(text)
|
||
|
|
# 数字和英文字母也算作有效字符
|
||
|
|
other_chars = len(re.findall(r'[a-zA-Z0-9]', text))
|
||
|
|
total_chars = chinese_chars + other_chars
|
||
|
|
|
||
|
|
return total_chars < threshold
|
||
|
|
|
||
|
|
def fix_paragraphs(input_file, output_file):
|
||
|
|
with open(input_file, 'r', encoding='utf-8') as f:
|
||
|
|
lines = f.readlines()
|
||
|
|
|
||
|
|
# 将内容分割成段落
|
||
|
|
paragraphs = []
|
||
|
|
current_para = []
|
||
|
|
|
||
|
|
for line in lines:
|
||
|
|
stripped = line.strip()
|
||
|
|
if stripped == '':
|
||
|
|
if current_para:
|
||
|
|
paragraphs.append(''.join(current_para))
|
||
|
|
current_para = []
|
||
|
|
paragraphs.append('') # 空段落
|
||
|
|
else:
|
||
|
|
current_para.append(line)
|
||
|
|
|
||
|
|
if current_para:
|
||
|
|
paragraphs.append(''.join(current_para))
|
||
|
|
|
||
|
|
# 合并短段落
|
||
|
|
merged_paragraphs = []
|
||
|
|
i = 0
|
||
|
|
while i < len(paragraphs):
|
||
|
|
if paragraphs[i] == '':
|
||
|
|
merged_paragraphs.append('')
|
||
|
|
i += 1
|
||
|
|
continue
|
||
|
|
|
||
|
|
current_text = paragraphs[i].strip()
|
||
|
|
|
||
|
|
# 如果是短段落,尝试合并后续短段落
|
||
|
|
if is_short_paragraph(current_text) and i + 1 < len(paragraphs) and paragraphs[i + 1] != '':
|
||
|
|
merged_text = current_text
|
||
|
|
j = i + 1
|
||
|
|
|
||
|
|
# 合并后续短段落
|
||
|
|
while j < len(paragraphs) and paragraphs[j] != '' and is_short_paragraph(paragraphs[j].strip()):
|
||
|
|
merged_text += ' ' + paragraphs[j].strip()
|
||
|
|
j += 1
|
||
|
|
# 如果合并后达到一定长度,停止合并
|
||
|
|
if count_chinese_chars(merged_text) >= 50:
|
||
|
|
break
|
||
|
|
|
||
|
|
merged_paragraphs.append(merged_text)
|
||
|
|
i = j
|
||
|
|
else:
|
||
|
|
merged_paragraphs.append(current_text)
|
||
|
|
i += 1
|
||
|
|
|
||
|
|
# 重新生成文本
|
||
|
|
result_lines = []
|
||
|
|
for para in merged_paragraphs:
|
||
|
|
if para == '':
|
||
|
|
result_lines.append('')
|
||
|
|
else:
|
||
|
|
# 清理多余的空白和标点
|
||
|
|
para = re.sub(r'\s+', ' ', para)
|
||
|
|
para = re.sub(r'([。!?])\s+', r'\1\n\n', para) # 在句号后添加换行
|
||
|
|
para = re.sub(r'([。!?])$', r'\1\n', para)
|
||
|
|
result_lines.append(para)
|
||
|
|
|
||
|
|
final_text = '\n'.join(result_lines)
|
||
|
|
|
||
|
|
# 写回文件
|
||
|
|
with open(output_file, 'w', encoding='utf-8') as f:
|
||
|
|
f.write(final_text)
|
||
|
|
|
||
|
|
# 统计信息
|
||
|
|
print(f'修复完成!')
|
||
|
|
print(f'输出文件: {output_file}')
|
||
|
|
|
||
|
|
# 分析原始和修复后的段落
|
||
|
|
original_non_empty = [p for p in paragraphs if p != '']
|
||
|
|
fixed_non_empty = [p for p in merged_paragraphs if p != '']
|
||
|
|
|
||
|
|
original_short = sum(1 for p in original_non_empty if is_short_paragraph(p.strip()))
|
||
|
|
fixed_short = sum(1 for p in fixed_non_empty if is_short_paragraph(p.strip()))
|
||
|
|
|
||
|
|
print(f'原始段落数: {len(original_non_empty)}')
|
||
|
|
print(f'原始短段落数: {original_short} ({original_short/len(original_non_empty)*100:.1f}%)')
|
||
|
|
print(f'修复后段落数: {len(fixed_non_empty)}')
|
||
|
|
print(f'修复后短段落数: {fixed_short} ({fixed_short/len(fixed_non_empty)*100:.1f}%)')
|
||
|
|
|
||
|
|
# 计算段落长度的改善
|
||
|
|
original_lengths = [count_chinese_chars(p) for p in original_non_empty]
|
||
|
|
fixed_lengths = [count_chinese_chars(p) for p in fixed_non_empty]
|
||
|
|
|
||
|
|
print(f'原始平均段落长度: {sum(original_lengths)/len(original_lengths):.1f}字')
|
||
|
|
print(f'修复后平均段落长度: {sum(fixed_lengths)/len(fixed_lengths):.1f}字')
|
||
|
|
print(f'长度增加: {(sum(fixed_lengths)/len(fixed_lengths) - sum(original_lengths)/len(original_lengths)):.1f}字')
|
||
|
|
|
||
|
|
if __name__ == '__main__':
|
||
|
|
if len(sys.argv) != 3:
|
||
|
|
print('用法: python fix_paragraphs_v2.py 输入文件 输出文件')
|
||
|
|
sys.exit(1)
|
||
|
|
|
||
|
|
input_file = sys.argv[1]
|
||
|
|
output_file = sys.argv[2]
|
||
|
|
fix_paragraphs(input_file, output_file)
|