novel-doomsday-resurgence/scripts/simple_quality_check.py

#!/usr/bin/env python3
"""
简化版质量检查脚本
"""

import os
import re
import sys
from datetime import datetime

def count_chinese_chars(text):
    """统计中文字符数"""
    return len([c for c in text if '\u4e00' <= c <= '\u9fff'])

def analyze_chapter(filepath):
    """分析章节质量"""
    print(f"分析文件：{filepath}")

    with open(filepath, 'r', encoding='utf-8') as f:
        content = f.read()

    # 提取章节号
    filename = os.path.basename(filepath)
    match = re.search(r'(\d{4})_', filename)
    chapter_num = int(match.group(1)) if match else 0

    print(f"章节号：{chapter_num}")
    print(f"文件大小：{len(content)} 字符")

    # 1. 段落分析
    print("\n" + "="*60)
    print("段落分析：")

    lines = content.split('\n')
    paragraphs = []
    current_para = []

    for line in lines:
        stripped = line.strip()
        if not stripped:
            if current_para:
                paragraphs.append(''.join(current_para))
                current_para = []
        else:
            current_para.append(line)

    if current_para:
        paragraphs.append(''.join(current_para))

    # 过滤标题和空段落
    filtered_paras = []
    for para in paragraphs:
        para_stripped = para.strip()
        if para_stripped and not para_stripped.startswith('#'):
            filtered_paras.append(para_stripped)

    # 统计段落长度
    short_count = 0
    consecutive_short = 0
    max_consecutive = 0
    current_streak = 0
    lengths = []

    for para in filtered_paras:
        char_count = count_chinese_chars(para)
        lengths.append(char_count)

        if char_count < 35:
            short_count += 1
            current_streak += 1
            if current_streak > max_consecutive:
                max_consecutive = current_streak
        else:
            current_streak = 0

    total_paras = len(filtered_paras)
    short_ratio = short_count / total_paras if total_paras > 0 else 0
    avg_length = sum(lengths) / len(lengths) if lengths else 0

    print(f"总段落数：{total_paras}")
    print(f"短段落数(<35字)：{short_count}")
    print(f"短段落比例：{short_ratio*100:.1f}%")
    print(f"最长连续短段落：{max_consecutive}")
    print(f"平均段落长度：{avg_length:.1f}字")

    # 2. 爽点分析
    print("\n" + "="*60)
    print("爽点分析：")

    golden_keywords = ["打脸", "升级", "收获", "碾压", "反转", "爽点", "优势", "先知", "重生"]
    found_keywords = []

    for keyword in golden_keywords:
        if keyword in content:
            found_keywords.append(keyword)

    print(f"找到爽点关键词：{len(found_keywords)}/{len(golden_keywords)}")
    print(f"关键词：{', '.join(found_keywords)}")

    # 3. 对话分析
    print("\n" + "="*60)
    print("对话分析：")

    dialogue_pattern = r'["「](.+?)["」]'
    dialogues = re.findall(dialogue_pattern, content)

    total_chars = len(content)
    dialogue_chars = sum(len(d) for d in dialogues)
    dialogue_ratio = dialogue_chars / total_chars if total_chars > 0 else 0

    print(f"对话数量：{len(dialogues)}")
    print(f"对话比例：{dialogue_ratio*100:.1f}%")

    # 4. 问题识别
    print("\n" + "="*60)
    print("问题识别：")

    problems = []

    if short_ratio > 0.3:
        problems.append(f"短段落比例过高 ({short_ratio*100:.1f}%)")

    if max_consecutive > 3:
        problems.append(f"连续短段落过多 ({max_consecutive}个)")

    if len(found_keywords) < 3:
        problems.append(f"爽点不足 (找到{len(found_keywords)}个，需要至少3个)")

    if dialogue_ratio < 0.2:
        problems.append(f"对话比例偏低 ({dialogue_ratio*100:.1f}%)")

    if problems:
        print("⚠️  发现问题：")
        for i, problem in enumerate(problems, 1):
            print(f"  {i}. {problem}")
    else:
        print("✅ 未发现严重问题")

    # 5. 修复建议
    print("\n" + "="*60)
    print("修复建议：")

    recommendations = []

    if short_ratio > 0.3:
        recommendations.append("合并短段落，提高段落平均长度")

    if len(found_keywords) < 3:
        if chapter_num == 1:
            recommendations.append("第1章需要：1)明确重生优势 2)建立时间紧迫感 3)设置第一个目标")
        elif chapter_num <= 3:
            recommendations.append("黄金三章需要：1)兑现第一个爽点 2)打脸小反派 3)建立升级体系")
        else:
            recommendations.append("增加爽点密度：每章至少3个爽点")

    if dialogue_ratio < 0.2:
        recommendations.append("增加对话比例，目标30-40%")

    if recommendations:
        for i, rec in enumerate(recommendations, 1):
            print(f"  {i}. {rec}")
    else:
        print("  ✅ 无需修复")

    # 6. 生成报告
    report = {
        "chapter": chapter_num,
        "file": filename,
        "timestamp": datetime.now().isoformat(),
        "metrics": {
            "paragraphs": {
                "total": total_paras,
                "short": short_count,
                "short_ratio": short_ratio,
                "max_consecutive_short": max_consecutive,
                "avg_length": avg_length
            },
            "golden_points": {
                "found": len(found_keywords),
                "keywords": found_keywords
            },
            "dialogue": {
                "count": len(dialogues),
                "ratio": dialogue_ratio
            }
        },
        "problems": problems,
        "recommendations": recommendations
    }

    # 保存报告
    report_file = f"quality_report_ch{chapter_num:04d}.json"
    import json
    with open(report_file, 'w', encoding='utf-8') as f:
        json.dump(report, f, ensure_ascii=False, indent=2)

    print(f"\n📄 报告已保存到：{report_file}")

    return report

def main():
    if len(sys.argv) < 2:
        print("用法：python simple_quality_check.py <章节文件路径>")
        sys.exit(1)

    filepath = sys.argv[1]

    if not os.path.exists(filepath):
        print(f"错误：文件不存在 - {filepath}")
        sys.exit(1)

    analyze_chapter(filepath)

if __name__ == "__main__":
    main()