novel-tools/analyzers/word_count.py

#!/usr/bin/env python3
"""
小说字数统计工具
统计章节字数、总字数、进度等
"""

import os
import re
from datetime import datetime
import json

class WordCounter:
    def __init__(self):
        self.total_words = 0
        self.total_chars = 0
        self.chapter_stats = []

    def count_file(self, file_path):
        """统计单个文件"""
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                content = f.read()

            # 计算字数（中文字符+英文单词）
            chinese_chars = len(re.findall(r'[\u4e00-\u9fff]', content))
            english_words = len(re.findall(r'\b[a-zA-Z]+\b', content))
            total_words = chinese_chars + english_words

            # 计算总字符数
            total_chars = len(content)

            # 计算段落数
            paragraphs = len([p for p in content.split('\n\n') if p.strip()])

            # 计算对话行数
            dialogue_lines = len(re.findall(r'["「][^"」]+["」]', content))

            return {
                "file": file_path,
                "words": total_words,
                "chars": total_chars,
                "paragraphs": paragraphs,
                "dialogue_lines": dialogue_lines,
                "dialogue_ratio": round(dialogue_lines / max(paragraphs, 1), 2)
            }
        except Exception as e:
            print(f"统计文件失败 {file_path}: {e}")
            return None

    def count_project(self, project_path):
        """统计整个项目"""
        chapters_dir = os.path.join(project_path, "chapters")

        if not os.path.exists(chapters_dir):
            print(f"章节目录不存在: {chapters_dir}")
            return None

        # 获取所有章节文件
        chapter_files = []
        for root, dirs, files in os.walk(chapters_dir):
            for file in files:
                if file.endswith('.md'):
                    chapter_files.append(os.path.join(root, file))

        if not chapter_files:
            print("没有找到章节文件")
            return None

        # 按文件名排序
        chapter_files.sort()

        # 统计每个章节
        self.chapter_stats = []
        self.total_words = 0
        self.total_chars = 0

        for chapter_file in chapter_files:
            stats = self.count_file(chapter_file)
            if stats:
                self.chapter_stats.append(stats)
                self.total_words += stats['words']
                self.total_chars += stats['chars']

        return {
            "project": project_path,
            "total_chapters": len(self.chapter_stats),
            "total_words": self.total_words,
            "total_chars": self.total_chars,
            "avg_words_per_chapter": round(self.total_words / max(len(self.chapter_stats), 1)),
            "chapters": self.chapter_stats
        }

    def generate_report(self, project_path, output_path=None):
        """生成统计报告"""
        stats = self.count_project(project_path)
        if not stats:
            return None

        # 番茄平台标准
        tomato_standard = 2500  # 每章标准字数
        daily_target = 4000     # 日更目标

        # 计算进度
        completed_chapters = stats['total_chapters']
        total_words = stats['total_words']
        avg_words = stats['avg_words_per_chapter']

        # 评估
        if avg_words < 2000:
            word_rating = "⚠️ 字数偏少"
        elif avg_words < 2500:
            word_rating = "✅ 符合标准"
        elif avg_words < 3500:
            word_rating = "✅ 字数充足"
        else:
            word_rating = "⚠️ 字数偏多"

        # 对话占比评估
        avg_dialogue_ratio = sum(s['dialogue_ratio'] for s in self.chapter_stats) / len(self.chapter_stats)
        if avg_dialogue_ratio < 0.3:
            dialogue_rating = "⚠️ 对话偏少（影响听书分成）"
        elif avg_dialogue_ratio < 0.5:
            dialogue_rating = "✅ 对话适中"
        else:
            dialogue_rating = "✅ 对话丰富（适合听书）"

        # 生成报告
        report = f"""
# 小说字数统计报告

## 项目信息
- 项目路径: {project_path}
- 统计时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
- 章节数量: {completed_chapters} 章
- 总字数: {total_words} 字
- 总字符数: {stats['total_chars']} 字符

## 章节统计
- 平均每章字数: {avg_words} 字
- 字数评估: {word_rating}
- 平均对话占比: {avg_dialogue_ratio:.1%}
- 对话评估: {dialogue_rating}

## 番茄平台适配
### 字数要求
- 标准章节字数: 2500-3500字
- 当前平均: {avg_words} 字
- 状态: {"符合" if 2000 <= avg_words <= 3500 else "需要调整"}

### 更新要求
- 日更全勤要求: 4000字/天
- 当前总字数: {total_words} 字
- 相当于: {total_words // 4000} 天的全勤更新量

### 听书优化
- 推荐对话占比: >30%
- 当前对话占比: {avg_dialogue_ratio:.1%}
- 状态: {"适合听书" if avg_dialogue_ratio >= 0.3 else "需要增加对话"}

## 详细章节数据
"""
        # 添加每个章节的详细数据
        for i, chapter in enumerate(self.chapter_stats, 1):
            chapter_name = os.path.basename(chapter['file']).replace('.md', '')
            report += f"\n### 第{i}章: {chapter_name}\n"
            report += f"- 字数: {chapter['words']} 字\n"
            report += f"- 字符: {chapter['chars']} 字符\n"
            report += f"- 段落: {chapter['paragraphs']} 段\n"
            report += f"- 对话行: {chapter['dialogue_lines']} 行\n"
            report += f"- 对话占比: {chapter['dialogue_ratio']:.1%}\n"

        # 保存报告
        if output_path:
            os.makedirs(os.path.dirname(output_path), exist_ok=True)
            with open(output_path, 'w', encoding='utf-8') as f:
                f.write(report)

            print(f"报告已保存: {output_path}")

        return report

def main():
    """主函数"""
    import argparse

    parser = argparse.ArgumentParser(description="小说字数统计工具")
    parser.add_argument("--project", help="项目路径", default=".")
    parser.add_argument("--output", help="输出报告路径")

    args = parser.parse_args()

    counter = WordCounter()
    report = counter.generate_report(args.project, args.output)

    if report:
        print(report)

if __name__ == "__main__":
    main()