jianzhihuixiang/skills/character-profile-cn/scripts/validate_profile.py

#!/usr/bin/env python3
"""
人物档案验证脚本
验证markdown格式的人物档案结构完整性
"""

import os
import re
import sys
from pathlib import Path

class ProfileValidator:
    """人物档案验证器"""

    # 各类型角色的必需章节
    REQUIRED_SECTIONS = {
        'protagonist': [
            '基本信息', '外貌特征', '性格特点', '背景故事',
            '动机层次', '人物关系', '故事发展'
        ],
        'antagonist': [
            '基本信息', '外貌特征', '性格特点', '核心理念',
            '动机发展', '镜像对比', '资源能力', '故事发展'
        ],
        'supporting': [
            '基本定位', '独立身份', '功能性设计', '关系发展',
            '发展可能性'
        ],
        'standard': [
            '基本信息', '外貌特征', '性格特点', '背景故事',
            '人物关系', '故事发展'
        ]
    }

    # 章节内的必需字段（根据模板）
    REQUIRED_FIELDS = {
        '基本信息': ['姓名', '年龄', '性别', '职业/身份', '故事中的角色'],
        '外貌特征': ['整体印象', '面部特征', '身材体型', '着装风格'],
        '性格特点': ['核心性格', '优点', '缺点', '价值观'],
        '背景故事': ['出身背景', '关键经历', '转折点'],
        '人物关系': ['与主角关系', '重要关系人'],
        '故事发展': ['角色目标', '内在冲突', '外在冲突', '发展弧线']
    }

    def __init__(self, profile_type='auto'):
        """初始化验证器

        Args:
            profile_type: 档案类型，可选值: 'protagonist', 'antagonist', 'supporting', 'standard', 'auto'
        """
        self.profile_type = profile_type

        # 检测平台，Windows上使用简单符号
        self.is_windows = sys.platform.startswith('win')

        # 符号定义
        if self.is_windows:
            self.symbols = {
                'building': '[结构]',
                'cross_mark': '[缺失]',
                'warning': '[注意]',
                'check': '[通过]',
                'wrench': '[修复]',
                'check_mark': '[OK]',
                'arrow': '->',
                'green_circle': '[良好]',
                'yellow_circle': '[一般]',
                'red_circle': '[需改进]',
                'file': '[文件]',
                'chart': '[统计]',
                'chart2': '[分布]',
                'bulb': '[建议]',
                'bullet': '-',
                'dash': '-'
            }
        else:
            self.symbols = {
                'building': '🏗️',
                'cross_mark': '❌',
                'warning': '⚠️',
                'check': '✅',
                'wrench': '🔧',
                'check_mark': '✓',
                'arrow': '→',
                'green_circle': '🟢',
                'yellow_circle': '🟡',
                'red_circle': '🔴',
                'file': '📋',
                'chart': '📊',
                'chart2': '📈',
                'bulb': '💡',
                'bullet': '•',
                'dash': '-'
            }

    def detect_profile_type(self, content):
        """检测档案类型"""
        # 通过内容特征检测类型
        lines = content.split('\n')

        # 检查是否有特定章节
        has_mirror = any('镜像对比' in line for line in lines)
        has_core_belief = any('核心理念' in line for line in lines)
        has_resources = any('资源能力' in line for line in lines)

        has_function = any('功能性设计' in line for line in lines)
        has_identity = any('独立身份' in line for line in lines)

        has_motivation = any('动机层次' in line for line in lines)
        has_core_identity = any('核心身份' in line for line in lines)

        if has_mirror or has_core_belief or has_resources:
            return 'antagonist'
        elif has_function or has_identity:
            return 'supporting'
        elif has_motivation or has_core_identity:
            return 'protagonist'
        else:
            return 'standard'

    def validate_structure(self, filepath):
        """验证档案结构

        Args:
            filepath: markdown文件路径

        Returns:
            验证结果字典
        """
        with open(filepath, 'r', encoding='utf-8') as f:
            content = f.read()

        # 检测类型
        if self.profile_type == 'auto':
            detected_type = self.detect_profile_type(content)
        else:
            detected_type = self.profile_type

        # 提取所有章节标题
        sections = self._extract_sections(content)

        # 检查必需章节
        required_sections = self.REQUIRED_SECTIONS.get(detected_type, [])
        missing_sections = []
        present_sections = []

        for required_section in required_sections:
            if required_section not in sections:
                missing_sections.append(required_section)
            else:
                present_sections.append(required_section)

        # 检查章节内的必需字段
        section_field_violations = {}
        for section_title in present_sections:
            section_content = self._get_section_content(content, section_title)
            missing_fields = self._check_required_fields(section_title, section_content)

            if missing_fields:
                section_field_violations[section_title] = missing_fields

        # 计算结构完整性评分
        structure_score = self._calculate_structure_score(
            len(required_sections), len(missing_sections), section_field_violations
        )

        return {
            'filepath': filepath,
            'detected_type': detected_type,
            'total_sections_found': len(sections),
            'required_sections': required_sections,
            'present_sections': present_sections,
            'missing_sections': missing_sections,
            'section_field_violations': section_field_violations,
            'structure_score': structure_score,
            'structure_level': self._get_structure_level(structure_score)
        }

    def _extract_sections(self, content):
        """提取所有章节标题"""
        sections = []

        # 匹配二级和三级标题（## 和 ###）
        header_pattern = r'^#{2,3}\s+(.+?)$'

        lines = content.split('\n')
        for line in lines:
            match = re.match(header_pattern, line.strip())
            if match:
                title = match.group(1).strip()
                # 去掉可能的内部链接
                title = re.sub(r'\[.*?\]\(.*?\)', '', title)
                sections.append(title)

        return sections

    def _get_section_content(self, content, section_title):
        """获取指定章节的内容"""
        lines = content.split('\n')
        in_target_section = False
        section_content = []

        for line in lines:
            # 检查是否是章节标题
            if re.match(rf'^#{{2,3}}\s+{re.escape(section_title)}\s*$', line.strip()):
                in_target_section = True
                continue

            # 如果进入下一个章节，停止收集
            if in_target_section and re.match(r'^#{2,3}\s+', line.strip()):
                break

            # 收集内容行
            if in_target_section:
                section_content.append(line)

        return '\n'.join(section_content)

    def _check_required_fields(self, section_title, section_content):
        """检查章节内的必需字段"""
        required_fields = self.REQUIRED_FIELDS.get(section_title, [])
        if not required_fields:
            return []

        missing_fields = []

        for field in required_fields:
            # 检查字段是否出现（作为粗体文本）
            pattern = rf'\*\*{re.escape(field)}\*\*'
            if not re.search(pattern, section_content):
                missing_fields.append(field)

        return missing_fields

    def _calculate_structure_score(self, total_required, missing_sections_count, field_violations):
        """计算结构完整性评分（0-100）"""
        if total_required == 0:
            return 100

        # 章节完整性（70分）
        section_score = ((total_required - missing_sections_count) / total_required) * 70

        # 字段完整性（30分）
        field_score = 30
        if field_violations:
            total_violations = sum(len(fields) for fields in field_violations.values())
            # 每个缺失字段扣3分
            field_penalty = min(30, total_violations * 3)
            field_score -= field_penalty

        total_score = section_score + field_score
        return max(0, min(100, total_score))

    def _get_structure_level(self, score):
        """获取结构完整性等级"""
        if score >= 90:
            return "优秀"
        elif score >= 75:
            return "良好"
        elif score >= 60:
            return "一般"
        elif score >= 40:
            return "不完整"
        else:
            return "结构缺失"

    def generate_validation_report(self, validation_result, output_format='text'):
        """生成验证报告"""
        result = validation_result

        if output_format == 'text':
            report_lines = []
            report_lines.append("=" * 60)
            report_lines.append(f"人物档案结构验证报告")
            report_lines.append(f"文件: {result['filepath']}")
            report_lines.append(f"检测类型: {result['detected_type']}")
            report_lines.append("=" * 60)
            report_lines.append("")

            # 结构完整性
            report_lines.append(f"{self.symbols['building']} 结构完整性")
            report_lines.append(f"  评分: {result['structure_score']:.1f}/100")
            report_lines.append(f"  等级: {result['structure_level']}")
            report_lines.append(f"  发现章节: {result['total_sections_found']}")
            report_lines.append("")

            # 章节检查
            if result['missing_sections']:
                report_lines.append(f"{self.symbols['cross_mark']} 缺失的必需章节")
                for section in result['missing_sections']:
                    report_lines.append(f"  {self.symbols['bullet']} {section}")
                report_lines.append("")

            # 字段检查
            if result['section_field_violations']:
                report_lines.append(f"{self.symbols['warning']} 章节内缺失字段")
                for section, fields in result['section_field_violations'].items():
                    report_lines.append(f"  {self.symbols['bullet']} {section}:")
                    for field in fields:
                        report_lines.append(f"      {self.symbols['dash']} {field}")
                report_lines.append("")

            # 通过检查的项目
            report_lines.append(f"{self.symbols['check']} 通过的检查")
            report_lines.append(f"  {self.symbols['bullet']} 必需章节: {len(result['present_sections'])}/{len(result['required_sections'])}")

            present_field_count = 0
            total_field_count = 0
            for section in result['present_sections']:
                required_fields = self.REQUIRED_FIELDS.get(section, [])
                total_field_count += len(required_fields)
                if section not in result['section_field_violations']:
                    present_field_count += len(required_fields)
                else:
                    missing_count = len(result['section_field_violations'][section])
                    present_field_count += (len(required_fields) - missing_count)

            if total_field_count > 0:
                report_lines.append(f"  {self.symbols['bullet']} 必需字段: {present_field_count}/{total_field_count}")

            report_lines.append("")

            # 修复建议
            report_lines.append(f"{self.symbols['wrench']} 修复建议")
            if result['structure_score'] >= 80:
                report_lines.append(f"  {self.symbols['check_mark']} 结构完整，可以继续完善内容细节")
            elif result['structure_score'] >= 60:
                if result['missing_sections']:
                    report_lines.append(f"  {self.symbols['arrow']} 添加缺失的章节: {', '.join(result['missing_sections'][:3])}")
                if result['section_field_violations']:
                    first_section = list(result['section_field_violations'].keys())[0]
                    first_field = result['section_field_violations'][first_section][0]
                    report_lines.append(f"  {self.symbols['arrow']} 补充字段: {first_section} → **{first_field}**")
            else:
                report_lines.append(f"  {self.symbols['arrow']} 需要补充基本的结构框架")
                report_lines.append(f"  {self.symbols['arrow']} 建议使用'{result['detected_type']}'模板重新整理")

            report_lines.append("")
            report_lines.append("=" * 60)

            return "\n".join(report_lines)

        elif output_format == 'json':
            import json
            return json.dumps(result, ensure_ascii=False, indent=2)

        else:
            raise ValueError(f"不支持的输出格式: {output_format}")

    def validate_directory(self, directory_path, recursive=True):
        """验证目录下的所有markdown档案"""
        directory = Path(directory_path)

        if not directory.exists():
            raise FileNotFoundError(f"目录不存在: {directory_path}")

        # 查找markdown文件
        md_files = []
        if recursive:
            md_files = list(directory.rglob("*.md"))
        else:
            md_files = list(directory.glob("*.md"))

        if not md_files:
            return {"message": "未找到markdown文件", "files": []}

        # 验证每个文件
        results = []
        for md_file in md_files:
            try:
                validation = self.validate_structure(str(md_file))
                results.append(validation)
            except Exception as e:
                results.append({
                    'filepath': str(md_file),
                    'error': str(e)
                })

        # 按结构评分排序
        valid_results = [r for r in results if 'structure_score' in r]
        sorted_results = sorted(valid_results, key=lambda x: x['structure_score'], reverse=True)

        return {
            'total_files': len(md_files),
            'successful_validation': len(valid_results),
            'failed_validation': len(results) - len(valid_results),
            'results': sorted_results
        }

    def generate_directory_validation_report(self, validation_results, output_format='text'):
        """生成目录验证报告"""
        if output_format == 'text':
            report_lines = []
            report_lines.append("=" * 60)
            report_lines.append(f"人物档案结构验证报告（目录）")
            report_lines.append(f"分析文件数: {validation_results['total_files']}")
            report_lines.append(f"成功验证: {validation_results['successful_validation']}")
            if validation_results['failed_validation'] > 0:
                report_lines.append(f"验证失败: {validation_results['failed_validation']}")
            report_lines.append("=" * 60)
            report_lines.append("")

            # 文件列表（按评分排序）
            if validation_results['results']:
                report_lines.append(f"{self.symbols['file']} 文件结构完整性排名")
                for i, result in enumerate(validation_results['results'], 1):
                    score = result['structure_score']
                    level = result['structure_level']
                    filename = os.path.basename(result['filepath'])
                    profile_type = result.get('detected_type', '未知')

                    # 使用符号表示等级
                    if score >= 80:
                        icon = self.symbols['green_circle']
                    elif score >= 60:
                        icon = self.symbols['yellow_circle']
                    else:
                        icon = self.symbols['red_circle']

                    report_lines.append(f"{icon} {i:2d}. {filename:<35} {score:5.1f}分 ({level}, {profile_type})")

                report_lines.append("")

                # 统计信息
                avg_score = sum(r['structure_score'] for r in validation_results['results']) / len(validation_results['results'])
                max_score = max(r['structure_score'] for r in validation_results['results'])
                min_score = min(r['structure_score'] for r in validation_results['results'])

                # 类型分布
                type_distribution = {}
                for result in validation_results['results']:
                    profile_type = result.get('detected_type', '未知')
                    type_distribution[profile_type] = type_distribution.get(profile_type, 0) + 1

                report_lines.append(f"{self.symbols['chart']} 统计信息")
                report_lines.append(f"  平均结构分: {avg_score:.1f}")
                report_lines.append(f"  最高分: {max_score:.1f}")
                report_lines.append(f"  最低分: {min_score:.1f}")
                report_lines.append("")

                report_lines.append(f"{self.symbols['chart2']} 类型分布")
                for profile_type, count in type_distribution.items():
                    percentage = (count / len(validation_results['results'])) * 100
                    report_lines.append(f"  {profile_type}: {count}个 ({percentage:.1f}%)")

                report_lines.append("")

                # 整体建议
                report_lines.append(f"{self.symbols['bulb']} 整体建议")
                if avg_score >= 75:
                    report_lines.append(f"  {self.symbols['check_mark']} 整体结构良好")
                    report_lines.append(f"  {self.symbols['arrow']} 可以开始关注内容深度和细节")
                elif avg_score >= 50:
                    report_lines.append(f"  {self.symbols['warning']} 结构基本完整但有缺失")
                    report_lines.append(f"  {self.symbols['arrow']} 建议补充缺失章节和字段")
                else:
                    report_lines.append(f"  {self.symbols['cross_mark']} 结构完整性不足")
                    report_lines.append(f"  {self.symbols['arrow']} 需要重新整理档案结构框架")

            report_lines.append("")
            report_lines.append("=" * 60)

            return "\n".join(report_lines)

        else:
            import json
            return json.dumps(validation_results, ensure_ascii=False, indent=2)


def main():
    """主函数"""
    import argparse

    parser = argparse.ArgumentParser(description='验证人物档案markdown文件结构')
    parser.add_argument('path', help='要验证的markdown文件或目录路径')
    parser.add_argument('--type', '-t', choices=['protagonist', 'antagonist', 'supporting', 'standard', 'auto'],
                      default='auto', help='档案类型（默认为自动检测）')
    parser.add_argument('--recursive', '-r', action='store_true', help='递归验证目录')
    parser.add_argument('--format', '-f', choices=['text', 'json'], default='text', help='输出格式')
    parser.add_argument('--output', '-o', help='输出文件路径')

    args = parser.parse_args()

    validator = ProfileValidator(args.type)
    path = Path(args.path)

    try:
        if path.is_file():
            # 验证单个文件
            if path.suffix.lower() != '.md':
                print("错误: 文件必须是.md格式")
                return 1

            validation = validator.validate_structure(str(path))
            report = validator.generate_validation_report(validation, args.format)

        elif path.is_dir():
            # 验证目录
            validation_results = validator.validate_directory(str(path), args.recursive)
            report = validator.generate_directory_validation_report(validation_results, args.format)

        else:
            print(f"错误: 路径不存在: {args.path}")
            return 1

        # 输出结果
        if args.output:
            with open(args.output, 'w', encoding='utf-8') as f:
                f.write(report)
            print(f"报告已保存到: {args.output}")
        else:
            print(report)

        return 0

    except Exception as e:
        print(f"验证失败: {e}")
        return 1


if __name__ == "__main__":
    sys.exit(main())