compliance/assets/doc/get_schema.py

import re
import os
import json

def extract_and_save_json_schemas(markdown_file_path, output_directory="json_schemas"):
    """
    从 Markdown 文件中提取 JSON Schema 并将其保存到单独文件的函数。

    Args:
        markdown_file_path (str): 输入 Markdown 文件的路径。
        output_directory (str): 用于保存 JSON Schema 文件的目录。
                                默认是 "json_schemas"。
    """
    try:
        with open(markdown_file_path, 'r', encoding='utf-8') as f:
            markdown_content = f.read()
    except FileNotFoundError:
        print(f"错误：文件 '{markdown_file_path}' 未找到。")
        return
    except Exception as e:
        print(f"错误：读取文件 '{markdown_file_path}' 时发生错误: {e}")
        return

    # 用于提取 JSON 代码块的正则表达式
    # 搜索以 ```json (或 ```Json 等，不区分大小写) 开头，并以 ``` 结尾的代码块
    # 代码块内容应为 { 到 }，并能被 json.loads 解析
    # re.DOTALL (或 re.S) 使 '.' 能够匹配包括换行符在内的任意字符
    # *? 用于非贪婪匹配
    json_block_pattern = r"```[Jj][Ss][Oo][Nn]\s*(\{[\s\S]*?\})\s*```"

    json_schemas_found = re.findall(json_block_pattern, markdown_content)

    if not json_schemas_found:
        print("未找到 JSON Schema 代码块。")
        return

    if not os.path.exists(output_directory):
        try:
            os.makedirs(output_directory)
            print(f"目录 '{output_directory}' 已创建。")
        except Exception as e:
            print(f"错误：创建目录 '{output_directory}' 失败: {e}")
            return

    saved_files_count = 0
    for i, schema_str in enumerate(json_schemas_found):
        try:
            # 解析 JSON 字符串以获取 $id 或 title
            schema_json_data = json.loads(schema_str)

            file_name = None
            # 尝试从 $id 获取文件名
            if isinstance(schema_json_data, dict) and '$id' in schema_json_data:
                id_value = schema_json_data['$id']
                if isinstance(id_value, str) and id_value.strip():
                    # 使用 URL 的最后一部分作为文件名
                    file_name_candidate = id_value.split('/')[-1]
                    # 避免文件名为空的情况
                    if file_name_candidate.strip():
                        file_name = file_name_candidate
                        if not file_name.endswith('.json'): # 确保以 .json 结尾
                            file_name += '.json'

            # 如果 $id 不可用或无效，则尝试从 title 生成文件名
            if not file_name and isinstance(schema_json_data, dict) and 'title' in schema_json_data:
                title_value = schema_json_data['title']
                if isinstance(title_value, str) and title_value.strip():
                    # 替换或删除文件名中不合适的字符 (例如： \ / * ? : " < > | 和空格)
                    safe_title = re.sub(r'[\\/*?:"<>|\s]', "_", title_value) # 将不合法字符和空格替换为下划线
                    safe_title = re.sub(r'_+', "_", safe_title) # 将连续的下划线替换为单个下划线
                    file_name = f"{safe_title}.json"

            # 如果文件名仍未确定，则使用序号
            if not file_name:
                file_name = f"schema_{i+1}.json"

            output_file_path = os.path.join(output_directory, file_name)

            try:
                with open(output_file_path, 'w', encoding='utf-8') as outfile:
                    # 使用 json.dump 进行格式化并写出，ensure_ascii=False 以正确处理中文等非ASCII字符
                    json.dump(schema_json_data, outfile, ensure_ascii=False, indent=4)
                print(f"Schema 已保存至 '{output_file_path}'。")
                saved_files_count += 1
            except Exception as e:
                print(f"错误：写入文件 '{output_file_path}' 时发生错误: {e}")

        except json.JSONDecodeError as e:
            print(f"警告：第 {i+1} 个 JSON Schema 解析失败。已跳过。")
            # 如果需要调试，可以取消下面这行注释来打印有问题的代码块开头部分
            print(f"问题代码块开头:\n{schema_str[:200]}...")
            print(f"问题：{e}")
            continue
        except Exception as e:
            print(f"警告：处理第 {i+1} 个 JSON Schema 时发生意外错误: {e}")
            continue

    if saved_files_count > 0:
        print(f"\n总共保存了 {saved_files_count} 个 JSON Schema 文件。")
    else:
        print("\n未能保存任何 JSON Schema 文件。")

# --- 使用方法 ---
if __name__ == "__main__":
    # 请在此处指定您的 Markdown 文件的路径
    markdown_file = "/Users/zpc01/workspace/zzlh/compliance/assets/doc/交换模型schema示例.md"  # 例如: "models.md"

    # 输出目录（可选）
    # 如果省略，将在脚本所在目录下创建名为 "extracted_json_schemas" 的目录
    output_dir = "/Users/zpc01/workspace/zzlh/compliance/assets/doc/extracted_schemas"

    # 执行函数
    # 在执行前，请确保 markdown_file 变量指向一个存在的 Markdown 文件。
    if os.path.exists(markdown_file):
        extract_and_save_json_schemas(markdown_file, output_dir)
    else:
        print(f"指定的 Markdown 文件 '{markdown_file}' 不存在。")
        print("请将脚本中的 `markdown_file` 变量更新为正确的Markdown文件路径。")