diff --git a/schema/leaderboard.schema.json b/schema/leaderboard.schema.json new file mode 100644 index 0000000..e02bb10 --- /dev/null +++ b/schema/leaderboard.schema.json @@ -0,0 +1,173 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "version": "0.0.1", + "type": "object", + "description": "Schema for storing and validating LLMs evaluation data, including model configuration, prompts, instances, Output, and evaluation metrics", + "required": [ + "schema_version", + "evaluation_id", + "model_info", + "evaluation_results" + ], + "properties": { + "schema_version": { + "type": "string", + "description": "Version of the schema used for this evaluation data" + }, + "evaluation_id": { + "type": "string", + "description": "Unique identifier for this specific evaluation run" + }, + "model_info": { + "type": "object", + "description": "Complete model specification including basic information, technical configuration and inference settings", + "required": [ + "name", + "source_url" + ], + "properties": { + "name": { + "type": "string", + "description": "Model name in HuggingFace format (e.g. meta-llama/Llama-3.1-8B-Instruct)" + }, + "source_url": { + "type": "string", + "description": "URL for the source of the evaluation data" + }, + "provider_name": { + "type": "string", + "description": "Name of the provider of the evaluation results." + }, + "developer": { + "type": "string", + "description": "Name of organization that provides the model (e.g. 'OpenAI')" + }, + "inference_platform": { + "type": "string", + "description": "Description of platform used to run the evaluations (e.g. local machine, Bedrock)" + } + } + }, + "evaluation_results": { + "type": "array", + "description": "Array of evaluation results", + "items": { + "type": "object", + "required": [ + "evaluation_name", + "metric_config", + "score_details" + ], + "properties": { + "evaluation_name": { + "type": "string", + "description": "Name of the evaluation" + }, + "metric_config": { + "type": "object", + "description": "Details about the metric", + "required": [ + "lower_is_better" + ], + "properties": { + "evaluation_description": { + "type": "string", + "description": "Description of the evaluation" + }, + "lower_is_better": { + "type": "boolean", + "description": "Whether a lower score is better" + }, + "score_type": { + "type": "string", + "description": "Type of score", + "enum": [ + "binary", + "continuous", + "levels" + ] + }, + "score_level_names": { + "type": "array", + "description": "Names of the score levels", + "items": { + "type": "string" + } + }, + "min_score": { + "type": "number", + "description": "Minimum possible score" + }, + "max_score": { + "type": "number", + "description": "Maximum possible score" + } + } + }, + "score_details": { + "type": "string", + "description": "The score for the evaluation and related details", + "required": [ + "score" + ], + "properties": { + "score": { + "type": "number", + "description": "The score for the evaluation" + }, + "details": { + "type": "string", + "description": "Any additional details about the score" + } + } + }, + "detailed_evaluation_results_url": { + "type": "string", + "description": "Link to detailed evaluation data" + }, + "generation_config": { + "type": "object", + "generation_args": { + "type": "object", + "description": "Parameters used to generate results - properties may vary by model type", + "properties": { + "temperature": { + "type": [ + "null", + "number" + ], + "description": "Sampling temperature" + }, + "top_p": { + "type": [ + "null", + "number" + ], + "description": "Nucleus sampling parameter" + }, + "top_k": { + "type": [ + "null", + "number" + ], + "description": "Top-k sampling parameter" + }, + "max_tokens": { + "type": "integer", + "minimum": 1, + "description": "Maximum number of tokens to generate" + } + }, + "additionalProperties": true + }, + "additional_details": { + "type": "string", + "description": "Additional details about how the results for this metric were generated." + } + } + } + } + + } + } +} diff --git a/schema/leaderboard_eval_types.py b/schema/leaderboard_eval_types.py new file mode 100644 index 0000000..6607c43 --- /dev/null +++ b/schema/leaderboard_eval_types.py @@ -0,0 +1,84 @@ +# generated by datamodel-codegen: +# filename: leaderboard.schema.json +# timestamp: 2025-10-01T17:57:26+00:00 + +from __future__ import annotations + +from enum import Enum +from typing import Any, Dict, List, Optional + +from pydantic import BaseModel, Field + + +class ModelInfo(BaseModel): + name: str = Field( + ..., + description='Model name in HuggingFace format (e.g. meta-llama/Llama-3.1-8B-Instruct)', + ) + source_url: str = Field( + ..., description='URL for the source of the evaluation data' + ) + provider_name: Optional[str] = Field( + None, description='Name of the provider of the evaluation results.' + ) + developer: Optional[str] = Field( + None, description="Name of organization that provides the model (e.g. 'OpenAI')" + ) + inference_platform: Optional[str] = Field( + None, + description='Description of platform used to run the evaluations (e.g. local machine, Bedrock)', + ) + + +class ScoreType(Enum): + binary = 'binary' + continuous = 'continuous' + levels = 'levels' + + +class MetricConfig(BaseModel): + evaluation_description: Optional[str] = Field( + None, description='Description of the evaluation' + ) + lower_is_better: bool = Field(..., description='Whether a lower score is better') + score_type: Optional[ScoreType] = Field(None, description='Type of score') + score_level_names: Optional[List[str]] = Field( + None, description='Names of the score levels' + ) + min_score: Optional[float] = Field(None, description='Minimum possible score') + max_score: Optional[float] = Field(None, description='Maximum possible score') + + +class ScoreDetails(BaseModel): + score: float = Field(..., description='The score for the evaluation') + details: Optional[str] = Field( + None, description='Any additional details about the score' + ) + + +class EvaluationResult(BaseModel): + evaluation_name: str = Field(..., description='Name of the evaluation') + metric_config: MetricConfig = Field(..., description='Details about the metric') + score_details: ScoreDetails = Field( + ..., description='The score for the evaluation and related details' + ) + detailed_evaluation_results_url: Optional[str] = Field( + None, description='Link to detailed evaluation data' + ) + generation_config: Optional[Dict[str, Any]] = None + + +class LeaderboardEvaluationResult(BaseModel): + schema_version: str = Field( + ..., description='Version of the schema used for this evaluation data' + ) + evaluation_id: str = Field( + ..., description='Unique identifier for this specific evaluation run' + ) + model_info: ModelInfo = Field( + ..., + description='Complete model specification including basic information, technical configuration and inference settings', + ) + evaluation_results: List[EvaluationResult] = Field( + ..., description='Array of evaluation results' + )