diff --git a/dataset.schema.json b/dataset.schema.json new file mode 100644 index 00000000..15f69b7a --- /dev/null +++ b/dataset.schema.json @@ -0,0 +1,38 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "$id": "https://github.com/Machine-Learning-for-Medical-Language/cnlp_transformers/blob/main/dataset.schema.json", + "title": "Dataset", + "description": "JSON file containing a CNLPT Dataset", + "type": "object", + "additionalProperties": false, + "properties": { + "data": { + "description": "Array of data instances", + "type": "array", + "items": { + "description": "One data instance", + "type": "object", + "properties": { + "text": { + "description": "The raw text of the instance", + "type": "string" + }, + "id": { + "description": "The identifier of the instance", + "type": "string" + } + }, + "additionalProperties": { + "description": "The label for the subtask given by the name of this property", + "type": "string" + }, + "required": ["text", "id"], + "minProperties": 3 + } + }, + "metadata": { + "description": "Optional in-file metadata specification; must be included if a metadata.json file is absent", + "$ref": "metadata.schema.json" + } + } +} diff --git a/metadata.schema.json b/metadata.schema.json new file mode 100644 index 00000000..544957aa --- /dev/null +++ b/metadata.schema.json @@ -0,0 +1,43 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "$id": "https://github.com/Machine-Learning-for-Medical-Language/cnlp_transformers/blob/main/metadata.schema.json", + "title": "Metadata", + "description": "Metadata for a CNLPT Dataset", + "type": "object", + "additionalProperties": false, + "properties": { + "version": { + "description": "The version of this specific dataset", + "type": "string" + }, + "task": { + "description": "Name of the task this dataset contains", + "type": "string" + }, + "subtasks": { + "description": "List of individual subtasks this dataset has labels for", + "type": "array", + "items": { + "description": "A subtask with a specific output mode", + "type": "object", + "additionalProperties": false, + "properties": { + "task_name": { + "description": "Name of this subtask", + "type": "string" + }, + "output_mode": { + "description": "The type of classification that this subtask has labels for", + "type": "string", + "enum": ["tagging", "relations", "classification", "conceptnorm"] + } + }, + "required": ["task_name", "output_mode"] + }, + "minItems": 1, + "uniqueItems": true, + "$comment": "FIXME: what is uniqueness here" + } + }, + "required": ["task", "subtasks"] +}