diff --git a/lmeval/benchmark.py b/lmeval/benchmark.py index 885cca6..2d3a0bb 100644 --- a/lmeval/benchmark.py +++ b/lmeval/benchmark.py @@ -490,7 +490,8 @@ def load_benchmark(path: str, archive = None, use_tempfile: bool | None = None) media_to_load = [] for category in benchmark.categories: for task in category.tasks: - if isinstance(task.scorer.type, ScorerType): + scorer_values = set(item.value for item in ScorerType) + if task.scorer.type in scorer_values: scorer_name = str(task.scorer.type) stype = ScorerType[scorer_name] else: diff --git a/lmeval/question.py b/lmeval/question.py index 1db0c07..1970e6e 100644 --- a/lmeval/question.py +++ b/lmeval/question.py @@ -37,10 +37,13 @@ class Question(CustomModel): # we need it for (de)serialization - automated added by Task.add() id: int = Field(default=-1) question: str | None = Field(default=None) + description: str = Field(default="") language: str = Field(default="en") # answer answer: str | None = Field(default=None) + # old answer (e.g. in case the anwere (label) changed due to a manual review) + answer_old: str | None = Field(default=None) # additional answers e.g. for multiple choice questions additional_answers: List[str] = Field(default_factory=list) @@ -94,6 +97,7 @@ def add_media(self, path: str| Path, filetype: FileType = FileType.auto, ".mp4": [FileType.mp4, Modality.video], ".py": [FileType.python, Modality.code], ".pdf": [FileType.pdf, Modality.document], + ".html": [FileType.html, Modality.code], } # auto detection if needed @@ -124,4 +128,4 @@ def _compute_file_hash(self, path: Path, class GroupedQuestion(Question): metadata: Dict[str, Any] - question_set: List[Question] \ No newline at end of file + question_set: List[Question]