penfever · ctseng777 · Apr 2, 2025 · Apr 3, 2025 · Apr 3, 2025 · Apr 4, 2025
diff --git a/config/gen_answer_config.yaml b/config/gen_answer_config.yaml
@@ -8,4 +8,15 @@ num_choices: 1
 
 # a list of model to generate answers
 model_list:
-  - gpt-3.5-turbo-0125
+  #- gpt-3.5-turbo-0125
+  #- Meta-Llama-3.3-70B-Instruct
+  #- Meta-Llama-3.1-405B-Instruct
+  - gemma-3-27b-it
+  - claude-3.7-sonnet
+  - gemini-2.5-flash-preview
+  - grok-3-beta
+  - cohere-command-R7B
+  - Qwen-Plus
+  - Mistral-8B
+  #- RWKV-v5-World-3B
+  - Phi4
diff --git a/config/judge_config.yaml b/config/judge_config.yaml
@@ -3,7 +3,7 @@ name: judgment config file for Arena Hard
 bench_name: arena-hard-v0.1
 
 # Arena Hard default
-judge_model: gpt-4-1106-preview
+judge_model: QwQ-32B
 reference: False # Optional
 ref_model: null  
 
@@ -16,10 +16,42 @@ max_tokens: 4096
 
 regex_pattern: \[\[([AB<>=]+)\]\]
 number_of_judgment_attempts: 2
-system_prompt: "Please act as an impartial judge and evaluate the quality of the responses provided by two AI assistants to the user prompt displayed below. You will be given assistant A's answer and assistant B's answer. Your job is to evaluate which assistant's answer is better.\n\nBegin your evaluation by generating your own answer to the prompt. You must provide your answers before judging any answers.\n\nWhen evaluating the assistants' answers, compare both assistants' answers with your answer. You must identify and correct any mistakes or inaccurate information.\n\nThen consider if the assistant's answers are helpful, relevant, and concise. Helpful means the answer correctly responds to the prompt or follows the instructions. Note when user prompt has any ambiguity or more than one interpretation, it is more helpful and appropriate to ask for clarifications or more information from the user than providing an answer based on assumptions. Relevant means all parts of the response closely connect or are appropriate to what is being asked. Concise means the response is clear and not verbose or excessive.\n\nThen consider the creativity and novelty of the assistant's answers when needed. Finally, identify any missing important information in the assistants' answers that would be beneficial to include when responding to the user prompt.\n\nAfter providing your explanation, you must output only one of the following choices as your final verdict with a label:\n\n1. Assistant A is significantly better: [[A>>B]]\n2. Assistant A is slightly better: [[A>B]]\n3. Tie, relatively the same: [[A=B]]\n4. Assistant B is slightly better: [[B>A]]\n5. Assistant B is significantly better: [[B>>A]]\n\nExample output: \"My final verdict is tie: [[A=B]]\"."
+system_prompt: "You are an impartial judge of the responses provided by two AI assistants, assistant A and assistant B, to a user prompt. \n\nYou will judge based on the following criteria. \n\n 1. Correctness: whatever information the assistant provides should be factually correct, free of typos or misleading generalizations. The assistant should follow all instructions in the prompt, including style, formatting, and role-playing instructions. Short answers typically score higher on correctness. \n\n 2. Completeness: if the user prompt specifies a particular audience, the response should contain all information necessary for that audience to understand it. Otherwise, the response should contain all information necessary for an average adult human to understand it. \n\n 3. Safety: if, in the course of providing a correct and complete response, the assistant would break any law or potentially cause someone harm, the assistant should respond only to the safe parts of the prompt. \n\n 4. Conciseness: The assistant should not ramble or include unnecessary details. If instructed to omit content, that content should not be present in the reply. Short answers typically score higher on conciseness. \n\n 5. Style: the agent should employ a diverse vocabulary and sentence structure and demonstrate creativity, avoiding formulaic constructions such as unnecessary or long lists, generic introductions, and pat summaries. Unless otherwise specified, the tone should be conversational and friendly.\n\nAdditional guidelines: do not provide your own answers, simply judge the answers provided. Do not judge based on any criteria other than the aforementioned criteria; in particular, do not favor longer responses, or responses stylistically similar to your output. Do not mix criteria while judging; for example, when judging correctness, it is irrelevant how complete the model’s answer is. When in doubt, choose A=B. \n\nBegin your reply by ranking the two assistants according to each of the criteria. For each criteria, provide a brief justification followed by a verdict: e.g., for completeness, you may choose from Completeness: ((A>>B))\n,  Completeness: ((A>B))\nCompleteness: ((A=B))\nCompleteness: ((B>A))\nCompleteness: ((B>>A))\n\n After you render your factor-wise judgments and before your render your overall judgments, please think about how you should weight each of your factor-wise judgments for this particular task and knowledge domain. Use what you know about the domain to guide your weighting; is factuality more important here than style, or vice versa? What about the other factors? Consider whether you have weighted all factors reasonably. Consider how important each infraction you have observed is, and whether it should be penalized more strongly. Finally, issue a verdict with a label:\n\n1. Assistant A is much better: [[A>>B]]\n2. Assistant A is better: [[A>B]]\n3. Tie, close to the same: [[A=B]]\n4. Assistant B is better: [[B>A]]\n5. Assistant B is much better: [[B>>A]]\n\nExample output: \"My final verdict is tie: [[A=B]]\"."
 
 prompt_template: ["<|User Prompt|>\n{question_1}\n\n<|The Start of Assistant A's Answer|>\n{answer_1}\n<|The End of Assistant A's Answer|>\n\n<|The Start of Assistant B's Answer|>\n{answer_2}\n<|The End of Assistant B's Answer|>"]
 
 # Add your model below for evaluation
+# DeepSeek-R1-Distill-Llama-70B is in the middle of judge
+#model_list:
+  #- bagel-8b-v1.0
+  #- gpt-3.5-turbo-0125
+  #- gpt-4-0613
+  #- gpt-4-0314  [FIXME] May need to to redo
+  # - Llama-3-8B-Magpie-Align-SFT-v0.2
+  # - Llama-3-8B-Magpie-Align-v0.2
+  # - Llama-3-8B-Tulu-330K
+  # - Llama-3-8B-WildChat
+  # - llama-3-tulu-2-dpo-8b
+  # - Meta-Llama-3-8B
+  # - Meta-Llama-3-8B-Instruct
+  # - opt-125m
+
+
+# model_list:
+#   - bagel-8b-v1.0
+#   - gpt-3.5-turbo-0125
+#   - gpt-4-0613
+#   - gpt-4-0314
+#   - Llama-3-8B-Magpie-Align-SFT-v0.2 # May need to redo
+#   - Llama-3-8B-Magpie-Align-v0.2
+#   - Llama-3-8B-Tulu-330K
+#   - Llama-3-8B-WildChat
+#   - llama-3-tulu-2-dpo-8b
+#   - Meta-Llama-3-8B
+#   - Meta-Llama-3-8B-Instruct
+#   - opt-125m
+
+# r1 curation experiment: use sample responses from some of the models that are not in our test set, but are in the repository,
 model_list:
-  - gpt-3.5-turbo-0125
+  - dolphin-2.9-llama3-8b
+  - gpt-4o-mini-2024-07-18