|
3 | 3 | # Term Challenge Comprehensive Docker Integration Test |
4 | 4 | # ============================================================================= |
5 | 5 | # Spins up a 3-instance challenge-server network via Docker Compose and runs |
6 | | -# 44 tests covering: |
| 6 | +# 53 tests covering: |
7 | 7 | # |
8 | | -# 1. Server health & startup |
9 | | -# 2. Challenge configuration |
10 | | -# 3. Validation API |
11 | | -# 4. Evaluation API & scoring |
12 | | -# 5. Custom challenge routes (leaderboard, stats, decay, agent) |
13 | | -# 6. Leaderboard & scoring consistency |
14 | | -# 7. Multi-instance consistency |
15 | | -# 8. Fault tolerance (stop/restart) |
16 | | -# 9. Resource & stability checks |
17 | | -# 10. Edge cases & error handling |
| 8 | +# 1. Server health & startup (4 tests) |
| 9 | +# 2. Health response schema (4 tests) |
| 10 | +# 3. Challenge configuration via /config (4 tests) |
| 11 | +# 4. Validation API via /validate (5 tests) |
| 12 | +# 5. Evaluation API & scoring (5 tests) |
| 13 | +# 6. Custom challenge routes (leaderboard, stats, decay, agent) (5 tests) |
| 14 | +# 7. Leaderboard & scoring consistency (4 tests) |
| 15 | +# 8. Multi-instance consistency (4 tests) |
| 16 | +# 9. Fault tolerance (stop/restart) (4 tests) |
| 17 | +# 10. Resource & stability checks (4 tests) |
| 18 | +# 11. Edge cases & error handling (5 tests) |
| 19 | +# 12. Validate endpoint edge cases (5 tests) |
18 | 20 | # |
19 | 21 | # Usage: |
20 | 22 | # bash tests/docker/test-comprehensive.sh |
@@ -275,7 +277,76 @@ run_test "Health response includes uptime_secs field" test_health_has_uptime_fie |
275 | 277 | run_test "Server uptime increases over time" test_health_uptime_increases |
276 | 278 |
|
277 | 279 | # ============================================================================= |
278 | | -# TEST SUITE 3: Validation API (5 tests) |
| 280 | +# TEST SUITE 3: Challenge Configuration via /config (4 tests) |
| 281 | +# ============================================================================= |
| 282 | + |
| 283 | +test_config_endpoint_responds() { |
| 284 | + local response |
| 285 | + response=$(curl_json "http://localhost:${SERVER_PORTS[0]}/config") |
| 286 | + if [ -z "${response}" ]; then |
| 287 | + log_info "No response from /config" |
| 288 | + return 1 |
| 289 | + fi |
| 290 | + local name |
| 291 | + name=$(echo "${response}" | jq -r '.name' 2>/dev/null) |
| 292 | + if [ -n "${name}" ] && [ "${name}" != "null" ]; then |
| 293 | + log_info "Config endpoint responds with name=${name}" |
| 294 | + return 0 |
| 295 | + fi |
| 296 | + log_info "Config response missing name: ${response}" |
| 297 | + return 1 |
| 298 | +} |
| 299 | + |
| 300 | +test_config_has_challenge_id() { |
| 301 | + local response |
| 302 | + response=$(curl_json "http://localhost:${SERVER_PORTS[0]}/config") |
| 303 | + local cid |
| 304 | + cid=$(echo "${response}" | jq -r '.challenge_id' 2>/dev/null) |
| 305 | + if [ "${cid}" = "${CHALLENGE_ID}" ]; then |
| 306 | + log_info "Config challenge_id matches: ${cid}" |
| 307 | + return 0 |
| 308 | + fi |
| 309 | + log_info "Config challenge_id mismatch: got ${cid}, expected ${CHALLENGE_ID}" |
| 310 | + return 1 |
| 311 | +} |
| 312 | + |
| 313 | +test_config_has_features() { |
| 314 | + local response |
| 315 | + response=$(curl_json "http://localhost:${SERVER_PORTS[0]}/config") |
| 316 | + local features_type |
| 317 | + features_type=$(echo "${response}" | jq 'type' 2>/dev/null) |
| 318 | + local features |
| 319 | + features=$(echo "${response}" | jq '.features' 2>/dev/null) |
| 320 | + if [ "${features}" != "null" ] && [ -n "${features}" ]; then |
| 321 | + local count |
| 322 | + count=$(echo "${features}" | jq 'length' 2>/dev/null) |
| 323 | + log_info "Config has ${count} features: ${features}" |
| 324 | + return 0 |
| 325 | + fi |
| 326 | + log_info "Config missing features field: ${response}" |
| 327 | + return 1 |
| 328 | +} |
| 329 | + |
| 330 | +test_config_has_limits() { |
| 331 | + local response |
| 332 | + response=$(curl_json "http://localhost:${SERVER_PORTS[0]}/config") |
| 333 | + local limits |
| 334 | + limits=$(echo "${response}" | jq '.limits' 2>/dev/null) |
| 335 | + if [ "${limits}" != "null" ] && [ -n "${limits}" ]; then |
| 336 | + log_info "Config has limits: ${limits}" |
| 337 | + return 0 |
| 338 | + fi |
| 339 | + log_info "Config missing limits field: ${response}" |
| 340 | + return 1 |
| 341 | +} |
| 342 | + |
| 343 | +run_test "GET /config endpoint responds" test_config_endpoint_responds |
| 344 | +run_test "Config contains correct challenge_id" test_config_has_challenge_id |
| 345 | +run_test "Config contains features array" test_config_has_features |
| 346 | +run_test "Config contains limits object" test_config_has_limits |
| 347 | + |
| 348 | +# ============================================================================= |
| 349 | +# TEST SUITE 4: Validation API via /validate (5 tests) |
279 | 350 | # ============================================================================= |
280 | 351 |
|
281 | 352 | test_validate_valid_submission() { |
@@ -396,7 +467,7 @@ run_test "Missing required fields rejected" test_validate_missing_required_field |
396 | 467 | run_test "Invalid score range (>1.0) rejected" test_validate_invalid_score_range |
397 | 468 |
|
398 | 469 | # ============================================================================= |
399 | | -# TEST SUITE 4: Evaluation API & Scoring (5 tests) |
| 470 | +# TEST SUITE 5: Evaluation API & Scoring (5 tests) |
400 | 471 | # ============================================================================= |
401 | 472 |
|
402 | 473 | test_eval_all_tasks_passed() { |
@@ -565,7 +636,7 @@ run_test "Evaluation returns execution_time_ms" test_eval_returns_execution_time |
565 | 636 | run_test "Evaluation echoes back request_id" test_eval_returns_request_id |
566 | 637 |
|
567 | 638 | # ============================================================================= |
568 | | -# TEST SUITE 5: Custom Challenge Routes (5 tests) |
| 639 | +# TEST SUITE 6: Custom Challenge Routes (5 tests) |
569 | 640 | # ============================================================================= |
570 | 641 |
|
571 | 642 | test_leaderboard_returns_json_array() { |
@@ -644,7 +715,7 @@ run_test "GET /agent/:hotkey/score returns 404 for unknown" test_agent_unknown_h |
644 | 715 | run_test "GET /agent/:hotkey/submissions returns count" test_agent_submissions_endpoint |
645 | 716 |
|
646 | 717 | # ============================================================================= |
647 | | -# TEST SUITE 6: Leaderboard & Scoring Consistency (4 tests) |
| 718 | +# TEST SUITE 7: Leaderboard & Scoring Consistency (4 tests) |
648 | 719 | # ============================================================================= |
649 | 720 |
|
650 | 721 | test_leaderboard_has_entries_after_evaluations() { |
@@ -725,7 +796,7 @@ run_test "Leaderboard sorted by score descending" test_leaderboard_sorted_by_sco |
725 | 796 | run_test "Leaderboard ranks start at 1" test_leaderboard_ranks_sequential |
726 | 797 |
|
727 | 798 | # ============================================================================= |
728 | | -# TEST SUITE 7: Multi-Instance Consistency (4 tests) |
| 799 | +# TEST SUITE 8: Multi-Instance Consistency (4 tests) |
729 | 800 | # ============================================================================= |
730 | 801 |
|
731 | 802 | test_all_servers_same_version() { |
@@ -823,7 +894,7 @@ run_test "All 3 servers healthy simultaneously" test_all_servers_healthy_simulta |
823 | 894 | run_test "Independent evaluation produces same score" test_independent_evaluation_on_each_server |
824 | 895 |
|
825 | 896 | # ============================================================================= |
826 | | -# TEST SUITE 8: Fault Tolerance (4 tests) |
| 897 | +# TEST SUITE 9: Fault Tolerance (4 tests) |
827 | 898 | # ============================================================================= |
828 | 899 |
|
829 | 900 | test_network_survives_single_server_stop() { |
@@ -917,7 +988,7 @@ run_test "Restarted server serves requests" test_restarted_server_serves_request |
917 | 988 | run_test "All servers recover after restart" test_all_servers_recover_after_restart |
918 | 989 |
|
919 | 990 | # ============================================================================= |
920 | | -# TEST SUITE 9: Resource & Stability (4 tests) |
| 991 | +# TEST SUITE 10: Resource & Stability (4 tests) |
921 | 992 | # ============================================================================= |
922 | 993 |
|
923 | 994 | test_server_memory_within_limits() { |
@@ -993,7 +1064,7 @@ run_test "No panics or fatal errors in logs" test_no_panics_in_logs |
993 | 1064 | run_test "No crash loops detected" test_no_crash_loops |
994 | 1065 |
|
995 | 1066 | # ============================================================================= |
996 | | -# TEST SUITE 10: Edge Cases & Error Handling (5 tests) |
| 1067 | +# TEST SUITE 11: Edge Cases & Error Handling (5 tests) |
997 | 1068 | # ============================================================================= |
998 | 1069 |
|
999 | 1070 | test_unknown_route_returns_404() { |
@@ -1122,6 +1193,115 @@ run_test "Empty body to /evaluate returns error" test_empty_body_to_evaluate |
1122 | 1193 | run_test "Large evaluation (50 tasks) succeeds" test_large_number_of_task_results |
1123 | 1194 | run_test "Concurrent evaluations all succeed" test_concurrent_evaluations |
1124 | 1195 |
|
| 1196 | +# ============================================================================= |
| 1197 | +# TEST SUITE 12: Validate Endpoint Edge Cases (5 tests) |
| 1198 | +# ============================================================================= |
| 1199 | + |
| 1200 | +test_validate_endpoint_valid_data() { |
| 1201 | + local response |
| 1202 | + response=$(curl_json_quiet -X POST "http://localhost:${SERVER_PORTS[0]}/validate" \ |
| 1203 | + -d '{ |
| 1204 | + "data": { |
| 1205 | + "agent_hash": "abc123", |
| 1206 | + "miner_hotkey": "5GrwvaEF5zXb26Fz9rcQpDWS57CtERHpNehXCPcNoHGKutQY", |
| 1207 | + "epoch": 1, |
| 1208 | + "task_results": [ |
| 1209 | + {"task_id": "task-1", "passed": true, "score": 0.9, "execution_time_ms": 1000, "test_output": "ok", "agent_output": "done", "error": null} |
| 1210 | + ] |
| 1211 | + } |
| 1212 | + }') |
| 1213 | + local valid |
| 1214 | + valid=$(echo "${response}" | jq -r '.valid' 2>/dev/null) |
| 1215 | + if [ "${valid}" = "true" ]; then |
| 1216 | + log_info "Valid data passes /validate" |
| 1217 | + return 0 |
| 1218 | + fi |
| 1219 | + log_info "Valid data rejected by /validate: ${response}" |
| 1220 | + return 1 |
| 1221 | +} |
| 1222 | + |
| 1223 | +test_validate_endpoint_null_data() { |
| 1224 | + local response |
| 1225 | + response=$(curl_json_quiet -X POST "http://localhost:${SERVER_PORTS[0]}/validate" \ |
| 1226 | + -d '{"data": null}') |
| 1227 | + local valid |
| 1228 | + valid=$(echo "${response}" | jq -r '.valid' 2>/dev/null) |
| 1229 | + if [ "${valid}" = "false" ]; then |
| 1230 | + log_info "Null data correctly fails /validate" |
| 1231 | + return 0 |
| 1232 | + fi |
| 1233 | + log_info "Null data not rejected by /validate: ${response}" |
| 1234 | + return 1 |
| 1235 | +} |
| 1236 | + |
| 1237 | +test_validate_endpoint_empty_hotkey() { |
| 1238 | + local response |
| 1239 | + response=$(curl_json_quiet -X POST "http://localhost:${SERVER_PORTS[0]}/validate" \ |
| 1240 | + -d '{ |
| 1241 | + "data": { |
| 1242 | + "agent_hash": "abc123", |
| 1243 | + "miner_hotkey": "", |
| 1244 | + "epoch": 1, |
| 1245 | + "task_results": [ |
| 1246 | + {"task_id": "task-1", "passed": true, "score": 0.9} |
| 1247 | + ] |
| 1248 | + } |
| 1249 | + }') |
| 1250 | + local valid |
| 1251 | + valid=$(echo "${response}" | jq -r '.valid' 2>/dev/null) |
| 1252 | + if [ "${valid}" = "false" ]; then |
| 1253 | + log_info "Empty miner_hotkey correctly fails /validate" |
| 1254 | + return 0 |
| 1255 | + fi |
| 1256 | + log_info "Empty miner_hotkey not rejected: ${response}" |
| 1257 | + return 1 |
| 1258 | +} |
| 1259 | + |
| 1260 | +test_validate_endpoint_returns_errors_array() { |
| 1261 | + local response |
| 1262 | + response=$(curl_json_quiet -X POST "http://localhost:${SERVER_PORTS[0]}/validate" \ |
| 1263 | + -d '{"data": null}') |
| 1264 | + local errors_type |
| 1265 | + errors_type=$(echo "${response}" | jq '.errors | type' 2>/dev/null) |
| 1266 | + if [ "${errors_type}" = '"array"' ]; then |
| 1267 | + local count |
| 1268 | + count=$(echo "${response}" | jq '.errors | length' 2>/dev/null) |
| 1269 | + log_info "Validate returns errors array with ${count} entries" |
| 1270 | + return 0 |
| 1271 | + fi |
| 1272 | + log_info "Validate response missing errors array: ${response}" |
| 1273 | + return 1 |
| 1274 | +} |
| 1275 | + |
| 1276 | +test_validate_endpoint_returns_warnings_array() { |
| 1277 | + local response |
| 1278 | + response=$(curl_json_quiet -X POST "http://localhost:${SERVER_PORTS[0]}/validate" \ |
| 1279 | + -d '{ |
| 1280 | + "data": { |
| 1281 | + "agent_hash": "abc123", |
| 1282 | + "miner_hotkey": "5GrwvaEF5zXb26Fz9rcQpDWS57CtERHpNehXCPcNoHGKutQY", |
| 1283 | + "epoch": 1, |
| 1284 | + "task_results": [ |
| 1285 | + {"task_id": "task-1", "passed": true, "score": 0.9} |
| 1286 | + ] |
| 1287 | + } |
| 1288 | + }') |
| 1289 | + local warnings_type |
| 1290 | + warnings_type=$(echo "${response}" | jq '.warnings | type' 2>/dev/null) |
| 1291 | + if [ "${warnings_type}" = '"array"' ]; then |
| 1292 | + log_info "Validate returns warnings array" |
| 1293 | + return 0 |
| 1294 | + fi |
| 1295 | + log_info "Validate response missing warnings array: ${response}" |
| 1296 | + return 1 |
| 1297 | +} |
| 1298 | + |
| 1299 | +run_test "POST /validate accepts valid data" test_validate_endpoint_valid_data |
| 1300 | +run_test "POST /validate rejects null data" test_validate_endpoint_null_data |
| 1301 | +run_test "POST /validate rejects empty miner_hotkey" test_validate_endpoint_empty_hotkey |
| 1302 | +run_test "POST /validate returns errors array" test_validate_endpoint_returns_errors_array |
| 1303 | +run_test "POST /validate returns warnings array" test_validate_endpoint_returns_warnings_array |
| 1304 | + |
1125 | 1305 | # ============================================================================= |
1126 | 1306 | # Collect final logs per server |
1127 | 1307 | # ============================================================================= |
|
0 commit comments