Skip to content

Commit 6856e4d

Browse files
author
EchoBT
committed
test: Add comprehensive test suite
- Add unit tests for core, consensus, network, storage - Add integration tests for API - Add security tests - Add performance benchmarks - Add compatibility tests - Add edge case tests - Reorganize test structure into categories - Add job results from terminal-bench evaluation
1 parent 35bc17e commit 6856e4d

File tree

185 files changed

+34581
-14
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

185 files changed

+34581
-14
lines changed

jobs/2025-12-16__16-39-09/adaptive-rejection-sampler__U5yPphc/agent/oracle.txt

Lines changed: 1013 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
{
2+
"task": {
3+
"path": "adaptive-rejection-sampler",
4+
"git_url": "https://github.com/laude-institute/terminal-bench-2.git",
5+
"git_commit_id": "69671fbaac6d67a7ef0dfec016cc38a64ef7a77c",
6+
"overwrite": false,
7+
"download_dir": null,
8+
"source": "terminal-bench"
9+
},
10+
"trial_name": "adaptive-rejection-sampler__U5yPphc",
11+
"trials_dir": "jobs/2025-12-16__16-39-09",
12+
"timeout_multiplier": 1.0,
13+
"agent": {
14+
"name": "oracle",
15+
"import_path": null,
16+
"model_name": null,
17+
"override_timeout_sec": null,
18+
"max_timeout_sec": null,
19+
"kwargs": {}
20+
},
21+
"environment": {
22+
"type": "docker",
23+
"force_build": false,
24+
"delete": true,
25+
"override_cpus": null,
26+
"override_memory_mb": null,
27+
"override_storage_mb": null,
28+
"kwargs": {}
29+
},
30+
"verifier": {
31+
"override_timeout_sec": null,
32+
"max_timeout_sec": null,
33+
"disable": false
34+
},
35+
"job_id": "4fb0d895-53ed-462d-b536-cb05e557469a"
36+
}
Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
{
2+
"id": "16348ecb-fd61-4a59-9a28-e2eea3ef514b",
3+
"task_name": "adaptive-rejection-sampler",
4+
"trial_name": "adaptive-rejection-sampler__U5yPphc",
5+
"trial_uri": "file:///home/ubuntu/repo/all/Infinity/Full/platform-chain/jobs/2025-12-16__16-39-09/adaptive-rejection-sampler__U5yPphc",
6+
"task_id": {
7+
"git_url": "https://github.com/laude-institute/terminal-bench-2.git",
8+
"git_commit_id": "69671fbaac6d67a7ef0dfec016cc38a64ef7a77c",
9+
"path": "adaptive-rejection-sampler"
10+
},
11+
"source": "terminal-bench",
12+
"task_checksum": "4fde6e809c475b000ae3ea4135f906ffe4775bccae7dafd0b123f4ecc9fcf697",
13+
"config": {
14+
"task": {
15+
"path": "adaptive-rejection-sampler",
16+
"git_url": "https://github.com/laude-institute/terminal-bench-2.git",
17+
"git_commit_id": "69671fbaac6d67a7ef0dfec016cc38a64ef7a77c",
18+
"overwrite": false,
19+
"download_dir": null,
20+
"source": "terminal-bench"
21+
},
22+
"trial_name": "adaptive-rejection-sampler__U5yPphc",
23+
"trials_dir": "jobs/2025-12-16__16-39-09",
24+
"timeout_multiplier": 1.0,
25+
"agent": {
26+
"name": "oracle",
27+
"import_path": null,
28+
"model_name": null,
29+
"override_timeout_sec": null,
30+
"max_timeout_sec": null,
31+
"kwargs": {}
32+
},
33+
"environment": {
34+
"type": "docker",
35+
"force_build": false,
36+
"delete": true,
37+
"override_cpus": null,
38+
"override_memory_mb": null,
39+
"override_storage_mb": null,
40+
"kwargs": {}
41+
},
42+
"verifier": {
43+
"override_timeout_sec": null,
44+
"max_timeout_sec": null,
45+
"disable": false
46+
},
47+
"job_id": "4fb0d895-53ed-462d-b536-cb05e557469a"
48+
},
49+
"agent_info": {
50+
"name": "oracle",
51+
"version": "1.0.0",
52+
"model_info": null
53+
},
54+
"agent_result": {
55+
"n_input_tokens": null,
56+
"n_cache_tokens": null,
57+
"n_output_tokens": null,
58+
"cost_usd": null,
59+
"rollout_details": null,
60+
"metadata": null
61+
},
62+
"verifier_result": {
63+
"rewards": {
64+
"reward": 1.0
65+
}
66+
},
67+
"exception_info": null,
68+
"started_at": "2025-12-16T16:39:12.996001",
69+
"finished_at": "2025-12-16T16:40:07.112502",
70+
"environment_setup": {
71+
"started_at": "2025-12-16T16:39:12.996745",
72+
"finished_at": "2025-12-16T16:39:15.365182"
73+
},
74+
"agent_setup": {
75+
"started_at": "2025-12-16T16:39:15.365487",
76+
"finished_at": "2025-12-16T16:39:15.365660"
77+
},
78+
"agent_execution": {
79+
"started_at": "2025-12-16T16:39:15.365725",
80+
"finished_at": "2025-12-16T16:39:37.671729"
81+
},
82+
"verifier": {
83+
"started_at": "2025-12-16T16:39:37.671828",
84+
"finished_at": "2025-12-16T16:39:55.799975"
85+
}
86+
}
Lines changed: 101 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,101 @@
1+
{
2+
"results": {
3+
"tool": {
4+
"name": "pytest",
5+
"version": "8.4.1"
6+
},
7+
"summary": {
8+
"tests": 9,
9+
"passed": 9,
10+
"failed": 0,
11+
"skipped": 0,
12+
"pending": 0,
13+
"other": 0,
14+
"start": 1765903194.15074,
15+
"stop": 1765903195.7155526
16+
},
17+
"tests": [
18+
{
19+
"name": "test_outputs.py::test_ars_function_exists",
20+
"status": "passed",
21+
"duration": 0.0006830068305134773,
22+
"start": 1765903194.6066408,
23+
"stop": 1765903194.607604,
24+
"retries": 0,
25+
"file_path": "test_outputs.py"
26+
},
27+
{
28+
"name": "test_outputs.py::test_can_generate_standard_distribution_samples",
29+
"status": "passed",
30+
"duration": 0.32918688328936696,
31+
"start": 1765903194.6078,
32+
"stop": 1765903194.9374826,
33+
"retries": 0,
34+
"file_path": "test_outputs.py"
35+
},
36+
{
37+
"name": "test_outputs.py::test_has_test_function",
38+
"status": "passed",
39+
"duration": 0.00106425816193223,
40+
"start": 1765903194.9381516,
41+
"stop": 1765903194.9393542,
42+
"retries": 0,
43+
"file_path": "test_outputs.py"
44+
},
45+
{
46+
"name": "test_outputs.py::test_formal_testing_with_known_truth",
47+
"status": "passed",
48+
"duration": 0.328892785590142,
49+
"start": 1765903194.9395168,
50+
"stop": 1765903195.2688258,
51+
"retries": 0,
52+
"file_path": "test_outputs.py"
53+
},
54+
{
55+
"name": "test_outputs.py::test_sample_files_generated",
56+
"status": "passed",
57+
"duration": 0.006135749164968729,
58+
"start": 1765903195.26943,
59+
"stop": 1765903195.2758567,
60+
"retries": 0,
61+
"file_path": "test_outputs.py"
62+
},
63+
{
64+
"name": "test_outputs.py::test_implementation_is_modular",
65+
"status": "passed",
66+
"duration": 0.0005171275697648525,
67+
"start": 1765903195.276044,
68+
"stop": 1765903195.2766724,
69+
"retries": 0,
70+
"file_path": "test_outputs.py"
71+
},
72+
{
73+
"name": "test_outputs.py::test_implementation_handles_errors",
74+
"status": "passed",
75+
"duration": 0.00034605152904987335,
76+
"start": 1765903195.2768118,
77+
"stop": 1765903195.2772608,
78+
"retries": 0,
79+
"file_path": "test_outputs.py"
80+
},
81+
{
82+
"name": "test_outputs.py::test_input_validation_functionality",
83+
"status": "passed",
84+
"duration": 0.1798468604683876,
85+
"start": 1765903195.2773955,
86+
"stop": 1765903195.4574487,
87+
"retries": 0,
88+
"file_path": "test_outputs.py"
89+
},
90+
{
91+
"name": "test_outputs.py::test_log_concavity_functionality",
92+
"status": "passed",
93+
"duration": 0.25703709246590734,
94+
"start": 1765903195.4577022,
95+
"stop": 1765903195.7151,
96+
"retries": 0,
97+
"file_path": "test_outputs.py"
98+
}
99+
]
100+
}
101+
}
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
1

jobs/2025-12-16__16-39-09/adaptive-rejection-sampler__U5yPphc/verifier/test-stdout.txt

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
Hit:1 http://security.ubuntu.com/ubuntu noble-security InRelease
2+
Hit:2 http://archive.ubuntu.com/ubuntu noble InRelease
3+
Hit:3 http://archive.ubuntu.com/ubuntu noble-updates InRelease
4+
Hit:4 http://archive.ubuntu.com/ubuntu noble-backports InRelease
5+
Reading package lists...
6+
Reading package lists...
7+
Building dependency tree...
8+
Reading state information...
9+
The following NEW packages will be installed:
10+
curl
11+
0 upgraded, 1 newly installed, 0 to remove and 9 not upgraded.
12+
Need to get 226 kB of archives.
13+
After this operation, 534 kB of additional disk space will be used.
14+
Get:1 http://archive.ubuntu.com/ubuntu noble-updates/main amd64 curl amd64 8.5.0-2ubuntu10.6 [226 kB]
15+
Fetched 226 kB in 1s (423 kB/s)
16+
Selecting previously unselected package curl.
17+
(Reading database ... (Reading database ... 5%(Reading database ... 10%(Reading database ... 15%(Reading database ... 20%(Reading database ... 25%(Reading database ... 30%(Reading database ... 35%(Reading database ... 40%(Reading database ... 45%(Reading database ... 50%(Reading database ... 55%(Reading database ... 60%(Reading database ... 65%(Reading database ... 70%(Reading database ... 75%(Reading database ... 80%(Reading database ... 85%(Reading database ... 90%(Reading database ... 95%(Reading database ... 100%(Reading database ... 7996 files and directories currently installed.)
18+
Preparing to unpack .../curl_8.5.0-2ubuntu10.6_amd64.deb ...
19+
Unpacking curl (8.5.0-2ubuntu10.6) ...
20+
Setting up curl (8.5.0-2ubuntu10.6) ...
21+
no checksums to verify
22+
installing to /root/.local/bin
23+
uv
24+
uvx
25+
everything's installed!
26+
27+
To add $HOME/.local/bin to your PATH, either restart your shell or run:
28+
29+
source $HOME/.local/bin/env (sh, bash, zsh)
30+
source $HOME/.local/bin/env.fish (fish)
31+
============================= test session starts ==============================
32+
platform linux -- Python 3.13.9, pytest-8.4.1, pluggy-1.6.0
33+
rootdir: /tests
34+
plugins: json-ctrf-0.3.5
35+
collected 9 items
36+
37+
../tests/test_outputs.py ......... [100%]
38+
39+
==================================== PASSES ====================================
40+
=========================== short test summary info ============================
41+
PASSED ../tests/test_outputs.py::test_ars_function_exists
42+
PASSED ../tests/test_outputs.py::test_can_generate_standard_distribution_samples
43+
PASSED ../tests/test_outputs.py::test_has_test_function
44+
PASSED ../tests/test_outputs.py::test_formal_testing_with_known_truth
45+
PASSED ../tests/test_outputs.py::test_sample_files_generated
46+
PASSED ../tests/test_outputs.py::test_implementation_is_modular
47+
PASSED ../tests/test_outputs.py::test_implementation_handles_errors
48+
PASSED ../tests/test_outputs.py::test_input_validation_functionality
49+
PASSED ../tests/test_outputs.py::test_log_concavity_functionality
50+
============================== 9 passed in 1.57s ===============================

0 commit comments

Comments
 (0)