Skip to content

Commit 98a1893

Browse files
author
Varun Deep Saini
committed
Updated tests and enhanced kill caller with an offset
Signed-off-by: Varun Deep Saini <varun.23bcs10048@ms.sst.scaler.com>
1 parent baf371e commit 98a1893

File tree

45 files changed

+811
-683
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

45 files changed

+811
-683
lines changed
Lines changed: 117 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,117 @@
1+
bundle:
2+
name: wal-chain-test
3+
4+
resources:
5+
jobs:
6+
# Linear chain: job_01 -> job_02 -> ... -> job_10
7+
# Execution order: job_01 first, job_10 last
8+
job_01:
9+
name: "job-01"
10+
description: "first in chain"
11+
tasks:
12+
- task_key: "task"
13+
spark_python_task:
14+
python_file: ./test.py
15+
new_cluster:
16+
spark_version: 15.4.x-scala2.12
17+
node_type_id: i3.xlarge
18+
num_workers: 0
19+
job_02:
20+
name: "job-02"
21+
description: "depends on ${resources.jobs.job_01.id}"
22+
tasks:
23+
- task_key: "task"
24+
spark_python_task:
25+
python_file: ./test.py
26+
new_cluster:
27+
spark_version: 15.4.x-scala2.12
28+
node_type_id: i3.xlarge
29+
num_workers: 0
30+
job_03:
31+
name: "job-03"
32+
description: "depends on ${resources.jobs.job_02.id}"
33+
tasks:
34+
- task_key: "task"
35+
spark_python_task:
36+
python_file: ./test.py
37+
new_cluster:
38+
spark_version: 15.4.x-scala2.12
39+
node_type_id: i3.xlarge
40+
num_workers: 0
41+
job_04:
42+
name: "job-04"
43+
description: "depends on ${resources.jobs.job_03.id}"
44+
tasks:
45+
- task_key: "task"
46+
spark_python_task:
47+
python_file: ./test.py
48+
new_cluster:
49+
spark_version: 15.4.x-scala2.12
50+
node_type_id: i3.xlarge
51+
num_workers: 0
52+
job_05:
53+
name: "job-05"
54+
description: "depends on ${resources.jobs.job_04.id}"
55+
tasks:
56+
- task_key: "task"
57+
spark_python_task:
58+
python_file: ./test.py
59+
new_cluster:
60+
spark_version: 15.4.x-scala2.12
61+
node_type_id: i3.xlarge
62+
num_workers: 0
63+
job_06:
64+
name: "job-06"
65+
description: "depends on ${resources.jobs.job_05.id}"
66+
tasks:
67+
- task_key: "task"
68+
spark_python_task:
69+
python_file: ./test.py
70+
new_cluster:
71+
spark_version: 15.4.x-scala2.12
72+
node_type_id: i3.xlarge
73+
num_workers: 0
74+
job_07:
75+
name: "job-07"
76+
description: "depends on ${resources.jobs.job_06.id}"
77+
tasks:
78+
- task_key: "task"
79+
spark_python_task:
80+
python_file: ./test.py
81+
new_cluster:
82+
spark_version: 15.4.x-scala2.12
83+
node_type_id: i3.xlarge
84+
num_workers: 0
85+
job_08:
86+
name: "job-08"
87+
description: "depends on ${resources.jobs.job_07.id}"
88+
tasks:
89+
- task_key: "task"
90+
spark_python_task:
91+
python_file: ./test.py
92+
new_cluster:
93+
spark_version: 15.4.x-scala2.12
94+
node_type_id: i3.xlarge
95+
num_workers: 0
96+
job_09:
97+
name: "job-09"
98+
description: "depends on ${resources.jobs.job_08.id}"
99+
tasks:
100+
- task_key: "task"
101+
spark_python_task:
102+
python_file: ./test.py
103+
new_cluster:
104+
spark_version: 15.4.x-scala2.12
105+
node_type_id: i3.xlarge
106+
num_workers: 0
107+
job_10:
108+
name: "job-10"
109+
description: "depends on ${resources.jobs.job_09.id}"
110+
tasks:
111+
- task_key: "task"
112+
spark_python_task:
113+
python_file: ./test.py
114+
new_cluster:
115+
spark_version: 15.4.x-scala2.12
116+
node_type_id: i3.xlarge
117+
num_workers: 0

acceptance/bundle/deploy/wal/chain-10-jobs/out.test.toml

Lines changed: 5 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.
Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
=== First deploy (crashes on job_10) ===
2+
3+
>>> errcode [CLI] bundle deploy
4+
Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/files...
5+
Deploying resources...
6+
[PROCESS_KILLED]
7+
8+
Exit code: [KILLED]
9+
10+
=== WAL content after crash ===
11+
{"lineage":"[UUID]","serial": [SERIAL]}
12+
{"k":"resources.jobs.job_01","v":{"__id__": "[ID]","state":{"deployment":{"kind":"BUNDLE","metadata_file_path":"/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/state/metadata.json"},"description":"first in chain","edit_mode":"UI_LOCKED","format":"MULTI_TASK","max_concurrent_runs":1,"name":"job-01","queue":{"enabled":true},"tasks":[{"new_cluster":{"node_type_id":"[NODE_TYPE_ID]","num_workers":0,"spark_version":"15.4.x-scala2.12"},"spark_python_task":{"python_file":"/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/files/test.py"},"task_key":"task"}]}}}
13+
{"k":"resources.jobs.job_02","v":{"__id__": "[ID]","state":{"deployment":{"kind":"BUNDLE","metadata_file_path":"/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/state/metadata.json"},"description":"depends on 1001","edit_mode":"UI_LOCKED","format":"MULTI_TASK","max_concurrent_runs":1,"name":"job-02","queue":{"enabled":true},"tasks":[{"new_cluster":{"node_type_id":"[NODE_TYPE_ID]","num_workers":0,"spark_version":"15.4.x-scala2.12"},"spark_python_task":{"python_file":"/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/files/test.py"},"task_key":"task"}]},"depends_on":[{"node":"resources.jobs.job_01","label":"${resources.jobs.job_01.id}"}]}}
14+
{"k":"resources.jobs.job_03","v":{"__id__": "[ID]","state":{"deployment":{"kind":"BUNDLE","metadata_file_path":"/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/state/metadata.json"},"description":"depends on 1001","edit_mode":"UI_LOCKED","format":"MULTI_TASK","max_concurrent_runs":1,"name":"job-03","queue":{"enabled":true},"tasks":[{"new_cluster":{"node_type_id":"[NODE_TYPE_ID]","num_workers":0,"spark_version":"15.4.x-scala2.12"},"spark_python_task":{"python_file":"/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/files/test.py"},"task_key":"task"}]},"depends_on":[{"node":"resources.jobs.job_02","label":"${resources.jobs.job_02.id}"}]}}
15+
{"k":"resources.jobs.job_04","v":{"__id__": "[ID]","state":{"deployment":{"kind":"BUNDLE","metadata_file_path":"/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/state/metadata.json"},"description":"depends on 1001","edit_mode":"UI_LOCKED","format":"MULTI_TASK","max_concurrent_runs":1,"name":"job-04","queue":{"enabled":true},"tasks":[{"new_cluster":{"node_type_id":"[NODE_TYPE_ID]","num_workers":0,"spark_version":"15.4.x-scala2.12"},"spark_python_task":{"python_file":"/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/files/test.py"},"task_key":"task"}]},"depends_on":[{"node":"resources.jobs.job_03","label":"${resources.jobs.job_03.id}"}]}}
16+
{"k":"resources.jobs.job_05","v":{"__id__": "[ID]","state":{"deployment":{"kind":"BUNDLE","metadata_file_path":"/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/state/metadata.json"},"description":"depends on 1001","edit_mode":"UI_LOCKED","format":"MULTI_TASK","max_concurrent_runs":1,"name":"job-05","queue":{"enabled":true},"tasks":[{"new_cluster":{"node_type_id":"[NODE_TYPE_ID]","num_workers":0,"spark_version":"15.4.x-scala2.12"},"spark_python_task":{"python_file":"/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/files/test.py"},"task_key":"task"}]},"depends_on":[{"node":"resources.jobs.job_04","label":"${resources.jobs.job_04.id}"}]}}
17+
{"k":"resources.jobs.job_06","v":{"__id__": "[ID]","state":{"deployment":{"kind":"BUNDLE","metadata_file_path":"/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/state/metadata.json"},"description":"depends on 1001","edit_mode":"UI_LOCKED","format":"MULTI_TASK","max_concurrent_runs":1,"name":"job-06","queue":{"enabled":true},"tasks":[{"new_cluster":{"node_type_id":"[NODE_TYPE_ID]","num_workers":0,"spark_version":"15.4.x-scala2.12"},"spark_python_task":{"python_file":"/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/files/test.py"},"task_key":"task"}]},"depends_on":[{"node":"resources.jobs.job_05","label":"${resources.jobs.job_05.id}"}]}}
18+
{"k":"resources.jobs.job_07","v":{"__id__": "[ID]","state":{"deployment":{"kind":"BUNDLE","metadata_file_path":"/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/state/metadata.json"},"description":"depends on 1001","edit_mode":"UI_LOCKED","format":"MULTI_TASK","max_concurrent_runs":1,"name":"job-07","queue":{"enabled":true},"tasks":[{"new_cluster":{"node_type_id":"[NODE_TYPE_ID]","num_workers":0,"spark_version":"15.4.x-scala2.12"},"spark_python_task":{"python_file":"/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/files/test.py"},"task_key":"task"}]},"depends_on":[{"node":"resources.jobs.job_06","label":"${resources.jobs.job_06.id}"}]}}
19+
{"k":"resources.jobs.job_08","v":{"__id__": "[ID]","state":{"deployment":{"kind":"BUNDLE","metadata_file_path":"/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/state/metadata.json"},"description":"depends on 1001","edit_mode":"UI_LOCKED","format":"MULTI_TASK","max_concurrent_runs":1,"name":"job-08","queue":{"enabled":true},"tasks":[{"new_cluster":{"node_type_id":"[NODE_TYPE_ID]","num_workers":0,"spark_version":"15.4.x-scala2.12"},"spark_python_task":{"python_file":"/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/files/test.py"},"task_key":"task"}]},"depends_on":[{"node":"resources.jobs.job_07","label":"${resources.jobs.job_07.id}"}]}}
20+
{"k":"resources.jobs.job_09","v":{"__id__": "[ID]","state":{"deployment":{"kind":"BUNDLE","metadata_file_path":"/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/state/metadata.json"},"description":"depends on 1001","edit_mode":"UI_LOCKED","format":"MULTI_TASK","max_concurrent_runs":1,"name":"job-09","queue":{"enabled":true},"tasks":[{"new_cluster":{"node_type_id":"[NODE_TYPE_ID]","num_workers":0,"spark_version":"15.4.x-scala2.12"},"spark_python_task":{"python_file":"/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/files/test.py"},"task_key":"task"}]},"depends_on":[{"node":"resources.jobs.job_08","label":"${resources.jobs.job_08.id}"}]}}
21+
22+
=== Number of jobs saved in WAL ===
23+
9
24+
25+
=== Bundle summary (reads from WAL) ===
26+
Name: wal-chain-test
27+
Target: default
28+
Workspace:
29+
User: [USERNAME]
30+
Path: /Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default
31+
Resources:
32+
Jobs:
33+
job_01:
34+
Name: job-01
35+
URL: [DATABRICKS_URL]/jobs/1001?o=[NUMID]
36+
job_02:
37+
Name: job-02
38+
URL: [DATABRICKS_URL]/jobs/1001?o=[NUMID]
39+
job_03:
40+
Name: job-03
41+
URL: [DATABRICKS_URL]/jobs/1001?o=[NUMID]
42+
job_04:
43+
Name: job-04
44+
URL: [DATABRICKS_URL]/jobs/1001?o=[NUMID]
45+
job_05:
46+
Name: job-05
47+
URL: [DATABRICKS_URL]/jobs/1001?o=[NUMID]
48+
job_06:
49+
Name: job-06
50+
URL: [DATABRICKS_URL]/jobs/1001?o=[NUMID]
51+
job_07:
52+
Name: job-07
53+
URL: [DATABRICKS_URL]/jobs/1001?o=[NUMID]
54+
job_08:
55+
Name: job-08
56+
URL: [DATABRICKS_URL]/jobs/1001?o=[NUMID]
57+
job_09:
58+
Name: job-09
59+
URL: [DATABRICKS_URL]/jobs/1001?o=[NUMID]
60+
job_10:
61+
Name: job-10
62+
URL: (not deployed)
63+
64+
=== Second deploy (recovery) ===
65+
66+
>>> [CLI] bundle deploy --force-lock
67+
Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/files...
68+
Deploying resources...
69+
Updating deployment state...
70+
Deployment complete!
71+
72+
=== WAL after successful deploy ===
73+
WAL deleted (expected)
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
echo "=== First deploy (crashes on job_10) ==="
2+
trace errcode $CLI bundle deploy
3+
4+
echo ""
5+
echo "=== WAL content after crash ==="
6+
cat .databricks/bundle/default/resources.json.wal 2>/dev/null || echo "No WAL file"
7+
8+
echo ""
9+
echo "=== Number of jobs saved in WAL ==="
10+
grep -c '"k":"resources.jobs' .databricks/bundle/default/resources.json.wal 2>/dev/null || echo "0"
11+
12+
echo ""
13+
echo "=== Bundle summary (reads from WAL) ==="
14+
$CLI bundle summary
15+
16+
echo ""
17+
echo "=== Second deploy (recovery) ==="
18+
trace $CLI bundle deploy --force-lock
19+
20+
echo ""
21+
echo "=== WAL after successful deploy ==="
22+
cat .databricks/bundle/default/resources.json.wal 2>/dev/null || echo "WAL deleted (expected)"
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
print("test")
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
# Linear chain: job_01 -> job_02 -> ... -> job_10
2+
# Let first 9 jobs/create succeed, then kill on the 10th
3+
4+
[[Server]]
5+
Pattern = "POST /api/2.2/jobs/create"
6+
KillCallerOffset = 9
7+
KillCaller = 1
8+
Response.Body = '{"job_id": 1001}'
9+
10+
[[Server]]
11+
Pattern = "POST /api/2.2/jobs/reset"
12+
Response.Body = '{}'
13+
14+
[[Server]]
15+
Pattern = "GET /api/2.2/jobs/get"
16+
Response.Body = '{"job_id": 1001, "settings": {"name": "test-job"}}'
17+

acceptance/bundle/deploy/wal/corrupted-wal-entry/output.txt

Lines changed: 3 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -1,45 +1,13 @@
11
=== Creating state file with serial 5 ===
2-
=== Creating WAL with corrupted entry ===
2+
=== Creating WAL with corrupted LAST entry ===
33
=== WAL content ===
44
{"lineage":"test-lineage-123","serial": [SERIAL]}
55
{"k":"resources.jobs.valid_job","v":{"__id__": "[ID]","state":{"name":"valid-job"}}}
6-
not valid json - this line should be skipped
76
{"k":"resources.jobs.another_valid","v":{"__id__": "[ID]","state":{"name":"another-valid"}}}
8-
=== Deploy (should recover valid entries, skip corrupted) ===
7+
not valid json - corrupted last line (partial write from crash)
8+
=== Deploy (should recover valid entries, skip corrupted last line) ===
99

1010
>>> [CLI] bundle deploy
11-
Warning: Single node cluster is not correctly configured
12-
at resources.jobs.another_valid.tasks[0].new_cluster
13-
in databricks.yml:23:13
14-
15-
num_workers should be 0 only for single-node clusters. To create a
16-
valid single node cluster please ensure that the following properties
17-
are correctly set in the cluster specification:
18-
19-
spark_conf:
20-
spark.databricks.cluster.profile: singleNode
21-
spark.master: local[*]
22-
23-
custom_tags:
24-
ResourceClass: SingleNode
25-
26-
27-
Warning: Single node cluster is not correctly configured
28-
at resources.jobs.valid_job.tasks[0].new_cluster
29-
in databricks.yml:13:13
30-
31-
num_workers should be 0 only for single-node clusters. To create a
32-
valid single node cluster please ensure that the following properties
33-
are correctly set in the cluster specification:
34-
35-
spark_conf:
36-
spark.databricks.cluster.profile: singleNode
37-
spark.master: local[*]
38-
39-
custom_tags:
40-
ResourceClass: SingleNode
41-
42-
4311
Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/wal-corrupted-test/default/files...
4412
Deploying resources...
4513
Updating deployment state...

acceptance/bundle/deploy/wal/corrupted-wal-entry/script

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -10,19 +10,21 @@ cat > .databricks/bundle/default/resources.json << 'EOF'
1010
}
1111
EOF
1212

13-
echo "=== Creating WAL with corrupted entry ==="
13+
echo "=== Creating WAL with corrupted LAST entry ==="
14+
# Corrupted last line is expected (partial write from crash) and should be skipped.
15+
# Valid entries before it should be recovered.
1416
cat > .databricks/bundle/default/resources.json.wal << 'EOF'
1517
{"lineage":"test-lineage-123","serial":6}
1618
{"k":"resources.jobs.valid_job","v":{"__id__":"1111","state":{"name":"valid-job"}}}
17-
not valid json - this line should be skipped
1819
{"k":"resources.jobs.another_valid","v":{"__id__":"2222","state":{"name":"another-valid"}}}
20+
not valid json - corrupted last line (partial write from crash)
1921
EOF
2022

2123
echo "=== WAL content ==="
2224
cat .databricks/bundle/default/resources.json.wal
2325

24-
echo "=== Deploy (should recover valid entries, skip corrupted) ==="
25-
trace $CLI bundle deploy 2>&1 | python3 sort_warnings.py
26+
echo "=== Deploy (should recover valid entries, skip corrupted last line) ==="
27+
trace $CLI bundle deploy 2>&1
2628

2729
echo "=== Final state (should have recovered entries) ==="
2830
cat .databricks/bundle/default/resources.json | jq -S '{serial: .serial, state_keys: (.state | keys | sort)}'

0 commit comments

Comments
 (0)