-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathcreate_nodes.py
More file actions
223 lines (175 loc) · 7.61 KB
/
create_nodes.py
File metadata and controls
223 lines (175 loc) · 7.61 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
"""
Build an image that has the Nvidia Drivers, Docker and Docker-compose preinstalled.
"""
import argparse
import time
import subprocess
import string
import random
from genesiscloud.client import Client, INSTANCE_TYPES
parser = argparse.ArgumentParser(description='Build an image that has the nvidia Drivers,' +
'docker and docker-compose preinstalled.')
parser.add_argument(
"--api_token",
type=str,
help="The API token. Generate a new one under https://account.genesiscloud.com/dashboard/security."
)
parser.add_argument(
"--ssh_key",
type=str,
help="The name of your ssh key. Public key needs to be stored here (" +
"https://account.genesiscloud.com/dashboard/security) and private key in ~/.ssh/id_rsa ."
)
def check_for_file(public_ip, file_name):
command = ['ssh', 'ubuntu@'+public_ip,
'-o', 'StrictHostKeyChecking=accept-new',
'-o', 'ConnectTimeout=50 ',
'ls ' + file_name]
error_count = 0
while error_count < 6:
output = subprocess.run(command, capture_output=True)
stdout = output.stdout[:-1].decode('utf-8')
stder = output.stderr[:-1].decode('utf-8')
if stdout:
# success
return True
elif stder.split(':')[0] == 'ls':
# ssh call succeeded, but file missing
return False
error_count += 1
time.sleep(5)
print('Checking for the presence of the confirmation file failed.')
exit()
def wait_for_file(ip, filename):
while not check_for_file(ip, filename):
time.sleep(5)
def create_gateway(client, ssh_key_name, name='gateway'):
"""
Creates the Gateway node of the Network. Makes sure the Nvidia GPU
drivers and docker are installed. If that is the case it saves the
image in a snapshot.
:param client: A instance of the pygc Client class
:param ssh_key_name: name of the ssh key
:param name: name of the gateway
:return: Instance dictionary, as returned by pygc Client.Instances.get()
"""
# pick the right image and startup script
snapshot_list = list(client.Snapshots.find({'name': 'nvidia+docker'}))
if snapshot_list:
from_scratch = False
image = snapshot_list[0]
with open('refresh_base_image.sh', 'r') as stream:
startup_script = stream.read()
print('Using the prebuild snapshot with docker installed.')
else:
from_scratch = True
image = list(client.Images.find({"name": 'Ubuntu 18.04'}))[0]
with open('base_image_cloud_init.yml', 'r') as stream:
startup_script = stream.read()
print('Building a image where docker is newly installed.')
# pick the right ssh key
sshkey = list(client.SSHKeys.find({"name": ssh_key_name}))[0]
# pick the right instance type
instance_type = list(INSTANCE_TYPES.keys())[0]
# pick the right security groups
security_groups = [next(client.SecurityGroups.find({'name': key})).id for key in ['standard', 'pygrid']]
# create the instance
instance = client.Instances.create(name=name,
hostname=name,
ssh_keys=[sshkey.id],
security_groups=security_groups,
image=image.id,
type=instance_type,
metadata={"startup_script":
startup_script})
# wait for it to become active
print('Instance '+str(instance.id)+' is starting.')
while instance.status != 'active':
time.sleep(5)
instance = client.Instances.get(instance.id)
print(f"{instance.status}\r", end="")
time.sleep(5)
print('instance is active at '+str(instance.public_ip))
# wait for the startup script to finish
if from_scratch:
wait_for_file(instance.public_ip, '/home/ubuntu/build_finished.txt')
snapshot = client.Snapshots.create(name='nvidia+docker',
instance_id=instance.id)
while snapshot.status == 'creating':
time.sleep(5)
snapshot = client.Snapshots.get(snapshot.id)
else:
wait_for_file(instance.public_ip, '/home/ubuntu/refresh_finished.txt')
return instance
def create_worker(client, ssh_key_name, gateway_ip, name):
"""
Creates A worker node based on the nvidia+docker snapshot.
:param client: A instance of the pygc client class
:param ssh_key_name: name of the ssh key
:param gateway_ip: ip of the gateway
:param name: name of the worker
:return: Instance dictionary, as returned by pygc Client.Instances.get()
"""
# pick the right image
image = list(client.Snapshots.find({'name': 'nvidia+docker'}))[0]
# collect the startup script
# assemble th startup script
with open('node_starter.sh', 'r') as stream:
startup_script = stream.readlines()
startup_script[2] = 'GATEWAYIP=' + gateway_ip + '\n'
startup_script[4] = 'NAME=' + name
startup_script = ''.join(startup_script)
# pick the right ssh key
sshkey = list(client.SSHKeys.find({"name": ssh_key_name}))[0]
# pick the right security groups
security_groups = [next(client.SecurityGroups.find({'name': key})).id for key in ['standard', 'pygrid']]
# pick the right instance type
instance_type = list(INSTANCE_TYPES.keys())[0]
# create the instance
instance = client.Instances.create(name=name,
hostname=name,
ssh_keys=[sshkey.id],
security_groups=security_groups,
image=image.id,
type=instance_type,
metadata={"startup_script":
startup_script})
# wait for it to become active
print('Worker '+name+'('+str(instance.id)+') is starting.')
while instance.status != 'active':
time.sleep(5)
instance = client.Instances.get(instance.id)
print(f"{instance.status}\r", end="")
time.sleep(5)
# wait for the startup script to finish
wait_for_file(instance.public_ip, '/home/ubuntu/worker_ready.txt')
return instance
def random_string(n):
letters = string.ascii_lowercase
return ''.join(random.choice(letters) for _ in range(n))
def get_list_of_names(n_o_names):
name_repo = ['Alice', 'Bob', 'Carlos', 'Eve', 'Frank', 'Grace',
'Judy', 'Hans', 'Heidi', 'Mike', 'Ted', 'Wendy']
if len(name_repo) > n_o_names:
return name_repo[:n_o_names]
else:
random_names = [random_string(7) for _ in range(n_o_names-len(name_repo))]
return name_repo+random_names
def create_workers(client, ssh_key_name, gateway_ip, no_workers=1):
names = get_list_of_names(no_workers)
if no_workers == 1:
print('Launching ' + names[0] + ' as a worker node on a separate instance.')
else:
print('Launching ' + ', '.join(names[:-1]) + ' and ' + names[-1] + ' as worker nodes ' +
' on separate instances.')
nodes = [create_worker(client, ssh_key_name, gateway_ip, name) for name in names]
return nodes
if __name__ == '__main__':
args = parser.parse_args()
api_token_arg = args.api_token
ssh_key_arg = args.ssh_key
my_client = Client(api_token_arg)
if not my_client.connect():
exit()
gateway = create_gateway(my_client, ssh_key_arg)
create_workers(my_client, ssh_key_arg, gateway.private_ip)