-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdask_cluster_deprecated.py
More file actions
235 lines (220 loc) · 7.67 KB
/
dask_cluster_deprecated.py
File metadata and controls
235 lines (220 loc) · 7.67 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
import argparse
import subprocess
import os
import time
import json
def nodeset_expand(tira):
if '-' in tira:
lista = tira.split('-')
if len(lista) != 2:
raise ValueError("Mal")
rango = list(range(int(lista[0]), int(lista[1]) + 1))
else:
rango = [tira]
return rango
def nodeset_like(tira):
node_name = tira.split('-')[0] + '-'
resto = tira[tira.find('-') + 1:]
if '[' in resto:
resto = resto.replace('[', '')
resto = resto.replace(']', '')
nodes_ids = resto.split(',')
list_of_nodes = []
for i in nodes_ids:
list_of_nodes = list_of_nodes + nodeset_expand(i)
list_of_nodes = [node_name + str(i) for i in list_of_nodes]
return list_of_nodes
def look_in_environment(environment_variable):
"""
Esta funcion intentan evaluar la variable de entorno que se le pasa.
Inputs:
environment_variable: Variable de entorno de la que se quiere saber el valor. Si no la hay levanta un error.
Outputs:
value_variable: Valor de la variable de entorno si se consigue leer correctamente.
"""
try:
value_variable = os.environ[environment_variable]
#print flag
except KeyError:
print("Not "+environment_variable+" environment variable!!")
raise KeyError
return value_variable
def launch_scheduler(preload=None):
dask_scheduler = 'dask-scheduler --port 0 --dashboard --interface ib0 --scheduler-file {}'.format(
'./scheduler_info.json')
if preload is not None:
dask_scheduler = dask_scheduler + " --preload {}".format(preload)
print('Command line to create Scheduler: {}'.format(dask_scheduler))
#Lanzamos el comado que monta el Scheduler
#process = subprocess.run(dask_scheduler.split(), stdout=subprocess.PIPE)
#return process
def launch_worker(scheduler_file=None, scheduler_address=None, local_folder='/tmp', preload=None):
if scheduler_file is not None:
worker = 'dask-worker --scheduler-file {} --interface ib0 --no-nanny --nthreads 1 --local-directory {}'.format(
scheduler_file, local_folder)
elif scheduler_address is not None:
worker = 'dask-worker {} --interface ib0 --no-nanny --nthreads 1 --local-directory {}'.format(
scheduler_address, local_folder)
else:
raise ValueError('No tengo direccion del scheduler')
if preload is not None:
worker = worker + " --preload {}".format(preload)
print('Command line to create worker: {}'.format(worker))
#Lanzamos el comando que levanta el Worker
process = subprocess.run(worker.split(), stdout=subprocess.PIPE)
return process
def test_scheduler_file(json_file_name):
isfile = False
counter = 0
while not isfile:
isfile = os.path.isfile(json_file_name)
if isfile:
return json_file_name
else:
print('NO ESTA!!')
time.sleep(5)
counter = counter + 1
if counter > 10:
raise FileNotFoundError('Not scheduler json!!')
def create_ssh_file(json_file_name):
#Test if file exists
json_file_name = test_scheduler_file(json_file_name)
#open file
json_file = open(json_file_name)
#load json in memory
data = json.load(json_file)
scheduler_addrs = data['address']
scheduler_addrs = scheduler_addrs.replace('tcp://', '')
scheduler_addrs = scheduler_addrs.split(':')[0]
dashboard_port = data['services']['dashboard']
dashboard_addrs = scheduler_addrs + ':' + str(dashboard_port)
try:
nodes = look_in_environment('SLURM_STEP_NODELIST')
except KeyError:
nodes = os.uname()[1]
node_name = nodeset_like(nodes)
log_name = look_in_environment('LOGNAME')
tira_ssh = 'ssh -t -L {}:localhost:{} {}@ft3.cesga.es ssh -L {}:{} {}'.format(
dashboard_port,
dashboard_port,
log_name,
dashboard_port,
dashboard_addrs,
node_name[0]
)
print(tira_ssh)
return tira_ssh
def create_dask_client(json_file_name="./scheduler_info.json"):
#Test if json scheduler exist
from distributed import Client
json_file_name = test_scheduler_file(json_file_name)
dask_client = Client(scheduler_file=json_file_name)
return dask_client
if __name__ == "__main__":
FLAGS = None
parser = argparse.ArgumentParser()
parser.add_argument('-local', default='/tmp', help='Local Storage')
parser.add_argument(
"--scheduler",
dest="scheduler",
default=False,
action="store_true",
help="Launch only the scheduler",
)
parser.add_argument(
"--worker",
dest="worker",
default=False,
action="store_true",
help="Launch only the worker(s)",
)
parser.add_argument(
"--dask_cluster",
dest="dask_cluster",
default=False,
action="store_true",
help="Deployment of complete Cluster",
)
parser.add_argument(
"--ssh_file",
dest="ssh_file",
default=False,
action="store_true",
help="Create file with ssh tunelling command",
)
parser.add_argument(
"-scheduler_address",
dest="scheduler_address",
type=int,
default=None,
help="Scheduler IP. ex: tcp://10.120.10.7:8085",
)
parser.add_argument(
"--client",
dest="client",
default=False,
action="store_true",
help="Get the dask client",
)
parser.add_argument(
'-preload',
dest="preload",
default=None,
help="File for preload code in dask scheduler and workers."
)
FLAGS, unparsed = parser.parse_known_args()
print(FLAGS)
if FLAGS.scheduler:
scheduler_dask = launch_scheduler(
preload=FLAGS.preload
)
if FLAGS.worker:
if FLAGS.scheduler_address is not None:
worker_dask = launch_worker(
scheduler_address=FLAGS.scheduler_address,
local_folder=FLAGS.local,
preload=FLAGS.preload
)
else:
#Verifica que el json de configuracion existe
json_file = test_scheduler_file('./scheduler_info.json')
worker_dask = launch_worker(
scheduler_file='./scheduler_info.json',
scheduler_address=FLAGS.scheduler_address,
local_folder=FLAGS.local,
preload=FLAGS.preload
)
if FLAGS.ssh_file:
ssh_file = create_ssh_file('./scheduler_info.json')
f = open("./ssh_command.txt", "w")
f.write(ssh_file)
f.close()
if FLAGS.client:
dask_client = create_dask_client("./scheduler_info.json")
print(dask_client)
if FLAGS.dask_cluster:
try:
os.remove("./scheduler_info.json")
except FileNotFoundError:
print('Ya esta borrada')
final_id = int(look_in_environment('SLURM_PROCID'))
if final_id == 0:
scheduler_dask = launch_scheduler(
preload=FLAGS.preload
)
else:
#Todos las tareas worker comprueban que existe el fichero
#de configuracion del cluster
test_scheduler_file('./scheduler_info.json')
if final_id == 1:
#Solo una me crea el fichero de configuracion de tunelling
ssh_file = create_ssh_file('./scheduler_info.json')
f = open("./ssh_command.txt", "w")
f.write(ssh_file)
f.close()
worker_dask = launch_worker(
scheduler_file='./scheduler_info.json',
scheduler_address=None,
local_folder=FLAGS.local,
preload=FLAGS.preload
)