-
Notifications
You must be signed in to change notification settings - Fork 5
replaced squeue calls with sacct for more robust interface #100
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: develop
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -5,7 +5,8 @@ | |
| import subprocess | ||
| import sys | ||
| import re | ||
| import psutil | ||
| # import psutil | ||
| from esm_parser import user_error | ||
|
|
||
| class Slurm: | ||
| """ | ||
|
|
@@ -152,19 +153,86 @@ def get_job_state(jobid): | |
| str : | ||
| The short job state. | ||
| """ | ||
| state_command = f'squeue -j {str(jobid)} -o "%T"' | ||
|
|
||
| squeue_output = subprocess.Popen( | ||
| state_command.split(), | ||
| stdout = subprocess.PIPE, | ||
| stderr = subprocess.PIPE, | ||
| ).communicate()[0] | ||
| out_pattern = 'b\\\'"STATE\"\\\\n"(.+?)"\\\\n\\\'' | ||
| out_search = re.search(out_pattern, str(squeue_output)) | ||
| if out_search: | ||
| return out_search.group(1) | ||
| # state_command = f'squeue -j {str(jobid)} -o "%T"' | ||
|
|
||
| # squeue_output = subprocess.Popen( | ||
| # state_command.split(), | ||
| # stdout = subprocess.PIPE, | ||
| # stderr = subprocess.PIPE, | ||
| # ).communicate()[0] | ||
| # out_pattern = 'b\\\'"STATE\"\\\\n"(.+?)"\\\\n\\\'' | ||
| # out_search = re.search(out_pattern, str(squeue_output)) | ||
| # if out_search: | ||
| # return out_search.group(1) | ||
|
|
||
| # deniz: sacct is much better and persistent compared to squeue. Also | ||
| # getoutput returns standard strings compared to byte strings. This | ||
| # allows easier regex | ||
| command = f'sacct -j {str(jobid)} --parsable --format=jobid,State' | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This looks much cleaner than what I hacked together this morning for one of my utilities: squeue -u $(whoami) -o "%Z %30j %T %M" | grep ${PROJECT_BASE} | cut -f 2- -d' ' | sortI wish I had learned |
||
| output = subprocess.getoutput(state_command) | ||
| # output will be like | ||
| # JobID|State| | ||
| # 29319673|COMPLETED| | ||
| # 29319673.batch|COMPLETED| | ||
| # 29319673.0|COMPLETED| | ||
| pattern = f'{jobid}\|([A-Z]+)\|' | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Looking at the function header, it seems that is the slurm job number, which corresponds to:
Maybe we can add a link in the docstring and refer to the section "OUTPUT ENVIRONMENT VARIABLES" here: https://slurm.schedmd.com/sbatch.html |
||
| match = re.search(pattern, output) | ||
|
|
||
| # If regex matches then return the Slurm status. Otherwise, something | ||
| # is really wrong | ||
| if match: | ||
| # return the Slurm job state code: eg. COMPLETED, RUNNNING, CANCELLED, ... | ||
| # https://slurm.schedmd.com/sacct.html | ||
| return match.group(1) | ||
| else: | ||
| err_msg = f"Job ID {jobid} does not correspond to a valid Slurm job" | ||
| user_error("RUNTIME ERROR", err_msg, 1) | ||
|
|
||
|
|
||
| @staticmethod | ||
| def job_is_still_running(jobid): | ||
| """Returns a boolean if the job is still running""" | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. don't erase the docstring!! |
||
| return psutil.pid_exists(jobid) | ||
| # """Returns a boolean if the job is still running""" | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same as above, maybe we can directly remove old code rather than just having it commented, that also might make looking at diffs easier. |
||
| # return psutil.pid_exists(jobid) | ||
|
|
||
| # deniz: these are the official Slurm job state codes, from: | ||
| # https://slurm.schedmd.com/sacct.html | ||
| wait_status_list = ['S', 'SUSPENDED', 'PD', 'PENDING', 'RQ', 'REQUEUED'] | ||
|
|
||
| running_status_list = ['R', 'RUNNING'] | ||
|
|
||
| bad_exit_status_list = ['BF', 'BOOT_FAIL', 'CA', 'CANCELLED', | ||
| 'DL', 'DEADLINE', 'F', 'FAILED', 'NF', 'NODE_FAIL', | ||
| 'OOM', 'OUT_OF_MEMORY', 'PR', 'PREEMPTED', 'TO', 'TIMEOUT'] | ||
|
|
||
| good_exit_status_list = ['COMPLETED'] | ||
|
|
||
| # deniz: could not categorize these 2 | ||
| other_status_list = ['RS', 'RESIZING', 'RV', 'REVOKED'] | ||
|
|
||
| # merge all states | ||
| all_states = wait_status_list + running_status_list + \ | ||
| bad_exit_status_list + good_exit_status_list | ||
|
|
||
| # get the state of the job and check if it is valid | ||
| job_state = get_job_state(jobid) | ||
|
|
||
| if not job_state in all_states: | ||
| err_msg = f"job state: {job_state} is not a valid Slurm job state" | ||
| user_error("RUNTIME ERROR", err_msg, 1) | ||
|
|
||
| # deniz: TODO: add inside the verbose group ??? | ||
| if job_state in good_exit_status_list: | ||
| print('already completed') | ||
|
|
||
| if job_state in running_status_list: | ||
| return True | ||
| # eg. COMPLETED, PENDING, ... | ||
| else: | ||
| return False | ||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can we erase old code instead of commenting it? If it's ever needed again, we can always go back in the git history and figure out what we had before