Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
96 changes: 82 additions & 14 deletions esm_runscripts/slurm.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@
import subprocess
import sys
import re
import psutil
# import psutil
from esm_parser import user_error

class Slurm:
"""
Expand Down Expand Up @@ -152,19 +153,86 @@ def get_job_state(jobid):
str :
The short job state.
"""
state_command = f'squeue -j {str(jobid)} -o "%T"'

squeue_output = subprocess.Popen(
state_command.split(),
stdout = subprocess.PIPE,
stderr = subprocess.PIPE,
).communicate()[0]
out_pattern = 'b\\\'"STATE\"\\\\n"(.+?)"\\\\n\\\''
out_search = re.search(out_pattern, str(squeue_output))
if out_search:
return out_search.group(1)
# state_command = f'squeue -j {str(jobid)} -o "%T"'
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we erase old code instead of commenting it? If it's ever needed again, we can always go back in the git history and figure out what we had before


# squeue_output = subprocess.Popen(
# state_command.split(),
# stdout = subprocess.PIPE,
# stderr = subprocess.PIPE,
# ).communicate()[0]
# out_pattern = 'b\\\'"STATE\"\\\\n"(.+?)"\\\\n\\\''
# out_search = re.search(out_pattern, str(squeue_output))
# if out_search:
# return out_search.group(1)

# deniz: sacct is much better and persistent compared to squeue. Also
# getoutput returns standard strings compared to byte strings. This
# allows easier regex
command = f'sacct -j {str(jobid)} --parsable --format=jobid,State'
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This looks much cleaner than what I hacked together this morning for one of my utilities:

  squeue -u $(whoami) -o "%Z %30j %T %M" | grep ${PROJECT_BASE} | cut -f 2- -d' ' | sort

I wish I had learned sacct earlier... 😭

output = subprocess.getoutput(state_command)
# output will be like
# JobID|State|
# 29319673|COMPLETED|
# 29319673.batch|COMPLETED|
# 29319673.0|COMPLETED|
pattern = f'{jobid}\|([A-Z]+)\|'
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

jobid is the slurm job number, or the slurm job name? What if someone has the same name for multiple jobs? I don't think slurm technically forbids having the same name twice (even if it is confusing)

Looking at the function header, it seems that is the slurm job number, which corresponds to:

SLURM_JOB_ID (and SLURM_JOBID for backwards compatibility)
The ID of the job allocation.

Maybe we can add a link in the docstring and refer to the section "OUTPUT ENVIRONMENT VARIABLES" here: https://slurm.schedmd.com/sbatch.html

match = re.search(pattern, output)

# If regex matches then return the Slurm status. Otherwise, something
# is really wrong
if match:
# return the Slurm job state code: eg. COMPLETED, RUNNNING, CANCELLED, ...
# https://slurm.schedmd.com/sacct.html
return match.group(1)
else:
err_msg = f"Job ID {jobid} does not correspond to a valid Slurm job"
user_error("RUNTIME ERROR", err_msg, 1)


@staticmethod
def job_is_still_running(jobid):
"""Returns a boolean if the job is still running"""
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

don't erase the docstring!!

return psutil.pid_exists(jobid)
# """Returns a boolean if the job is still running"""
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same as above, maybe we can directly remove old code rather than just having it commented, that also might make looking at diffs easier.

# return psutil.pid_exists(jobid)

# deniz: these are the official Slurm job state codes, from:
# https://slurm.schedmd.com/sacct.html
wait_status_list = ['S', 'SUSPENDED', 'PD', 'PENDING', 'RQ', 'REQUEUED']

running_status_list = ['R', 'RUNNING']

bad_exit_status_list = ['BF', 'BOOT_FAIL', 'CA', 'CANCELLED',
'DL', 'DEADLINE', 'F', 'FAILED', 'NF', 'NODE_FAIL',
'OOM', 'OUT_OF_MEMORY', 'PR', 'PREEMPTED', 'TO', 'TIMEOUT']

good_exit_status_list = ['COMPLETED']

# deniz: could not categorize these 2
other_status_list = ['RS', 'RESIZING', 'RV', 'REVOKED']

# merge all states
all_states = wait_status_list + running_status_list + \
bad_exit_status_list + good_exit_status_list

# get the state of the job and check if it is valid
job_state = get_job_state(jobid)

if not job_state in all_states:
err_msg = f"job state: {job_state} is not a valid Slurm job state"
user_error("RUNTIME ERROR", err_msg, 1)

# deniz: TODO: add inside the verbose group ???
if job_state in good_exit_status_list:
print('already completed')

if job_state in running_status_list:
return True
# eg. COMPLETED, PENDING, ...
else:
return False