diff --git a/beeflow/client/core.py b/beeflow/client/core.py index 7bc2152a8..c5df9c59b 100644 --- a/beeflow/client/core.py +++ b/beeflow/client/core.py @@ -279,7 +279,7 @@ def start_slurm_restd(): slurmrestd_log = '/'.join([bee_workdir, 'logs', 'restd.log']) openapi_version = worker_utils.get_slurmrestd_version() print(f"Inferred slurmrestd version: {openapi_version}") - slurm_args = f'-s openapi/{openapi_version}' + slurm_args = f'-d {openapi_version} -s openapi/slurmctld' # The following adds the db plugin we opted not to use for now # slurm_args = f'-s openapi/{openapi_version},openapi/db{openapi_version}' slurm_socket = paths.slurm_socket() diff --git a/beeflow/common/worker/slurm_worker.py b/beeflow/common/worker/slurm_worker.py index 9fddd130b..064379c62 100644 --- a/beeflow/common/worker/slurm_worker.py +++ b/beeflow/common/worker/slurm_worker.py @@ -233,7 +233,7 @@ def query_task(self,job_id): check_slurm_error(data, f'Failed to query job {job_id}, slurm error.') # For some versions of slurm, the job_state isn't included on failure try: - job_state = data['jobs'][0]['job_state'] + job_state = data['jobs'][0]['job_state'][0] job_info = deepcopy(data['jobs'][0]) except (KeyError, IndexError) as exc: diff --git a/beeflow/common/worker/utils.py b/beeflow/common/worker/utils.py index 44d8814ed..df2c9fbdf 100644 --- a/beeflow/common/worker/utils.py +++ b/beeflow/common/worker/utils.py @@ -40,15 +40,15 @@ def parse_key_val(pair): def get_slurmrestd_version(): """Get the newest slurmrestd version.""" - resp = subprocess.run(["slurmrestd", "-s", "list"], check=True, stderr=subprocess.PIPE, - text=True).stderr + resp = subprocess.run(["slurmrestd", "-d", "list"], check=True, stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, text=True).stdout resp = resp.split("\n") # Confirm slurmrestd format is the same # If the slurmrestd list outputs has changed potentially something else has broken - if "Possible OpenAPI plugins" not in resp[0]: + if "Possible data_parser plugins" not in resp[0]: print("Slurmrestd OpenAPI format has changed and things may break") - api_versions = [line.split('/')[1] for line in resp[1:] if re.search(r"openapi/v\d+\.\d+\.\d+", - line)] + api_versions = [line.split('/')[1] for line in resp[1:] if + re.search(r"data_parser/v\d+\.\d+\.\d+", line)] # Sort the versions and grab the newest one newest_api = sorted(api_versions, key=Version, reverse=True)[0] return newest_api diff --git a/beeflow/tests/test_slurm_worker.py b/beeflow/tests/test_slurm_worker.py index 13dc332e0..34e6ed85b 100644 --- a/beeflow/tests/test_slurm_worker.py +++ b/beeflow/tests/test_slurm_worker.py @@ -56,7 +56,7 @@ def slurm_worker(request): bee_workdir = os.path.expanduser(f'/tmp/{uuid.uuid4().hex}.tmp') os.mkdir(bee_workdir) openapi_version = worker_utils.get_slurmrestd_version() - proc = subprocess.Popen(f'slurmrestd -s openapi/{openapi_version} unix:{slurm_socket}', + proc = subprocess.Popen(f'slurmrestd -d {openapi_version} -s openapi/slurmctld unix:{slurm_socket}', shell=True) time.sleep(1) worker_iface = WorkerInterface(worker=SlurmWorker, container_runtime='Charliecloud', diff --git a/ci/deps_install.sh b/ci/deps_install.sh index a4f9d5d9b..c4e02119b 100755 --- a/ci/deps_install.sh +++ b/ci/deps_install.sh @@ -3,7 +3,7 @@ set -e sudo apt-get update -sudo apt-get install -y slurmctld slurmd slurmrestd munge python3 python3-venv \ +sudo apt-get install -y libhttp-parser-dev libjson-c-dev libjwt-dev munge python3 python3-venv \ curl build-essential zlib1g-dev libncurses5-dev libgdbm-dev libnss3-dev \ libssl-dev libsqlite3-dev libreadline-dev libffi-dev libbz2-dev \ libmunge-dev \ @@ -16,7 +16,15 @@ curl -O -L https://github.com/hpc/charliecloud/releases/download/v${CHARLIECLOUD tar -xvf charliecloud-${CHARLIECLOUD_VERSION}.tar.gz (cd charliecloud-${CHARLIECLOUD_VERSION} ./configure --prefix=/usr - make + make -j4 + sudo make install) + +# Install Slurm +curl -O -L https://download.schedmd.com/slurm/slurm-${SLURM_VERSION}.tar.bz2 +tar -xvf slurm-${SLURM_VERSION}.tar.bz2 +(cd slurm-${SLURM_VERSION} + ./configure --prefix=/usr + make -j4 sudo make install) # Install Python3 diff --git a/ci/env.sh b/ci/env.sh index 8a725fab4..fbcf2e91a 100644 --- a/ci/env.sh +++ b/ci/env.sh @@ -12,6 +12,8 @@ mkdir -p $SLURMD_SPOOL_DIR $SLURM_STATE_SAVE_LOCATION $LOG_DIR export SLURMCTLD_LOG=$LOG_DIR/slurmctld.log export SLURMD_LOG=$LOG_DIR/slurmd.log export SLURM_USER=`whoami` +export SLURMRESTD_SECURITY=disable_user_check +export SLURM_VERSION=24.11.7 export MUNGE_SOCKET=/tmp/munge.sock export MUNGE_LOG=/tmp/munge.log export MUNGE_PID=/tmp/munge.pid diff --git a/ci/slurm_start.sh b/ci/slurm_start.sh index 6221277d5..c2c81cba0 100755 --- a/ci/slurm_start.sh +++ b/ci/slurm_start.sh @@ -3,10 +3,6 @@ . ./ci/env.sh -printf "#### SLURM VERSION ####\n" -srun -V -printf "#######################\n" - # Determine config of CI host export NODE_CONFIG=`slurmd -C | head -n 1` @@ -55,6 +51,10 @@ $NODE_CONFIG PartitionName=debug Nodes=ALL Default=YES MaxTime=INFINITE State=UP EOF +printf "#### SLURM VERSION ####\n" +srun -V +printf "#######################\n" + printf "\n\n" printf "#### slurm.conf ####\n" cat $SLURM_CONF @@ -82,5 +82,5 @@ srun --mpi=list printf "#######################\n" printf "\n" printf "#### OPENAPI VERSIONS ####\n" -slurmrestd -s list +slurmrestd -d list printf "##########################\n"