Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 13 additions & 6 deletions slurm_drmaa/drmaa.c
Original file line number Diff line number Diff line change
Expand Up @@ -76,13 +76,20 @@ slurmdrmaa_get_DRM_system( fsd_drmaa_singletone_t *self )
if(slurmdrmaa_version[0] == '\0') /*no locks as drmaa_get_drm_system is usually called only once */
{
#if SLURM_VERSION_NUMBER >= SLURM_VERSION_NUM(20,11,0)
slurm_conf_t * conf_info_msg_ptr = NULL;
slurm_conf_t * conf_info_msg_ptr = NULL;
#else
slurm_ctl_conf_t * conf_info_msg_ptr = NULL;
slurm_ctl_conf_t * conf_info_msg_ptr = NULL;
#endif
if ( slurm_load_ctl_conf ((time_t) NULL, &conf_info_msg_ptr ) == -1 )
{
fsd_log_error(("slurm_load_ctl_conf error: %s",slurm_strerror(slurm_get_errno())));
int _serrno;
#if SLURM_VERSION_NUMBER >= SLURM_VERSION_NUM(24,11,0)
if ( (_serrno = slurm_load_ctl_conf ((time_t) NULL, &conf_info_msg_ptr )) != SLURM_SUCCESS )
{
#else
if ( slurm_load_ctl_conf ((time_t) NULL, &conf_info_msg_ptr ) == -1 )
{
_serrno = slurm_get_errno();
#endif
fsd_log_error(("slurm_load_ctl_conf error: %s",slurm_strerror(_serrno)));
fsd_snprintf(NULL, slurmdrmaa_version, sizeof(slurmdrmaa_version)-1,"SLURM");
}
else
Expand Down Expand Up @@ -192,7 +199,7 @@ slurmdrmaa_wcoredump(
)
{
/**core_dumped = 0;*/
*core_dumped = ((stat)&0200);
*core_dumped = ((stat)&0200);
return DRMAA_ERRNO_SUCCESS;
}

Expand Down
61 changes: 50 additions & 11 deletions slurm_drmaa/job.c
Original file line number Diff line number Diff line change
Expand Up @@ -70,15 +70,21 @@ slurmdrmaa_job_control( fsd_job_t *self, int action )
job_id_spec.original = self->job_id;
self->job_id = slurmdrmaa_set_job_id(&job_id_spec);

int _serrno;

switch( action )
{
case DRMAA_CONTROL_SUSPEND:
#if SLURM_VERSION_NUMBER > SLURM_VERSION_NUM(14,10,0)
if(slurm_suspend2(self->job_id, NULL) == -1) {
#if SLURM_VERSION_NUMBER >= SLURM_VERSION_NUM(24,11,0)
if(( _serrno = slurm_suspend2(self->job_id, NULL)) != SLURM_SUCCESS) {
#elif SLURM_VERSION_NUMBER > SLURM_VERSION_NUM(14,10,0)
if( slurm_suspend2(self->job_id, NULL) == -1) {
int _serrno = slurm_get_errno();
#else
if(slurm_suspend(fsd_atoi(self->job_id)) == -1) {
int _serrno = slurm_get_errno();
#endif
fsd_exc_raise_fmt(FSD_ERRNO_INTERNAL_ERROR, "slurm_suspend error: %s,job_id: %s", slurm_strerror(slurm_get_errno()), self->job_id);
fsd_exc_raise_fmt(FSD_ERRNO_INTERNAL_ERROR, "slurm_suspend error: %s,job_id: %s", slurm_strerror( _serrno ), self->job_id);
}
slurm_self->user_suspended = true;
break;
Expand All @@ -89,17 +95,26 @@ slurmdrmaa_job_control( fsd_job_t *self, int action )
job_desc.job_id = atoi(self->job_id);
job_desc.priority = 0;
job_desc.alloc_sid = 0;
#if SLURM_VERSION_NUMBER >= SLURM_VERSION_NUM(24,11,0)
if(( _serrno = slurm_update_job(&job_desc)) != SLURM_SUCCESS ) {
#else
if(slurm_update_job(&job_desc) == -1) {
fsd_exc_raise_fmt(FSD_ERRNO_INTERNAL_ERROR, "slurm_update_job error: %s,job_id: %s", slurm_strerror(slurm_get_errno()), self->job_id);
_serrno = slurm_get_errno();
#endif
fsd_exc_raise_fmt(FSD_ERRNO_INTERNAL_ERROR, "slurm_update_job error: %s,job_id: %s", slurm_strerror( _serrno ), self->job_id);
}
break;
case DRMAA_CONTROL_RESUME:
#if SLURM_VERSION_NUMBER > SLURM_VERSION_NUM(14,10,0)
#if SLURM_VERSION_NUMBER >= SLURM_VERSION_NUM(24,11,0)
if(( _serrno = slurm_resume2(self->job_id, NULL)) != SLURM_SUCCESS ) {
#elif SLURM_VERSION_NUMBER > SLURM_VERSION_NUM(14,10,0)
if(slurm_resume2(self->job_id, NULL) == -1) {
_serrno = slurm_get_errno();
#else
if(slurm_resume(fsd_atoi(self->job_id)) == -1) {
_serrno = slurm_get_errno();
#endif
fsd_exc_raise_fmt(FSD_ERRNO_INTERNAL_ERROR, "slurm_resume error: %s,job_id: %s", slurm_strerror(slurm_get_errno()), self->job_id);
fsd_exc_raise_fmt(FSD_ERRNO_INTERNAL_ERROR, "slurm_resume error: %s,job_id: %s", slurm_strerror( _serrno ), self->job_id);
}
slurm_self->user_suspended = false;
break;
Expand All @@ -108,19 +123,29 @@ slurmdrmaa_job_control( fsd_job_t *self, int action )
slurm_init_job_desc_msg(&job_desc);
job_desc.priority = INFINITE;
job_desc.job_id = atoi(self->job_id);
#if SLURM_VERSION_NUMBER >= SLURM_VERSION_NUM(24,11,0)
if((_serrno = slurm_update_job(&job_desc)) != SLURM_SUCCESS ) {
#else
if(slurm_update_job(&job_desc) == -1) {
fsd_exc_raise_fmt(FSD_ERRNO_INTERNAL_ERROR, "slurm_update_job error: %s,job_id: %s", slurm_strerror(slurm_get_errno()), self->job_id);
_serrno = slurm_get_errno();
#endif
fsd_exc_raise_fmt(FSD_ERRNO_INTERNAL_ERROR, "slurm_update_job error: %s,job_id: %s", slurm_strerror( _serrno ), self->job_id);
}
break;
case DRMAA_CONTROL_TERMINATE:
#if SLURM_VERSION_NUMBER >= SLURM_VERSION_NUM(21,8,0)
#if SLURM_VERSION_NUMBER >= SLURM_VERSION_NUM(24,11,0)
if(( _serrno = slurm_kill_job2(self->job_id, SIGKILL, 0, NULL)) != SLURM_SUCCESS ) {
#elif SLURM_VERSION_NUMBER >= SLURM_VERSION_NUM(21,8,0)
if(slurm_kill_job2(self->job_id, SIGKILL, 0, NULL) == -1) {
_serrno = slurm_get_errno();
#elif SLURM_VERSION_NUMBER > SLURM_VERSION_NUM(14,10,0)
if(slurm_kill_job2(self->job_id, SIGKILL, 0) == -1) {
_serrno = slurm_get_errno();
#else
if(slurm_kill_job(fsd_atoi(self->job_id), SIGKILL, 0) == -1) {
_serrno = slurm_get_errno();
#endif
fsd_exc_raise_fmt(FSD_ERRNO_INTERNAL_ERROR, "slurm_terminate_job error: %s,job_id: %s", slurm_strerror(slurm_get_errno()), self->job_id);
fsd_exc_raise_fmt(FSD_ERRNO_INTERNAL_ERROR, "slurm_terminate_job error: %s,job_id: %s", slurm_strerror( _serrno ), self->job_id);
}
break;
default:
Expand Down Expand Up @@ -152,16 +177,21 @@ slurmdrmaa_find_job_info( fsd_job_t *self, job_info_msg_t **job_info ) {

if (! (str_i = strchr( self->job_id, '_' ))) {
/* single job */
#if SLURM_VERSION_NUMBER >= SLURM_VERSION_NUM(24,11,0)
int _slurm_errno;
if (( _slurm_errno = slurm_load_job( job_info, fsd_atoi( self->job_id ), SHOW_ALL)) != SLURM_SUCCESS ) {
#else
if ( slurm_load_job( job_info, fsd_atoi( self->job_id ), SHOW_ALL) ) {
int _slurm_errno = slurm_get_errno();
#endif

if (_slurm_errno == ESLURM_INVALID_JOB_ID) {
self->on_missing(self);
} else if (_slurm_errno == SLURM_PROTOCOL_SOCKET_IMPL_TIMEOUT ||
_slurm_errno == SLURMCTLD_COMMUNICATIONS_CONNECTION_ERROR) {
fsd_exc_raise_fmt(FSD_ERRNO_DRM_COMMUNICATION_FAILURE, "slurm_load_jobs error: %s,job_id: %s", slurm_strerror(_slurm_errno), self->job_id);
} else {
fsd_exc_raise_fmt(FSD_ERRNO_INTERNAL_ERROR, "slurm_load_jobs error: %s,job_id: %s", slurm_strerror(slurm_get_errno()), self->job_id);
fsd_exc_raise_fmt(FSD_ERRNO_INTERNAL_ERROR, "slurm_load_jobs error: %s,job_id: %s", slurm_strerror(_slurm_errno), self->job_id);
}
}

Expand Down Expand Up @@ -191,8 +221,13 @@ slurmdrmaa_find_job_info( fsd_job_t *self, job_info_msg_t **job_info ) {

fsd_log_debug(( "looking for task (%u) of job (%s)", task_id, parent_job ));

#if SLURM_VERSION_NUMBER >= SLURM_VERSION_NUM(24,11,0)
int _slurm_errno;
if (( _slurm_errno = slurm_load_job( job_info, fsd_atoi( parent_job ), SHOW_ALL)) != SLURM_SUCCESS ) {
#else
if ( slurm_load_job( job_info, fsd_atoi( parent_job ), SHOW_ALL) ) {
int _slurm_errno = slurm_get_errno();
#endif

if (_slurm_errno == ESLURM_INVALID_JOB_ID) {
self->on_missing(self);
Expand All @@ -201,7 +236,7 @@ slurmdrmaa_find_job_info( fsd_job_t *self, job_info_msg_t **job_info ) {
_slurm_errno == SLURMCTLD_COMMUNICATIONS_CONNECTION_ERROR) {
fsd_exc_raise_fmt(FSD_ERRNO_DRM_COMMUNICATION_FAILURE, "slurm_load_jobs error: %s,job_id: %s", slurm_strerror(_slurm_errno), self->job_id);
} else {
fsd_exc_raise_fmt(FSD_ERRNO_INTERNAL_ERROR, "slurm_load_jobs error: %s,job_id: %s", slurm_strerror(slurm_get_errno()), self->job_id);
fsd_exc_raise_fmt(FSD_ERRNO_INTERNAL_ERROR, "slurm_load_jobs error: %s,job_id: %s", slurm_strerror(_slurm_errno), self->job_id);
}
}

Expand Down Expand Up @@ -444,7 +479,11 @@ slurmdrmaa_job_on_missing( fsd_job_t *self )
job_id_spec_t job_id_spec;
slurmdb_job_cond_t *job_cond = NULL;
slurmdb_job_rec_t *job = NULL;
#if SLURM_VERSION_NUMBER >= SLURM_VERSION_NUM(24,11,0)
list_t *jobs;
#else
List jobs;
#endif

#if SLURM_VERSION_NUMBER >= SLURM_VERSION_NUM(24,5,0)
list_itr_t *itr = NULL;
Expand Down
30 changes: 21 additions & 9 deletions slurm_drmaa/session.c
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ slurmdrmaa_session_run_bulk(

/* zero out the struct, and set default vaules */
slurm_init_job_desc_msg( &job_desc );

TRY
{
unsigned i;
Expand All @@ -125,13 +125,18 @@ slurmdrmaa_session_run_bulk(

connection_lock = fsd_mutex_lock( &self->drm_connection_mutex );
slurmdrmaa_job_create_req( self, jt, (fsd_environ_t**)&env , &job_desc );
int _slurm_errno;
#if SLURM_VERSION_NUMBER >= SLURM_VERSION_NUM(24,11,0)
if (_slurm_errno = slurm_submit_batch_job(&job_desc,&submit_response)) {
#else
if (slurm_submit_batch_job(&job_desc,&submit_response)) {
int _slurm_errno = slurm_get_errno();
_slurm_errno = slurm_get_errno();
#endif
if (_slurm_errno == EAGAIN ||
(_slurm_errno >= 5000 && _slurm_errno < 6000)) {
fsd_exc_raise_fmt(FSD_ERRNO_DRM_COMMUNICATION_FAILURE,"slurm_submit_batch_job error: %s", slurm_strerror(_slurm_errno));
} else if (_slurm_errno >= 2000 && _slurm_errno < 4000) {
fsd_exc_raise_fmt(FSD_ERRNO_DENIED_BY_DRM,"slurm_submit_batch_job error: %s", slurm_strerror(slurm_get_errno()));
fsd_exc_raise_fmt(FSD_ERRNO_DENIED_BY_DRM,"slurm_submit_batch_job error: %s", slurm_strerror(_slurm_errno));
} else {
fsd_exc_raise_fmt(FSD_ERRNO_INTERNAL_ERROR,"slurm_submit_batch_job error (%d): %s", _slurm_errno, slurm_strerror(_slurm_errno));
}
Expand All @@ -145,7 +150,16 @@ slurmdrmaa_session_run_bulk(
fsd_log_debug(("job %u submitted on cluster %s", submit_response->job_id, working_cluster_rec->name));

if ( start != 0 || end != 0 || incr != 0 ) {
if ( SLURM_SUCCESS == slurm_load_job( &job_info, submit_response->job_id, 0) ) {
int _serrno;
#if SLURM_VERSION_NUMBER >= SLURM_VERSION_NUM(24,11,0)
if (( _serrno = slurm_load_job( &job_info, submit_response->job_id, 0)) != SLURM_SUCCESS) {
#else
if ( SLURM_SUCCESS != slurm_load_job( &job_info, submit_response->job_id, 0) ) {
_serrno = slurm_get_errno();
#endif
fsd_exc_raise_fmt( FSD_ERRNO_INTERNAL_ERROR,"slurm_load_job: %s",slurm_strerror(_serrno));
}
else {
#if SLURM_VERSION_NUMBER >= SLURM_VERSION_NUM(14, 10, 0)
for (i = 0, v = start; i < n_jobs; i++, v += incr) {
job_ids[i] = fsd_asprintf("%d_%d", submit_response->job_id, v);
Expand All @@ -163,8 +177,6 @@ slurmdrmaa_session_run_bulk(
job->release( job );
job = NULL;
}
} else {
fsd_exc_raise_fmt( FSD_ERRNO_INTERNAL_ERROR,"slurm_load_job: %s",slurm_strerror(slurm_get_errno()));
}
} else {
if (!working_cluster_rec)
Expand All @@ -187,8 +199,8 @@ slurmdrmaa_session_run_bulk(
}
FINALLY
{


if( connection_lock )
fsd_mutex_unlock( &self->drm_connection_mutex );

Expand All @@ -204,7 +216,7 @@ slurmdrmaa_session_run_bulk(

if( fsd_exc_get() != NULL )
fsd_free_vector( job_ids );

slurmdrmaa_free_job_desc(&job_desc);
}
END_TRY
Expand Down
4 changes: 4 additions & 0 deletions slurm_drmaa/slurm_missing.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,12 +24,16 @@
#ifndef __LL_DRMAA__SLURM_MISSING_H
#define __LL_DRMAA__SLURM_MISSING_H

#if SLURM_VERSION_NUMBER < SLURM_VERSION_NUM(24,11,0)
extern void * slurm_list_peek (List l);
#endif
#if SLURM_VERSION_NUMBER < SLURM_VERSION_NUM(24,5,0)
extern void * slurm_list_remove (ListIterator i);
#endif

#if SLURM_VERSION_NUMBER < SLURM_VERSION_NUM(24,11,0)
extern int slurm_addto_step_list(List step_list, char *names);
#endif

/* --clusters is not supported with Slurm < 15.08, but these are defined to
* avoid compiler warnings
Expand Down
4 changes: 4 additions & 0 deletions slurm_drmaa/util.c
Original file line number Diff line number Diff line change
Expand Up @@ -690,7 +690,11 @@ slurmdrmaa_unset_job_id(job_id_spec_t *job_id_spec)
void
slurmdrmaa_set_cluster(const char * value)
{
#if SLURM_VERSION_NUMBER >= SLURM_VERSION_NUM(24,11,0)
list_t *cluster_list = NULL;
#else
volatile List cluster_list = NULL;
#endif

fsd_log_enter(( "({value=%s})", value));

Expand Down