Skip to content

Commit 256e17b

Browse files
dthainbtovar
authored andcommitted
Factory: Improved Shutdown of Local Workers (#4196)
* Add batch_queue_remove_mode_t parameter to allow for friendly and unfriendly kill modes. Conflicts: batch_job/src/batch_queue_internal.h * format * Modify factory to shutdown workers in parallel. And, for local workers, send SIGKILL after 30s until gone. * Adjust makeflow to use batch_queue_remove_mode_t. * Extend worker kill from vine_factory to work_queue_factory. * Remove mistaken prune * Remove unused int count * Remove unused int count * After waiting 30s, send the kill signal every 5s until worker exited.
1 parent 0f0ffcf commit 256e17b

15 files changed

+122
-34
lines changed

batch_job/src/batch_queue.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -244,9 +244,9 @@ batch_queue_id_t batch_queue_wait_timeout(struct batch_queue *q, struct batch_jo
244244
return q->module->wait(q, info, stoptime);
245245
}
246246

247-
int batch_queue_remove(struct batch_queue *q, batch_queue_id_t jobid)
247+
int batch_queue_remove(struct batch_queue *q, batch_queue_id_t jobid, batch_queue_remove_mode_t mode)
248248
{
249-
return q->module->remove(q, jobid);
249+
return q->module->remove(q, jobid, mode);
250250
}
251251

252252
/* vim: set noexpandtab tabstop=8: */

batch_job/src/batch_queue.h

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,12 @@ typedef enum {
8484
BATCH_QUEUE_TYPE_UNKNOWN = -1 /**< An invalid batch queue type. */
8585
} batch_queue_type_t;
8686

87+
/** Indicates how aggressively to remove a batch job. */
88+
typedef enum {
89+
BATCH_QUEUE_REMOVE_MODE_FRIENDLY, /**< Remove a batch job with a friendly signal that allows it to clean up. */
90+
BATCH_QUEUE_REMOVE_MODE_KILL, /**< Remove a batch job by sending a kill signal that terminates quickly. */
91+
} batch_queue_remove_mode_t;
92+
8793
/** Create a new batch queue.
8894
@param type The type of the queue.
8995
@param ssl_key_file The location of the queue manager's ssl key file, if it has one.
@@ -133,7 +139,7 @@ You must still call @ref batch_queue_wait to wait for the removal to complete.
133139
@param jobid The job to be removed.
134140
@return Greater than zero if the job exists and was removed, zero otherwise.
135141
*/
136-
int batch_queue_remove(struct batch_queue *q, batch_queue_id_t jobid);
142+
int batch_queue_remove(struct batch_queue *q, batch_queue_id_t jobid, batch_queue_remove_mode_t mode );
137143

138144
/** Converts a string into a batch queue type.
139145
@param str A string listing all of the known batch queue types (which changes over time.)

batch_job/src/batch_queue_amazon.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -789,7 +789,7 @@ kill the local ssh process forcibly, and then we save
789789
the Amazon instance and delete other expired instances.
790790
*/
791791

792-
static int batch_queue_amazon_remove(struct batch_queue *q, batch_queue_id_t jobid)
792+
static int batch_queue_amazon_remove(struct batch_queue *q, batch_queue_id_t jobid, batch_queue_remove_mode_t mode)
793793
{
794794
struct batch_queue_amazon_info *info;
795795

batch_job/src/batch_queue_cluster.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -393,7 +393,7 @@ static batch_queue_id_t batch_queue_cluster_wait(struct batch_queue *q, struct b
393393
return -1;
394394
}
395395

396-
static int batch_queue_cluster_remove(struct batch_queue *q, batch_queue_id_t jobid)
396+
static int batch_queue_cluster_remove(struct batch_queue *q, batch_queue_id_t jobid, batch_queue_remove_mode_t mode)
397397
{
398398
struct batch_job_info *info;
399399

batch_job/src/batch_queue_condor.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -358,7 +358,7 @@ static batch_queue_id_t batch_queue_condor_wait(struct batch_queue *q, struct ba
358358
return -1;
359359
}
360360

361-
static int batch_queue_condor_remove(struct batch_queue *q, batch_queue_id_t jobid)
361+
static int batch_queue_condor_remove(struct batch_queue *q, batch_queue_id_t jobid, batch_queue_remove_mode_t mode)
362362
{
363363
char *command = string_format("condor_rm %" PRIbjid, jobid);
364364

batch_job/src/batch_queue_dryrun.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,8 @@ See the file COPYING for details.
1515
#include "debug.h"
1616
#include "process.h"
1717
#include "macros.h"
18-
#include "stringtools.h"
1918
#include "path.h"
19+
#include "stringtools.h"
2020
#include "xxmalloc.h"
2121

2222
static batch_queue_id_t batch_queue_dryrun_submit(struct batch_queue *q, struct batch_job *bt)
@@ -84,7 +84,7 @@ static batch_queue_id_t batch_queue_dryrun_wait(struct batch_queue *q, struct ba
8484
}
8585
}
8686

87-
static int batch_queue_dryrun_remove(struct batch_queue *q, batch_queue_id_t jobid)
87+
static int batch_queue_dryrun_remove(struct batch_queue *q, batch_queue_id_t jobid, batch_queue_remove_mode_t mode)
8888
{
8989
return 0;
9090
}

batch_job/src/batch_queue_flux.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -258,7 +258,7 @@ static batch_queue_id_t batch_queue_flux_wait(struct batch_queue *q, struct batc
258258
return batch_queue_flux_wait_jobid(q, info_out, stoptime, 0);
259259
}
260260

261-
static int batch_queue_flux_remove(struct batch_queue *q, batch_queue_id_t jobid)
261+
static int batch_queue_flux_remove(struct batch_queue *q, batch_queue_id_t jobid, batch_queue_remove_mode_t mode)
262262
{
263263
struct flux_job_info *info = itable_lookup(batch_queue_jobid_info_table, jobid);
264264
if (!info) {

batch_job/src/batch_queue_internal.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,8 @@ struct batch_queue_module {
3232

3333
batch_queue_id_t (*submit) (struct batch_queue *Q, struct batch_job *bt );
3434
batch_queue_id_t (*wait) (struct batch_queue *Q, struct batch_job_info *info, time_t stoptime);
35-
int (*remove) (struct batch_queue *Q, batch_queue_id_t id);
35+
36+
int (*remove) (struct batch_queue *Q, batch_queue_id_t id, batch_queue_remove_mode_t mode );
3637
};
3738

3839
struct batch_queue {

batch_job/src/batch_queue_k8s.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -349,7 +349,7 @@ static batch_queue_id_t batch_queue_k8s_submit(struct batch_queue *q, struct bat
349349
return jobid;
350350
}
351351

352-
static int batch_queue_k8s_remove(struct batch_queue *q, batch_queue_id_t jobid)
352+
static int batch_queue_k8s_remove(struct batch_queue *q, batch_queue_id_t jobid, batch_queue_remove_mode_t mode)
353353
{
354354

355355
pid_t pid = fork();
@@ -408,7 +408,7 @@ static void batch_queue_k8s_handle_complete_task(char *pod_id, int job_id,
408408
free(info);
409409

410410
list_remove(running_pod_lst, pod_id);
411-
batch_queue_k8s_remove(q, job_id);
411+
batch_queue_k8s_remove(q, job_id, BATCH_QUEUE_REMOVE_MODE_FRIENDLY);
412412

413413
if (curr_k8s_job_info->is_running == 0) {
414414
process_wait(timeout);

batch_job/src/batch_queue_local.c

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -107,10 +107,16 @@ static batch_queue_id_t batch_queue_local_wait(struct batch_queue *q, struct bat
107107
}
108108
}
109109

110-
static int batch_queue_local_remove(struct batch_queue *q, batch_queue_id_t jobid)
110+
/* Note that batch_queue_remove triggers a job to exit, but it must still be returned by batch_queue_wait */
111+
112+
static int batch_queue_local_remove(struct batch_queue *q, batch_queue_id_t jobid, batch_queue_remove_mode_t mode)
111113
{
112-
int max_wait = 5; // maximum seconds we wish to wait for a given process
113-
process_kill_waitpid(jobid, max_wait);
114+
if (mode == BATCH_QUEUE_REMOVE_MODE_FRIENDLY) {
115+
kill(jobid, SIGQUIT);
116+
} else {
117+
kill(jobid, SIGKILL);
118+
}
119+
114120
return 0;
115121
}
116122

0 commit comments

Comments
 (0)