|
56 | 56 | # or some combination thereof.
|
57 | 57 | # Refer to qstat man page for additional details.
|
58 | 58 | # o(rphaned) is not considered as busy since we assume a node in orphaned state is not present in ASG anymore
|
59 |
| -SGE_BUSY_STATES = ["u", "C", "s", "d", "D", "E", "P"] |
| 59 | +SGE_BUSY_STATES = ["u", "C", "s", "D", "E", "P"] |
| 60 | + |
| 61 | +# This state is set by nodewatcher when the node is locked and is being terminated. |
| 62 | +SGE_DISABLED_STATE = "d" |
60 | 63 |
|
61 | 64 | # If an o(rphaned) state is displayed for a queue instance, it indicates that the queue instance is no longer demanded
|
62 | 65 | # by the current cluster queue configuration or the host group configuration. The queue instance is kept because jobs
|
@@ -133,10 +136,11 @@ def remove_hosts_from_queue(hosts):
|
133 | 136 | def install_sge_on_compute_nodes(hosts, cluster_user):
|
134 | 137 | """Start sge on compute nodes in parallel."""
|
135 | 138 | command = (
|
136 |
| - "sudo sh -c 'cd {0} && {0}/inst_sge -noremote -x -auto /opt/parallelcluster/templates/sge/sge_inst.conf'" |
| 139 | + "sudo sh -c 'ps aux | grep [s]ge_execd || " |
| 140 | + "(cd {0} && {0}/inst_sge -noremote -x -auto /opt/parallelcluster/templates/sge/sge_inst.conf)'" |
137 | 141 | ).format(sge.SGE_ROOT)
|
138 | 142 | hostnames = [host.hostname for host in hosts]
|
139 |
| - result = RemoteCommandExecutor.run_remote_command_on_multiple_hosts(command, hostnames, cluster_user) |
| 143 | + result = RemoteCommandExecutor.run_remote_command_on_multiple_hosts(command, hostnames, cluster_user, timeout=20) |
140 | 144 |
|
141 | 145 | succeeded_hosts = []
|
142 | 146 | for host in hosts:
|
@@ -206,6 +210,7 @@ def get_jobs_info(hostname_filter=None, job_state_filter=None):
|
206 | 210 | def get_pending_jobs_info(max_slots_filter=None, skip_if_state=None):
|
207 | 211 | """
|
208 | 212 | Retrieve the list of pending jobs.
|
| 213 | +
|
209 | 214 | :param max_slots_filter: discard jobs that require a number of slots bigger than the given value
|
210 | 215 | :param skip_if_state: discard jobs that are in the given state
|
211 | 216 | :return: the list of filtered pending jos.
|
|
0 commit comments