From c29480288451760d9837df114c08f25054a88adb Mon Sep 17 00:00:00 2001 From: pauloestrela Date: Mon, 17 Oct 2022 11:14:08 -0300 Subject: [PATCH 01/13] Add support for proxy, AWS credentials and add comment tags to reduce log output --- common.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/common.py b/common.py index 58aa9a7..e34d9ae 100644 --- a/common.py +++ b/common.py @@ -7,6 +7,11 @@ import boto3 +# set to use proxy and AWS credentials +# os.environ["https_proxy"] = "https://user:password@proxy-server:port" +# os.environ["AWS_ACCESS_KEY_ID"] = "" +# os.environ["AWS_SECRET_ACCESS_KEY"] = "" + dir_path = os.path.dirname(os.path.realpath(__file__)) # Folder where resides the Python files @@ -142,6 +147,7 @@ def get_common(scriptname): # Create a logger logger = get_logger(scriptname, config['LogLevel'], config['LogFileName']) + # comment to reduce output logger.debug('Config: %s' %json.dumps(config, indent=4)) # Validate the structure of config.json @@ -171,6 +177,7 @@ def get_common(scriptname): sys.exit(1) finally: partitions = partitions_json['Partitions'] + # comment to reduce output logger.debug('Partitions: %s' %json.dumps(partitions_json, indent=4)) return logger, config, partitions From 75307f25fb4e53734c61e5fb8674226836f4fa07 Mon Sep 17 00:00:00 2001 From: pauloestrela Date: Mon, 17 Oct 2022 15:54:23 -0300 Subject: [PATCH 02/13] Initial nodegroup hostname support --- common.py | 60 +++++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 50 insertions(+), 10 deletions(-) diff --git a/common.py b/common.py index e34d9ae..17d5c9f 100644 --- a/common.py +++ b/common.py @@ -13,6 +13,10 @@ # os.environ["AWS_SECRET_ACCESS_KEY"] = "" + + + + dir_path = os.path.dirname(os.path.realpath(__file__)) # Folder where resides the Python files logger = None # Global variable for the logging.Logger object @@ -198,11 +202,17 @@ def get_node_name(partition, nodegroup, node_id=''): nodegroup_name = nodegroup['NodeGroupName'] else: nodegroup_name = nodegroup - - if node_id == '': - return '%s-%s' %(partition_name, nodegroup_name) - else: - return '%s-%s-%s' %(partition_name, nodegroup_name, node_id) + + if config['NodeNameStartsWithNodeGroupName']: + if node_id == '': + return '%s' %(nodegroup_name) + else: + return '%s%s' %(nodegroup_name, node_id) + else + if node_id == '': + return '%s-%s' %(partition_name, nodegroup_name) + else: + return '%s-%s-%s' %(partition_name, nodegroup_name, node_id) # Return the name of a node [partition_name]-[nodegroup_name][id] @@ -215,7 +225,18 @@ def get_node_range(partition, nodegroup, nb_nodes=None): nb_nodes = nodegroup['MaxNodes'] if nb_nodes > 1: - return '%s-[0-%s]' %(get_node_name(partition, nodegroup), nb_nodes-1) + if config['NodeNameStartsFrom1']: + if config['NodeNameStartsWithNodeGroupName']: + digits=len(str(nb_nodes)) + return '%s[%s-%s]' %(get_node_name(partition, nodegroup),str(1).zfill(digits),nb_nodes) + else + return '%s-[1-%s]' %(get_node_name(partition, nodegroup), nb_nodes) + else + if config['NodeNameStartsWithNodeGroupName']: + digits=len(str(nb_nodes)) + return '%s[%s-%s]' %(get_node_name(partition, nodegroup),str(0).zfill(digits),nb_nodes-1) + else + return '%s-[0-%s]' %(get_node_name(partition, nodegroup), nb_nodes-1) else: return '%s-0' %(get_node_name(partition, nodegroup)) @@ -248,14 +269,23 @@ def expand_hostlist(hostlist): # Take a list of node names in input and return a dict with result[partition_name][nodegroup_name] = list of node ids def parse_node_names(node_names): result = {} + + if config['NodeNameStartsWithNodeGroupName']: + groups = 2 + pattern = '^([-a-zA-Z0-9]+[-a-zA-z])([0-9]+)$' + else + pattern = '^([a-zA-Z0-9]+)-([a-zA-Z0-9]+)-([0-9]+)$' + for node_name in node_names: - # For each node: extract partition name, node group name and node id - pattern = '^([a-zA-Z0-9]+)-([a-zA-Z0-9]+)-([0-9]+)$' match = re.match(pattern, node_name) if match: - partition_name, nodegroup_name, node_id = match.groups() - + if groups == 2: + nodegroup_name, node_id = match.groups() + partition_name = get_partition_name(nodegroup_name) + else + partition_name, nodegroup_name, node_id = match.groups() + # Add to result if not partition_name in result: result[partition_name] = {} @@ -278,6 +308,16 @@ def get_partition_nodegroup(partition_name, nodegroup_name): # Return None if it does not exist return None +# Return partition name based on node group name +def get_partition_name(nodegroup_name): + + for partition in partitions: + for nodegroup in partition['NodeGroups']: + if nodegroup['NodeGroupName'] == nodegroup_name: + return partition['PartitionName'] + # ReturnNone if it doesn't exist + return None + # Use 'scontrol update node' to update nodes def update_node(node_name, parameters): From a8947810a1b2be9a30417abd530adafc80ac3ec6 Mon Sep 17 00:00:00 2001 From: pauloestrela Date: Mon, 17 Oct 2022 16:18:26 -0300 Subject: [PATCH 03/13] syntax fixing --- common.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/common.py b/common.py index 17d5c9f..8f67105 100644 --- a/common.py +++ b/common.py @@ -12,11 +12,6 @@ # os.environ["AWS_ACCESS_KEY_ID"] = "" # os.environ["AWS_SECRET_ACCESS_KEY"] = "" - - - - - dir_path = os.path.dirname(os.path.realpath(__file__)) # Folder where resides the Python files logger = None # Global variable for the logging.Logger object @@ -208,7 +203,7 @@ def get_node_name(partition, nodegroup, node_id=''): return '%s' %(nodegroup_name) else: return '%s%s' %(nodegroup_name, node_id) - else + else: if node_id == '': return '%s-%s' %(partition_name, nodegroup_name) else: From 7388e46e786a1d1abeba87d8486637ddddaac83a Mon Sep 17 00:00:00 2001 From: pauloestrela Date: Mon, 17 Oct 2022 16:19:20 -0300 Subject: [PATCH 04/13] syntax fixing --- common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common.py b/common.py index 8f67105..474951c 100644 --- a/common.py +++ b/common.py @@ -226,7 +226,7 @@ def get_node_range(partition, nodegroup, nb_nodes=None): return '%s[%s-%s]' %(get_node_name(partition, nodegroup),str(1).zfill(digits),nb_nodes) else return '%s-[1-%s]' %(get_node_name(partition, nodegroup), nb_nodes) - else + else: if config['NodeNameStartsWithNodeGroupName']: digits=len(str(nb_nodes)) return '%s[%s-%s]' %(get_node_name(partition, nodegroup),str(0).zfill(digits),nb_nodes-1) From 14732a64577a266790af9a1383dc13edc631a5a3 Mon Sep 17 00:00:00 2001 From: pauloestrela Date: Mon, 17 Oct 2022 16:21:11 -0300 Subject: [PATCH 05/13] Typo fixing --- common.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/common.py b/common.py index 474951c..c1b5142 100644 --- a/common.py +++ b/common.py @@ -224,13 +224,13 @@ def get_node_range(partition, nodegroup, nb_nodes=None): if config['NodeNameStartsWithNodeGroupName']: digits=len(str(nb_nodes)) return '%s[%s-%s]' %(get_node_name(partition, nodegroup),str(1).zfill(digits),nb_nodes) - else + else: return '%s-[1-%s]' %(get_node_name(partition, nodegroup), nb_nodes) else: if config['NodeNameStartsWithNodeGroupName']: digits=len(str(nb_nodes)) return '%s[%s-%s]' %(get_node_name(partition, nodegroup),str(0).zfill(digits),nb_nodes-1) - else + else: return '%s-[0-%s]' %(get_node_name(partition, nodegroup), nb_nodes-1) else: return '%s-0' %(get_node_name(partition, nodegroup)) From a7c71d9e5b33752300fc96b6c9bcfdd1666784ce Mon Sep 17 00:00:00 2001 From: pauloestrela Date: Tue, 18 Oct 2022 10:10:19 -0300 Subject: [PATCH 06/13] Fixing typos --- common.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/common.py b/common.py index c1b5142..c3e6fe2 100644 --- a/common.py +++ b/common.py @@ -268,7 +268,7 @@ def parse_node_names(node_names): if config['NodeNameStartsWithNodeGroupName']: groups = 2 pattern = '^([-a-zA-Z0-9]+[-a-zA-z])([0-9]+)$' - else + else: pattern = '^([a-zA-Z0-9]+)-([a-zA-Z0-9]+)-([0-9]+)$' for node_name in node_names: @@ -278,7 +278,7 @@ def parse_node_names(node_names): if groups == 2: nodegroup_name, node_id = match.groups() partition_name = get_partition_name(nodegroup_name) - else + else: partition_name, nodegroup_name, node_id = match.groups() # Add to result From 0b6dbc73263ce0fa9b477b1bd440835b44e13443 Mon Sep 17 00:00:00 2001 From: pauloestrela Date: Tue, 18 Oct 2022 14:47:22 -0300 Subject: [PATCH 07/13] Documentation updated to reflect new hostnames behavior --- README.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/README.md b/README.md index f054714..8878e35 100644 --- a/README.md +++ b/README.md @@ -43,6 +43,8 @@ This JSON file specifies the plugin and Slurm configuration parameters. "LogLevel": "STRING", "LogFileName": "STRING", "SlurmBinPath": "STRING", + "NodeNameStartsFrom1": "STRING", + "NodeNameStartsWithNodeGroupName": "STRING", "SlurmConf": { "PrivateData": "STRING", "ResumeProgram": "STRING", @@ -60,6 +62,8 @@ This JSON file specifies the plugin and Slurm configuration parameters. * `LogLevel`: Logging level. Possible values are `CRITICAL`, `ERROR`, `WARNING`, `INFO`, `DEBUG`. Default is `DEBUG`. * `LogFileName`: Full path to the log file location. Default is `PLUGIN_PATH\aws_plugin.log`. * `SlurmBinPath`: Full path to the folder that contains Slurm binaries like `scontrol` or `sinfo`. Example: `/slurm/bin`. +* `NodeNameStartsFrom1`: Optional. By default node number starts from 0, like aws-c5_24xlarge-0. This flag changes this behavior to start from 0. +* `NodeNameStartsWithNodeGroupName`: Optional. By default node name starts with partition name followed by "-". This flag changes this behavior to allow node names be node group name followed by a number like: c5_24xlarge001. * `SlurmConf`: These attributes are used by `generate_conf.py` to generate the content that must be appended to the Slurm configuration file. You must specify at least the following attributes: * `PrivateData`: Must be equal to `CLOUD` such that EC2 compute nodes that are idle are returned by Slurm command outputs such as `sinfo`. * `ResumeProgram`: Full path to the location of `resume.py`. Example: `/slurm/etc/aws/resume.py`. From 8d41327b4888525b62a440d8f55f8fcdcf6a347b Mon Sep 17 00:00:00 2001 From: pauloestrela Date: Tue, 18 Oct 2022 14:53:10 -0300 Subject: [PATCH 08/13] Fixing new parameters types --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 8878e35..d7abe20 100644 --- a/README.md +++ b/README.md @@ -43,8 +43,8 @@ This JSON file specifies the plugin and Slurm configuration parameters. "LogLevel": "STRING", "LogFileName": "STRING", "SlurmBinPath": "STRING", - "NodeNameStartsFrom1": "STRING", - "NodeNameStartsWithNodeGroupName": "STRING", + "NodeNameStartsFrom1": BOOLEAN, + "NodeNameStartsWithNodeGroupName": BOOLEAN, "SlurmConf": { "PrivateData": "STRING", "ResumeProgram": "STRING", From 2ff2faaa1fc46968129d5977a0e569f3de4c697f Mon Sep 17 00:00:00 2001 From: pauloestrela Date: Tue, 18 Oct 2022 16:11:36 -0300 Subject: [PATCH 09/13] Testing scontrol show config --- common.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/common.py b/common.py index c3e6fe2..0dd037e 100644 --- a/common.py +++ b/common.py @@ -12,12 +12,17 @@ # os.environ["AWS_ACCESS_KEY_ID"] = "" # os.environ["AWS_SECRET_ACCESS_KEY"] = "" + dir_path = os.path.dirname(os.path.realpath(__file__)) # Folder where resides the Python files logger = None # Global variable for the logging.Logger object config = None # Global variable for the config parameters partitions = None # Global variable that stores partitions details +arguments = [ 'show', 'config', '|', 'grep', 'SlurmctldHost', '|', 'wc', '-l'] +out = run_scommand('scontrol',arguments) +print out + # Create and return a logging.Logger object # - scriptname: name of the module From 84c7e27c572e87973d5892f6bdfd0fdd4d186965 Mon Sep 17 00:00:00 2001 From: pauloestrela Date: Tue, 18 Oct 2022 16:17:07 -0300 Subject: [PATCH 10/13] Fixing typo --- common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common.py b/common.py index 0dd037e..77ed642 100644 --- a/common.py +++ b/common.py @@ -21,7 +21,7 @@ arguments = [ 'show', 'config', '|', 'grep', 'SlurmctldHost', '|', 'wc', '-l'] out = run_scommand('scontrol',arguments) -print out +print(out) # Create and return a logging.Logger object From 69cf573315f0959b9590cfaf87e739988d170931 Mon Sep 17 00:00:00 2001 From: pauloestrela Date: Tue, 18 Oct 2022 16:28:24 -0300 Subject: [PATCH 11/13] Change HA teste location --- common.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/common.py b/common.py index 77ed642..a43b2f0 100644 --- a/common.py +++ b/common.py @@ -19,11 +19,6 @@ config = None # Global variable for the config parameters partitions = None # Global variable that stores partitions details -arguments = [ 'show', 'config', '|', 'grep', 'SlurmctldHost', '|', 'wc', '-l'] -out = run_scommand('scontrol',arguments) -print(out) - - # Create and return a logging.Logger object # - scriptname: name of the module # - levelname: log level (DEBUG, INFO, WARNING, ERROR, CRITICAL) @@ -350,3 +345,8 @@ def get_ec2_client(nodegroup): sys.exit(1) else: return boto3.client('ec2', region_name=nodegroup['Region']) + + +arguments = [ 'show', 'config', '|', 'grep', 'SlurmctldHost', '|', 'wc', '-l'] +out = run_scommand('scontrol',arguments) +print(out) \ No newline at end of file From fd04c074e18fa44551b338adab9826c29494616d Mon Sep 17 00:00:00 2001 From: pauloestrela Date: Tue, 18 Oct 2022 16:31:21 -0300 Subject: [PATCH 12/13] Removing HA code --- common.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/common.py b/common.py index a43b2f0..da4125e 100644 --- a/common.py +++ b/common.py @@ -344,9 +344,4 @@ def get_ec2_client(nodegroup): logger.critical('Failed to create a EC2 client - %s' %e) sys.exit(1) else: - return boto3.client('ec2', region_name=nodegroup['Region']) - - -arguments = [ 'show', 'config', '|', 'grep', 'SlurmctldHost', '|', 'wc', '-l'] -out = run_scommand('scontrol',arguments) -print(out) \ No newline at end of file + return boto3.client('ec2', region_name=nodegroup['Region']) \ No newline at end of file From 5d7c58557e17e7b7f3991c210edae9d18f1ff995 Mon Sep 17 00:00:00 2001 From: pauloestrela Date: Wed, 19 Oct 2022 08:35:17 -0300 Subject: [PATCH 13/13] Adding High Avaiability support --- common.py | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/common.py b/common.py index da4125e..acf2914 100644 --- a/common.py +++ b/common.py @@ -344,4 +344,26 @@ def get_ec2_client(nodegroup): logger.critical('Failed to create a EC2 client - %s' %e) sys.exit(1) else: - return boto3.client('ec2', region_name=nodegroup['Region']) \ No newline at end of file + return boto3.client('ec2', region_name=nodegroup['Region']) + + +def check_ha(): + args = ['show', 'config'] + out = run_scommand('scontrol', args) + ctld_hosts = 0 + for line in out: + if 'SlurmctldHost' in line: + ctld_hosts += 1 + if 'ClusterName' in line: + cluster_name = line.split('= ')[-1] # get last list element + if ctld_hosts > 1: + args = ['show', 'cluster', cluster_name] + out = run_scommand('sacctmgr', args) + for line in out: + if cluster_name in line: + primary_ip = re.sub(r'([ ])(\1+)', r'\1', line).split(' ')[2] + hostname = socket.gethostname() + host_ip = socket.gethostbyname(hostname) + if primary_ip != host_ip: + logger.info('This host is not the primary slurmctld. Exiting...') + sys.exit(1) \ No newline at end of file