Skip to content

Commit 6caf490

Browse files
verdimrcVerdi March
andauthored
smhp: quality-of-live improvements (#300)
* Selective backport from the playground repo (ppc) - ssh to compute nodes from intra-cluster no longer ask for host key - fix alt (option) shortcuts when connecting from OSX - generate keypair for ubuntu, if home dir doesn't already have one. Use-case: when re-using FSx Lustre filesystem - install git-remote-codecommit to connect to Amazon CodeCommit - install mountpoint-for-s3 - generate how-to install miniconda3 - install additional CLI tools (install-pkgs.sh) * smhp: add friendly name to auto-generated ssh keypair --------- Co-authored-by: Verdi March <[email protected]>
1 parent 24c2156 commit 6caf490

File tree

10 files changed

+169
-4
lines changed

10 files changed

+169
-4
lines changed

1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/config.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,14 +6,16 @@ class Config:
66
enable_docker_enroot_pyxis = True
77

88
# Set true if you want to install metric exporter software and Prometheus for observability
9-
# DCGM Exporter and EFA Node Exporter are installed on compute nodes,
9+
# DCGM Exporter and EFA Node Exporter are installed on compute nodes,
1010
# Slurm Exporter and Prometheus are installed on controller node.
1111
enable_observability = False
1212

1313
# Set true if you want to install SSSD for ActiveDirectory/LDAP integration.
1414
# You need to configure parameters in SssdConfig as well.
1515
enable_sssd = False
1616

17+
# Set true to install quality-of-live improvements
18+
enable_initsmhp = False
1719

1820
# Configuration parameters for ActiveDirectory/LDAP/SSSD
1921
class SssdConfig:
@@ -29,7 +31,7 @@ class SssdConfig:
2931

3032
# The default bind DN to use for performing LDAP operations
3133
ldap_default_bind_dn = "CN=ReadOnly,OU=Users,OU=hyperpod,DC=hyperpod,DC=abc123,DC=com"
32-
34+
3335
# "password" or "obfuscated_password". Obfuscated password is recommended.
3436
ldap_default_authtok_type = "obfuscated_password"
3537

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
#!/bin/bash
2+
3+
[[ "$1" == "" ]] && NODE_TYPE=other || NODE_TYPE="$1"
4+
5+
set -exuo pipefail
6+
7+
BIN_DIR=$(dirname $(realpath ${BASH_SOURCE[@]}))
8+
chmod ugo+x $BIN_DIR/initsmhp/*.sh
9+
10+
declare -a PKGS_SCRIPTS=(
11+
install-pkgs.sh
12+
install-mount-s3.sh
13+
install-git-remote-codecommit.sh
14+
)
15+
mkdir /var/log/initsmhp
16+
for i in "${PKGS_SCRIPTS[@]}"; do
17+
bash -x $BIN_DIR/initsmhp/$i &> /var/log/initsmhp/$i.txt \
18+
&& echo "SUCCESS: $i" >> /var/log/initsmhp/initsmhp.txt \
19+
|| echo "FAIL: $i" >> /var/log/initsmhp/initsmhp.txt
20+
done
21+
22+
bash -x $BIN_DIR/initsmhp/fix-profile.sh
23+
bash -x $BIN_DIR/initsmhp/ssh-to-compute.sh
24+
25+
# /opt/ml/config/resource_config.json is not world-readable, so take only the part that later-on
26+
# used for ssh-keygen comment.
27+
cat /opt/ml/config/resource_config.json | jq '.ClusterConfig' > /opt/initsmhp-cluster_config.json
28+
29+
if [[ "${NODE_TYPE}" == "controller" ]]; then
30+
runuser -l ubuntu $BIN_DIR/initsmhp/gen-keypair-ubuntu.sh
31+
bash -x $BIN_DIR/initsmhp/howto-miniconda.sh
32+
fi
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
#!/bin/bash
2+
3+
set -exuo pipefail
4+
5+
cat << 'EOF' > /opt/inputrc-osx
6+
# A few bash shortcuts when ssh-ing from OSX
7+
"ƒ": forward-word # alt-f
8+
"∫": backward-word # alt-b
9+
"≥": yank-last-arg # alt-.
10+
"∂": kill-word # alt-d
11+
12+
";3D": backward-word # alt-left
13+
";3C": forward-word # alt-right
14+
15+
"\e[1;3D": backward-word ### Alt left
16+
"\e[1;3C": forward-word ### Alt right
17+
EOF
18+
19+
echo -e "\nbind -f /opt/inputrc-osx" >> /etc/profile.d/z99-initsmhp.sh
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
#!/bin/bash
2+
3+
set -exuo pipefail
4+
5+
mkdir -p ~/.ssh
6+
cd ~/.ssh
7+
{ test -f id_rsa && grep "^$(cat id_rsa.pub)$" authorized_keys &> /dev/null ; } && GENERATE_KEYPAIR=0 || GENERATE_KEYPAIR=1
8+
if [[ $GENERATE_KEYPAIR == 1 ]]; then
9+
echo Generate a new keypair...
10+
SSH_KEYGEN_ARGS=""
11+
if [[ -f /opt/initsmhp-cluster_config.json ]]; then
12+
CLUSTER_ARN=$(jq -r ".ClusterArn" /opt/initsmhp-cluster_config.json)
13+
CLUSTER_NAME=$(jq -r ".ClusterName" /opt/initsmhp-cluster_config.json)
14+
SSH_KEYGEN_ARGS="-C $(whoami)@$(hostname)__${CLUSTER_NAME}__${CLUSTER_ARN}"
15+
fi
16+
17+
ssh-keygen -t rsa -q -f id_rsa -N "" ${SSH_KEYGEN_ARGS}
18+
cat id_rsa.pub >> authorized_keys
19+
else
20+
echo Use existing keypair...
21+
fi
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
#!/bin/bash
2+
3+
set -exuo pipefail
4+
5+
cat << 'EOF' > /etc/skel/HOWTO-install-miniconda.md
6+
# How to install miniconda
7+
8+
**Pre-requisite:** home directory is located on the shared `/fsx` filesystem. If this is not the
9+
case, please contact your sysadmins.
10+
11+
```bash
12+
cd ~
13+
curl -O https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
14+
chmod 755 Miniconda3-latest-Linux-x86_64.sh
15+
./Miniconda3-latest-Linux-x86_64.sh -b -f
16+
~/miniconda3/bin/conda init
17+
18+
# For conda command to become available, close and re-open your current shell.
19+
exit
20+
# ... then reconnect back, and now you should see shell prompts start with (base).
21+
22+
# Example based on https://pytorch.org/get-started/locally/
23+
conda create -y -n pt220-p312 python=3.12
24+
conda activate pt220-p312
25+
# Make sure your shell prompts start with (pt220-p312).
26+
conda install pytorch torchvision torchaudio pytorch-cuda=12.1 -c pytorch -c nvidia -y
27+
```
28+
EOF
29+
30+
runuser -u ubuntu -- cp /etc/skel/HOWTO-install-miniconda.md ~ubuntu/
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
#!/bin/bash
2+
3+
set -exuo pipefail
4+
5+
apt install -y -o DPkg::Lock::Timeout=120 python3-jmespath
6+
/usr/bin/pip3 install git-remote-codecommit
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
#!/bin/bash
2+
3+
# This script only installs the mount-s3 package. Users must mount the S3 themselves as part of
4+
# their cluster usage.
5+
6+
set -exuo pipefail
7+
cd /tmp
8+
wget https://s3.amazonaws.com/mountpoint-s3-release/latest/x86_64/mount-s3.deb
9+
apt-get install -y -o DPkg::Lock::Timeout=120 ./mount-s3.deb
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
#!/bin/bash
2+
3+
set -exuo pipefail
4+
5+
# https://askubuntu.com/a/1431746
6+
export NEEDRESTART_MODE=a
7+
export DEBIAN_FRONTEND=noninteractive
8+
9+
add-apt-repository ppa:git-core/ppa -y
10+
apt -o DPkg::Lock::Timeout=120 update
11+
12+
declare -a PKG=(git unzip tree most fio dstat dos2unix tig jq ncdu inxi mediainfo git-lfs nvme-cli aria2 ripgrep bat python3-venv python3-pip)
13+
[[ $(apt-cache search ^duf$) ]] && PKG+=(duf)
14+
15+
apt-get -y -o DPkg::Lock::Timeout=120 install "${PKG[@]}"
16+
[[ -e /usr/bin/batcat ]] && ln -s /usr/bin/batcat /usr/bin/bat
17+
echo -e '\nexport DSTAT_OPTS="-cdngym"' >> /etc/profile.d/z99-initsmhp.sh
18+
19+
# VSCode: https://code.visualstudio.com/docs/setup/linux#_visual-studio-code-is-unable-to-watch-for-file-changes-in-this-large-workspace-error-enospc
20+
echo -e '\nfs.inotify.max_user_watches=524288' | tee -a /etc/sysctl.conf
21+
sysctl -p
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
#!/bin/bash
2+
3+
set -exuo pipefail
4+
5+
[[ -f /opt/slurm/etc/slurm.conf ]] \
6+
&& SLURM_CONFIG=/opt/slurm/etc/slurm.conf \
7+
|| SLURM_CONFIG=/var/spool/slurmd/conf-cache/slurm.conf
8+
9+
# https://github.com/aws-samples/aws-efa-nccl-baseami-pipeline/blob/9d8a9273f72d7dee36f7f3e5e8a968b5e0f5f21b/nvidia-efa-ami_base/nvidia-efa-ml-ubuntu2004.yml#L163-L169
10+
cat << EOF >> /etc/ssh/ssh_config.d/initsmhp-ssh.conf
11+
Host 127.0.0.1 localhost $(hostname)
12+
StrictHostKeyChecking no
13+
HostbasedAuthentication no
14+
CheckHostIP no
15+
UserKnownHostsFile /dev/null
16+
17+
Match host * exec "grep '^NodeName=%h ' $SLURM_CONFIG &> /dev/null"
18+
StrictHostKeyChecking no
19+
HostbasedAuthentication no
20+
CheckHostIP no
21+
UserKnownHostsFile /dev/null
22+
EOF

1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/lifecycle_script.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -140,7 +140,7 @@ def wait_for_scontrol():
140140

141141
print(f"Exceeded maximum wait time of {timeout} seconds. No output from scontrol.")
142142
return False
143-
143+
144144

145145
def main(args):
146146
params = ProvisioningParameters(args.provisioning_parameters)
@@ -205,6 +205,9 @@ def main(args):
205205
if Config.enable_sssd:
206206
subprocess.run(["python3", "-u", "setup_sssd.py", "--node-type", node_type], check=True)
207207

208+
if Config.enable_initsmhp:
209+
ExecuteBashScript("./initsmhp.sh").run(node_type)
210+
208211
print("[INFO]: Success: All provisioning scripts completed")
209212

210213

@@ -214,4 +217,4 @@ def main(args):
214217
parser.add_argument("-pp", "--provisioning_parameters", help="Provisioning Parameters containing the head, login and compute ID/names")
215218
args=parser.parse_args()
216219

217-
main(args)
220+
main(args)

0 commit comments

Comments
 (0)