diff --git a/terraform/gcp_old/tpu-inference/modules/benchmark/main.tf b/terraform/gcp_old/modules/benchmark/main.tf similarity index 91% rename from terraform/gcp_old/tpu-inference/modules/benchmark/main.tf rename to terraform/gcp_old/modules/benchmark/main.tf index e5db2063..9f45df69 100644 --- a/terraform/gcp_old/tpu-inference/modules/benchmark/main.tf +++ b/terraform/gcp_old/modules/benchmark/main.tf @@ -4,18 +4,18 @@ # Type: v6e-8 # Runtime: v2-alpha-tpuv6e -data "google_secret_manager_secret_version" "buildkite_agent_token_benchmark_cluster" { - secret = "projects/${var.project_id}/secrets/bm-agent-hf-token" +data "google_secret_manager_secret_version" "buildkite_benchmark_agent_token" { + secret = "projects/${var.project_id}/secrets/${var.buildkite_benchmark_agent_token_name}" version = "latest" } data "google_secret_manager_secret_version" "huggingface_token" { - secret = "projects/${var.project_id}/secrets/tpu_commons_buildkite_hf_token" + secret = "projects/${var.project_id}/secrets/${var.huggingface_token_name}" version = "latest" } locals { - buildkite_token_value = data.google_secret_manager_secret_version.buildkite_agent_token_benchmark_cluster.secret_data + buildkite_token_value = data.google_secret_manager_secret_version.buildkite_benchmark_agent_token.secret_data huggingface_token_value = data.google_secret_manager_secret_version.huggingface_token.secret_data } diff --git a/terraform/gcp_old/modules/benchmark/variables.tf b/terraform/gcp_old/modules/benchmark/variables.tf new file mode 100644 index 00000000..915e22c0 --- /dev/null +++ b/terraform/gcp_old/modules/benchmark/variables.tf @@ -0,0 +1,13 @@ +variable "project_id" { + default = "cloud-tpu-inference-test" +} + +variable "buildkite_benchmark_agent_token_name" { + type = string + description = "google_secret_manager_secret name for benchmark agent token" +} + +variable "huggingface_token_name" { + type = string + description = "google_secret_manager_secret name for huggingface token" +} diff --git a/terraform/gcp_old/tpu-inference/modules/ci_cpu/main.tf b/terraform/gcp_old/modules/ci_cpu/main.tf similarity index 92% rename from terraform/gcp_old/tpu-inference/modules/ci_cpu/main.tf rename to terraform/gcp_old/modules/ci_cpu/main.tf index a5e115ab..0b77697d 100644 --- a/terraform/gcp_old/tpu-inference/modules/ci_cpu/main.tf +++ b/terraform/gcp_old/modules/ci_cpu/main.tf @@ -3,13 +3,13 @@ # Region: us-east5-b # Type: e2-standard-2 -data "google_secret_manager_secret_version" "buildkite_agent_token_ci_cluster" { - secret = "projects/${var.project_id}/secrets/tpu_commons_buildkite_agent_token" +data "google_secret_manager_secret_version" "buildkite_ci_agent_token" { + secret = "projects/${var.project_id}/secrets/${var.buildkite_ci_agent_token_name}" version = "latest" } locals { - buildkite_token_value = data.google_secret_manager_secret_version.buildkite_agent_token_ci_cluster.secret_data + buildkite_token_value = data.google_secret_manager_secret_version.buildkite_ci_agent_token.secret_data } resource "google_compute_instance" "buildkite-agent-instance" { diff --git a/terraform/gcp_old/modules/ci_cpu/variables.tf b/terraform/gcp_old/modules/ci_cpu/variables.tf new file mode 100644 index 00000000..16fec6e3 --- /dev/null +++ b/terraform/gcp_old/modules/ci_cpu/variables.tf @@ -0,0 +1,12 @@ +variable "project_id" { + default = "cloud-tpu-inference-test" +} + +variable "instance_count" { + default = 8 +} + +variable "buildkite_ci_agent_token_name" { + type = string + description = "google_secret_manager_secret name for ci agent token" +} diff --git a/terraform/gcp_old/tpu-inference/modules/ci_v5/main.tf b/terraform/gcp_old/modules/ci_v5/main.tf similarity index 91% rename from terraform/gcp_old/tpu-inference/modules/ci_v5/main.tf rename to terraform/gcp_old/modules/ci_v5/main.tf index f4a152ce..a202d530 100644 --- a/terraform/gcp_old/tpu-inference/modules/ci_v5/main.tf +++ b/terraform/gcp_old/modules/ci_v5/main.tf @@ -1,15 +1,15 @@ -data "google_secret_manager_secret_version" "buildkite_agent_token_ci_cluster" { - secret = "projects/${var.project_id}/secrets/buildkite_agent_token_ci_cluster" +data "google_secret_manager_secret_version" "buildkite_ci_agent_token" { + secret = "projects/${var.project_id}/secrets/${var.buildkite_ci_agent_token_name}" version = "latest" } data "google_secret_manager_secret_version" "huggingface_token" { - secret = "projects/${var.project_id}/secrets/huggingface_token" + secret = "projects/${var.project_id}/secrets/${var.huggingface_token_name}" version = "latest" } locals { - buildkite_token_value = data.google_secret_manager_secret_version.buildkite_agent_token_ci_cluster.secret_data + buildkite_token_value = data.google_secret_manager_secret_version.buildkite_ci_agent_token.secret_data huggingface_token_value = data.google_secret_manager_secret_version.huggingface_token.secret_data } diff --git a/terraform/gcp_old/modules/ci_v5/variables.tf b/terraform/gcp_old/modules/ci_v5/variables.tf new file mode 100644 index 00000000..911bfa6b --- /dev/null +++ b/terraform/gcp_old/modules/ci_v5/variables.tf @@ -0,0 +1,13 @@ +variable "project_id" { + default = "vllm-405802" +} + +variable "buildkite_ci_agent_token_name" { + type = string + description = "google_secret_manager_secret name for ci agent token" +} + +variable "huggingface_token_name" { + type = string + description = "google_secret_manager_secret name for huggingface token" +} diff --git a/terraform/gcp_old/tpu-inference/modules/ci_v6/main.tf b/terraform/gcp_old/modules/ci_v6/main.tf similarity index 89% rename from terraform/gcp_old/tpu-inference/modules/ci_v6/main.tf rename to terraform/gcp_old/modules/ci_v6/main.tf index 1fd1e604..f6a3704d 100644 --- a/terraform/gcp_old/tpu-inference/modules/ci_v6/main.tf +++ b/terraform/gcp_old/modules/ci_v6/main.tf @@ -4,34 +4,34 @@ # Type: v6e-1 # Runtime: v2-alpha-tpuv6e -data "google_secret_manager_secret_version" "buildkite_agent_token_ci_cluster" { - secret = "projects/${var.project_id}/secrets/tpu_commons_buildkite_agent_token" +data "google_secret_manager_secret_version" "buildkite_ci_agent_token" { + secret = "projects/${var.project_id}/secrets/${var.buildkite_ci_agent_token_name}" version = "latest" } data "google_secret_manager_secret_version" "huggingface_token" { - secret = "projects/${var.project_id}/secrets/tpu_commons_buildkite_hf_token" + secret = "projects/${var.project_id}/secrets/${var.huggingface_token_name}" version = "latest" } locals { - buildkite_token_value = data.google_secret_manager_secret_version.buildkite_agent_token_ci_cluster.secret_data + buildkite_token_value = data.google_secret_manager_secret_version.buildkite_ci_agent_token.secret_data huggingface_token_value = data.google_secret_manager_secret_version.huggingface_token.secret_data } resource "google_compute_disk" "disk_east5_b" { provider = google-beta.us-east5-b - count = 24 + count = "${var.ci_v6_instance_count}" name = "tpu-disk-east5-b-${count.index}" - size = 2048 + size = "${var.ci_v6_disk_size}" type = "hyperdisk-balanced" zone = "us-east5-b" } resource "google_tpu_v2_vm" "tpu_v6_ci" { provider = google-beta.us-east5-b - count = 24 + count = "${var.ci_v6_instance_count}" name = "vllm-tpu-v6-ci-${count.index}" zone = "us-east5-b" diff --git a/terraform/gcp_old/modules/ci_v6/variables.tf b/terraform/gcp_old/modules/ci_v6/variables.tf new file mode 100644 index 00000000..874b2d7e --- /dev/null +++ b/terraform/gcp_old/modules/ci_v6/variables.tf @@ -0,0 +1,23 @@ +variable "project_id" { + default = "cloud-tpu-inference-test" +} + +variable "buildkite_ci_agent_token_name" { + type = string + description = "google_secret_manager_secret name for ci agent token" +} + +variable "huggingface_token_name" { + type = string + description = "google_secret_manager_secret name for huggingface token" +} + +variable "ci_v6_instance_count" { + type = number + description = "number of instances to spawn for ci_v6" +} + +variable "ci_v6_disk_size" { + type = number + description = "disk size for ci_v6" +} diff --git a/terraform/gcp_old/tpu-inference/main.tf b/terraform/gcp_old/tpu-inference/main.tf index d84ac1ea..d6fb81d9 100644 --- a/terraform/gcp_old/tpu-inference/main.tf +++ b/terraform/gcp_old/tpu-inference/main.tf @@ -1,35 +1,20 @@ -# module "benchmark" { -# source = "./modules/benchmark" -# providers = { -# google-beta.us-east1-d = google-beta.us-east1-d -# } - -# buildkite_agent_token_benchmark_cluster = var.buildkite_agent_token_benchmark_cluster -# huggingface_token = var.huggingface_token -# } - module "ci_v6" { - source = "./modules/ci_v6" + source = "../modules/ci_v6" providers = { google-beta.us-east5-b = google-beta.us-east5-b } project_id = var.project_id + buildkite_ci_agent_token_name = var.buildkite_ci_agent_token_name + huggingface_token_name = var.huggingface_token_name + ci_v6_instance_count = var.ci_v6_instance_count + ci_v6_disk_size = var.ci_v6_disk_size } module "ci_cpu" { - source = "./modules/ci_cpu" + source = "../modules/ci_cpu" providers = { google-beta.us-east5-b = google-beta.us-east5-b } - project_id = var.project_id + project_id = var.project_id + buildkite_ci_agent_token_name = var.buildkite_ci_agent_token_name } - -# module "ci_v5" { -# source = "./modules/ci_v5" -# providers = { -# google-beta.us-south1-a = google-beta.us-south1-a -# } - -# buildkite_agent_token_ci_cluster = var.buildkite_agent_token_ci_cluster -# huggingface_token = var.huggingface_token -# } diff --git a/terraform/gcp_old/tpu-inference/modules/benchmark/variables.tf b/terraform/gcp_old/tpu-inference/modules/benchmark/variables.tf deleted file mode 100644 index 83c5f21c..00000000 --- a/terraform/gcp_old/tpu-inference/modules/benchmark/variables.tf +++ /dev/null @@ -1,3 +0,0 @@ -variable "project_id" { - default = "cloud-tpu-inference-test" -} diff --git a/terraform/gcp_old/tpu-inference/modules/ci_cpu/variables.tf b/terraform/gcp_old/tpu-inference/modules/ci_cpu/variables.tf deleted file mode 100644 index 15f18f5f..00000000 --- a/terraform/gcp_old/tpu-inference/modules/ci_cpu/variables.tf +++ /dev/null @@ -1,7 +0,0 @@ -variable "project_id" { - default = "cloud-tpu-inference-test" -} - -variable "instance_count" { - default = 8 -} diff --git a/terraform/gcp_old/tpu-inference/modules/ci_v5/variables.tf b/terraform/gcp_old/tpu-inference/modules/ci_v5/variables.tf deleted file mode 100644 index 35c6609c..00000000 --- a/terraform/gcp_old/tpu-inference/modules/ci_v5/variables.tf +++ /dev/null @@ -1,3 +0,0 @@ -variable "project_id" { - default = "vllm-405802" -} diff --git a/terraform/gcp_old/tpu-inference/modules/ci_v6/variables.tf b/terraform/gcp_old/tpu-inference/modules/ci_v6/variables.tf deleted file mode 100644 index 83c5f21c..00000000 --- a/terraform/gcp_old/tpu-inference/modules/ci_v6/variables.tf +++ /dev/null @@ -1,3 +0,0 @@ -variable "project_id" { - default = "cloud-tpu-inference-test" -} diff --git a/terraform/gcp_old/tpu-inference/variables.tf b/terraform/gcp_old/tpu-inference/variables.tf index 83c5f21c..1d22241c 100644 --- a/terraform/gcp_old/tpu-inference/variables.tf +++ b/terraform/gcp_old/tpu-inference/variables.tf @@ -1,3 +1,23 @@ variable "project_id" { default = "cloud-tpu-inference-test" } + +variable "buildkite_benchmark_agent_token_name" { + default = "bm-agent-hf-token" +} + +variable "buildkite_ci_agent_token_name" { + default = "tpu_commons_buildkite_agent_token" +} + +variable "huggingface_token_name" { + default = "tpu_commons_buildkite_hf_token" +} + +variable "ci_v6_instance_count" { + default = 24 +} + +variable "ci_v6_disk_size" { + default = 2048 +} diff --git a/terraform/gcp_old/vllm/main.tf b/terraform/gcp_old/vllm/main.tf index 666db811..a9c116e7 100644 --- a/terraform/gcp_old/vllm/main.tf +++ b/terraform/gcp_old/vllm/main.tf @@ -1,24 +1,32 @@ module "benchmark" { - source = "./modules/benchmark" + source = "../modules/benchmark" providers = { google-beta.us-east1-d = google-beta.us-east1-d } project_id = var.project_id + buildkite_benchmark_agent_token_name = var.buildkite_benchmark_agent_token_name + huggingface_token_name = var.huggingface_token_name } module "ci_v6" { - source = "./modules/ci_v6" + source = "../modules/ci_v6" providers = { google-beta.us-east5-b = google-beta.us-east5-b } project_id = var.project_id + buildkite_ci_agent_token_name = var.buildkite_ci_agent_token_name + huggingface_token_name = var.huggingface_token_name + ci_v6_instance_count = var.ci_v6_instance_count + ci_v6_disk_size = var.ci_v6_disk_size } module "ci_v5" { - source = "./modules/ci_v5" + source = "../modules/ci_v5" providers = { google-beta.us-south1-a = google-beta.us-south1-a } project_id = var.project_id + buildkite_ci_agent_token_name = var.buildkite_ci_agent_token_name + huggingface_token_name = var.huggingface_token_name } \ No newline at end of file diff --git a/terraform/gcp_old/vllm/modules/benchmark/main.tf b/terraform/gcp_old/vllm/modules/benchmark/main.tf deleted file mode 100644 index d25198ec..00000000 --- a/terraform/gcp_old/vllm/modules/benchmark/main.tf +++ /dev/null @@ -1,91 +0,0 @@ -# 2 nodes for Performance Benchmark cluster -# 8 TPU v6e devices each -# Region: us-east1-d -# Type: v6e-8 -# Runtime: v2-alpha-tpuv6e - -data "google_secret_manager_secret_version" "buildkite_agent_token_benchmark_cluster" { - secret = "projects/${var.project_id}/secrets/buildkite_agent_token_benchmark_cluster" - version = "latest" -} - -data "google_secret_manager_secret_version" "huggingface_token" { - secret = "projects/${var.project_id}/secrets/huggingface_token" - version = "latest" -} - -locals { - buildkite_token_value = data.google_secret_manager_secret_version.buildkite_agent_token_benchmark_cluster.secret_data - huggingface_token_value = data.google_secret_manager_secret_version.huggingface_token.secret_data -} - -resource "google_compute_disk" "disk_east1_d" { - provider = google-beta.us-east1-d - count = 0 - - name = "tpu-disk-east1-d${count.index + 1}" - size = 512 - type = "hyperdisk-balanced" - zone = "us-east1-d" -} - -resource "google_tpu_v2_vm" "tpu_v6_benchmark" { - provider = google-beta.us-east1-d - count = 0 - name = "vllm-tpu-v6-benchmark-${count.index + 1}" - zone = "us-east1-d" - - runtime_version = "v2-alpha-tpuv6e" - accelerator_type = "v6e-8" - - data_disks { - source_disk = google_compute_disk.disk_east1_d[count.index].id - mode = "READ_WRITE" - } - - network_config { - network = "projects/${var.project_id}/global/networks/default" - enable_external_ips = true - } - - metadata = { - "startup-script" = <<-EOF - #!/bin/bash - - apt-get update - apt-get install -y curl build-essential jq - - curl -o- https://get.docker.com/ | bash - - - curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y - /root/.cargo/bin/cargo install minijinja-cli - cp /root/.cargo/bin/minijinja-cli /usr/bin/minijinja-cli - chmod 777 /usr/bin/minijinja-cli - - curl -fsSL https://keys.openpgp.org/vks/v1/by-fingerprint/32A37959C2FA5C3C99EFBC32A79206696452D198 | sudo gpg --dearmor -o /usr/share/keyrings/buildkite-agent-archive-keyring.gpg - echo "deb [signed-by=/usr/share/keyrings/buildkite-agent-archive-keyring.gpg] https://apt.buildkite.com/buildkite-agent stable main" | sudo tee /etc/apt/sources.list.d/buildkite-agent.list - apt-get update - apt-get install -y buildkite-agent - - sudo usermod -a -G docker buildkite-agent - sudo -u buildkite-agent gcloud auth configure-docker us-central1-docker.pkg.dev --quiet - - sudo sed -i "s/xxx/${local.buildkite_token_value}/g" /etc/buildkite-agent/buildkite-agent.cfg - sudo sed -i 's/name="%hostname-%spawn"/name="vllm-tpu-v6-${count.index}"/' /etc/buildkite-agent/buildkite-agent.cfg - echo 'tags="queue=tpu_8_v6e_queue"' | sudo tee -a /etc/buildkite-agent/buildkite-agent.cfg - echo 'HF_TOKEN=${local.huggingface_token_value}' | sudo tee -a /etc/environment - - sudo mkfs.ext4 -m 0 -E lazy_itable_init=0,lazy_journal_init=0,discard /dev/sdb - sudo mkdir -p /mnt/disks/persist - sudo mount -o discard,defaults /dev/sdb /mnt/disks/persist - - jq ". + {\"data-root\": \"/mnt/disks/persist\"}" /etc/docker/daemon.json > /tmp/daemon.json.tmp && mv /tmp/daemon.json.tmp /etc/docker/daemon.json - systemctl stop docker - systemctl daemon-reload - systemctl start docker - - systemctl enable buildkite-agent - systemctl start buildkite-agent - EOF - } -} diff --git a/terraform/gcp_old/vllm/modules/benchmark/variables.tf b/terraform/gcp_old/vllm/modules/benchmark/variables.tf deleted file mode 100644 index 35c6609c..00000000 --- a/terraform/gcp_old/vllm/modules/benchmark/variables.tf +++ /dev/null @@ -1,3 +0,0 @@ -variable "project_id" { - default = "vllm-405802" -} diff --git a/terraform/gcp_old/vllm/modules/ci_v5/main.tf b/terraform/gcp_old/vllm/modules/ci_v5/main.tf deleted file mode 100644 index f4a152ce..00000000 --- a/terraform/gcp_old/vllm/modules/ci_v5/main.tf +++ /dev/null @@ -1,86 +0,0 @@ -data "google_secret_manager_secret_version" "buildkite_agent_token_ci_cluster" { - secret = "projects/${var.project_id}/secrets/buildkite_agent_token_ci_cluster" - version = "latest" -} - -data "google_secret_manager_secret_version" "huggingface_token" { - secret = "projects/${var.project_id}/secrets/huggingface_token" - version = "latest" -} - -locals { - buildkite_token_value = data.google_secret_manager_secret_version.buildkite_agent_token_ci_cluster.secret_data - huggingface_token_value = data.google_secret_manager_secret_version.huggingface_token.secret_data -} - -resource "google_compute_disk" "disk_v5" { - provider = google-beta.us-south1-a - count = 7 - - name = "tpu-disk-south1-a-${count.index + 1}" - size = 512 - type = "pd-ssd" - zone = "us-south1-a" -} - -resource "google_tpu_v2_vm" "tpu_v5" { - provider = google-beta.us-south1-a - count = 7 - name = "vllm-tpu-v5-${count.index + 1}" - zone = "us-south1-a" - - runtime_version = "v2-alpha-tpuv5-lite" - - accelerator_type = "v5litepod-1" - - data_disks { - source_disk = google_compute_disk.disk_v5[count.index].id - mode = "READ_WRITE" - } - - network_config { - network = "projects/${var.project_id}/global/networks/default" - enable_external_ips = true - } - - metadata = { - "startup-script" = <<-EOF - #!/bin/bash - - apt-get update - apt-get install -y curl build-essential jq - - curl -o- https://get.docker.com/ | bash - - - curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y - /root/.cargo/bin/cargo install minijinja-cli - cp /root/.cargo/bin/minijinja-cli /usr/bin/minijinja-cli - chmod 777 /usr/bin/minijinja-cli - - curl -fsSL https://keys.openpgp.org/vks/v1/by-fingerprint/32A37959C2FA5C3C99EFBC32A79206696452D198 | sudo gpg --dearmor -o /usr/share/keyrings/buildkite-agent-archive-keyring.gpg - echo "deb [signed-by=/usr/share/keyrings/buildkite-agent-archive-keyring.gpg] https://apt.buildkite.com/buildkite-agent stable main" | sudo tee /etc/apt/sources.list.d/buildkite-agent.list - apt-get update - apt-get install -y buildkite-agent - - sudo usermod -a -G docker buildkite-agent - sudo -u buildkite-agent gcloud auth configure-docker us-central1-docker.pkg.dev --quiet - - sudo sed -i "s/xxx/${local.buildkite_token_value}/g" /etc/buildkite-agent/buildkite-agent.cfg - sudo sed -i 's/name="%hostname-%spawn"/name="vllm-tpu-${count.index}"/' /etc/buildkite-agent/buildkite-agent.cfg - echo 'tags="queue=tpu_v5_queue"' | sudo tee -a /etc/buildkite-agent/buildkite-agent.cfg - echo 'HF_TOKEN=${local.huggingface_token_value}' | sudo tee -a /etc/environment - - sudo mkfs.ext4 -m 0 -E lazy_itable_init=0,lazy_journal_init=0,discard /dev/sdb - sudo mkdir -p /mnt/disks/persist - sudo mount -o discard,defaults /dev/sdb /mnt/disks/persist - - jq ". + {\"data-root\": \"/mnt/disks/persist\"}" /etc/docker/daemon.json > /tmp/daemon.json.tmp && mv /tmp/daemon.json.tmp /etc/docker/daemon.json - systemctl stop docker - systemctl daemon-reload - systemctl start docker - - systemctl enable buildkite-agent - systemctl start buildkite-agent - EOF - } -} diff --git a/terraform/gcp_old/vllm/modules/ci_v5/variables.tf b/terraform/gcp_old/vllm/modules/ci_v5/variables.tf deleted file mode 100644 index 35c6609c..00000000 --- a/terraform/gcp_old/vllm/modules/ci_v5/variables.tf +++ /dev/null @@ -1,3 +0,0 @@ -variable "project_id" { - default = "vllm-405802" -} diff --git a/terraform/gcp_old/vllm/modules/ci_v6/main.tf b/terraform/gcp_old/vllm/modules/ci_v6/main.tf deleted file mode 100644 index c75e972d..00000000 --- a/terraform/gcp_old/vllm/modules/ci_v6/main.tf +++ /dev/null @@ -1,97 +0,0 @@ -# 16 nodes for CI cluster -# 1 TPU v6e device each -# Region: us-east5-b -# Type: v6e-1 -# Runtime: v2-alpha-tpuv6e - -data "google_secret_manager_secret_version" "buildkite_agent_token_ci_cluster" { - secret = "projects/${var.project_id}/secrets/buildkite_agent_token_ci_cluster" - version = "latest" -} - -data "google_secret_manager_secret_version" "huggingface_token" { - secret = "projects/${var.project_id}/secrets/huggingface_token" - version = "latest" -} - -locals { - buildkite_token_value = data.google_secret_manager_secret_version.buildkite_agent_token_ci_cluster.secret_data - huggingface_token_value = data.google_secret_manager_secret_version.huggingface_token.secret_data -} - -resource "google_compute_disk" "disk_east5_b" { - provider = google-beta.us-east5-b - count = 16 - - name = "tpu-disk-east5-b-${count.index}" - size = 512 - type = "hyperdisk-balanced" - zone = "us-east5-b" -} - -resource "google_tpu_v2_vm" "tpu_v6_ci" { - provider = google-beta.us-east5-b - count = 16 - name = "vllm-tpu-v6-ci-${count.index}" - zone = "us-east5-b" - - runtime_version = "v2-alpha-tpuv6e" - - accelerator_type = "v6e-1" - - network_config { - network = "projects/${var.project_id}/global/networks/default" - enable_external_ips = true - } - - data_disks { - source_disk = google_compute_disk.disk_east5_b[count.index].id - mode = "READ_WRITE" - } - - metadata = { - "startup-script" = <<-EOF - #!/bin/bash - - apt-get update - apt-get install -y curl build-essential jq - - curl -o- https://get.docker.com/ | bash - - - curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y - /root/.cargo/bin/cargo install minijinja-cli - cp /root/.cargo/bin/minijinja-cli /usr/bin/minijinja-cli - chmod 777 /usr/bin/minijinja-cli - - curl -fsSL https://keys.openpgp.org/vks/v1/by-fingerprint/32A37959C2FA5C3C99EFBC32A79206696452D198 | sudo gpg --dearmor -o /usr/share/keyrings/buildkite-agent-archive-keyring.gpg - echo "deb [signed-by=/usr/share/keyrings/buildkite-agent-archive-keyring.gpg] https://apt.buildkite.com/buildkite-agent stable main" | sudo tee /etc/apt/sources.list.d/buildkite-agent.list - apt-get update - apt-get install -y buildkite-agent - - sudo usermod -a -G docker buildkite-agent - sudo -u buildkite-agent gcloud auth configure-docker us-central1-docker.pkg.dev --quiet - - sudo sed -i "s/xxx/${local.buildkite_token_value}/g" /etc/buildkite-agent/buildkite-agent.cfg - sudo sed -i 's/name="%hostname-%spawn"/name="vllm-tpu-${count.index}"/' /etc/buildkite-agent/buildkite-agent.cfg - echo 'tags="queue=tpu_v6e_queue"' | sudo tee -a /etc/buildkite-agent/buildkite-agent.cfg - echo 'HF_TOKEN=${local.huggingface_token_value}' | sudo tee -a /etc/environment - - # Mount persistent disk - if ! blkid /dev/nvme0n2; then - echo "Formatting /dev/nvme0n2 as ext4..." - sudo mkfs.ext4 -m 0 -E lazy_itable_init=0,lazy_journal_init=0,discard /dev/nvme0n2 - fi - - sudo mkdir -p /mnt/disks/persist - sudo mount -o discard,defaults /dev/nvme0n2 /mnt/disks/persist - - jq ". + {\"data-root\": \"/mnt/disks/persist\"}" /etc/docker/daemon.json > /tmp/daemon.json.tmp && mv /tmp/daemon.json.tmp /etc/docker/daemon.json - systemctl stop docker - systemctl daemon-reload - systemctl start docker - - systemctl enable buildkite-agent - systemctl start buildkite-agent - EOF - } -} \ No newline at end of file diff --git a/terraform/gcp_old/vllm/modules/ci_v6/variables.tf b/terraform/gcp_old/vllm/modules/ci_v6/variables.tf deleted file mode 100644 index 35c6609c..00000000 --- a/terraform/gcp_old/vllm/modules/ci_v6/variables.tf +++ /dev/null @@ -1,3 +0,0 @@ -variable "project_id" { - default = "vllm-405802" -} diff --git a/terraform/gcp_old/vllm/variables.tf b/terraform/gcp_old/vllm/variables.tf index 35c6609c..4e1ccfd1 100644 --- a/terraform/gcp_old/vllm/variables.tf +++ b/terraform/gcp_old/vllm/variables.tf @@ -1,3 +1,23 @@ variable "project_id" { default = "vllm-405802" } + +variable "buildkite_benchmark_agent_token_name" { + default = "buildkite_agent_token_benchmark_cluster" +} + +variable "buildkite_ci_agent_token_name" { + default = "buildkite_agent_token_ci_cluster" +} + +variable "huggingface_token_name" { + default = "huggingface_token" +} + +variable "ci_v6_instance_count" { + default = 16 +} + +variable "ci_v6_disk_size" { + default = 512 +}