diff --git a/iac/provider-gcp/Makefile b/iac/provider-gcp/Makefile index d9c4ddc3d1..da545b52b6 100644 --- a/iac/provider-gcp/Makefile +++ b/iac/provider-gcp/Makefile @@ -42,6 +42,7 @@ tf_vars := TF_VAR_environment=$(TERRAFORM_ENVIRONMENT) \ $(call tfvar, ALLOW_SANDBOX_INTERNET) \ $(call tfvar, API_RESOURCES_CPU_COUNT) \ $(call tfvar, API_RESOURCES_MEMORY_MB) \ + $(call tfvar, INGRESS_COUNT) \ $(call tfvar, CLIENT_PROXY_COUNT) \ $(call tfvar, CLIENT_PROXY_RESOURCES_CPU_COUNT) \ $(call tfvar, CLIENT_PROXY_RESOURCES_MEMORY_MB) \ diff --git a/iac/provider-gcp/init/main.tf b/iac/provider-gcp/init/main.tf index db12efe1ca..b861b56f5b 100644 --- a/iac/provider-gcp/init/main.tf +++ b/iac/provider-gcp/init/main.tf @@ -208,6 +208,27 @@ resource "google_secret_manager_secret_version" "analytics_collector_api_token" depends_on = [time_sleep.secrets_api_wait_60_seconds] } +resource "google_secret_manager_secret" "routing_domains" { + secret_id = "${var.prefix}routing-domains" + + replication { + auto {} + } + + depends_on = [time_sleep.secrets_api_wait_60_seconds] +} + +resource "google_secret_manager_secret_version" "routing_domains" { + secret = google_secret_manager_secret.routing_domains.name + secret_data = jsonencode([]) + + lifecycle { + ignore_changes = [secret_data] + } + + depends_on = [time_sleep.secrets_api_wait_60_seconds] +} + resource "google_artifact_registry_repository" "orchestration_repository" { format = "DOCKER" repository_id = "e2b-orchestration" @@ -222,7 +243,6 @@ resource "time_sleep" "artifact_registry_api_wait_90_seconds" { create_duration = "90s" } - resource "google_artifact_registry_repository_iam_member" "orchestration_repository_member" { repository = google_artifact_registry_repository.orchestration_repository.name role = "roles/artifactregistry.reader" diff --git a/iac/provider-gcp/init/outputs.tf b/iac/provider-gcp/init/outputs.tf index be11b49b1c..2b2a72a546 100644 --- a/iac/provider-gcp/init/outputs.tf +++ b/iac/provider-gcp/init/outputs.tf @@ -30,6 +30,10 @@ output "analytics_collector_api_token_secret_name" { value = google_secret_manager_secret.analytics_collector_api_token.name } +output "routing_domains_secret_name" { + value = google_secret_manager_secret.routing_domains.name +} + output "orchestration_repository_name" { value = google_artifact_registry_repository.orchestration_repository.name } diff --git a/iac/provider-gcp/main.tf b/iac/provider-gcp/main.tf index 82eb375291..5a8118d961 100644 --- a/iac/provider-gcp/main.tf +++ b/iac/provider-gcp/main.tf @@ -54,6 +54,19 @@ provider "google" { zone = var.gcp_zone } +data "google_secret_manager_secret_version" "routing_domains" { + secret = module.init.routing_domains_secret_name +} + +locals { + // Taking additional domains from local env is there just for backward compatibility + additional_domains_from_secret = nonsensitive(jsondecode(data.google_secret_manager_secret_version.routing_domains.secret_data)) + additional_domains_from_env = (var.additional_domains != "" ? + [for item in split(",", var.additional_domains) : trimspace(item)] : []) + + additional_domains = distinct(concat(local.additional_domains_from_env, local.additional_domains_from_secret)) +} + module "init" { source = "./init" @@ -108,6 +121,7 @@ module "cluster" { api_use_nat = var.api_use_nat api_nat_ips = var.api_nat_ips + ingress_port = var.ingress_port edge_api_port = var.edge_api_port edge_proxy_port = var.edge_proxy_port api_port = var.api_port @@ -115,9 +129,8 @@ module "cluster" { nomad_port = var.nomad_port google_service_account_email = module.init.service_account_email domain_name = var.domain_name - additional_domains = (var.additional_domains != "" ? - [for item in split(",", var.additional_domains) : trimspace(item)] : []) + additional_domains = local.additional_domains additional_api_services = (var.additional_api_services_json != "" ? jsondecode(var.additional_api_services_json) : []) @@ -172,6 +185,10 @@ module "nomad" { clickhouse_job_constraint_prefix = var.clickhouse_job_constraint_prefix clickhouse_node_pool = var.clickhouse_node_pool + # Ingress + ingress_port = var.ingress_port + ingress_count = var.ingress_count + # API api_resources_cpu_count = var.api_resources_cpu_count api_resources_memory_mb = var.api_resources_memory_mb diff --git a/iac/provider-gcp/nomad-cluster/main.tf b/iac/provider-gcp/nomad-cluster/main.tf index c89f2f18da..3c9d5fb1d7 100644 --- a/iac/provider-gcp/nomad-cluster/main.tf +++ b/iac/provider-gcp/nomad-cluster/main.tf @@ -96,6 +96,7 @@ module "network" { api_use_nat = var.api_use_nat api_nat_ips = var.api_nat_ips + ingress_port = var.ingress_port api_port = var.api_port docker_reverse_proxy_port = var.docker_reverse_proxy_port network_name = var.network_name diff --git a/iac/provider-gcp/nomad-cluster/network/ingress.tf b/iac/provider-gcp/nomad-cluster/network/ingress.tf new file mode 100644 index 0000000000..eedb8ea79d --- /dev/null +++ b/iac/provider-gcp/nomad-cluster/network/ingress.tf @@ -0,0 +1,58 @@ +resource "google_compute_health_check" "ingress" { + name = "${var.prefix}ingress" + + timeout_sec = 3 + check_interval_sec = 5 + healthy_threshold = 2 + unhealthy_threshold = 2 + + http_health_check { + port = var.ingress_port.port + request_path = var.ingress_port.health_path + } +} + +resource "google_compute_backend_service" "ingress" { + name = "${var.prefix}ingress" + + protocol = "HTTP" + port_name = var.ingress_port.name + + session_affinity = null + health_checks = [google_compute_health_check.ingress.id] + + timeout_sec = 65 + + load_balancing_scheme = "EXTERNAL_MANAGED" + locality_lb_policy = "ROUND_ROBIN" + + backend { + group = var.api_instance_group + } +} + +resource "google_compute_url_map" "ingress" { + name = "${var.prefix}ingress" + default_service = google_compute_backend_service.ingress.self_link +} + +resource "google_compute_global_forwarding_rule" "ingress" { + name = "${var.prefix}ingress-forward-http" + ip_protocol = "TCP" + port_range = "443" + load_balancing_scheme = "EXTERNAL_MANAGED" + ip_address = google_compute_global_address.ingress_ipv4.address + target = google_compute_target_https_proxy.ingress.self_link +} + +resource "google_compute_global_address" "ingress_ipv4" { + name = "${var.prefix}ingress-ipv4" + ip_version = "IPV4" +} + +resource "google_compute_target_https_proxy" "ingress" { + name = "${var.prefix}ingress-https" + url_map = google_compute_url_map.ingress.self_link + + certificate_map = "//certificatemanager.googleapis.com/${google_certificate_manager_certificate_map.certificate_map.id}" +} diff --git a/iac/provider-gcp/nomad-cluster/network/main.tf b/iac/provider-gcp/nomad-cluster/network/main.tf index 92927fe19a..c4a9508fa6 100644 --- a/iac/provider-gcp/nomad-cluster/network/main.tf +++ b/iac/provider-gcp/nomad-cluster/network/main.tf @@ -455,6 +455,11 @@ resource "google_compute_firewall" "default-hc" { } } + allow { + protocol = "tcp" + ports = [var.ingress_port.port] + } + dynamic "allow" { for_each = toset(var.additional_ports) diff --git a/iac/provider-gcp/nomad-cluster/network/variables.tf b/iac/provider-gcp/nomad-cluster/network/variables.tf index 640672814b..aa6db683f2 100644 --- a/iac/provider-gcp/nomad-cluster/network/variables.tf +++ b/iac/provider-gcp/nomad-cluster/network/variables.tf @@ -51,6 +51,14 @@ variable "api_port" { }) } +variable "ingress_port" { + type = object({ + name = string + port = number + health_path = string + }) +} + variable "docker_reverse_proxy_port" { type = object({ name = string diff --git a/iac/provider-gcp/nomad-cluster/nodepool-api.tf b/iac/provider-gcp/nomad-cluster/nodepool-api.tf index 9fbe6422ab..848fb6f92a 100644 --- a/iac/provider-gcp/nomad-cluster/nodepool-api.tf +++ b/iac/provider-gcp/nomad-cluster/nodepool-api.tf @@ -71,6 +71,11 @@ resource "google_compute_instance_group_manager" "api_pool" { port = var.docker_reverse_proxy_port.port } + named_port { + name = var.ingress_port.name + port = var.ingress_port.port + } + dynamic "named_port" { for_each = local.api_additional_ports content { diff --git a/iac/provider-gcp/nomad-cluster/variables.tf b/iac/provider-gcp/nomad-cluster/variables.tf index f18d7072a1..c6d449cbcb 100644 --- a/iac/provider-gcp/nomad-cluster/variables.tf +++ b/iac/provider-gcp/nomad-cluster/variables.tf @@ -100,6 +100,14 @@ variable "api_port" { }) } +variable "ingress_port" { + type = object({ + name = string + port = number + health_path = string + }) +} + variable "docker_reverse_proxy_port" { type = object({ name = string diff --git a/iac/provider-gcp/nomad/jobs/ingress.hcl b/iac/provider-gcp/nomad/jobs/ingress.hcl new file mode 100644 index 0000000000..378eff328e --- /dev/null +++ b/iac/provider-gcp/nomad/jobs/ingress.hcl @@ -0,0 +1,95 @@ +job "ingress" { + datacenters = ["${gcp_zone}"] + node_pool = "${node_pool}" + priority = 90 + + group "ingress" { + count = ${count} + + constraint { + operator = "distinct_hosts" + value = "true" + } + + network { + port "ingress" { + static = "${ingress_port}" + } + + port "control" { + static = "${control_port}" + } + } + +# https://developer.hashicorp.com/nomad/docs/job-specification/update +%{ if update_stanza } + update { + max_parallel = 1 # Update only 1 node at a time + } +%{ endif } + + service { + port = "ingress" + name = "ingress" + task = "ingress" + + check { + type = "http" + name = "health" + path = "/ping" + interval = "3s" + timeout = "3s" + port = "${ingress_port}" + } + } + + task "ingress" { + driver = "docker" + + %{ if update_stanza } + kill_timeout = "24h" + %{ endif } + + kill_signal = "SIGTERM" + + config { + network_mode = "host" + image = "traefik:v3.5" + ports = ["control", "ingress"] + args = [ + # Entry-points that are set internally by Traefik + "--entrypoints.web.address=:${ingress_port}", + "--entrypoints.traefik.address=:${control_port}", + + # Traefik internals (logging, metrics, ...) + "--api.dashboard=true", + "--api.insecure=false", + + "--accesslog=true", + "--ping=true", + "--ping.entryPoint=web", + "--metrics=true", + "--metrics.prometheus=true", + "--metrics.prometheus.entryPoint=traefik", + + # Traefik Nomad provider + "--providers.nomad=true", + "--providers.nomad.endpoint.address=${nomad_endpoint}", + "--providers.nomad.endpoint.token=${nomad_token}", + + # Traefik Consul provider + "--providers.consulcatalog=true", + "--providers.consulcatalog.exposedByDefault=false", + "--providers.consulcatalog.endpoint.address=${consul_endpoint}", + "--providers.consulcatalog.endpoint.token=${consul_token}", + ] + } + + resources { + memory_max = ${memory_mb * 1.5} + memory = ${memory_mb} + cpu = ${cpu_count * 1000} + } + } + } +} \ No newline at end of file diff --git a/iac/provider-gcp/nomad/main.tf b/iac/provider-gcp/nomad/main.tf index eb93b32d23..40a13cd6a2 100644 --- a/iac/provider-gcp/nomad/main.tf +++ b/iac/provider-gcp/nomad/main.tf @@ -68,6 +68,27 @@ resource "docker_image" "db_migrator_image" { platform = "linux/amd64/v8" } +resource "nomad_job" "ingress" { + jobspec = templatefile("${path.module}/jobs/ingress.hcl", + { + count = var.ingress_count + update_stanza = var.api_machine_count > 1 + cpu_count = 1 + memory_mb = 512 + node_pool = var.api_node_pool + gcp_zone = var.gcp_zone + + ingress_port = var.ingress_port.port + control_port = 8900 + + nomad_endpoint = "http://localhost:4646" + nomad_token = var.nomad_acl_token_secret + + consul_token = var.consul_acl_token_secret + consul_endpoint = "http://localhost:8500" + }) +} + resource "nomad_job" "api" { jobspec = templatefile("${path.module}/jobs/api.hcl", { update_stanza = var.api_machine_count > 1 diff --git a/iac/provider-gcp/nomad/variables.tf b/iac/provider-gcp/nomad/variables.tf index d946b7ca2a..6e747e230f 100644 --- a/iac/provider-gcp/nomad/variables.tf +++ b/iac/provider-gcp/nomad/variables.tf @@ -64,6 +64,18 @@ variable "api_port" { }) } +variable "ingress_port" { + type = object({ + name = string + port = number + health_path = string + }) +} + +variable "ingress_count" { + type = number +} + variable "api_resources_cpu_count" { type = number } diff --git a/iac/provider-gcp/variables.tf b/iac/provider-gcp/variables.tf index d979c37dc6..97942b6e1a 100644 --- a/iac/provider-gcp/variables.tf +++ b/iac/provider-gcp/variables.tf @@ -155,6 +155,11 @@ variable "client_proxy_count" { default = 1 } +variable "ingress_count" { + type = number + default = 1 +} + variable "client_proxy_resources_memory_mb" { type = number default = 1024 @@ -223,6 +228,19 @@ variable "api_port" { } } +variable "ingress_port" { + type = object({ + name = string + port = number + health_path = string + }) + default = { + name = "ingress" + port = 8800 + health_path = "/ping" + } +} + variable "docker_reverse_proxy_port" { type = object({ name = string