From 1d7e313b47865acd8b65bcb9f30d364432f0d680 Mon Sep 17 00:00:00 2001 From: X1aoZEOuO Date: Tue, 5 Aug 2025 11:10:00 +0800 Subject: [PATCH 1/3] feat: add label for prometheus query. Signed-off-by: X1aoZEOuO --- pkg/controller/inference/service_controller.go | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pkg/controller/inference/service_controller.go b/pkg/controller/inference/service_controller.go index 1f62cabd..393a80f4 100644 --- a/pkg/controller/inference/service_controller.go +++ b/pkg/controller/inference/service_controller.go @@ -131,7 +131,7 @@ func (r *ServiceReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct } // Create a service for the leader pods of the lws for loadbalancing. - if err := CreateServiceIfNotExists(ctx, r.Client, r.Scheme, service); err != nil { + if err := CreateServiceIfNotExists(ctx, r.Client, r.Scheme, service, models); err != nil { return ctrl.Result{}, err } @@ -419,7 +419,7 @@ func setControllerReferenceForWorkload(owner metav1.Object, lws *applyconfigurat return nil } -func CreateServiceIfNotExists(ctx context.Context, k8sClient client.Client, Scheme *runtime.Scheme, service *inferenceapi.Service) error { +func CreateServiceIfNotExists(ctx context.Context, k8sClient client.Client, Scheme *runtime.Scheme, service *inferenceapi.Service, model []*coreapi.OpenModel) error { log := ctrl.LoggerFrom(ctx) // The load balancing service name. svcName := service.Name + "-lb" @@ -433,6 +433,7 @@ func CreateServiceIfNotExists(ctx context.Context, k8sClient client.Client, Sche ObjectMeta: metav1.ObjectMeta{ Name: svcName, Namespace: service.Namespace, + Labels: modelLabels(model[0]), }, Spec: corev1.ServiceSpec{ Ports: []corev1.ServicePort{ From 78648c20a8a8169b745f82bb283bb9dbf6d3f843 Mon Sep 17 00:00:00 2001 From: X1aoZEOuO Date: Mon, 1 Sep 2025 11:10:00 +0800 Subject: [PATCH 2/3] feat: add serverless config for keda. Signed-off-by: X1aoZEOuO --- docs/examples/serverless/basic.yaml | 76 +++++++++++++++++++ docs/examples/serverless/scaled-object.yaml | 21 +++++ docs/examples/serverless/service-monitor.yaml | 18 +++++ 3 files changed, 115 insertions(+) create mode 100644 docs/examples/serverless/basic.yaml create mode 100644 docs/examples/serverless/scaled-object.yaml create mode 100644 docs/examples/serverless/service-monitor.yaml diff --git a/docs/examples/serverless/basic.yaml b/docs/examples/serverless/basic.yaml new file mode 100644 index 00000000..9fc5761b --- /dev/null +++ b/docs/examples/serverless/basic.yaml @@ -0,0 +1,76 @@ +apiVersion: llmaz.io/v1alpha1 +kind: OpenModel +metadata: + name: qwen2-0--5b +spec: + familyName: qwen2 + source: + modelHub: + modelID: Qwen/Qwen2-0.5B-Instruct-GGUF + filename: qwen2-0_5b-instruct-q5_k_m.gguf +--- +apiVersion: inference.llmaz.io/v1alpha1 +kind: Playground +metadata: + name: qwen2-0--5b +spec: + replicas: 0 + modelClaim: + modelName: qwen2-0--5b + backendRuntimeConfig: + backendName: llamacpp + configName: default + args: + - -fa # use flash attention +--- +apiVersion: gateway.networking.k8s.io/v1 +kind: GatewayClass +metadata: + name: default-envoy-ai-gateway +spec: + controllerName: gateway.envoyproxy.io/gatewayclass-controller +--- +apiVersion: gateway.networking.k8s.io/v1 +kind: Gateway +metadata: + name: default-envoy-ai-gateway +spec: + gatewayClassName: default-envoy-ai-gateway + listeners: + - name: http + protocol: HTTP + port: 80 +--- +apiVersion: aigateway.envoyproxy.io/v1alpha1 +kind: AIGatewayRoute +metadata: + name: default-envoy-ai-gateway +spec: + schema: + name: OpenAI + targetRefs: + - name: default-envoy-ai-gateway + kind: Gateway + group: gateway.networking.k8s.io + rules: + - matches: + - headers: + - type: Exact + name: x-ai-eg-model + value: qwen2-0--5b + backendRefs: + - name: qwen2-0--5b +--- +apiVersion: aigateway.envoyproxy.io/v1alpha1 +kind: AIServiceBackend +metadata: + name: qwen2-0--5b +spec: + timeouts: + request: 3m + schema: + name: OpenAI + backendRef: + name: qwen2-0--5b-lb + kind: Service + port: 8080 \ No newline at end of file diff --git a/docs/examples/serverless/scaled-object.yaml b/docs/examples/serverless/scaled-object.yaml new file mode 100644 index 00000000..c8bd4326 --- /dev/null +++ b/docs/examples/serverless/scaled-object.yaml @@ -0,0 +1,21 @@ +apiVersion: keda.sh/v1alpha1 +kind: ScaledObject +metadata: + name: qwen2-0--5b-scaler + namespace: default +spec: + scaleTargetRef: + apiVersion: inference.llmaz.io/v1alpha1 + kind: Playground + name: qwen2-0--5b + pollingInterval: 30 + cooldownPeriod: 50 + minReplicaCount: 0 + maxReplicaCount: 3 + triggers: + - type: prometheus + metadata: + serverAddress: http://prometheus-operated.llmaz-system.svc.cluster.local:9090 + metricName: llamacpp:requests_processing + query: sum(llamacpp:requests_processing) + threshold: "0.2" \ No newline at end of file diff --git a/docs/examples/serverless/service-monitor.yaml b/docs/examples/serverless/service-monitor.yaml new file mode 100644 index 00000000..779c88c5 --- /dev/null +++ b/docs/examples/serverless/service-monitor.yaml @@ -0,0 +1,18 @@ +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: qwen2-0--5b-lb-monitor + namespace: llmaz-system + labels: + control-plane: controller-manager + app.kubernetes.io/name: servicemonitor +spec: + namespaceSelector: + any: true + selector: + matchLabels: + llmaz.io/model-name: qwen2-0--5b + endpoints: + - port: http + path: /metrics + scheme: http \ No newline at end of file From b73e6f066a6a5a740a9558972881b3b5f61dfe43 Mon Sep 17 00:00:00 2001 From: X1aoZEOuO Date: Tue, 9 Sep 2025 11:10:00 +0800 Subject: [PATCH 3/3] feat: add serverless usage doc for llmaz. Signed-off-by: X1aoZEOuO --- Makefile | 9 ++ docs/examples/serverless/README.md | 119 ++++++++++++++++++ .../inference/service_controller.go | 2 +- 3 files changed, 129 insertions(+), 1 deletion(-) create mode 100644 docs/examples/serverless/README.md diff --git a/Makefile b/Makefile index d653c968..fca04dc0 100644 --- a/Makefile +++ b/Makefile @@ -302,6 +302,15 @@ install-prometheus: uninstall-prometheus: kubectl delete -k config/prometheus +.PHONY: install-keda +install-keda: + helm repo add kedacore https://kedacore.github.io/charts + helm install keda kedacore/keda --namespace keda --create-namespace + +.PHONY: uninstall-keda +uninstall-keda: + helm uninstall keda -n keda + ##@Release .PHONY: artifacts diff --git a/docs/examples/serverless/README.md b/docs/examples/serverless/README.md new file mode 100644 index 00000000..feb49289 --- /dev/null +++ b/docs/examples/serverless/README.md @@ -0,0 +1,119 @@ +# Serverless Configuration and Documentation + +## Overview + +This document provides a detailed guide on configuring serverless environments using Kubernetes, with a focus on integrating Prometheus for monitoring and KEDA for scaling. The configuration aims to ensure efficient resource utilization and seamless scaling of applications. + +## Concepts + +### Prometheus Configuration + +Prometheus is used for monitoring and alerting. To enable cross-namespace ServiceMonitor discovery, use `namespaceSelector`. In Prometheus, define `serviceMonitorSelector` to associate with ServiceMonitors. + +```yaml +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: qwen2-0--5b-lb-monitor + namespace: llmaz-system + labels: + control-plane: controller-manager + app.kubernetes.io/name: servicemonitor +spec: + namespaceSelector: + any: true + selector: + matchLabels: + llmaz.io/model-name: qwen2-0--5b + endpoints: + - port: http + path: /metrics + scheme: http +``` + +- Ensure that the `namespaceSelector` is set to allow cross-namespace monitoring. +- Label your services appropriately to be discovered by Prometheus. + +### KEDA Configuration + +KEDA (Kubernetes Event-driven Autoscaling) is used for scaling applications based on custom metrics. It can be integrated with Prometheus to trigger scaling actions. + + +```yaml +apiVersion: keda.sh/v1alpha1 +kind: ScaledObject +metadata: + name: qwen2-0--5b-scaler + namespace: default +spec: + scaleTargetRef: + apiVersion: inference.llmaz.io/v1alpha1 + kind: Playground + name: qwen2-0--5b + pollingInterval: 30 + cooldownPeriod: 50 + minReplicaCount: 0 + maxReplicaCount: 3 + triggers: + - type: prometheus + metadata: + serverAddress: http://prometheus-operated.llmaz-system.svc.cluster.local:9090 + metricName: llamacpp:requests_processing + query: sum(llamacpp:requests_processing) + threshold: "0.2" +``` + +- Ensure that the `serverAddress` points to the correct Prometheus service. +- Adjust `pollingInterval` and `cooldownPeriod` to optimize scaling behavior and avoid conflicts with other scaling mechanisms. + +### Integration with Activator + +Consider integrating the serverless configuration with an activator for scale-from-zero scenarios. The activator can be implemented using a controller pattern or as a standalone goroutine. + +### Controller Runtime Framework + +Using the Controller Runtime framework can simplify the development of Kubernetes controllers. It provides abstractions for managing resources and handling events. + +#### Key Components + +1. **Controller**: Monitors resource states and triggers actions to align actual and desired states. +2. **Reconcile Function**: Core logic for transitioning resource states. +3. **Manager**: Manages the lifecycle of controllers and shared resources. +4. **Client**: Interface for interacting with the Kubernetes API. +5. **Scheme**: Registry for resource types. +6. **Event Source and Handler**: Define event sources and handling logic. + + +## Quick Start Guide + +1. Install Prometheus and KEDA using Helm charts, following the official documentation [Install Guide](https://llmaz.inftyai.com/docs/getting-started/installation/). + +```bash +helm install llmaz oci://registry-1.docker.io/inftyai/llmaz --namespace llmaz-system --create-namespace --version 0.0.10 +make install-keda +make install-prometheus +``` + +2. Create a ServiceMonitor for Prometheus to discover your services. +```bash +kubectl apply -f service-monitor.yaml +``` + +3. Create a ScaledObject for KEDA to manage scaling. +```bash +kubectl apply -f scaled-object.yaml +``` + +4. Test with a cold start application. +```bash +kubectl exec -it -n kube-system deploy/activator -- wget -O- qwen2-0--5b-lb.default.svc:8080 +``` + +5. Check with Prometheus and KEDA dashboards to monitor metrics and scaling activities in web page. +```bash +kubectl port-forward services/prometheus-operated 9090:9090 --address 0.0.0.0 -n llmaz-system +``` + +## Conclusion + +This configuration guide provides a comprehensive approach to setting up a serverless environment with Kubernetes, Prometheus, and KEDA. By following these guidelines, you can ensure efficient scaling and monitoring of your applications. \ No newline at end of file diff --git a/pkg/controller/inference/service_controller.go b/pkg/controller/inference/service_controller.go index 393a80f4..61f939a2 100644 --- a/pkg/controller/inference/service_controller.go +++ b/pkg/controller/inference/service_controller.go @@ -433,7 +433,7 @@ func CreateServiceIfNotExists(ctx context.Context, k8sClient client.Client, Sche ObjectMeta: metav1.ObjectMeta{ Name: svcName, Namespace: service.Namespace, - Labels: modelLabels(model[0]), + Labels: modelLabels(model[0]), }, Spec: corev1.ServiceSpec{ Ports: []corev1.ServicePort{