Merge pull request #11 from meta-llama/release/0.0.54

cmodi-meta · web-flow · commit c8dbf914387f · 2024-12-06T12:30:34.000-08:00
Release/0.0.54
diff --git a/README.md b/README.md
diff --git a/build-libs.sh b/build-libs.sh
@@ -6,6 +6,7 @@ export SKIP_MOCK_TESTS=true
 ./gradlew :llama-stack-client-kotlin-core:spotlessApply
 ./gradlew :llama-stack-client-kotlin-client-okhttp:spotlessApply
 ./gradlew :llama-stack-client-kotlin:spotlessApply
+./gradlew :llama-stack-client-kotlin-client-local:spotlessApply
 
 ./gradlew build 
 
@@ -17,3 +18,4 @@ echo $BUILD_JARS_DIR
 cp -a llama-stack-client-kotlin/build/libs/. $BUILD_JARS_DIR
 cp -a llama-stack-client-kotlin-client-okhttp/build/libs/. $BUILD_JARS_DIR
 cp -a llama-stack-client-kotlin-core/build/libs/. $BUILD_JARS_DIR
+cp -a llama-stack-client-kotlin-client-local/build/libs/. $BUILD_JARS_DIR
diff --git a/build.gradle.kts b/build.gradle.kts
@@ -4,5 +4,5 @@ plugins {
 
 allprojects {
     group = "com.llama.llamastack"
-    version = "0.0.1-alpha.2"
+    version = "0.0.54"
 }
diff --git a/doc/img/example_android_app_directory.png b/doc/img/example_android_app_directory.png
diff --git a/llama-stack-client-kotlin-client-local/build.gradle.kts b/llama-stack-client-kotlin-client-local/build.gradle.kts
@@ -0,0 +1,11 @@
+plugins {
+    id("llama-stack-client.kotlin")
+    id("llama-stack-client.publish")
+}
+
+dependencies {
+    api(project(":llama-stack-client-kotlin-core"))
+    testImplementation(kotlin("test"))
+    implementation(fileTree("libs") { include("*.jar") })
+    implementation(files("jni/**/*.so"))
+}
diff --git a/llama-stack-client-kotlin-client-local/download-prebuilt-et-lib.sh b/llama-stack-client-kotlin-client-local/download-prebuilt-et-lib.sh
@@ -0,0 +1,44 @@
+#!/usr/bin/env bash
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Purpose: The goal of this script is to get ExecuTorch AAR file for local inferencing.
+# How to run:
+# - `sh download-prebuild-et-lib.sh` : download ExecuTorch AAR in apps/libs (meant to be run in Android App environment)
+# - `sh download-prebuild-et-lib.sh` : download+unzip ExecuTorch AAR in working directory and clean-up.
+# Based on ExecuTorch Commit ID: 0a12e33d22a3d44d1aa2af5f0d0673d45b962553
+
+set -eu
+
+AAR_URL="https://ossci-android.s3.amazonaws.com/executorch/release/executorch-20241202/executorch.aar"
+AAR_SHASUM_URL="https://ossci-android.s3.amazonaws.com/executorch/release/executorch-20241202/executorch.aar.sha256sums"
+LIBS_PATH="$(dirname "$0")/app/libs"
+UNZIP=false
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        --unzip)
+            UNZIP=true
+            LIBS_PATH="$(dirname "$0")"
+            shift
+            ;;
+        *)
+            echo "Unknown option: $1"
+            exit 1
+            ;;
+    esac
+done
+
+if [ "$UNZIP" = false ]; then
+    mkdir -p "${LIBS_PATH}"
+fi
+
+curl -O "${AAR_SHASUM_URL}"
+shasum --check --status executorch.aar.sha256sums || curl "${AAR_URL}" -o "${LIBS_PATH}/executorch.aar"
+
+if [ "$UNZIP" = true ]; then
+    unzip -q executorch.aar
+    rm "executorch.aar" "executorch.aar.sha256sums"
+fi
diff --git a/llama-stack-client-kotlin-client-local/src/main/kotlin/com/llama/llamastack/client/local/InferenceServiceLocalImpl.kt b/llama-stack-client-kotlin-client-local/src/main/kotlin/com/llama/llamastack/client/local/InferenceServiceLocalImpl.kt
@@ -0,0 +1,107 @@
+// File generated from our OpenAPI spec by Stainless.
+
+package com.llama.llamastack.client.local
+
+import com.llama.llamastack.client.local.util.PromptFormatLocal
+import com.llama.llamastack.core.JsonValue
+import com.llama.llamastack.core.RequestOptions
+import com.llama.llamastack.models.CompletionMessage
+import com.llama.llamastack.models.EmbeddingsResponse
+import com.llama.llamastack.models.InferenceChatCompletionParams
+import com.llama.llamastack.models.InferenceChatCompletionResponse
+import com.llama.llamastack.models.InferenceCompletionParams
+import com.llama.llamastack.models.InferenceCompletionResponse
+import com.llama.llamastack.models.InferenceEmbeddingsParams
+import com.llama.llamastack.services.blocking.InferenceService
+import org.pytorch.executorch.LlamaCallback
+
+class InferenceServiceLocalImpl
+constructor(
+    private val clientOptions: LocalClientOptions,
+) : InferenceService, LlamaCallback {
+
+    private var resultMessage: String = ""
+    private var onResultComplete: Boolean = false
+    private var statsMetric: Float = 0.0f
+    private var onStatsComplete: Boolean = false
+    private var modelName: String = ""
+
+    private var sequenceLengthKey: String = "seq_len"
+
+    override fun onResult(p0: String?) {
+        if (PromptFormatLocal.getStopTokens(modelName).any { it == p0 }) {
+            onResultComplete = true
+            return
+        }
+
+        if (p0.equals("\n\n") || p0.equals("\n")) {
+            if (resultMessage.isNotEmpty()) {
+                resultMessage += p0
+            }
+        } else {
+            resultMessage += p0
+        }
+    }
+
+    override fun onStats(p0: Float) {
+        onResultComplete =
+            true // required since in some cases where seq_len is met then EOT is not appended by ET
+        // logic
+        statsMetric = p0
+        onStatsComplete = true
+    }
+
+    override fun chatCompletion(
+        params: InferenceChatCompletionParams,
+        requestOptions: RequestOptions
+    ): InferenceChatCompletionResponse {
+        resultMessage = ""
+        val mModule = clientOptions.llamaModule
+        modelName = params.modelId()
+        val formattedPrompt =
+            PromptFormatLocal.getTotalFormattedPrompt(params.messages(), modelName)
+
+        // Developer can pass in their sequence length but if not then it will default to a
+        // particular dynamic value. This is to ensure enough value is provided to give a reasonably
+        // complete response. 0.75 is the approximate words per token. And 64 is buffer for tokens
+        // for generate response.
+        val seqLength =
+            params._additionalQueryParams().values(sequenceLengthKey).lastOrNull()?.toInt()
+                ?: ((formattedPrompt.length * 0.75) + 64).toInt()
+
+        println("Chat Completion Prompt is: $formattedPrompt with seqLength of $seqLength")
+        onResultComplete = false
+        mModule.generate(formattedPrompt, seqLength, this, false)
+
+        while (!onResultComplete && !onStatsComplete) {
+            Thread.sleep(100)
+        }
+        onResultComplete = false
+        onStatsComplete = false
+
+        return InferenceChatCompletionResponse.ofChatCompletionResponse(
+            InferenceChatCompletionResponse.ChatCompletionResponse.builder()
+                .completionMessage(
+                    CompletionMessage.builder()
+                        .content(CompletionMessage.Content.ofString(resultMessage))
+                        .build()
+                )
+                .putAdditionalProperty("tps", JsonValue.from(statsMetric))
+                .build()
+        )
+    }
+
+    override fun completion(
+        params: InferenceCompletionParams,
+        requestOptions: RequestOptions
+    ): InferenceCompletionResponse {
+        TODO("Not yet implemented")
+    }
+
+    override fun embeddings(
+        params: InferenceEmbeddingsParams,
+        requestOptions: RequestOptions
+    ): EmbeddingsResponse {
+        TODO("Not yet implemented")
+    }
+}
diff --git a/llama-stack-client-kotlin-client-local/src/main/kotlin/com/llama/llamastack/client/local/LlamaStackClientClientLocalImpl.kt b/llama-stack-client-kotlin-client-local/src/main/kotlin/com/llama/llamastack/client/local/LlamaStackClientClientLocalImpl.kt
@@ -0,0 +1,98 @@
+// File generated from our OpenAPI spec by Stainless.
+
+package com.llama.llamastack.client.local
+
+import com.llama.llamastack.client.LlamaStackClientClient
+import com.llama.llamastack.client.LlamaStackClientClientAsync
+import com.llama.llamastack.models.*
+import com.llama.llamastack.services.blocking.*
+
+class LlamaStackClientClientLocalImpl
+constructor(
+    private val clientOptions: LocalClientOptions,
+) : LlamaStackClientClient {
+
+    private val inference: InferenceService by lazy { InferenceServiceLocalImpl(clientOptions) }
+
+    override fun inference(): InferenceService = inference
+
+    override fun async(): LlamaStackClientClientAsync {
+        TODO("Not yet implemented")
+    }
+
+    override fun telemetry(): TelemetryService {
+        TODO("Not yet implemented")
+    }
+
+    override fun datasetio(): DatasetioService {
+        TODO("Not yet implemented")
+    }
+
+    override fun scoring(): ScoringService {
+        TODO("Not yet implemented")
+    }
+
+    override fun scoringFunctions(): ScoringFunctionService {
+        TODO("Not yet implemented")
+    }
+
+    override fun evalTasks(): EvalTaskService {
+        TODO("Not yet implemented")
+    }
+
+    override fun agents(): AgentService {
+        TODO("Not yet implemented")
+    }
+
+    override fun batchInference(): BatchInferenceService {
+        TODO("Not yet implemented")
+    }
+
+    override fun datasets(): DatasetService {
+        TODO("Not yet implemented")
+    }
+
+    override fun eval(): EvalService {
+        TODO("Not yet implemented")
+    }
+
+    override fun inspect(): InspectService {
+        TODO("Not yet implemented")
+    }
+
+    override fun safety(): SafetyService {
+        TODO("Not yet implemented")
+    }
+
+    override fun memory(): MemoryService {
+        TODO("Not yet implemented")
+    }
+
+    override fun postTraining(): PostTrainingService {
+        TODO("Not yet implemented")
+    }
+
+    override fun providers(): ProviderService {
+        TODO("Not yet implemented")
+    }
+
+    override fun routes(): RouteService {
+        TODO("Not yet implemented")
+    }
+
+    override fun syntheticDataGeneration(): SyntheticDataGenerationService {
+        TODO("Not yet implemented")
+    }
+
+    override fun models(): ModelService {
+        TODO("Not yet implemented")
+    }
+
+    override fun memoryBanks(): MemoryBankService {
+        TODO("Not yet implemented")
+    }
+
+    override fun shields(): ShieldService {
+        TODO("Not yet implemented")
+    }
+}
diff --git a/llama-stack-client-kotlin-client-local/src/main/kotlin/com/llama/llamastack/client/local/LlamaStackClientLocalClient.kt b/llama-stack-client-kotlin-client-local/src/main/kotlin/com/llama/llamastack/client/local/LlamaStackClientLocalClient.kt
@@ -0,0 +1,37 @@
+package com.llama.llamastack.client.local
+
+import com.llama.llamastack.client.LlamaStackClientClient
+
+class LlamaStackClientLocalClient private constructor() {
+
+    companion object {
+        fun builder() = Builder()
+    }
+
+    class Builder {
+
+        private var clientOptions: LocalClientOptions.Builder = LocalClientOptions.builder()
+        private var modelPath: String? = null
+        private var tokenizerPath: String? = null
+        private var temperature: Float = 0.0F
+
+        fun modelPath(modelPath: String) = apply { this.modelPath = modelPath }
+
+        fun tokenizerPath(tokenizerPath: String) = apply { this.tokenizerPath = tokenizerPath }
+
+        fun temperature(temperature: Float) = apply { this.temperature = temperature }
+
+        fun fromEnv() = apply { clientOptions.fromEnv() }
+
+        fun build(): LlamaStackClientClient {
+
+            return LlamaStackClientClientLocalImpl(
+                clientOptions
+                    .modelPath(modelPath!!)
+                    .tokenizerPath(tokenizerPath!!)
+                    .temperature(temperature)
+                    .build()
+            )
+        }
+    }
+}
diff --git a/llama-stack-client-kotlin-client-local/src/main/kotlin/com/llama/llamastack/client/local/LocalClientOptions.kt b/llama-stack-client-kotlin-client-local/src/main/kotlin/com/llama/llamastack/client/local/LocalClientOptions.kt
@@ -0,0 +1,57 @@
+// File generated from our OpenAPI spec by Stainless.
+
+package com.llama.llamastack.client.local
+
+import com.llama.llamastack.errors.LlamaStackClientException
+import org.pytorch.executorch.LlamaModule
+
+class LocalClientOptions
+private constructor(
+    val modelPath: String,
+    val tokenizerPath: String,
+    val temperature: Float,
+    val llamaModule: LlamaModule
+) {
+
+    companion object {
+        fun builder() = Builder()
+    }
+
+    class Builder {
+        private var modelPath: String? = null
+        private var tokenizerPath: String? = null
+        private var temperature: Float = 0.0F
+        private var llamaModule: LlamaModule? = null
+
+        fun modelPath(modelPath: String) = apply { this.modelPath = modelPath }
+
+        fun tokenizerPath(tokenizerPath: String) = apply { this.tokenizerPath = tokenizerPath }
+
+        fun temperature(temperature: Float) = apply { this.temperature = temperature }
+
+        fun fromEnv() = apply {}
+
+        fun build(): LocalClientOptions {
+            checkNotNull(modelPath) { "`modelPath` is required but not set" }
+            checkNotNull(tokenizerPath) { "`tokenizerPath` is required but not set" }
+
+            try {
+                this.llamaModule = LlamaModule(1, modelPath, tokenizerPath, temperature)
+                checkNotNull(llamaModule) { "`temperature` is required but not set" }
+                llamaModule!!.load()
+                println(
+                    "llamaModule loading with modelPath: $modelPath | " +
+                        "tokenizerPath: $tokenizerPath | temperature: $temperature"
+                )
+                return LocalClientOptions(modelPath!!, tokenizerPath!!, temperature, llamaModule!!)
+            } catch (e: NoClassDefFoundError) {
+                throw LlamaStackClientException(
+                    "ExecuTorch AAR file needs to be included in the libs/ for your app. " +
+                        "Please see the README for more details: " +
+                        "https://github.com/meta-llama/llama-stack-client-kotlin/tree/main",
+                    e
+                )
+            }
+        }
+    }
+}
diff --git a/llama-stack-client-kotlin-client-local/src/main/kotlin/com/llama/llamastack/client/local/util/PromptFormatLocal.kt b/llama-stack-client-kotlin-client-local/src/main/kotlin/com/llama/llamastack/client/local/util/PromptFormatLocal.kt
diff --git a/settings.gradle.kts b/settings.gradle.kts

Original file line number	Diff line number	Diff line change
`@@ -4,5 +4,5 @@ plugins {`
`4`	`4`
`5`	`5`	`allprojects {`
`6`	`6`	`group = "com.llama.llamastack"`
`7`		`- version = "0.0.1-alpha.2"`
	`7`	`+ version = "0.0.54"`
`8`	`8`	`}`