feat: add --include-usage CLI option

apepkuss · apepkuss · commit 4bf5a8a90db6 · 2025-02-12T00:19:12.000+08:00
Signed-off-by: Xin Liu &lt;sam@secondstate.io&gt;
diff --git a/README.md b/README.md
@@ -596,7 +596,7 @@ Options:
   -c, --ctx-size <CTX_SIZE>
           Sets context sizes for chat and embedding models, respectively. The sizes are separated by comma without space, for example, '--ctx-size 4096,384'. The first value is for the chat model, and the second is for the embedding model [default: 4096,384]
   -p, --prompt-template <PROMPT_TEMPLATE>
-          Sets prompt templates for chat and embedding models, respectively. The prompt templates are separated by comma without space, for example, '--prompt-template llama-2-chat,embedding'. The first value is for the chat model, and the second is for the embedding model [possible values: llama-2-chat, llama-3-chat, llama-3-tool, mistral-instruct, mistral-tool, mistrallite, mistral-small-chat, openchat, codellama-instruct, codellama-super-instruct, human-assistant, vicuna-1.0-chat, vicuna-1.1-chat, vicuna-llava, chatml, chatml-tool, internlm-2-tool, baichuan-2, wizard-coder, zephyr, stablelm-zephyr, intel-neural, deepseek-chat, deepseek-coder, deepseek-chat-2, deepseek-chat-25, deepseek-chat-3, solar-instruct, phi-2-chat, phi-2-instruct, phi-3-chat, phi-3-instruct, phi-4-chat, gemma-instruct, octopus, glm-4-chat, groq-llama3-tool, mediatek-breeze, nemotron-chat, nemotron-tool, functionary-32, functionary-31, minicpmv, moxin-chat, falcon3, megrez, qwen2-vision, embedding, none]
+          Sets prompt templates for chat and embedding models, respectively. The prompt templates are separated by comma without space, for example, '--prompt-template llama-2-chat,embedding'. The first value is for the chat model, and the second is for the embedding model [possible values: llama-2-chat, llama-3-chat, llama-3-tool, mistral-instruct, mistral-tool, mistrallite, mistral-small-chat, mistral-small-tool, openchat, codellama-instruct, codellama-super-instruct, human-assistant, vicuna-1.0-chat, vicuna-1.1-chat, vicuna-llava, chatml, chatml-tool, internlm-2-tool, baichuan-2, wizard-coder, zephyr, stablelm-zephyr, intel-neural, deepseek-chat, deepseek-coder, deepseek-chat-2, deepseek-chat-25, deepseek-chat-3, solar-instruct, phi-2-chat, phi-2-instruct, phi-3-chat, phi-3-instruct, phi-4-chat, gemma-instruct, octopus, glm-4-chat, groq-llama3-tool, mediatek-breeze, nemotron-chat, nemotron-tool, functionary-32, functionary-31, minicpmv, moxin-chat, falcon3, megrez, qwen2-vision, embedding, none]
   -r, --reverse-prompt <REVERSE_PROMPT>
           Halt generation at PROMPT, return control
   -n, --n-predict <N_PREDICT>
@@ -637,6 +637,8 @@ Options:
           Maximum number of user messages used in the retrieval [default: 1]
       --kw-search-url <KW_SEARCH_URL>
           URL of the keyword search service
+      --include-usage
+          Whether to include usage in the stream response. Defaults to false
       --socket-addr <SOCKET_ADDR>
           Socket address of LlamaEdge-RAG API Server instance. For example, `0.0.0.0:8080`
       --port <PORT>
diff --git a/src/main.rs b/src/main.rs
@@ -131,6 +131,9 @@ struct Cli {
     /// URL of the keyword search service
     #[arg(long)]
     kw_search_url: Option<String>,
+    /// Whether to include usage in the stream response. Defaults to false.
+    #[arg(long, default_value = "false")]
+    include_usage: bool,
     /// Socket address of LlamaEdge-RAG API Server instance. For example, `0.0.0.0:8080`.
     #[arg(long, default_value = None, value_parser = clap::value_parser!(SocketAddr), group = "socket_address_group")]
     socket_addr: Option<SocketAddr>,
@@ -420,6 +423,9 @@ async fn main() -> Result<(), ServerError> {
         KW_SEARCH_CONFIG.set(kw_search_config).unwrap();
     }
 
+    // log include_usage
+    info!(target: "stdout", "include_usage: {}", cli.include_usage);
+
     // create metadata for chat model
     let chat_metadata = GgmlMetadataBuilder::new(
         cli.model_name[0].clone(),
@@ -440,6 +446,7 @@ async fn main() -> Result<(), ServerError> {
     .with_json_schema(cli.json_schema)
     .enable_plugin_log(true)
     .enable_debug_log(plugin_debug)
+    .include_usage(cli.include_usage)
     .build();
 
     let chat_model_info = ModelConfig {