BoundaryML
diff --git a/‎engine/baml-lib/baml-types/src/tracing/events.rs‎
Lines changed: 1 addition & 0 deletions b/‎engine/baml-lib/baml-types/src/tracing/events.rs‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎engine/baml-rpc/src/runtime_api/trace_event.rs‎
Lines changed: 1 addition & 0 deletions b/‎engine/baml-rpc/src/runtime_api/trace_event.rs‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎engine/baml-runtime/src/internal/llm_client/mod.rs‎
Lines changed: 1 addition & 0 deletions b/‎engine/baml-runtime/src/internal/llm_client/mod.rs‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎engine/baml-runtime/src/internal/llm_client/primitive/anthropic/response_handler.rs‎
Lines changed: 4 additions & 0 deletions b/‎engine/baml-runtime/src/internal/llm_client/primitive/anthropic/response_handler.rs‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎engine/baml-runtime/src/internal/llm_client/primitive/anthropic/types.rs‎
Lines changed: 3 additions & 8 deletions b/‎engine/baml-runtime/src/internal/llm_client/primitive/anthropic/types.rs‎
Lines changed: 3 additions & 8 deletions
diff --git a/‎engine/baml-runtime/src/internal/llm_client/primitive/aws/aws_client.rs‎
Lines changed: 4 additions & 0 deletions b/‎engine/baml-runtime/src/internal/llm_client/primitive/aws/aws_client.rs‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎engine/baml-runtime/src/internal/llm_client/primitive/google/response_handler.rs‎
Lines changed: 4 additions & 0 deletions b/‎engine/baml-runtime/src/internal/llm_client/primitive/google/response_handler.rs‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎engine/baml-runtime/src/internal/llm_client/primitive/google/types.rs‎
Lines changed: 1 addition & 0 deletions b/‎engine/baml-runtime/src/internal/llm_client/primitive/google/types.rs‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎engine/baml-runtime/src/internal/llm_client/primitive/openai/response_handler.rs‎
Lines changed: 34 additions & 0 deletions b/‎engine/baml-runtime/src/internal/llm_client/primitive/openai/response_handler.rs‎
Lines changed: 34 additions & 0 deletions
diff --git a/‎engine/baml-runtime/src/internal/llm_client/primitive/openai/types.rs‎
Lines changed: 2 additions & 0 deletions b/‎engine/baml-runtime/src/internal/llm_client/primitive/openai/types.rs‎
Lines changed: 2 additions & 0 deletions
@@ -593,6 +593,7 @@ pub struct LLMUsage {
     pub input_tokens: Option<u64>,
     pub output_tokens: Option<u64>,
     pub total_tokens: Option<u64>,
+    pub cached_input_tokens: Option<u64>,
 }
 
 #[cfg(test)]
 
@@ -183,4 +183,5 @@ pub struct LLMUsage {
     pub input_tokens: Option<u64>,
     pub output_tokens: Option<u64>,
     pub total_tokens: Option<u64>,
+    pub cached_input_tokens: Option<u64>,
 }
@@ -294,6 +294,7 @@ pub struct LLMCompleteResponseMetadata {
     pub prompt_tokens: Option<u64>,
     pub output_tokens: Option<u64>,
     pub total_tokens: Option<u64>,
+    pub cached_input_tokens: Option<u64>,
 }
 
 // This is how the response gets logged if you print the result to the console.
 
@@ -90,6 +90,7 @@ pub fn parse_anthropic_response<C: WithClient + RequestBuilder>(
             prompt_tokens: Some(response.usage.input_tokens),
             output_tokens: Some(response.usage.output_tokens),
             total_tokens: Some(response.usage.input_tokens + response.usage.output_tokens),
+            cached_input_tokens: response.usage.cache_read_input_tokens,
         },
     })
 }
@@ -137,6 +138,7 @@ pub fn scan_anthropic_response_stream(
             inner.prompt_tokens = Some(body.usage.input_tokens);
             inner.output_tokens = Some(body.usage.output_tokens);
             inner.total_tokens = Some(body.usage.input_tokens + body.usage.output_tokens);
+            inner.cached_input_tokens = body.usage.cache_read_input_tokens;
         }
         MessageChunk::ContentBlockDelta(event) => {
             if let super::types::ContentBlockDelta::TextDelta { text } = event.delta {
@@ -153,6 +155,7 @@ pub fn scan_anthropic_response_stream(
             inner.finish_reason = body.delta.stop_reason.clone();
             inner.output_tokens = Some(body.usage.output_tokens);
             inner.total_tokens = Some(inner.prompt_tokens.unwrap_or(0) + body.usage.output_tokens);
+            inner.cached_input_tokens = body.usage.cache_read_input_tokens;
         }
         MessageChunk::MessageStop => (),
         MessageChunk::Error { error } => {
@@ -218,6 +221,7 @@ mod tests {
                 prompt_tokens: Some(321),
                 output_tokens: Some(158),
                 total_tokens: Some(479),
+                cached_input_tokens: Some(0),
             },
         };
 
 
@@ -34,6 +34,8 @@ pub enum AnthropicMessageContent {
 pub struct AnthropicUsage {
     pub input_tokens: u64,
     pub output_tokens: u64,
+    pub cache_creation_input_tokens: Option<u64>,
+    pub cache_read_input_tokens: Option<u64>,
 }
 
 #[derive(Clone, Debug, Deserialize, PartialEq, Serialize)]
@@ -200,7 +202,7 @@ pub struct MessageDeltaChunk {
     /// The result of this stream.
     pub delta: StreamStop,
     /// The billing and rate-limit usage of this stream.
-    pub usage: DeltaUsage,
+    pub usage: AnthropicUsage,
 }
 
 /// The text delta content block.
@@ -222,13 +224,6 @@ pub struct StreamStop {
     pub stop_sequence: Option<StopSequence>,
 }
 
-/// The delta usage of the stream.
-#[derive(Debug, Deserialize, Clone, PartialEq, Serialize)]
-pub struct DeltaUsage {
-    /// The number of output tokens which were used.
-    pub output_tokens: u64,
-}
-
 #[cfg(test)]
 mod tests {
     use anyhow::Result;
 
@@ -899,6 +899,7 @@ impl WithStreamChat for AwsClient {
                         prompt_tokens: None,
                         output_tokens: None,
                         total_tokens: None,
+                        cached_input_tokens: None,
                     },
                 }),
                 response,
@@ -962,6 +963,8 @@ impl WithStreamChat for AwsClient {
                                             Some(usage.output_tokens() as u64);
                                         new_state.metadata.total_tokens =
                                             Some((usage.total_tokens()) as u64);
+                                        // AWS Bedrock does not currently support cached tokens
+                                        new_state.metadata.cached_input_tokens = None;
                                     }
                                 }
                                 _ => {
@@ -1303,6 +1306,7 @@ impl WithChat for AwsClient {
                         .usage
                         .as_ref()
                         .and_then(|i| i.total_tokens.try_into().ok()),
+                    cached_input_tokens: None, // AWS Bedrock does not currently support cached tokens
                 },
             }),
             Err(e) => LLMResponse::LLMFailure(LLMErrorResponse {
 
@@ -97,6 +97,7 @@ pub fn parse_google_response<C: WithClient + RequestBuilder>(
             prompt_tokens: response.usage_metadata.prompt_token_count,
             output_tokens: response.usage_metadata.candidates_token_count,
             total_tokens: response.usage_metadata.total_token_count,
+            cached_input_tokens: response.usage_metadata.cached_content_token_count,
         },
     })
 }
@@ -171,6 +172,7 @@ pub fn scan_google_response_stream(
     inner.metadata.prompt_tokens = event.usage_metadata.prompt_token_count;
     inner.metadata.output_tokens = event.usage_metadata.candidates_token_count;
     inner.metadata.total_tokens = event.usage_metadata.total_token_count;
+    inner.metadata.cached_input_tokens = event.usage_metadata.cached_content_token_count;
 
     inner.latency = instant_now.elapsed();
     Ok(())
@@ -285,6 +287,7 @@ mod tests {
                 prompt_token_count: Some(166),
                 candidates_token_count: Some(39),
                 total_token_count: Some(205),
+                cached_content_token_count: None,
             },
         };
 
@@ -331,6 +334,7 @@ mod tests {
                 prompt_tokens: Some(166),
                 output_tokens: Some(39),
                 total_tokens: Some(205),
+                cached_input_tokens: None,
             },
         };
 
 
@@ -338,6 +338,7 @@ pub struct UsageMetaData {
     pub prompt_token_count: Option<u64>,
     pub candidates_token_count: Option<u64>,
     pub total_token_count: Option<u64>,
+    pub cached_content_token_count: Option<u64>,
 }
 
 #[cfg(test)]
 
@@ -92,6 +92,13 @@ pub fn parse_openai_response<C: WithClient + RequestBuilder>(
             prompt_tokens: usage.map(|u| u.prompt_tokens),
             output_tokens: usage.map(|u| u.completion_tokens),
             total_tokens: usage.map(|u| u.total_tokens),
+            cached_input_tokens: usage.and_then(|u| {
+                // Extract cached tokens from input_tokens_details if available
+                u.input_tokens_details
+                    .as_ref()
+                    .and_then(|details| details.get("cached_tokens"))
+                    .and_then(|cached| cached.as_u64())
+            }),
         },
     })
 }
@@ -143,6 +150,12 @@ pub fn scan_openai_chat_completion_stream(
         inner.metadata.prompt_tokens = Some(usage.prompt_tokens);
         inner.metadata.output_tokens = Some(usage.completion_tokens);
         inner.metadata.total_tokens = Some(usage.total_tokens);
+        inner.metadata.cached_input_tokens =
+            usage.input_tokens_details.as_ref().and_then(|details| {
+                details
+                    .get("cached_tokens")
+                    .and_then(|cached| cached.as_u64())
+            })
     }
 
     Ok(())
@@ -226,6 +239,7 @@ mod tests {
                 prompt_tokens: Some(128),
                 output_tokens: Some(71),
                 total_tokens: Some(199),
+                cached_input_tokens: Some(0),
             },
         };
 
@@ -322,6 +336,13 @@ pub fn parse_openai_responses_response<C: WithClient + RequestBuilder>(
             prompt_tokens: usage.map(|u| u.prompt_tokens),
             output_tokens: usage.map(|u| u.completion_tokens),
             total_tokens: usage.map(|u| u.total_tokens),
+            cached_input_tokens: usage.and_then(|u| {
+                // Extract cached tokens from input_tokens_details if available
+                u.input_tokens_details
+                    .as_ref()
+                    .and_then(|details| details.get("cached_tokens"))
+                    .and_then(|cached| cached.as_u64())
+            }),
         },
     })
 }
@@ -390,6 +411,12 @@ pub fn scan_openai_responses_stream(
                 inner.metadata.prompt_tokens = Some(usage.prompt_tokens);
                 inner.metadata.output_tokens = Some(usage.completion_tokens);
                 inner.metadata.total_tokens = Some(usage.total_tokens);
+                inner.metadata.cached_input_tokens =
+                    usage.input_tokens_details.as_ref().and_then(|details| {
+                        details
+                            .get("cached_tokens")
+                            .and_then(|cached| cached.as_u64())
+                    })
             }
         }
         ResponseFailed { response, .. } => {
@@ -441,6 +468,12 @@ pub fn scan_openai_responses_stream(
                 inner.metadata.prompt_tokens = Some(usage.prompt_tokens);
                 inner.metadata.output_tokens = Some(usage.completion_tokens);
                 inner.metadata.total_tokens = Some(usage.total_tokens);
+                inner.metadata.cached_input_tokens =
+                    usage.input_tokens_details.as_ref().and_then(|details| {
+                        details
+                            .get("cached_tokens")
+                            .and_then(|cached| cached.as_u64())
+                    })
             }
         }
         OutputTextDelta { delta, .. } => {
@@ -507,6 +540,7 @@ mod responses_tests {
                 prompt_tokens: Some(36),
                 output_tokens: Some(87),
                 total_tokens: Some(123),
+                cached_input_tokens: Some(0),
             },
         };
 
 
@@ -255,7 +255,9 @@ pub struct CompletionUsage {
     /// Total number of tokens used in the request (prompt + completion).
     pub total_tokens: u64,
     /// Additional fields that may be present in responses API
+    #[serde(alias = "prompt_tokens_details")]
     pub input_tokens_details: Option<serde_json::Value>,
+    #[serde(alias = "completion_tokens_details")]
     pub output_tokens_details: Option<serde_json::Value>,
 }
Original file line number	Diff line number	Diff line change
`@@ -593,6 +593,7 @@ pub struct LLMUsage {`
`593`	`593`	`pub input_tokens: Option<u64>,`
`594`	`594`	`pub output_tokens: Option<u64>,`
`595`	`595`	`pub total_tokens: Option<u64>,`
	`596`	`+ pub cached_input_tokens: Option<u64>,`
`596`	`597`	`}`
`597`	`598`
`598`	`599`	`#[cfg(test)]`
Original file line number	Diff line number	Diff line change
`@@ -183,4 +183,5 @@ pub struct LLMUsage {`
`183`	`183`	`pub input_tokens: Option<u64>,`
`184`	`184`	`pub output_tokens: Option<u64>,`
`185`	`185`	`pub total_tokens: Option<u64>,`
	`186`	`+ pub cached_input_tokens: Option<u64>,`
`186`	`187`	`}`
Original file line number	Diff line number	Diff line change
`@@ -294,6 +294,7 @@ pub struct LLMCompleteResponseMetadata {`
`294`	`294`	`pub prompt_tokens: Option<u64>,`
`295`	`295`	`pub output_tokens: Option<u64>,`
`296`	`296`	`pub total_tokens: Option<u64>,`
	`297`	`+ pub cached_input_tokens: Option<u64>,`
`297`	`298`	`}`
`298`	`299`
`299`	`300`	`// This is how the response gets logged if you print the result to the console.`
Original file line number	Diff line number	Diff line change
`@@ -338,6 +338,7 @@ pub struct UsageMetaData {`
`338`	`338`	`pub prompt_token_count: Option<u64>,`
`339`	`339`	`pub candidates_token_count: Option<u64>,`
`340`	`340`	`pub total_token_count: Option<u64>,`
	`341`	`+ pub cached_content_token_count: Option<u64>,`
`341`	`342`	`}`
`342`	`343`
`343`	`344`	`#[cfg(test)]`
Original file line number	Diff line number	Diff line change
`@@ -255,7 +255,9 @@ pub struct CompletionUsage {`
`255`	`255`	`/// Total number of tokens used in the request (prompt + completion).`
`256`	`256`	`pub total_tokens: u64,`
`257`	`257`	`/// Additional fields that may be present in responses API`
	`258`	`+ #[serde(alias = "prompt_tokens_details")]`
`258`	`259`	`pub input_tokens_details: Option<serde_json::Value>,`
	`260`	`+ #[serde(alias = "completion_tokens_details")]`
`259`	`261`	`pub output_tokens_details: Option<serde_json::Value>,`
`260`	`262`	`}`
`261`	`263`