Skip to content

Commit d6bae3e

Browse files
committed
chore: fix edge cases
Signed-off-by: Elyas Mehtabuddin <[email protected]>
1 parent 377255e commit d6bae3e

File tree

1 file changed

+57
-13
lines changed

1 file changed

+57
-13
lines changed

lib/parsers/src/tool_calling/json/base_json_parser.rs

Lines changed: 57 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -73,27 +73,40 @@ fn handle_single_token_tool_calls(input: &str, start_token: &str) -> Option<Stri
7373
if s.is_empty() {
7474
continue;
7575
}
76-
// Only consider segments that start like JSON
76+
// Only consider segments that start like JSON (objects or arrays)
7777
if s.starts_with('{') {
78-
// Trim trailing non-JSON by cutting at the last closing brace/bracket
78+
// Trim trailing non-JSON by cutting at the last closing brace
7979
if let Some(pos) = s.rfind('}') {
8080
let candidate = &s[..=pos].trim();
8181
// Keep only valid JSON candidates
8282
if serde_json::from_str::<serde_json::Value>(candidate).is_ok() {
8383
items.push(candidate.to_string());
8484
}
8585
}
86+
} else if s.starts_with('[') {
87+
// Handle array format (like phi4: functools[{...}])
88+
if let Some(pos) = s.rfind(']') {
89+
let candidate = &s[..=pos].trim();
90+
// Keep only valid JSON arrays
91+
if serde_json::from_str::<serde_json::Value>(candidate).is_ok() {
92+
// For arrays, we need to extract the individual objects
93+
if let Ok(serde_json::Value::Array(arr)) =
94+
serde_json::from_str::<serde_json::Value>(candidate)
95+
{
96+
for item in arr {
97+
if let Ok(item_str) = serde_json::to_string(&item) {
98+
items.push(item_str);
99+
}
100+
}
101+
}
102+
}
103+
}
86104
}
87105
}
88106
if items.is_empty() {
89-
// Remove everything up to and including the first occurrence of the start token
90-
if let Some(idx) = input.find(start_token) {
91-
let rest = &input[idx + start_token.len()..];
92-
return Some(rest.trim_start().to_string());
93-
} else {
94-
// Shouldn't happen because we checked contains() above, but be defensive
95-
return None;
96-
}
107+
// If we found the start token but no valid JSON after it, return empty string
108+
// to avoid leaking the invalid content (important for phi4 and similar models)
109+
return Some(String::new());
97110
}
98111
Some(format!("[{}]", items.join(",")))
99112
}
@@ -174,6 +187,7 @@ pub fn try_tool_call_parse_basic_json(
174187
// Assumption : One message will not contain different tags for tool calls. Iteration over tags is to support different tags by default for multiple models
175188
let mut json = trimmed.to_string();
176189
let mut normal_text = trimmed.to_string();
190+
let mut found_start_token_with_no_valid_json = false;
177191

178192
// First, check if ANY start token exists in the input
179193
let has_start_token = tool_call_start_tokens
@@ -204,18 +218,32 @@ pub fn try_tool_call_parse_basic_json(
204218
// Single token case
205219
let result = handle_single_token_tool_calls(&json, start_token);
206220
if let Some(content) = result {
221+
// Check if we found a start token but got empty JSON back
222+
// This indicates the token was found but no valid JSON followed
223+
if content.is_empty() {
224+
found_start_token_with_no_valid_json = true;
225+
}
226+
207227
json = content;
208228
// For single token case, use the normal text we extracted earlier
209229
normal_text = new_normal_text;
230+
210231
break; // Found content, exit early
211232
}
212233
}
213234
(false, false) => {
214235
// Start and end token case
215236
let result = extract_tool_call_content(&json, start_token, end_token);
216237
if let Some(content) = result {
238+
// Check if we found a start token but got empty JSON back
239+
// This indicates the token was found but no valid JSON followed
240+
if content.is_empty() {
241+
found_start_token_with_no_valid_json = true;
242+
}
243+
217244
json = content;
218245
normal_text = new_normal_text;
246+
219247
break; // Found content, exit early
220248
}
221249
}
@@ -304,7 +332,13 @@ pub fn try_tool_call_parse_basic_json(
304332
return Ok((results, Some(normal_text)));
305333
}
306334

307-
Ok((vec![], Some(trimmed.to_string())))
335+
// If we found a start token but no valid JSON, return empty content
336+
// to avoid leaking the token and invalid JSON content
337+
if found_start_token_with_no_valid_json {
338+
Ok((vec![], Some(String::new())))
339+
} else {
340+
Ok((vec![], Some(trimmed.to_string())))
341+
}
308342
}
309343

310344
pub fn detect_tool_call_start_basic_json(chunk: &str, config: &JsonParserConfig) -> bool {
@@ -330,12 +364,22 @@ pub fn detect_tool_call_start_basic_json(chunk: &str, config: &JsonParserConfig)
330364
return false;
331365
}
332366
// Check if the chunk could be a prefix of this start token
333-
// We need to be careful to avoid false positives
334367
// Handle Unicode character boundaries properly
335368
for i in 1..=token.chars().count() {
336369
if let Some(prefix) = token.chars().take(i).collect::<String>().get(..) {
337370
let prefix_str = &prefix[..prefix.len()];
338-
if trimmed == prefix_str || trimmed.ends_with(prefix_str) {
371+
// Check for exact prefix match
372+
if trimmed == prefix_str {
373+
return true;
374+
}
375+
// For longer prefixes (3+ chars), allow them anywhere in the input
376+
// This allows "funny joke" to match "functools" via "fun"
377+
// but prevents "<tool_call>" from matching "<TOOLCALL>" via single char "<"
378+
if prefix_str.len() >= 3 && trimmed.contains(prefix_str) {
379+
return true;
380+
}
381+
// For shorter prefixes, only match if they're at the end (streaming scenario)
382+
if prefix_str.len() < 3 && trimmed.ends_with(prefix_str) {
339383
return true;
340384
}
341385
}

0 commit comments

Comments
 (0)