diff --git a/src/codegen.rs b/src/codegen.rs index 61a661f..8fa04d3 100644 --- a/src/codegen.rs +++ b/src/codegen.rs @@ -9,7 +9,7 @@ use syn::{Expr, Ident, LitStr, spanned::Spanned}; /// Generate parsing code from tokens. /// -/// Returns `(code, anon_count)` or error for consecutive placeholders / missing args. +/// Returns `(code, anon_count)` or error for consecutive placeholders/missing args. fn generate_parsing_code( tokens: &[FormatToken], explicit_args: &[&Expr], @@ -87,157 +87,126 @@ fn generate_parsing_code( Ok((generated, anon_index)) } -/// Generate code for named placeholder with separator. -fn generate_named_placeholder_with_separator( - name: &str, +/// Generate code for placeholder with separator (named or anonymous). +fn generate_placeholder_with_separator( + assignment_stmt: &proc_macro2::TokenStream, + var_desc: &str, separator: &LitStr, ) -> proc_macro2::TokenStream { - let ident = Ident::new(name, Span::call_site()); - quote! { - if let Some(pos) = remaining.find(#separator) { + { + let pos = remaining.find(#separator).ok_or_else(|| { + std::io::Error::new( + std::io::ErrorKind::InvalidInput, + format!( + "Expected separator {:?} for {} not found in remaining input: {:?}", + #separator, + #var_desc, + remaining + ) + ) + })?; let slice = &remaining[..pos]; - match slice.parse() { - Ok(parsed) => { - #ident = parsed; - } - Err(error) => { - result = result.and(Err(std::io::Error::new( - std::io::ErrorKind::InvalidInput, - format!("Failed to parse variable '{}' from {:?}: {}", #name, slice, error) - ))); - } - } - remaining = &remaining[pos + #separator.len()..]; - } else { - result = result.and(Err(std::io::Error::new( - std::io::ErrorKind::InvalidInput, - format!( - "Expected separator {:?} for variable '{}' not found in remaining input: {:?}", - #separator, - #name, - remaining + let parsed = slice.parse().map_err(|error| { + std::io::Error::new( + std::io::ErrorKind::InvalidInput, + format!("Failed to parse {} from {:?}: {}", #var_desc, slice, error) ) - ))); + })?; + #assignment_stmt; + remaining = &remaining[pos + #separator.len()..]; } } } +/// Generate code for named placeholder with separator. +/// +/// Note: `assignment_stmt` contains the expression `#ident = parsed` WITHOUT a trailing +/// semicolon. The semicolon is added explicitly at the insertion point to form a complete +/// statement within the generated block. +fn generate_named_placeholder_with_separator( + name: &str, + separator: &LitStr, +) -> proc_macro2::TokenStream { + let ident = Ident::new(name, Span::call_site()); + let assignment_stmt = quote! { #ident = parsed }; // No trailing semicolon + let var_desc = format!("variable '{name}'"); + generate_placeholder_with_separator(&assignment_stmt, &var_desc, separator) +} + /// Generate code for anonymous placeholder with separator. +/// +/// Note: `assignment_stmt` contains the expression `*#arg_expr = parsed` WITHOUT a trailing +/// semicolon. The semicolon is added explicitly at the insertion point to form a complete +/// statement within the generated block. fn generate_anonymous_placeholder_with_separator( arg_expr: &Expr, placeholder_num: usize, separator: &LitStr, ) -> proc_macro2::TokenStream { - quote! { - if let Some(pos) = remaining.find(#separator) { - let slice = &remaining[..pos]; - match slice.parse() { - Ok(parsed) => { - *#arg_expr = parsed; - } - Err(error) => { - result = result.and(Err(std::io::Error::new( - std::io::ErrorKind::InvalidInput, - format!( - "Failed to parse anonymous placeholder #{} from {:?}: {}", - #placeholder_num, - slice, - error - ) - ))); - } - } - remaining = &remaining[pos + #separator.len()..]; - } else { - result = result.and(Err(std::io::Error::new( - std::io::ErrorKind::InvalidInput, - format!( - "Expected separator {:?} for anonymous placeholder #{} not found in remaining input: {:?}", - #separator, - #placeholder_num, - remaining - ) - ))); - } - } + let assignment_stmt = quote! { *#arg_expr = parsed }; // No trailing semicolon + let var_desc = format!("anonymous placeholder #{placeholder_num}"); + generate_placeholder_with_separator(&assignment_stmt, &var_desc, separator) } /// Generate code for fixed text matching at current position. fn generate_fixed_text_match(text: &LitStr) -> proc_macro2::TokenStream { quote! { - if let Some(pos) = remaining.find(#text) { - if pos == 0 { - remaining = &remaining[#text.len()..]; - } else { - result = result.and(Err(std::io::Error::new( + { + if !remaining.starts_with(#text) { + return Err(std::io::Error::new( std::io::ErrorKind::InvalidInput, format!( - "Expected text {:?} at current position, but found it at offset {}. \ - Remaining input: {:?}", + "Expected text {:?} at current position. Remaining input: {:?}", #text, - pos, remaining ) - ))); + )); } - } else { - result = result.and(Err(std::io::Error::new( - std::io::ErrorKind::InvalidInput, - format!( - "Required text separator {:?} not found. Remaining input: {:?}", - #text, - remaining - ) - ))); + remaining = &remaining[#text.len()..]; } } } -/// Generate code for final named placeholder (consumes rest of input). -fn generate_final_named_placeholder(name: &str) -> proc_macro2::TokenStream { - let ident = Ident::new(name, Span::call_site()); - +/// Generate code for final placeholder (consumes rest of input). +fn generate_final_placeholder( + assignment_stmt: &proc_macro2::TokenStream, + var_desc: &str, +) -> proc_macro2::TokenStream { quote! { - match remaining.parse() { - Ok(parsed) => { - #ident = parsed; - } - Err(error) => { - result = result.and(Err(std::io::Error::new( + { + let parsed = remaining.parse().map_err(|error| { + std::io::Error::new( std::io::ErrorKind::InvalidInput, - format!("Failed to parse variable '{}' from remaining input {:?}: {}", #name, remaining, error) - ))); - } + format!("Failed to parse {} from remaining input {:?}: {}", #var_desc, remaining, error) + ) + })?; + #assignment_stmt; + remaining = ""; } - remaining = ""; } } +/// Generate code for final named placeholder (consumes rest of input). +/// +/// Note: `assignment_stmt` contains the expression WITHOUT a trailing semicolon. +fn generate_final_named_placeholder(name: &str) -> proc_macro2::TokenStream { + let ident = Ident::new(name, Span::call_site()); + let assignment_stmt = quote! { #ident = parsed }; // No trailing semicolon + let var_desc = format!("variable '{name}'"); + generate_final_placeholder(&assignment_stmt, &var_desc) +} + /// Generate code for final anonymous placeholder (consumes rest of input). +/// +/// Note: `assignment_stmt` contains the expression WITHOUT a trailing semicolon. fn generate_final_anonymous_placeholder( arg_expr: &Expr, placeholder_num: usize, ) -> proc_macro2::TokenStream { - quote! { - match remaining.parse() { - Ok(parsed) => { - *#arg_expr = parsed; - } - Err(error) => { - result = result.and(Err(std::io::Error::new( - std::io::ErrorKind::InvalidInput, - format!( - "Failed to parse anonymous placeholder #{} from remaining input {:?}: {}", - #placeholder_num, - remaining, - error - ) - ))); - } - } - remaining = ""; - } + let assignment_stmt = quote! { *#arg_expr = parsed }; // No trailing semicolon + let var_desc = format!("anonymous placeholder #{placeholder_num}"); + generate_final_placeholder(&assignment_stmt, &var_desc) } /// Create error for missing anonymous placeholder argument. @@ -250,9 +219,8 @@ fn make_missing_argument_error( syn::Error::new( format_lit.span(), format!( - "{}anonymous placeholder '{{}}' at position {} has no corresponding argument. \ - Provide a mutable reference argument (e.g., &mut var) or use a named placeholder (e.g., '{{var}}')", - prefix, position + "{prefix}anonymous placeholder '{{}}' at position {position} has no corresponding argument. \ + Provide a mutable reference argument (e.g., &mut var) or use a named placeholder (e.g., '{{var}}')" ), ) .to_compile_error() @@ -295,9 +263,8 @@ pub fn generate_scanf_implementation( return Err(syn::Error::new( explicit_args[anon_index].span(), format!( - "Too many arguments: {} unused argument(s) provided. \ - The format string only has {} anonymous placeholder(s)", - unused_count, anon_index + "Too many arguments: {unused_count} unused argument(s) provided. \ + The format string only has {anon_index} anonymous placeholder(s)" ), ) .to_compile_error() diff --git a/src/lib.rs b/src/lib.rs index fcfaf24..365fac1 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -6,13 +6,13 @@ //! # Architecture //! //! Compile-time: `tokenization` → `codegen` → expansion -//! Runtime: Generated code parses input with `.find()` and `.parse()` +//! Runtime: Generated code uses `.find()` and `.parse()` //! //! Modules: `constants`, `types`, `validation`, `parsing`, `tokenization`, `codegen` //! //! # Hygiene //! -//! Generated code uses isolated scopes `{ }` (block expressions) - no prefix pollution. +//! Generated code uses isolated scopes - no variable pollution. //! //! # Limitations //! @@ -23,7 +23,7 @@ //! //! # Security //! -//! **DoS limits:** 10K bytes format, 256 tokens, 128 char identifiers +//! **`DoS` limits:** 10K bytes format, 256 tokens, 128 char identifiers //! **Memory:** `#![forbid(unsafe_code)]`, `Box`, bounds-checked //! **Validation:** Rejects empty formats, keywords, invalid identifiers @@ -88,10 +88,11 @@ pub fn sscanf(input: TokenStream) -> TokenStream { // Scope isolation ensures macro hygiene let expanded = quote! {{ - let mut result: std::io::Result<()> = Ok(()); - let mut remaining = #input_expr; - #(#generated)* - result + (|| -> std::io::Result<()> { + let mut remaining = #input_expr; + #(#generated)* + Ok(()) + })() }}; TokenStream::from(expanded) @@ -132,18 +133,15 @@ pub fn scanf(input: TokenStream) -> TokenStream { // Scope isolation ensures macro hygiene let expanded = quote! {{ - let mut result: std::io::Result<()> = Ok(()); - let mut buffer = String::new(); - let _ = std::io::Write::flush(&mut std::io::stdout()); - match std::io::stdin().read_line(&mut buffer) { - Ok(_) => { - let input = buffer.trim_end_matches('\n').trim_end_matches('\r'); - let mut remaining: &str = input; - #(#generated)* - result - } - Err(e) => Err(e) - } + (|| -> std::io::Result<()> { + let mut buffer = String::new(); + let _ = std::io::Write::flush(&mut std::io::stdout()); + std::io::stdin().read_line(&mut buffer)?; + let input = buffer.trim_end_matches('\n').trim_end_matches('\r'); + let mut remaining: &str = input; + #(#generated)* + Ok(()) + })() }}; TokenStream::from(expanded) } diff --git a/src/parsing.rs b/src/parsing.rs index 9413860..477837c 100644 --- a/src/parsing.rs +++ b/src/parsing.rs @@ -27,7 +27,7 @@ impl Parse for SscanfArgs { Punctuated::parse_terminated(input)? }; - Ok(SscanfArgs { + Ok(Self { input: input_expr, format, args, diff --git a/src/tokenization.rs b/src/tokenization.rs index d967371..d119b14 100644 --- a/src/tokenization.rs +++ b/src/tokenization.rs @@ -9,9 +9,32 @@ use crate::validation::is_valid_identifier; use proc_macro::TokenStream; use syn::LitStr; +/// Push token with `MAX_TOKENS` validation. +#[inline] +fn push_token( + tokens: &mut Vec, + token: FormatToken, + format_lit: &LitStr, +) -> Result<(), TokenStream> { + if tokens.len() >= MAX_TOKENS { + return Err(syn::Error::new( + format_lit.span(), + format!( + "Too many tokens (would exceed {}). Maximum: {}. Prevents compile-time DoS.", + tokens.len() + 1, + MAX_TOKENS + ), + ) + .to_compile_error() + .into()); + } + tokens.push(token); + Ok(()) +} + /// Tokenize format string into text/placeholders. Handles `{{`/`}}` escapes. /// -/// Security: enforces MAX_FORMAT_STRING_LEN, MAX_TOKENS, MAX_IDENTIFIER_LEN limits. +/// Enforces `MAX_FORMAT_STRING_LEN`, `MAX_TOKENS`, `MAX_IDENTIFIER_LEN` limits. pub fn tokenize_format_string( format_str: &str, format_lit: &LitStr, @@ -34,25 +57,6 @@ pub fn tokenize_format_string( let mut chars = format_str.chars().peekable(); let mut current_text = String::with_capacity(TEXT_SEGMENT_CAPACITY); - let push_token = - |tokens: &mut Vec, token: FormatToken| -> Result<(), TokenStream> { - if tokens.len() >= MAX_TOKENS { - return Err(syn::Error::new( - format_lit.span(), - format!( - "Too many tokens in format string (would exceed {}). Maximum allowed: {}. \ - This limit prevents compile-time resource exhaustion.", - tokens.len() + 1, - MAX_TOKENS - ), - ) - .to_compile_error() - .into()); - } - tokens.push(token); - Ok(()) - }; - while let Some(ch) = chars.next() { match ch { '{' => { @@ -66,6 +70,7 @@ pub fn tokenize_format_string( push_token( &mut tokens, FormatToken::Text(std::mem::take(&mut current_text).into_boxed_str()), + format_lit, )?; current_text = String::with_capacity(TEXT_SEGMENT_CAPACITY); } @@ -80,9 +85,8 @@ pub fn tokenize_format_string( return Err(syn::Error::new( format_lit.span(), format!( - "Identifier in placeholder too long (>{} characters). \ - This limit prevents compile-time DoS attacks.", - MAX_IDENTIFIER_LEN + "Identifier in placeholder too long (>{MAX_IDENTIFIER_LEN} characters). \ + This limit prevents compile-time DoS attacks." ), ) .to_compile_error() @@ -96,21 +100,22 @@ pub fn tokenize_format_string( push_token( &mut tokens, FormatToken::Placeholder(Placeholder::Anonymous), + format_lit, )?; } else if is_valid_identifier(&content) { push_token( &mut tokens, FormatToken::Placeholder(Placeholder::Named(content.into_boxed_str())), + format_lit, )?; } else { return Err(syn::Error::new( format_lit.span(), format!( - "Invalid identifier '{}' in placeholder. \ + "Invalid identifier '{content}' in placeholder. \ Identifiers must start with a letter or underscore, \ contain only alphanumeric characters or underscores, \ - and not be Rust keywords. Use '{{}}' for anonymous placeholders.", - content + and not be Rust keywords. Use '{{}}' for anonymous placeholders." ), ) .to_compile_error() @@ -138,6 +143,7 @@ pub fn tokenize_format_string( push_token( &mut tokens, FormatToken::Text(current_text.into_boxed_str()), + format_lit, )?; } diff --git a/src/validation.rs b/src/validation.rs index 0bea883..21ff9d2 100644 --- a/src/validation.rs +++ b/src/validation.rs @@ -17,7 +17,7 @@ pub fn is_valid_identifier(s: &str) -> bool { } let mut chars = s.chars(); - let first = chars.next().unwrap(); // OK to unwrap: checked is_empty above + let first = chars.next().unwrap(); // Safe: is_empty checked above if !first.is_alphabetic() && first != '_' { return false; diff --git a/tests/integration_tests.rs b/tests/integration_tests.rs index ffaf616..fd71dfd 100644 --- a/tests/integration_tests.rs +++ b/tests/integration_tests.rs @@ -371,3 +371,67 @@ fn test_security_long_but_valid_identifier() { sscanf!(input, "{this_is_a_very_long_variable_name_but_still_valid}").unwrap(); assert_eq!(this_is_a_very_long_variable_name_but_still_valid, 42); } + +#[test] +fn test_variable_scope_no_collision_anonymous() { + // Test that macro internal variables don't collide with user variables when using anonymous placeholders + let input = "test"; + let mut value: String = String::new(); + let remaining: String = String::from("should not be modified"); + let buffer: String = String::from("should not be modified"); + + sscanf!(input, "{}", &mut value).unwrap(); + + assert_eq!(value, "test"); + assert_eq!(remaining, "should not be modified"); + assert_eq!(buffer, "should not be modified"); +} + +#[test] +fn test_only_literal_text_no_placeholders() { + // Test format with only literal text (no placeholders) + let input = "exact match"; + let result = sscanf!(input, "exact match"); + assert!(result.is_ok()); +} + +#[test] +fn test_only_literal_text_mismatch() { + // Test format with only literal text that doesn't match + let input = "wrong text"; + let result = sscanf!(input, "exact match"); + assert!(result.is_err()); +} + +#[test] +fn test_unicode_separator() { + // Test Unicode characters in separators + let input = "10→20"; + let mut a: i32 = 0; + let mut b: i32 = 0; + sscanf!(input, "{}→{}", &mut a, &mut b).unwrap(); + assert_eq!(a, 10); + assert_eq!(b, 20); +} + +#[test] +fn test_bool_parsing() { + // Test boolean parsing + let input = "true false"; + let mut a: bool = false; + let mut b: bool = true; + sscanf!(input, "{} {}", &mut a, &mut b).unwrap(); + assert!(a); + assert!(!b); +} + +#[test] +fn test_char_parsing() { + // Test single character parsing + let input = "a b"; + let mut first: char = ' '; + let mut second: char = ' '; + sscanf!(input, "{} {}", &mut first, &mut second).unwrap(); + assert_eq!(first, 'a'); + assert_eq!(second, 'b'); +}