From 31a1d5cbf97bea81418d79c9831d48702682a499 Mon Sep 17 00:00:00 2001 From: CPunisher <1343316114@qq.com> Date: Fri, 12 Sep 2025 11:11:08 +0800 Subject: [PATCH 1/5] wtf8 --- crates/hstr/Cargo.toml | 1 + crates/hstr/src/lib.rs | 1 + crates/hstr/src/wtf8/mod.rs | 1265 +++++++++++++++++++++++++ crates/hstr/src/wtf8/not_quite_std.rs | 250 +++++ 4 files changed, 1517 insertions(+) create mode 100644 crates/hstr/src/wtf8/mod.rs create mode 100644 crates/hstr/src/wtf8/not_quite_std.rs diff --git a/crates/hstr/Cargo.toml b/crates/hstr/Cargo.toml index 1c9e615d348d..bb178d06a4e3 100644 --- a/crates/hstr/Cargo.toml +++ b/crates/hstr/Cargo.toml @@ -18,6 +18,7 @@ rkyv = ["dep:rkyv"] serde = ["dep:serde"] [dependencies] +arrayvec = { workspace = true } hashbrown = { workspace = true } new_debug_unreachable = { workspace = true } once_cell = { workspace = true } diff --git a/crates/hstr/src/lib.rs b/crates/hstr/src/lib.rs index 6f2c842bcf33..39e370db1348 100644 --- a/crates/hstr/src/lib.rs +++ b/crates/hstr/src/lib.rs @@ -22,6 +22,7 @@ mod global_store; mod tagged_value; #[cfg(test)] mod tests; +pub mod wtf8; /// An immutable string which is cheap to clone, compare, hash, and has small /// size. diff --git a/crates/hstr/src/wtf8/mod.rs b/crates/hstr/src/wtf8/mod.rs new file mode 100644 index 000000000000..43374fdea1a8 --- /dev/null +++ b/crates/hstr/src/wtf8/mod.rs @@ -0,0 +1,1265 @@ +/*! + +Implementation of [the WTF-8 encoding](https://simonsapin.github.io/wtf-8/). + +This library uses Rust’s type system to maintain +[well-formedness](https://simonsapin.github.io/wtf-8/#well-formed), +like the `String` and `&str` types do for UTF-8. + +Since [WTF-8 must not be used +for interchange](https://simonsapin.github.io/wtf-8/#intended-audience), +this library deliberately does not provide access to the underlying bytes +of WTF-8 strings, +nor can it decode WTF-8 from arbitrary bytes. +WTF-8 strings can be obtained from UTF-8, UTF-16, or code points. + +*/ + +extern crate alloc; + +use alloc::{borrow::Cow, string::String, vec::Vec}; +use core::{ + cmp::Ordering, + fmt, hash, + iter::{FromIterator, IntoIterator}, + mem::transmute, + ops::Deref, + slice, str, +}; +use std::iter::FusedIterator; + +use arrayvec::ArrayVec; + +mod not_quite_std; + +static UTF8_REPLACEMENT_CHARACTER: &[u8] = b"\xEF\xBF\xBD"; + +/// A Unicode code point: from U+0000 to U+10FFFF. +/// +/// Compare with the `char` type, +/// which represents a Unicode scalar value: +/// a code point that is not a surrogate (U+D800 to U+DFFF). +#[derive(Eq, PartialEq, Ord, PartialOrd, Clone)] +pub struct CodePoint { + value: u32, +} + +impl Copy for CodePoint {} + +/// Format the code point as `U+` followed by four to six hexadecimal digits. +/// Example: `U+1F4A9` +impl fmt::Debug for CodePoint { + #[inline] + fn fmt(&self, formatter: &mut fmt::Formatter) -> Result<(), fmt::Error> { + write!(formatter, "U+{:04X}", self.value) + } +} + +impl CodePoint { + /// Unsafely create a new `CodePoint` without checking the value. + /// + /// Only use when `value` is known to be less than or equal to 0x10FFFF. + #[inline] + #[allow(clippy::missing_safety_doc)] + pub unsafe fn from_u32_unchecked(value: u32) -> CodePoint { + CodePoint { value } + } + + /// Create a new `CodePoint` if the value is a valid code point. + /// + /// Return `None` if `value` is above 0x10FFFF. + #[inline] + pub fn from_u32(value: u32) -> Option { + match value { + 0..=0x10ffff => Some(CodePoint { value }), + _ => None, + } + } + + /// Create a new `CodePoint` from a `char`. + /// + /// Since all Unicode scalar values are code points, this always succeds. + #[inline] + pub fn from_char(value: char) -> CodePoint { + CodePoint { + value: value as u32, + } + } + + /// Return the numeric value of the code point. + #[inline] + pub fn to_u32(&self) -> u32 { + self.value + } + + /// Optionally return a Unicode scalar value for the code point. + /// + /// Return `None` if the code point is a surrogate (from U+D800 to U+DFFF). + #[inline] + pub fn to_char(&self) -> Option { + match self.value { + 0xd800..=0xdfff => None, + _ => Some(unsafe { char::from_u32_unchecked(self.value) }), + } + } + + /// Return a Unicode scalar value for the code point. + /// + /// Return `'\u{FFFD}'` (the replacement character “�”) + /// if the code point is a surrogate (from U+D800 to U+DFFF). + #[inline] + pub fn to_char_lossy(&self) -> char { + self.to_char().unwrap_or('\u{FFFD}') + } +} + +pub struct CharIter(ArrayVec); + +/// Ported from https://github.com/web-infra-dev/oxc/blob/99a4816ce7b6132b2667257984f9d92ae3768f03/crates/oxc_parser/src/lexer/mod.rs#L1349-L1374 +impl IntoIterator for CodePoint { + type IntoIter = CharIter; + type Item = char; + + #[allow(unsafe_code)] + fn into_iter(self) -> Self::IntoIter { + // // TODO: Check if this is correct + // fn to_char(v: u8) -> char { + // char::from_digit(v as _, 16).unwrap_or('0') + // } + + CharIter(match char::from_u32(self.value) { + Some(c) => { + let mut buf = ArrayVec::new(); + // Safety: we can make sure that `buf` has enough capacity + unsafe { + buf.push_unchecked(c); + } + buf + } + None => { + let mut buf = ArrayVec::new(); + + let high = self.value & 0xffff0000 >> 16; + + let low = self.value & 0x0000ffff; + + // The second code unit of a surrogate pair is always in the range from 0xDC00 + // to 0xDFFF, and is called a low surrogate or a trail surrogate. + if !(0xdc00..=0xdfff).contains(&low) { + // Safety: we can make sure that `buf` has enough capacity + unsafe { + buf.push_unchecked('\\'); + buf.push_unchecked('u'); + for c in format!("{high:x}").chars() { + buf.push_unchecked(c); + } + buf.push_unchecked('\\'); + buf.push_unchecked('u'); + for c in format!("{low:x}").chars() { + buf.push_unchecked(c); + } + } + } else { + // `https://tc39.es/ecma262/#sec-utf16decodesurrogatepair` + let astral_code_point = (high - 0xd800) * 0x400 + low - 0xdc00 + 0x10000; + + // Safety: we can make sure that `buf` has enough capacity + unsafe { + buf.push_unchecked('\\'); + buf.push_unchecked('u'); + for c in format!("{astral_code_point:x}").chars() { + buf.push_unchecked(c); + } + } + } + + buf + } + }) + } +} + +impl Iterator for CharIter { + type Item = char; + + fn next(&mut self) -> Option { + if self.0.is_empty() { + None + } else { + Some(self.0.remove(0)) + } + } +} + +impl FusedIterator for CharIter {} + +/// An owned, growable string of well-formed WTF-8 data. +/// +/// Similar to `String`, but can additionally contain surrogate code points +/// if they’re not in a surrogate pair. +#[derive(Eq, PartialEq, Ord, PartialOrd, Clone)] +pub struct Wtf8Buf { + bytes: Vec, +} + +impl Deref for Wtf8Buf { + type Target = Wtf8; + + fn deref(&self) -> &Wtf8 { + unsafe { transmute(&*self.bytes) } + } +} + +/// Format the string with double quotes, +/// and surrogates as `\u` followed by four hexadecimal digits. +/// Example: `"a\u{D800}"` for a string with code points [U+0061, U+D800] +impl fmt::Debug for Wtf8Buf { + #[inline] + fn fmt(&self, formatter: &mut fmt::Formatter) -> Result<(), fmt::Error> { + Wtf8::fmt(self, formatter) + } +} + +impl Wtf8Buf { + /// Create an new, empty WTF-8 string. + #[inline] + #[allow(clippy::new_without_default)] + pub fn new() -> Wtf8Buf { + Wtf8Buf { bytes: Vec::new() } + } + + /// Create an new, empty WTF-8 string with pre-allocated capacity for `n` + /// bytes. + #[inline] + pub fn with_capacity(n: usize) -> Wtf8Buf { + Wtf8Buf { + bytes: Vec::with_capacity(n), + } + } + + /// Create a WTF-8 string from an UTF-8 `String`. + /// + /// This takes ownership of the `String` and does not copy. + /// + /// Since WTF-8 is a superset of UTF-8, this always succeeds. + #[inline] + pub fn froming(string: String) -> Wtf8Buf { + Wtf8Buf { + bytes: string.into_bytes(), + } + } + + /// Create a WTF-8 string from a potentially ill-formed UTF-16 slice of + /// 16-bit code units. + /// + /// This is lossless: calling `.to_ill_formed_utf16()` on the resulting + /// string will always return the original code units. + pub fn from_ill_formed_utf16(v: &[u16]) -> Wtf8Buf { + let mut string = Wtf8Buf::with_capacity(v.len()); + for item in not_quite_std::decode_utf16(v.iter().cloned()) { + match item { + Ok(c) => string.push_char(c), + Err(s) => { + // Surrogates are known to be in the code point range. + let code_point = unsafe { CodePoint::from_u32_unchecked(s as u32) }; + // Skip the WTF-8 concatenation check, + // surrogate pairs are already decoded by utf16_items + not_quite_std::push_code_point(&mut string, code_point) + } + } + } + string + } + + /// Reserves capacity for at least `additional` more bytes to be inserted + /// in the given `Wtf8Buf`. + /// The collection may reserve more space to avoid frequent reallocations. + /// + /// # Panics + /// + /// Panics if the new capacity overflows `usize`. + /// + /// # Example + /// + /// ``` + /// let mut s = Wtf8Buf::new(); + /// s.reserve(10); + /// assert!(s.capacity() >= 10); + /// ``` + #[inline] + pub fn reserve(&mut self, additional: usize) { + self.bytes.reserve(additional) + } + + /// Returns the number of bytes that this string buffer can hold without + /// reallocating. + /// + /// # Example + /// + /// ``` + /// let s = Wtf8Buf::with_capacity(10); + /// assert!(s.capacity() >= 10); + /// ``` + #[inline] + pub fn capacity(&self) -> usize { + self.bytes.capacity() + } + + /// Append an UTF-8 slice at the end of the string. + #[inline] + pub fn push_str(&mut self, other: &str) { + self.bytes.extend_from_slice(other.as_bytes()) + } + + /// Append a WTF-8 slice at the end of the string. + /// + /// This replaces newly paired surrogates at the boundary + /// with a supplementary code point, + /// like concatenating ill-formed UTF-16 strings effectively would. + #[inline] + pub fn push_wtf8(&mut self, other: &Wtf8) { + match ( + (*self).final_lead_surrogate(), + other.initial_trail_surrogate(), + ) { + // Replace newly paired surrogates by a supplementary code point. + (Some(lead), Some(trail)) => { + let len_without_lead_surrogate = self.len() - 3; + self.bytes.truncate(len_without_lead_surrogate); + let other_without_trail_surrogate = &other.bytes[3..]; + // 4 bytes for the supplementary code point + self.bytes.reserve(4 + other_without_trail_surrogate.len()); + self.push_char(decode_surrogate_pair(lead, trail)); + self.bytes.extend_from_slice(other_without_trail_surrogate); + } + _ => self.bytes.extend_from_slice(&other.bytes), + } + } + + /// Append a Unicode scalar value at the end of the string. + #[inline] + pub fn push_char(&mut self, c: char) { + not_quite_std::push_code_point(self, CodePoint::from_char(c)) + } + + /// Append a code point at the end of the string. + /// + /// This replaces newly paired surrogates at the boundary + /// with a supplementary code point, + /// like concatenating ill-formed UTF-16 strings effectively would. + #[inline] + pub fn push(&mut self, code_point: CodePoint) { + if let trail @ 0xdc00..=0xdfff = code_point.to_u32() { + if let Some(lead) = (*self).final_lead_surrogate() { + let len_without_lead_surrogate = self.len() - 3; + self.bytes.truncate(len_without_lead_surrogate); + self.push_char(decode_surrogate_pair(lead, trail as u16)); + return; + } + } + + // No newly paired surrogates at the boundary. + not_quite_std::push_code_point(self, code_point) + } + + /// Shortens a string to the specified length. + /// + /// # Failure + /// + /// Fails if `new_len` > current length, + /// or if `new_len` is not a code point boundary. + #[inline] + pub fn truncate(&mut self, new_len: usize) { + assert!(not_quite_std::is_code_point_boundary(self, new_len)); + self.bytes.truncate(new_len) + } + + /// Consume the WTF-8 string and try to convert it to UTF-8. + /// + /// This does not copy the data. + /// + /// If the contents are not well-formed UTF-8 + /// (that is, if the string contains surrogates), + /// the original WTF-8 string is returned instead. + pub fn into_string(self) -> Result { + match self.next_surrogate(0) { + None => Ok(unsafe { String::from_utf8_unchecked(self.bytes) }), + Some(_) => Err(self), + } + } + + /// Consume the WTF-8 string and convert it lossily to UTF-8. + /// + /// This does not copy the data (but may overwrite parts of it in place). + /// + /// Surrogates are replaced with `"\u{FFFD}"` (the replacement character + /// “�”) + pub fn into_string_lossy(mut self) -> String { + let mut pos = 0; + loop { + match self.next_surrogate(pos) { + Some((surrogate_pos, _)) => { + pos = surrogate_pos + 3; + self.bytes[surrogate_pos..pos].copy_from_slice(UTF8_REPLACEMENT_CHARACTER); + } + None => return unsafe { String::from_utf8_unchecked(self.bytes) }, + } + } + } +} + +impl From<&str> for Wtf8Buf { + /// Create a WTF-8 string from an UTF-8 `&str` slice. + /// + /// This copies the content of the slice. + /// + /// Since WTF-8 is a superset of UTF-8, this always succeeds. + #[inline] + fn from(s: &str) -> Wtf8Buf { + Wtf8Buf { + bytes: s.as_bytes().to_vec(), + } + } +} + +impl Wtf8Buf { + pub const fn as_bytes(&self) -> &[u8] { + self.bytes.as_slice() + } +} + +/// Create a new WTF-8 string from an iterator of code points. +/// +/// This replaces surrogate code point pairs with supplementary code points, +/// like concatenating ill-formed UTF-16 strings effectively would. +impl FromIterator for Wtf8Buf { + fn from_iter>(iterable: T) -> Wtf8Buf { + let mut string = Wtf8Buf::new(); + string.extend(iterable); + string + } +} + +/// Append code points from an iterator to the string. +/// +/// This replaces surrogate code point pairs with supplementary code points, +/// like concatenating ill-formed UTF-16 strings effectively would. +impl Extend for Wtf8Buf { + fn extend>(&mut self, iterable: T) { + let iterator = iterable.into_iter(); + let (low, _high) = iterator.size_hint(); + // Lower bound of one byte per code point (ASCII only) + self.bytes.reserve(low); + for code_point in iterator { + self.push(code_point); + } + } +} + +/// A borrowed slice of well-formed WTF-8 data. +/// +/// Similar to `&str`, but can additionally contain surrogate code points +/// if they’re not in a surrogate pair. +pub struct Wtf8 { + bytes: [u8], +} + +// FIXME: https://github.com/rust-lang/rust/issues/18805 +impl PartialEq for Wtf8 { + fn eq(&self, other: &Wtf8) -> bool { + self.bytes.eq(&other.bytes) + } +} + +// FIXME: https://github.com/rust-lang/rust/issues/18805 +impl Eq for Wtf8 {} + +// FIXME: https://github.com/rust-lang/rust/issues/18738 +#[allow(clippy::non_canonical_partial_ord_impl)] +impl PartialOrd for Wtf8 { + #[inline] + fn partial_cmp(&self, other: &Wtf8) -> Option { + self.bytes.partial_cmp(&other.bytes) + } + + #[inline] + fn lt(&self, other: &Wtf8) -> bool { + self.bytes.lt(&other.bytes) + } + + #[inline] + fn le(&self, other: &Wtf8) -> bool { + self.bytes.le(&other.bytes) + } + + #[inline] + fn gt(&self, other: &Wtf8) -> bool { + self.bytes.gt(&other.bytes) + } + + #[inline] + fn ge(&self, other: &Wtf8) -> bool { + self.bytes.ge(&other.bytes) + } +} + +// FIXME: https://github.com/rust-lang/rust/issues/18738 +impl Ord for Wtf8 { + #[inline] + fn cmp(&self, other: &Wtf8) -> Ordering { + self.bytes.cmp(&other.bytes) + } +} + +/// Format the slice with double quotes, +/// and surrogates as `\u` followed by four hexadecimal digits. +/// Example: `"a\u{D800}"` for a slice with code points [U+0061, U+D800] +impl fmt::Debug for Wtf8 { + fn fmt(&self, formatter: &mut fmt::Formatter) -> Result<(), fmt::Error> { + formatter.write_str("\"")?; + let mut pos = 0; + loop { + match self.next_surrogate(pos) { + None => break, + Some((surrogate_pos, surrogate)) => { + formatter.write_str(unsafe { + str::from_utf8_unchecked(&self.bytes[pos..surrogate_pos]) + })?; + write!(formatter, "\\u{{{surrogate:X}}}")?; + pos = surrogate_pos + 3; + } + } + } + formatter.write_str(unsafe { str::from_utf8_unchecked(&self.bytes[pos..]) })?; + formatter.write_str("\"") + } +} + +impl Wtf8 { + /// Create a WTF-8 slice from a UTF-8 `&str` slice. + /// + /// Since WTF-8 is a superset of UTF-8, this always succeeds. + #[inline] + pub fn from(value: &str) -> &Wtf8 { + unsafe { transmute(value.as_bytes()) } + } + + /// Return the length, in WTF-8 bytes. + #[inline] + pub fn len(&self) -> usize { + self.bytes.len() + } + + #[inline] + #[must_use] + pub fn is_empty(&self) -> bool { + self.len() == 0 + } + + /// Return a slice of the given string for the byte range [`begin`..`end`). + /// + /// # Failure + /// + /// Fails when `begin` and `end` do not point to code point boundaries, + /// or point beyond the end of the string. + #[inline] + pub fn slice(&self, begin: usize, end: usize) -> &Wtf8 { + // is_code_point_boundary checks that the index is in [0, .len()] + if begin <= end + && not_quite_std::is_code_point_boundary(self, begin) + && not_quite_std::is_code_point_boundary(self, end) + { + unsafe { not_quite_std::slice_unchecked(self, begin, end) } + } else { + not_quite_std::slice_error_fail(self, begin, end) + } + } + + /// Return a slice of the given string from byte `begin` to its end. + /// + /// # Failure + /// + /// Fails when `begin` is not at a code point boundary, + /// or is beyond the end of the string. + #[inline] + pub fn slice_from(&self, begin: usize) -> &Wtf8 { + // is_code_point_boundary checks that the index is in [0, .len()] + if not_quite_std::is_code_point_boundary(self, begin) { + unsafe { not_quite_std::slice_unchecked(self, begin, self.len()) } + } else { + not_quite_std::slice_error_fail(self, begin, self.len()) + } + } + + /// Return a slice of the given string from its beginning to byte `end`. + /// + /// # Failure + /// + /// Fails when `end` is not at a code point boundary, + /// or is beyond the end of the string. + #[inline] + pub fn slice_to(&self, end: usize) -> &Wtf8 { + // is_code_point_boundary checks that the index is in [0, .len()] + if not_quite_std::is_code_point_boundary(self, end) { + unsafe { not_quite_std::slice_unchecked(self, 0, end) } + } else { + not_quite_std::slice_error_fail(self, 0, end) + } + } + + /// Return the code point at `position` if it is in the ASCII range, + /// or `b'\xFF' otherwise. + /// + /// # Failure + /// + /// Fails if `position` is beyond the end of the string. + #[inline] + pub fn ascii_byte_at(&self, position: usize) -> u8 { + match self.bytes[position] { + ascii_byte @ 0x00..=0x7f => ascii_byte, + _ => 0xff, + } + } + + /// Return an iterator for the string’s code points. + #[inline] + pub fn code_points(&self) -> Wtf8CodePoints { + Wtf8CodePoints { + bytes: self.bytes.iter(), + } + } + + /// Try to convert the string to UTF-8 and return a `&str` slice. + /// + /// Return `None` if the string contains surrogates. + /// + /// This does not copy the data. + #[inline] + pub fn as_str(&self) -> Option<&str> { + // Well-formed WTF-8 is also well-formed UTF-8 + // if and only if it contains no surrogate. + match self.next_surrogate(0) { + None => Some(unsafe { str::from_utf8_unchecked(&self.bytes) }), + Some(_) => None, + } + } + + /// Lossily convert the string to UTF-8. + /// Return an UTF-8 `&str` slice if the contents are well-formed in UTF-8. + /// + /// Surrogates are replaced with `"\u{FFFD}"` (the replacement character + /// “�”). + /// + /// This only copies the data if necessary (if it contains any surrogate). + pub fn to_string_lossy(&self) -> Cow { + let surrogate_pos = match self.next_surrogate(0) { + None => return Cow::Borrowed(unsafe { str::from_utf8_unchecked(&self.bytes) }), + Some((pos, _)) => pos, + }; + let wtf8_bytes = &self.bytes; + let mut utf8_bytes = Vec::with_capacity(self.len()); + utf8_bytes.extend_from_slice(&wtf8_bytes[..surrogate_pos]); + utf8_bytes.extend_from_slice(UTF8_REPLACEMENT_CHARACTER); + let mut pos = surrogate_pos + 3; + loop { + match self.next_surrogate(pos) { + Some((surrogate_pos, _)) => { + utf8_bytes.extend_from_slice(&wtf8_bytes[pos..surrogate_pos]); + utf8_bytes.extend_from_slice(UTF8_REPLACEMENT_CHARACTER); + pos = surrogate_pos + 3; + } + None => { + utf8_bytes.extend_from_slice(&wtf8_bytes[pos..]); + return Cow::Owned(unsafe { String::from_utf8_unchecked(utf8_bytes) }); + } + } + } + } + + /// Convert the WTF-8 string to potentially ill-formed UTF-16 + /// and return an iterator of 16-bit code units. + /// + /// This is lossless: + /// calling `Wtf8Buf::from_ill_formed_utf16` on the resulting code units + /// would always return the original WTF-8 string. + #[inline] + pub fn to_ill_formed_utf16(&self) -> IllFormedUtf16CodeUnits { + IllFormedUtf16CodeUnits { + code_points: self.code_points(), + extra: 0, + } + } + + #[inline] + fn next_surrogate(&self, mut pos: usize) -> Option<(usize, u16)> { + let mut iter = self.bytes[pos..].iter(); + loop { + let b = match iter.next() { + None => return None, + Some(&b) => b, + }; + if b < 0x80 { + pos += 1; + } else if b < 0xe0 { + iter.next(); + pos += 2; + } else if b == 0xed { + match (iter.next(), iter.next()) { + (Some(&b2), Some(&b3)) if b2 >= 0xa0 => { + return Some((pos, decode_surrogate(b2, b3))) + } + _ => pos += 3, + } + } else if b < 0xf0 { + iter.next(); + iter.next(); + pos += 3; + } else { + iter.next(); + iter.next(); + iter.next(); + pos += 4; + } + } + } + + #[inline] + fn final_lead_surrogate(&self) -> Option { + let len = self.len(); + if len < 3 { + return None; + } + let seq = &self.bytes[len - 3..]; + if seq[0] == 0xed && 0xa0 <= seq[1] && seq[1] <= 0xaf { + Some(decode_surrogate(seq[1], seq[2])) + } else { + None + } + } + + #[inline] + fn initial_trail_surrogate(&self) -> Option { + let len = self.len(); + if len < 3 { + return None; + } + let seq = &self.bytes[..3]; + if seq[0] == 0xed && 0xb0 <= seq[1] && seq[1] <= 0xbf { + Some(decode_surrogate(seq[1], seq[2])) + } else { + None + } + } +} + +#[inline] +fn decode_surrogate(second_byte: u8, third_byte: u8) -> u16 { + // The first byte is assumed to be 0xED + 0xd800 | (second_byte as u16 & 0x3f) << 6 | third_byte as u16 & 0x3f +} + +#[inline] +fn decode_surrogate_pair(lead: u16, trail: u16) -> char { + let code_point = 0x10000 + (((lead as u32 - 0xd800) << 10) | (trail as u32 - 0xdc00)); + unsafe { char::from_u32_unchecked(code_point) } +} + +/// Iterator for the code points of a WTF-8 string. +/// +/// Created with the method `.code_points()`. +#[derive(Clone)] +pub struct Wtf8CodePoints<'a> { + bytes: slice::Iter<'a, u8>, +} + +impl<'a> Iterator for Wtf8CodePoints<'a> { + type Item = CodePoint; + + #[inline] + fn next(&mut self) -> Option { + not_quite_std::next_code_point(&mut self.bytes) + .map(|value| unsafe { CodePoint::from_u32_unchecked(value) }) + } + + #[inline] + fn size_hint(&self) -> (usize, Option) { + let (len, _) = self.bytes.size_hint(); + (len.saturating_add(3) / 4, Some(len)) + } +} + +#[derive(Clone)] +pub struct IllFormedUtf16CodeUnits<'a> { + code_points: Wtf8CodePoints<'a>, + extra: u16, +} + +impl<'a> Iterator for IllFormedUtf16CodeUnits<'a> { + type Item = u16; + + #[inline] + fn next(&mut self) -> Option { + not_quite_std::next_utf16_code_unit(self) + } + + #[inline] + fn size_hint(&self) -> (usize, Option) { + let (low, high) = self.code_points.size_hint(); + // every code point gets either one u16 or two u16, + // so this iterator is between 1 or 2 times as + // long as the underlying iterator. + (low, high.and_then(|n| n.checked_mul(2))) + } +} + +impl PartialEq<&'_ Wtf8> for Wtf8Buf { + fn eq(&self, other: &&Wtf8) -> bool { + **self == **other + } +} + +impl PartialEq for &'_ Wtf8 { + fn eq(&self, other: &Wtf8Buf) -> bool { + **self == **other + } +} + +impl hash::Hash for CodePoint { + #[inline] + fn hash(&self, state: &mut H) { + self.value.hash(state) + } +} + +impl hash::Hash for Wtf8Buf { + #[inline] + fn hash(&self, state: &mut H) { + Wtf8::hash(self, state) + } +} + +impl hash::Hash for Wtf8 { + #[inline] + fn hash(&self, state: &mut H) { + state.write(&self.bytes); + 0xfeu8.hash(state) + } +} + +#[cfg(test)] +mod tests { + use alloc::{format, vec}; + use core::mem::transmute; + + use super::*; + + #[test] + fn code_point_from_u32() { + assert!(CodePoint::from_u32(0).is_some()); + assert!(CodePoint::from_u32(0xd800).is_some()); + assert!(CodePoint::from_u32(0x10ffff).is_some()); + assert!(CodePoint::from_u32(0x110000).is_none()); + } + + #[test] + fn code_point_to_u32() { + fn c(value: u32) -> CodePoint { + CodePoint::from_u32(value).unwrap() + } + assert_eq!(c(0).to_u32(), 0); + assert_eq!(c(0xd800).to_u32(), 0xd800); + assert_eq!(c(0x10ffff).to_u32(), 0x10ffff); + } + + #[test] + fn code_point_from_char() { + assert_eq!(CodePoint::from_char('a').to_u32(), 0x61); + assert_eq!(CodePoint::from_char('💩').to_u32(), 0x1f4a9); + } + + #[test] + fn code_point_to_string() { + assert_eq!(format!("{:?}", CodePoint::from_char('a')), "U+0061"); + assert_eq!(format!("{:?}", CodePoint::from_char('💩')), "U+1F4A9"); + } + + #[test] + fn code_point_to_char() { + fn c(value: u32) -> CodePoint { + CodePoint::from_u32(value).unwrap() + } + assert_eq!(c(0x61).to_char(), Some('a')); + assert_eq!(c(0x1f4a9).to_char(), Some('💩')); + assert_eq!(c(0xd800).to_char(), None); + } + + #[test] + fn code_point_to_char_lossy() { + fn c(value: u32) -> CodePoint { + CodePoint::from_u32(value).unwrap() + } + assert_eq!(c(0x61).to_char_lossy(), 'a'); + assert_eq!(c(0x1f4a9).to_char_lossy(), '💩'); + assert_eq!(c(0xd800).to_char_lossy(), '\u{FFFD}'); + } + + #[test] + fn wtf8buf_new() { + assert_eq!(Wtf8Buf::new().bytes, b""); + } + + #[test] + fn wtf8buf_from() { + assert_eq!(Wtf8Buf::from("").bytes, b""); + assert_eq!(Wtf8Buf::from("aé 💩").bytes, b"a\xC3\xA9 \xF0\x9F\x92\xA9"); + } + + #[test] + fn wtf8buf_froming() { + assert_eq!(Wtf8Buf::froming(String::from("")).bytes, b""); + assert_eq!( + Wtf8Buf::froming(String::from("aé 💩")).bytes, + b"a\xC3\xA9 \xF0\x9F\x92\xA9" + ); + } + + #[test] + fn wtf8buf_from_ill_formed_utf16() { + assert_eq!(Wtf8Buf::from_ill_formed_utf16(&[]).bytes, b""); + assert_eq!( + Wtf8Buf::from_ill_formed_utf16(&[0x61, 0xe9, 0x20, 0xd83d, 0xd83d, 0xdca9]).bytes, + b"a\xC3\xA9 \xED\xA0\xBD\xF0\x9F\x92\xA9" + ); + } + + #[test] + fn wtf8buf_push_str() { + let mut string = Wtf8Buf::new(); + assert_eq!(string.bytes, b""); + string.push_str("aé 💩"); + assert_eq!(string.bytes, b"a\xC3\xA9 \xF0\x9F\x92\xA9"); + } + + #[test] + fn wtf8buf_push_char() { + let mut string = Wtf8Buf::from("aé "); + assert_eq!(string.bytes, b"a\xC3\xA9 "); + string.push_char('💩'); + assert_eq!(string.bytes, b"a\xC3\xA9 \xF0\x9F\x92\xA9"); + } + + #[test] + fn wtf8buf_push() { + let mut string = Wtf8Buf::from("aé "); + assert_eq!(string.bytes, b"a\xC3\xA9 "); + string.push(CodePoint::from_char('💩')); + assert_eq!(string.bytes, b"a\xC3\xA9 \xF0\x9F\x92\xA9"); + + fn c(value: u32) -> CodePoint { + CodePoint::from_u32(value).unwrap() + } + + let mut string = Wtf8Buf::new(); + string.push(c(0xd83d)); // lead + string.push(c(0xdca9)); // trail + assert_eq!(string.bytes, b"\xF0\x9F\x92\xA9"); // Magic! + + let mut string = Wtf8Buf::new(); + string.push(c(0xd83d)); // lead + string.push(c(0x20)); // not surrogate + string.push(c(0xdca9)); // trail + assert_eq!(string.bytes, b"\xED\xA0\xBD \xED\xB2\xA9"); + + let mut string = Wtf8Buf::new(); + string.push(c(0xd800)); // lead + string.push(c(0xdbff)); // lead + assert_eq!(string.bytes, b"\xED\xA0\x80\xED\xAF\xBF"); + + let mut string = Wtf8Buf::new(); + string.push(c(0xd800)); // lead + string.push(c(0xe000)); // not surrogate + assert_eq!(string.bytes, b"\xED\xA0\x80\xEE\x80\x80"); + + let mut string = Wtf8Buf::new(); + string.push(c(0xd7ff)); // not surrogate + string.push(c(0xdc00)); // trail + assert_eq!(string.bytes, b"\xED\x9F\xBF\xED\xB0\x80"); + + let mut string = Wtf8Buf::new(); + string.push(c(0x61)); // not surrogate, < 3 bytes + string.push(c(0xdc00)); // trail + assert_eq!(string.bytes, b"\x61\xED\xB0\x80"); + + let mut string = Wtf8Buf::new(); + string.push(c(0xdc00)); // trail + assert_eq!(string.bytes, b"\xED\xB0\x80"); + } + + #[test] + fn wtf8buf_push_wtf8() { + let mut string = Wtf8Buf::from("aé"); + assert_eq!(string.bytes, b"a\xC3\xA9"); + string.push_wtf8(Wtf8::from(" 💩")); + assert_eq!(string.bytes, b"a\xC3\xA9 \xF0\x9F\x92\xA9"); + + fn w(value: &[u8]) -> &Wtf8 { + unsafe { transmute(value) } + } + + let mut string = Wtf8Buf::new(); + string.push_wtf8(w(b"\xED\xA0\xBD")); // lead + string.push_wtf8(w(b"\xED\xB2\xA9")); // trail + assert_eq!(string.bytes, b"\xF0\x9F\x92\xA9"); // Magic! + + let mut string = Wtf8Buf::new(); + string.push_wtf8(w(b"\xED\xA0\xBD")); // lead + string.push_wtf8(w(b" ")); // not surrogate + string.push_wtf8(w(b"\xED\xB2\xA9")); // trail + assert_eq!(string.bytes, b"\xED\xA0\xBD \xED\xB2\xA9"); + + let mut string = Wtf8Buf::new(); + string.push_wtf8(w(b"\xED\xA0\x80")); // lead + string.push_wtf8(w(b"\xED\xAF\xBF")); // lead + assert_eq!(string.bytes, b"\xED\xA0\x80\xED\xAF\xBF"); + + let mut string = Wtf8Buf::new(); + string.push_wtf8(w(b"\xED\xA0\x80")); // lead + string.push_wtf8(w(b"\xEE\x80\x80")); // not surrogate + assert_eq!(string.bytes, b"\xED\xA0\x80\xEE\x80\x80"); + + let mut string = Wtf8Buf::new(); + string.push_wtf8(w(b"\xED\x9F\xBF")); // not surrogate + string.push_wtf8(w(b"\xED\xB0\x80")); // trail + assert_eq!(string.bytes, b"\xED\x9F\xBF\xED\xB0\x80"); + + let mut string = Wtf8Buf::new(); + string.push_wtf8(w(b"a")); // not surrogate, < 3 bytes + string.push_wtf8(w(b"\xED\xB0\x80")); // trail + assert_eq!(string.bytes, b"\x61\xED\xB0\x80"); + + let mut string = Wtf8Buf::new(); + string.push_wtf8(w(b"\xED\xB0\x80")); // trail + assert_eq!(string.bytes, b"\xED\xB0\x80"); + } + + #[test] + fn wtf8buf_truncate() { + let mut string = Wtf8Buf::from("aé"); + string.truncate(1); + assert_eq!(string.bytes, b"a"); + } + + #[test] + #[should_panic] + fn wtf8buf_truncate_fail_code_point_boundary() { + let mut string = Wtf8Buf::from("aé"); + string.truncate(2); + } + + #[test] + #[should_panic] + fn wtf8buf_truncate_fail_longer() { + let mut string = Wtf8Buf::from("aé"); + string.truncate(4); + } + + #[test] + fn wtf8buf_into_string() { + let mut string = Wtf8Buf::from("aé 💩"); + assert_eq!(string.clone().into_string(), Ok(String::from("aé 💩"))); + string.push(CodePoint::from_u32(0xd800).unwrap()); + assert_eq!(string.clone().into_string(), Err(string)); + } + + #[test] + fn wtf8buf_into_string_lossy() { + let mut string = Wtf8Buf::from("aé 💩"); + assert_eq!(string.clone().into_string_lossy(), String::from("aé 💩")); + string.push(CodePoint::from_u32(0xd800).unwrap()); + assert_eq!(string.clone().into_string_lossy(), String::from("aé 💩�")); + } + + #[test] + fn wtf8buf_from_iterator() { + fn f(values: &[u32]) -> Wtf8Buf { + values + .iter() + .map(|&c| CodePoint::from_u32(c).unwrap()) + .collect::() + } + assert_eq!( + f(&[0x61, 0xe9, 0x20, 0x1f4a9]).bytes, + b"a\xC3\xA9 \xF0\x9F\x92\xA9" + ); + + assert_eq!(f(&[0xd83d, 0xdca9]).bytes, b"\xF0\x9F\x92\xA9"); // Magic! + assert_eq!( + f(&[0xd83d, 0x20, 0xdca9]).bytes, + b"\xED\xA0\xBD \xED\xB2\xA9" + ); + assert_eq!(f(&[0xd800, 0xdbff]).bytes, b"\xED\xA0\x80\xED\xAF\xBF"); + assert_eq!(f(&[0xd800, 0xe000]).bytes, b"\xED\xA0\x80\xEE\x80\x80"); + assert_eq!(f(&[0xd7ff, 0xdc00]).bytes, b"\xED\x9F\xBF\xED\xB0\x80"); + assert_eq!(f(&[0x61, 0xdc00]).bytes, b"\x61\xED\xB0\x80"); + assert_eq!(f(&[0xdc00]).bytes, b"\xED\xB0\x80"); + } + + #[test] + fn wtf8buf_extend() { + fn e(initial: &[u32], extended: &[u32]) -> Wtf8Buf { + fn c(value: &u32) -> CodePoint { + CodePoint::from_u32(*value).unwrap() + } + let mut string = initial.iter().map(c).collect::(); + string.extend(extended.iter().map(c)); + string + } + + assert_eq!( + e(&[0x61, 0xe9], &[0x20, 0x1f4a9]).bytes, + b"a\xC3\xA9 \xF0\x9F\x92\xA9" + ); + + assert_eq!(e(&[0xd83d], &[0xdca9]).bytes, b"\xF0\x9F\x92\xA9"); // Magic! + assert_eq!( + e(&[0xd83d, 0x20], &[0xdca9]).bytes, + b"\xED\xA0\xBD \xED\xB2\xA9" + ); + assert_eq!(e(&[0xd800], &[0xdbff]).bytes, b"\xED\xA0\x80\xED\xAF\xBF"); + assert_eq!(e(&[0xd800], &[0xe000]).bytes, b"\xED\xA0\x80\xEE\x80\x80"); + assert_eq!(e(&[0xd7ff], &[0xdc00]).bytes, b"\xED\x9F\xBF\xED\xB0\x80"); + assert_eq!(e(&[0x61], &[0xdc00]).bytes, b"\x61\xED\xB0\x80"); + assert_eq!(e(&[], &[0xdc00]).bytes, b"\xED\xB0\x80"); + } + + #[test] + fn wtf8buf_debug() { + let mut string = Wtf8Buf::from("aé 💩"); + string.push(CodePoint::from_u32(0xd800).unwrap()); + assert_eq!(format!("{string:?}"), r#""aé 💩\u{D800}""#); + } + + #[test] + fn wtf8buf_as_slice() { + assert_eq!(Wtf8Buf::from("aé"), Wtf8::from("aé")); + } + + #[test] + fn wtf8_debug() { + let mut string = Wtf8Buf::from("aé 💩"); + string.push(CodePoint::from_u32(0xd800).unwrap()); + assert_eq!(format!("{:?}", &*string), r#""aé 💩\u{D800}""#); + } + + #[test] + fn wtf8_from() { + assert_eq!(&Wtf8::from("").bytes, b""); + assert_eq!(&Wtf8::from("aé 💩").bytes, b"a\xC3\xA9 \xF0\x9F\x92\xA9"); + } + + #[test] + fn wtf8_len() { + assert_eq!(Wtf8::from("").len(), 0); + assert_eq!(Wtf8::from("aé 💩").len(), 8); + } + + #[test] + fn wtf8_slice() { + assert_eq!(&Wtf8::from("aé 💩").slice(1, 4).bytes, b"\xC3\xA9 "); + } + + #[test] + #[should_panic] + fn wtf8_slice_not_code_point_boundary() { + Wtf8::from("aé 💩").slice(2, 4); + } + + #[test] + fn wtf8_slice_from() { + assert_eq!( + &Wtf8::from("aé 💩").slice_from(1).bytes, + b"\xC3\xA9 \xF0\x9F\x92\xA9" + ); + } + + #[test] + #[should_panic] + fn wtf8_slice_from_not_code_point_boundary() { + Wtf8::from("aé 💩").slice_from(2); + } + + #[test] + fn wtf8_slice_to() { + assert_eq!(&Wtf8::from("aé 💩").slice_to(4).bytes, b"a\xC3\xA9 "); + } + + #[test] + #[should_panic] + fn wtf8_slice_to_not_code_point_boundary() { + Wtf8::from("aé 💩").slice_from(5); + } + + #[test] + fn wtf8_ascii_byte_at() { + let slice = Wtf8::from("aé 💩"); + assert_eq!(slice.ascii_byte_at(0), b'a'); + assert_eq!(slice.ascii_byte_at(1), b'\xFF'); + assert_eq!(slice.ascii_byte_at(2), b'\xFF'); + assert_eq!(slice.ascii_byte_at(3), b' '); + assert_eq!(slice.ascii_byte_at(4), b'\xFF'); + } + + #[test] + fn wtf8_code_points() { + fn c(value: u32) -> CodePoint { + CodePoint::from_u32(value).unwrap() + } + fn cp(string: &Wtf8Buf) -> Vec> { + string + .code_points() + .map(|c| c.to_char()) + .collect::>() + } + let mut string = Wtf8Buf::from("é "); + assert_eq!(cp(&string), vec![Some('é'), Some(' ')]); + string.push(c(0xd83d)); + assert_eq!(cp(&string), vec![Some('é'), Some(' '), None]); + string.push(c(0xdca9)); + assert_eq!(cp(&string), vec![Some('é'), Some(' '), Some('💩')]); + } + + #[test] + fn wtf8_as_str() { + assert_eq!(Wtf8::from("").as_str(), Some("")); + assert_eq!(Wtf8::from("aé 💩").as_str(), Some("aé 💩")); + let mut string = Wtf8Buf::new(); + string.push(CodePoint::from_u32(0xd800).unwrap()); + assert_eq!(string.as_str(), None); + } + + #[test] + fn wtf8_to_string_lossy() { + assert_eq!(Wtf8::from("").to_string_lossy(), Cow::Borrowed("")); + assert_eq!( + Wtf8::from("aé 💩").to_string_lossy(), + Cow::Borrowed("aé 💩") + ); + let mut string = Wtf8Buf::from("aé 💩"); + string.push(CodePoint::from_u32(0xd800).unwrap()); + assert_eq!(string.to_string_lossy(), { + let o: Cow = Cow::Owned(String::from("aé 💩�")); + o + }); + } + + #[test] + fn wtf8_to_ill_formed_utf16() { + let mut string = Wtf8Buf::from("aé "); + string.push(CodePoint::from_u32(0xd83d).unwrap()); + string.push_char('💩'); + assert_eq!( + string.to_ill_formed_utf16().collect::>(), + vec![0x61, 0xe9, 0x20, 0xd83d, 0xd83d, 0xdca9] + ); + } +} diff --git a/crates/hstr/src/wtf8/not_quite_std.rs b/crates/hstr/src/wtf8/not_quite_std.rs new file mode 100644 index 000000000000..f7482be8d83e --- /dev/null +++ b/crates/hstr/src/wtf8/not_quite_std.rs @@ -0,0 +1,250 @@ +//! The code in this module is copied from Rust standard library +//! (the `std` crate and crates it is a facade for) +//! at commit 16d80de231abb2b1756f3951ffd4776d681035eb, +//! with the signature changed to use `Wtf8Buf`, `Wtf8`, and `CodePoint` +//! instead of `String`, `&str`, and `char`. +//! +//! FIXME: if and when this is moved into the standard library, +//! try to avoid the code duplication. +//! Maybe by having private generic code that is monomorphized to UTF-8 and +//! WTF-8? + +use core::{char, mem, slice}; + +use super::{CodePoint, IllFormedUtf16CodeUnits, Wtf8, Wtf8Buf}; + +// UTF-8 ranges and tags for encoding characters +// Copied from 48d5fe9ec560b53b1f5069219b0d62015e1de5ba^:src/libcore/char.rs +const TAG_CONT: u8 = 0b1000_0000; +const TAG_TWO_B: u8 = 0b1100_0000; +const TAG_THREE_B: u8 = 0b1110_0000; +const TAG_FOUR_B: u8 = 0b1111_0000; +const MAX_ONE_B: u32 = 0x80; +const MAX_TWO_B: u32 = 0x800; +const MAX_THREE_B: u32 = 0x10000; + +/// Copied from 48d5fe9ec560b53b1f5069219b0d62015e1de5ba^:src/libcore/char.rs +#[inline] +fn encode_utf8_raw(code: u32, dst: &mut [u8]) -> Option { + // Marked #[inline] to allow llvm optimizing it away + if code < MAX_ONE_B && !dst.is_empty() { + dst[0] = code as u8; + Some(1) + } else if code < MAX_TWO_B && dst.len() >= 2 { + dst[0] = (code >> 6 & 0x1f) as u8 | TAG_TWO_B; + dst[1] = (code & 0x3f) as u8 | TAG_CONT; + Some(2) + } else if code < MAX_THREE_B && dst.len() >= 3 { + dst[0] = (code >> 12 & 0x0f) as u8 | TAG_THREE_B; + dst[1] = (code >> 6 & 0x3f) as u8 | TAG_CONT; + dst[2] = (code & 0x3f) as u8 | TAG_CONT; + Some(3) + } else if dst.len() >= 4 { + dst[0] = (code >> 18 & 0x07) as u8 | TAG_FOUR_B; + dst[1] = (code >> 12 & 0x3f) as u8 | TAG_CONT; + dst[2] = (code >> 6 & 0x3f) as u8 | TAG_CONT; + dst[3] = (code & 0x3f) as u8 | TAG_CONT; + Some(4) + } else { + None + } +} + +/// Copied from 48d5fe9ec560b53b1f5069219b0d62015e1de5ba^:src/libcore/char.rs +#[inline] +fn encode_utf16_raw(mut ch: u32, dst: &mut [u16]) -> Option { + // Marked #[inline] to allow llvm optimizing it away + if (ch & 0xffff) == ch && !dst.is_empty() { + // The BMP falls through (assuming non-surrogate, as it should) + dst[0] = ch as u16; + Some(1) + } else if dst.len() >= 2 { + // Supplementary planes break into surrogates. + ch -= 0x1_0000; + dst[0] = 0xd800 | ((ch >> 10) as u16); + dst[1] = 0xdc00 | ((ch as u16) & 0x3ff); + Some(2) + } else { + None + } +} + +/// Copied from core::str::next_code_point +#[inline] +pub fn next_code_point(bytes: &mut slice::Iter) -> Option { + // Decode UTF-8 + let x = match bytes.next() { + None => return None, + Some(&next_byte) if next_byte < 128 => return Some(next_byte as u32), + Some(&next_byte) => next_byte, + }; + + // Multibyte case follows + // Decode from a byte combination out of: [[[x y] z] w] + // NOTE: Performance is sensitive to the exact formulation here + let init = utf8_first_byte(x, 2); + let y = unwrap_or_0(bytes.next()); + let mut ch = utf8_acc_cont_byte(init, y); + if x >= 0xe0 { + // [[x y z] w] case + // 5th bit in 0xE0 .. 0xEF is always clear, so `init` is still valid + let z = unwrap_or_0(bytes.next()); + let y_z = utf8_acc_cont_byte((y & CONT_MASK) as u32, z); + ch = init << 12 | y_z; + if x >= 0xf0 { + // [x y z w] case + // use only the lower 3 bits of `init` + let w = unwrap_or_0(bytes.next()); + ch = (init & 7) << 18 | utf8_acc_cont_byte(y_z, w); + } + } + + Some(ch) +} + +#[inline] +fn utf8_first_byte(byte: u8, width: u32) -> u32 { + (byte & (0x7f >> width)) as u32 +} + +/// Return the value of `ch` updated with continuation byte `byte`. +#[inline] +fn utf8_acc_cont_byte(ch: u32, byte: u8) -> u32 { + (ch << 6) | (byte & CONT_MASK) as u32 +} + +#[inline] +fn unwrap_or_0(opt: Option<&u8>) -> u8 { + match opt { + Some(&byte) => byte, + None => 0, + } +} + +/// Mask of the value bits of a continuation byte +const CONT_MASK: u8 = 0b0011_1111; + +/// Copied from String::push +/// This does **not** include the WTF-8 concatenation check. +#[inline] +pub fn push_code_point(string: &mut Wtf8Buf, code_point: CodePoint) { + let cur_len = string.len(); + // This may use up to 4 bytes. + string.reserve(4); + + unsafe { + // Attempt to not use an intermediate buffer by just pushing bytes + // directly onto this string. + let slice = slice::from_raw_parts_mut(string.bytes.as_mut_ptr().add(cur_len), 4); + let used = encode_utf8_raw(code_point.to_u32(), slice).unwrap_or(0); + string.bytes.set_len(cur_len + used); + } +} + +/// Copied from core::str::StrPrelude::is_char_boundary +#[inline] +pub fn is_code_point_boundary(slice: &Wtf8, index: usize) -> bool { + if index == slice.len() { + return true; + } + match slice.bytes.get(index) { + None => false, + Some(&b) => !(128u8..192u8).contains(&b), + } +} + +/// Copied from core::str::raw::slice_unchecked +#[inline] +pub unsafe fn slice_unchecked(s: &Wtf8, begin: usize, end: usize) -> &Wtf8 { + mem::transmute(slice::from_raw_parts( + s.bytes.as_ptr().add(begin), + end - begin, + )) +} + +/// Copied from core::str::raw::slice_error_fail +#[inline(never)] +pub fn slice_error_fail(s: &Wtf8, begin: usize, end: usize) -> ! { + assert!(begin <= end); + panic!("index {begin} and/or {end} in {s:?} do not lie on character boundary",); +} + +/// Copied from core::str::Utf16CodeUnits::next +pub fn next_utf16_code_unit(iter: &mut IllFormedUtf16CodeUnits) -> Option { + if iter.extra != 0 { + let tmp = iter.extra; + iter.extra = 0; + return Some(tmp); + } + + let mut buf = [0u16; 2]; + iter.code_points.next().map(|code_point| { + let n = encode_utf16_raw(code_point.to_u32(), &mut buf).unwrap_or(0); + if n == 2 { + iter.extra = buf[1]; + } + buf[0] + }) +} + +/// Copied from src/librustc_unicode/char.rs +pub struct DecodeUtf16 +where + I: Iterator, +{ + iter: I, + buf: Option, +} + +/// Copied from src/librustc_unicode/char.rs +#[inline] +pub fn decode_utf16>(iterable: I) -> DecodeUtf16 { + DecodeUtf16 { + iter: iterable.into_iter(), + buf: None, + } +} + +/// Copied from src/librustc_unicode/char.rs +impl> Iterator for DecodeUtf16 { + type Item = Result; + + fn next(&mut self) -> Option> { + let u = match self.buf.take() { + Some(buf) => buf, + None => self.iter.next()?, + }; + + if !(0xd800..=0xdfff).contains(&u) { + // not a surrogate + Some(Ok(unsafe { char::from_u32_unchecked(u as u32) })) + } else if u >= 0xdc00 { + // a trailing surrogate + Some(Err(u)) + } else { + let u2 = match self.iter.next() { + Some(u2) => u2, + // eof + None => return Some(Err(u)), + }; + if !(0xdc00..=0xdfff).contains(&u2) { + // not a trailing surrogate so we're not a valid + // surrogate pair, so rewind to redecode u2 next time. + self.buf = Some(u2); + return Some(Err(u)); + } + + // all ok, so lets decode it. + let c = (((u - 0xd800) as u32) << 10 | (u2 - 0xdc00) as u32) + 0x1_0000; + Some(Ok(unsafe { char::from_u32_unchecked(c) })) + } + } + + #[inline] + fn size_hint(&self) -> (usize, Option) { + let (low, high) = self.iter.size_hint(); + // we could be entirely valid surrogates (2 elements per + // char), or entirely non-surrogates (1 element per char) + (low / 2, high) + } +} From ef7476791862370877a3754c20563acba0b17190 Mon Sep 17 00:00:00 2001 From: CPunisher <1343316114@qq.com> Date: Fri, 12 Sep 2025 18:04:14 +0800 Subject: [PATCH 2/5] Store code point --- Cargo.lock | 1 + crates/hstr/src/dynamic.rs | 31 ++- crates/hstr/src/global_store.rs | 4 +- crates/hstr/src/wtf8/mod.rs | 5 - crates/swc_atoms/src/lib.rs | 8 +- .../swc_ecma_lexer/src/common/lexer/char.rs | 103 +--------- crates/swc_ecma_lexer/src/common/lexer/mod.rs | 191 +++++++----------- crates/swc_ecma_lexer/src/lexer/mod.rs | 4 +- crates/swc_ecma_parser/src/lexer/mod.rs | 20 +- crates/swc_ecma_parser/src/lexer/state.rs | 6 +- 10 files changed, 115 insertions(+), 258 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 320f44cc3a34..c00182f90ecd 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2249,6 +2249,7 @@ checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70" name = "hstr" version = "2.0.1" dependencies = [ + "arrayvec", "compact_str", "criterion", "hashbrown 0.14.5", diff --git a/crates/hstr/src/dynamic.rs b/crates/hstr/src/dynamic.rs index ad9244b19416..48a96f9da294 100644 --- a/crates/hstr/src/dynamic.rs +++ b/crates/hstr/src/dynamic.rs @@ -70,7 +70,12 @@ impl Default for AtomStore { impl AtomStore { #[inline(always)] pub fn atom<'a>(&mut self, text: impl Into>) -> Atom { - atom_in(self, &text.into()) + self.atom_raw(text.into().as_bytes()) + } + + #[inline(always)] + pub fn atom_raw(&mut self, text: &[u8]) -> Atom { + atom_in(self, text) } fn gc(&mut self) { @@ -94,7 +99,7 @@ pub fn global_atom_store_gc() { }); } -pub(crate) fn global_atom(text: &str) -> Atom { +pub(crate) fn global_atom(text: &[u8]) -> Atom { GLOBAL_DATA.with(|global| { let mut store = global.borrow_mut(); @@ -104,7 +109,7 @@ pub(crate) fn global_atom(text: &str) -> Atom { /// This can create any kind of [Atom], although this lives in the `dynamic` /// module. -fn atom_in(storage: S, text: &str) -> Atom +fn atom_in(storage: S, text: &[u8]) -> Atom where S: Storage, { @@ -115,7 +120,7 @@ where let tag = INLINE_TAG_INIT | ((len as u8) << LEN_OFFSET); let mut unsafe_data = TaggedValue::new_tag(tag); unsafe { - unsafe_data.data_mut()[..len].copy_from_slice(text.as_bytes()); + unsafe_data.data_mut()[..len].copy_from_slice(text); } return Atom { unsafe_data }; } @@ -159,31 +164,25 @@ pub(crate) const fn inline_atom(text: &str) -> Option { } trait Storage { - fn insert_entry(self, text: &str, hash: u64) -> Item; + fn insert_entry(self, text: &[u8], hash: u64) -> Item; } impl Storage for &'_ mut AtomStore { - fn insert_entry(self, text: &str, hash: u64) -> Item { + fn insert_entry(self, text: &[u8], hash: u64) -> Item { // If the text is too long, interning is not worth it. if text.len() > 512 { - return Item(ThinArc::from_header_and_slice( - Metadata { hash }, - text.as_bytes(), - )); + return Item(ThinArc::from_header_and_slice(Metadata { hash }, text)); } let (entry, _) = self .data .raw_entry_mut() .from_hash(hash, |key| { - key.header.header.hash == hash && key.slice.eq(text.as_bytes()) + key.header.header.hash == hash && key.slice.eq(text) }) .or_insert_with(move || { ( - Item(ThinArc::from_header_and_slice( - Metadata { hash }, - text.as_bytes(), - )), + Item(ThinArc::from_header_and_slice(Metadata { hash }, text)), (), ) }); @@ -192,7 +191,7 @@ impl Storage for &'_ mut AtomStore { } #[inline(always)] -fn calc_hash(text: &str) -> u64 { +fn calc_hash(text: &[u8]) -> u64 { let mut hasher = FxHasher::default(); text.hash(&mut hasher); hasher.finish() diff --git a/crates/hstr/src/global_store.rs b/crates/hstr/src/global_store.rs index 6136c747b6ee..c1fdee923cc0 100644 --- a/crates/hstr/src/global_store.rs +++ b/crates/hstr/src/global_store.rs @@ -6,7 +6,7 @@ macro_rules! direct_from_impl { ($T:ty) => { impl From<$T> for Atom { fn from(s: $T) -> Self { - global_atom(&s) + global_atom(s.as_bytes()) } } }; @@ -18,6 +18,6 @@ direct_from_impl!(String); impl From> for crate::Atom { fn from(s: Box) -> Self { - global_atom(&s) + global_atom(s.as_bytes()) } } diff --git a/crates/hstr/src/wtf8/mod.rs b/crates/hstr/src/wtf8/mod.rs index 43374fdea1a8..57e2ed40eec7 100644 --- a/crates/hstr/src/wtf8/mod.rs +++ b/crates/hstr/src/wtf8/mod.rs @@ -122,11 +122,6 @@ impl IntoIterator for CodePoint { #[allow(unsafe_code)] fn into_iter(self) -> Self::IntoIter { - // // TODO: Check if this is correct - // fn to_char(v: u8) -> char { - // char::from_digit(v as _, 16).unwrap_or('0') - // } - CharIter(match char::from_u32(self.value) { Some(c) => { let mut buf = ArrayVec::new(); diff --git a/crates/swc_atoms/src/lib.rs b/crates/swc_atoms/src/lib.rs index 07c584f10c8a..301c8cc9f7ef 100644 --- a/crates/swc_atoms/src/lib.rs +++ b/crates/swc_atoms/src/lib.rs @@ -18,6 +18,7 @@ use std::{ rc::Rc, }; +pub use hstr::wtf8; use once_cell::sync::Lazy; use serde::Serializer; @@ -264,11 +265,16 @@ impl AtomStoreCell { pub fn atom<'a>(&self, s: impl Into>) -> Atom { // evaluate the into before borrowing (see #8362) let s: Cow<'a, str> = s.into(); + self.atom_raw(s.as_bytes()) + } + + #[inline] + pub fn atom_raw(&self, s: &[u8]) -> Atom { // SAFETY: We can skip the borrow check of RefCell because // this API enforces a safe contract. It is slightly faster // to use an UnsafeCell. Note the borrow here is short lived // only to this block. - unsafe { (*self.0.get()).atom(s) } + unsafe { Atom((*self.0.get()).0.atom_raw(s)) } } } diff --git a/crates/swc_ecma_lexer/src/common/lexer/char.rs b/crates/swc_ecma_lexer/src/common/lexer/char.rs index 53e433e4d7ee..c46c92452600 100644 --- a/crates/swc_ecma_lexer/src/common/lexer/char.rs +++ b/crates/swc_ecma_lexer/src/common/lexer/char.rs @@ -1,101 +1,4 @@ -use std::iter::FusedIterator; - -use arrayvec::ArrayVec; - -#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)] -pub struct Char(u32); - -impl From for Char { - fn from(c: char) -> Self { - Char(c as u32) - } -} - -impl From for Char { - fn from(c: u32) -> Self { - Char(c) - } -} - -pub struct CharIter(ArrayVec); - -/// Ported from https://github.com/web-infra-dev/oxc/blob/99a4816ce7b6132b2667257984f9d92ae3768f03/crates/oxc_parser/src/lexer/mod.rs#L1349-L1374 -impl IntoIterator for Char { - type IntoIter = CharIter; - type Item = char; - - #[allow(unsafe_code)] - fn into_iter(self) -> Self::IntoIter { - // // TODO: Check if this is correct - // fn to_char(v: u8) -> char { - // char::from_digit(v as _, 16).unwrap_or('0') - // } - - CharIter(match char::from_u32(self.0) { - Some(c) => { - let mut buf = ArrayVec::new(); - // Safety: we can make sure that `buf` has enough capacity - unsafe { - buf.push_unchecked(c); - } - buf - } - None => { - let mut buf = ArrayVec::new(); - - let high = self.0 & 0xffff0000 >> 16; - - let low = self.0 & 0x0000ffff; - - // The second code unit of a surrogate pair is always in the range from 0xDC00 - // to 0xDFFF, and is called a low surrogate or a trail surrogate. - if !(0xdc00..=0xdfff).contains(&low) { - // Safety: we can make sure that `buf` has enough capacity - unsafe { - buf.push_unchecked('\\'); - buf.push_unchecked('u'); - for c in format!("{high:x}").chars() { - buf.push_unchecked(c); - } - buf.push_unchecked('\\'); - buf.push_unchecked('u'); - for c in format!("{low:x}").chars() { - buf.push_unchecked(c); - } - } - } else { - // `https://tc39.es/ecma262/#sec-utf16decodesurrogatepair` - let astral_code_point = (high - 0xd800) * 0x400 + low - 0xdc00 + 0x10000; - - // Safety: we can make sure that `buf` has enough capacity - unsafe { - buf.push_unchecked('\\'); - buf.push_unchecked('u'); - for c in format!("{astral_code_point:x}").chars() { - buf.push_unchecked(c); - } - } - } - - buf - } - }) - } -} - -impl Iterator for CharIter { - type Item = char; - - fn next(&mut self) -> Option { - if self.0.is_empty() { - None - } else { - Some(self.0.remove(0)) - } - } -} - -impl FusedIterator for CharIter {} +use swc_atoms::wtf8::CodePoint; /// Implemented for `char`. pub trait CharExt: Copy { @@ -164,10 +67,10 @@ pub trait CharExt: Copy { } } -impl CharExt for Char { +impl CharExt for CodePoint { #[inline(always)] fn to_char(self) -> Option { - char::from_u32(self.0) + CodePoint::to_char(&self) } } diff --git a/crates/swc_ecma_lexer/src/common/lexer/mod.rs b/crates/swc_ecma_lexer/src/common/lexer/mod.rs index 7be808407c43..d36ed93f5320 100644 --- a/crates/swc_ecma_lexer/src/common/lexer/mod.rs +++ b/crates/swc_ecma_lexer/src/common/lexer/mod.rs @@ -1,11 +1,14 @@ use std::borrow::Cow; -use char::{Char, CharExt}; +use char::CharExt; use either::Either::{self, Left, Right}; use num_bigint::BigInt as BigIntValue; use smartstring::{LazyCompact, SmartString}; use state::State; -use swc_atoms::Atom; +use swc_atoms::{ + wtf8::{CodePoint, Wtf8Buf}, + Atom, +}; use swc_common::{ comments::{Comment, CommentKind}, input::{Input, StringInput}, @@ -91,9 +94,13 @@ pub trait Lexer<'a, TokenAndSpan>: Tokens + Sized { /// We know that the start and the end are valid unsafe fn input_slice(&mut self, start: BytePos, end: BytePos) -> &'a str; fn input_uncons_while(&mut self, f: impl FnMut(char) -> bool) -> &'a str; - fn atom<'b>(&self, s: impl Into>) -> swc_atoms::Atom; + fn atom_raw(&self, bytes: &[u8]) -> swc_atoms::Atom; fn push_error(&mut self, error: crate::error::Error); + fn atom<'b>(&self, s: impl Into>) -> swc_atoms::Atom { + self.atom_raw(s.into().as_bytes()) + } + #[inline(always)] #[allow(clippy::misnamed_getters)] fn had_line_break_before_last(&self) -> bool { @@ -1119,10 +1126,9 @@ pub trait Lexer<'a, TokenAndSpan>: Tokens + Sized { Ok(Self::Token::str(value, raw, self)) } - fn read_unicode_escape(&mut self) -> LexResult> { + fn read_unicode_escape(&mut self) -> LexResult { debug_assert_eq!(self.cur(), Some('u')); - let mut chars = Vec::with_capacity(4); let mut is_curly = false; self.bump(); // 'u' @@ -1132,89 +1138,39 @@ pub trait Lexer<'a, TokenAndSpan>: Tokens + Sized { } let state = self.input().cur_pos(); - let c = match self.read_int_u32::<16>(if is_curly { 0 } else { 4 }) { - Ok(Some(val)) => { - if 0x0010_ffff >= val { - char::from_u32(val) - } else { - let start = self.cur_pos(); - - self.error( - start, - SyntaxError::BadCharacterEscapeSequence { - expected: if is_curly { - "1-6 hex characters in the range 0 to 10FFFF." - } else { - "4 hex characters" - }, - }, - )? - } - } - _ => { - let start = self.cur_pos(); - - self.error( - start, - SyntaxError::BadCharacterEscapeSequence { - expected: if is_curly { - "1-6 hex characters" - } else { - "4 hex characters" - }, + let Ok(Some(val)) = self.read_int_u32::<16>(if is_curly { 0 } else { 4 }) else { + let start = self.cur_pos(); + self.error( + start, + SyntaxError::BadCharacterEscapeSequence { + expected: if is_curly { + "1-6 hex characters" + } else { + "4 hex characters" }, - )? - } + }, + )? }; - match c { - Some(c) => { - chars.push(c.into()); - } - _ => { - unsafe { - // Safety: state is valid position because we got it from cur_pos() - self.input_mut().reset_to(state); - } - - chars.push(Char::from('\\')); - chars.push(Char::from('u')); - - if is_curly { - chars.push(Char::from('{')); - - for _ in 0..6 { - if let Some(c) = self.input().cur() { - if c == '}' { - break; - } - - self.bump(); - - chars.push(Char::from(c)); - } else { - break; - } - } - - chars.push(Char::from('}')); - } else { - for _ in 0..4 { - if let Some(c) = self.input().cur() { - self.bump(); - - chars.push(Char::from(c)); - } - } - } - } - } + let Some(code_point) = CodePoint::from_u32(val) else { + let start = self.cur_pos(); + self.error( + start, + SyntaxError::BadCharacterEscapeSequence { + expected: if is_curly { + "1-6 hex characters in the range 0 to 10FFFF." + } else { + "4 hex characters" + }, + }, + )? + }; if is_curly && !self.eat(b'}') { - self.error(state, SyntaxError::InvalidUnicodeEscape)? + self.error(state, SyntaxError::InvalidUnicodeEscape)?; } - Ok(chars) + Ok(code_point) } #[cold] @@ -1231,7 +1187,7 @@ pub trait Lexer<'a, TokenAndSpan>: Tokens + Sized { fn read_tmpl_token(&mut self, start_of_tpl: BytePos) -> LexResult { let start = self.cur_pos(); - let mut cooked = Ok(String::new()); + let mut cooked = Ok(Wtf8Buf::new()); let mut cooked_slice_start = start; let raw_slice_start = start; @@ -1288,7 +1244,7 @@ pub trait Lexer<'a, TokenAndSpan>: Tokens + Sized { Ok(self.atom(s)) } else { consume_cooked!(); - cooked.map(|s| self.atom(s)) + cooked.map(|s| self.atom_raw(s.as_bytes())) }; let end = self.input().cur_pos(); @@ -1313,7 +1269,7 @@ pub trait Lexer<'a, TokenAndSpan>: Tokens + Sized { Ok(self.atom(s)) } else { consume_cooked!(); - cooked.map(|s| self.atom(s)) + cooked.map(|s| self.atom_raw(s.as_bytes())) }; let end = self.input().cur_pos(); @@ -1333,7 +1289,7 @@ pub trait Lexer<'a, TokenAndSpan>: Tokens + Sized { } if let Ok(ref mut cooked) = cooked { - cooked.push('\n'); + cooked.push_char('\n'); } cooked_slice_start = self.cur_pos(); } @@ -1342,11 +1298,9 @@ pub trait Lexer<'a, TokenAndSpan>: Tokens + Sized { consume_cooked!(); match self.read_escaped_char(true) { - Ok(Some(chars)) => { + Ok(Some(code_point)) => { if let Ok(ref mut cooked) = cooked { - for c in chars { - cooked.extend(c); - } + cooked.push(code_point); } } Ok(None) => {} @@ -1365,7 +1319,7 @@ pub trait Lexer<'a, TokenAndSpan>: Tokens + Sized { /// Read an escaped character for string literal. /// /// In template literal, we should preserve raw string. - fn read_escaped_char(&mut self, in_template: bool) -> LexResult>> { + fn read_escaped_char(&mut self, in_template: bool) -> LexResult> { debug_assert_eq!(self.cur(), Some('\\')); let start = self.cur_pos(); @@ -1403,7 +1357,13 @@ pub trait Lexer<'a, TokenAndSpan>: Tokens + Sized { self.bump(); // 'x' match self.read_int_u32::<16>(2)? { - Some(val) => return Ok(Some(vec![Char::from(val)])), + Some(val) => { + return { + // Safety: val is in 0x00..=0xFF + let code_point = unsafe { CodePoint::from_u32_unchecked(val) }; + Ok(Some(code_point)) + }; + } None => self.error( start, SyntaxError::BadCharacterEscapeSequence { @@ -1427,7 +1387,7 @@ pub trait Lexer<'a, TokenAndSpan>: Tokens + Sized { match self.cur() { Some(next) if next.is_digit(8) => c, // \0 is not an octal literal nor decimal literal. - _ => return Ok(Some(vec!['\u{0000}'.into()])), + _ => return Ok(Some(CodePoint::from_char('\u{0000}'))), } } else { c @@ -1454,7 +1414,9 @@ pub trait Lexer<'a, TokenAndSpan>: Tokens + Sized { .and_then(|value| value.checked_add(v as u8)); match new_val { Some(val) => val, - None => return Ok(Some(vec![Char::from(value as char)])), + None => { + return Ok(Some(CodePoint::from_char(value as char))) + } } } else { value * 8 + v as u8 @@ -1462,7 +1424,7 @@ pub trait Lexer<'a, TokenAndSpan>: Tokens + Sized { self.bump(); } - _ => return Ok(Some(vec![Char::from(value as u32)])), + _ => return Ok(Some(CodePoint::from_char(value as char))), } }}; } @@ -1470,7 +1432,7 @@ pub trait Lexer<'a, TokenAndSpan>: Tokens + Sized { one!(false); one!(true); - return Ok(Some(vec![Char::from(value as char)])); + return Ok(Some(CodePoint::from_char(value as char))); } _ => c, }; @@ -1480,7 +1442,7 @@ pub trait Lexer<'a, TokenAndSpan>: Tokens + Sized { self.input_mut().bump(); } - Ok(Some(vec![c.into()])) + Ok(Some(CodePoint::from_char(c))) } /// Expects current char to be '/' @@ -1665,24 +1627,19 @@ pub trait Lexer<'a, TokenAndSpan>: Tokens + Sized { } } - let chars = self.read_unicode_escape()?; - - if let Some(c) = chars.first() { - let valid = if first { - c.is_ident_start() - } else { - c.is_ident_part() - }; + let code_point = self.read_unicode_escape()?; - if !valid { - self.emit_error(start, SyntaxError::InvalidIdentChar); - } - } + let valid = if first { + code_point.is_ident_start() + } else { + code_point.is_ident_part() + }; - for c in chars { - buf.extend(c); + if !valid { + self.emit_error(start, SyntaxError::InvalidIdentChar); } + buf.extend(code_point); slice_start = self.cur_pos(); continue; } @@ -1964,7 +1921,7 @@ pub trait Lexer<'a, TokenAndSpan>: Tokens + Sized { let mut slice_start = self.input().cur_pos(); - let mut buf: Option = None; + let mut buf: Option = None; loop { let table = if quote == b'"' { @@ -2005,7 +1962,7 @@ pub trait Lexer<'a, TokenAndSpan>: Tokens + Sized { self.input_slice(slice_start, value_end) }; buf.push_str(s); - self.atom(&*buf) + self.atom_raw(buf.as_bytes()) } else { let s = unsafe { self.input_slice(slice_start, value_end) }; self.atom(s) @@ -2034,15 +1991,15 @@ pub trait Lexer<'a, TokenAndSpan>: Tokens + Sized { }; if buf.is_none() { - buf = Some(s.to_string()); + let mut new_buf = Wtf8Buf::with_capacity(s.len()); + new_buf.push_str(s); + buf = Some(new_buf); } else { buf.as_mut().unwrap().push_str(s); } - if let Some(chars) = self.read_escaped_char(false)? { - for c in chars { - buf.as_mut().unwrap().extend(c); - } + if let Some(code_point) = self.read_escaped_char(false)? { + buf.as_mut().unwrap().push(code_point); } slice_start = self.cur_pos(); diff --git a/crates/swc_ecma_lexer/src/lexer/mod.rs b/crates/swc_ecma_lexer/src/lexer/mod.rs index 5833e87f7986..cbf06bdbc1fd 100644 --- a/crates/swc_ecma_lexer/src/lexer/mod.rs +++ b/crates/swc_ecma_lexer/src/lexer/mod.rs @@ -110,8 +110,8 @@ impl<'a> crate::common::lexer::Lexer<'a, TokenAndSpan> for Lexer<'a> { } #[inline(always)] - fn atom<'b>(&self, s: impl Into>) -> swc_atoms::Atom { - self.atoms.atom(s) + fn atom_raw(&self, s: &[u8]) -> swc_atoms::Atom { + self.atoms.atom_raw(s) } } diff --git a/crates/swc_ecma_parser/src/lexer/mod.rs b/crates/swc_ecma_parser/src/lexer/mod.rs index bb0cd16e1c55..391b6242910a 100644 --- a/crates/swc_ecma_parser/src/lexer/mod.rs +++ b/crates/swc_ecma_parser/src/lexer/mod.rs @@ -2,7 +2,7 @@ use std::{char, iter::FusedIterator, rc::Rc}; -use swc_atoms::AtomStoreCell; +use swc_atoms::{wtf8::Wtf8Buf, AtomStoreCell}; use swc_common::{ comments::Comments, input::{Input, StringInput}, @@ -113,8 +113,8 @@ impl<'a> swc_ecma_lexer::common::lexer::Lexer<'a, TokenAndSpan> for Lexer<'a> { } #[inline(always)] - fn atom<'b>(&self, s: impl Into>) -> swc_atoms::Atom { - self.atoms.atom(s) + fn atom_raw(&self, s: &[u8]) -> swc_atoms::Atom { + self.atoms.atom_raw(s) } } @@ -332,7 +332,7 @@ impl Lexer<'_> { started_with_backtick: bool, ) -> LexResult { debug_assert!(self.cur() == Some(if started_with_backtick { '`' } else { '}' })); - let mut cooked = Ok(String::with_capacity(8)); + let mut cooked = Ok(Wtf8Buf::with_capacity(8)); self.bump(); // `}` or `\`` let mut cooked_slice_start = self.cur_pos(); let raw_slice_start = cooked_slice_start; @@ -357,7 +357,7 @@ impl Lexer<'_> { while let Some(c) = self.cur() { if c == '`' { consume_cooked!(); - let cooked = cooked.map(|cooked| self.atoms.atom(cooked)); + let cooked = cooked.map(|cooked| self.atoms.atom_raw(cooked.as_bytes())); let raw = raw_atom(self); self.bump(); return Ok(if started_with_backtick { @@ -369,7 +369,7 @@ impl Lexer<'_> { }); } else if c == '$' && self.input.peek() == Some('{') { consume_cooked!(); - let cooked = cooked.map(|cooked| self.atoms.atom(cooked)); + let cooked = cooked.map(|cooked| self.atoms.atom_raw(cooked.as_bytes())); let raw = raw_atom(self); self.input.bump_bytes(2); return Ok(if started_with_backtick { @@ -383,11 +383,9 @@ impl Lexer<'_> { consume_cooked!(); match self.read_escaped_char(true) { - Ok(Some(chars)) => { + Ok(Some(code_point)) => { if let Ok(ref mut cooked) = cooked { - for c in chars { - cooked.extend(c); - } + cooked.push(code_point); } } Ok(None) => {} @@ -416,7 +414,7 @@ impl Lexer<'_> { self.bump(); if let Ok(ref mut cooked) = cooked { - cooked.push(c); + cooked.push_char(c); } cooked_slice_start = self.cur_pos(); } else { diff --git a/crates/swc_ecma_parser/src/lexer/state.rs b/crates/swc_ecma_parser/src/lexer/state.rs index 18bf6880d597..d9da4d259475 100644 --- a/crates/swc_ecma_parser/src/lexer/state.rs +++ b/crates/swc_ecma_parser/src/lexer/state.rs @@ -553,13 +553,11 @@ impl Lexer<'_> { continue; } self.bump(); // bump 'u' - let Ok(chars) = self.read_unicode_escape() else { + let Ok(code_point) = self.read_unicode_escape() else { self.emit_error(self.cur_pos(), SyntaxError::InvalidUnicodeEscape); break; }; - for c in chars { - v.extend(c); - } + v.extend(code_point); self.token_flags |= swc_ecma_lexer::lexer::TokenFlags::UNICODE; } else { break; From 158455edaa074f7d3d4445368667210a5ff0f463 Mon Sep 17 00:00:00 2001 From: CPunisher <1343316114@qq.com> Date: Fri, 12 Sep 2025 18:21:07 +0800 Subject: [PATCH 3/5] as_wtf8_str --- crates/hstr/src/lib.rs | 15 +++++++++++++++ crates/hstr/src/wtf8/mod.rs | 5 +++++ 2 files changed, 20 insertions(+) diff --git a/crates/hstr/src/lib.rs b/crates/hstr/src/lib.rs index 39e370db1348..38a316c1dd6c 100644 --- a/crates/hstr/src/lib.rs +++ b/crates/hstr/src/lib.rs @@ -284,6 +284,21 @@ impl Atom { _ => unsafe { debug_unreachable!() }, } } + + fn as_wtf8_str(&self) -> &wtf8::Wtf8 { + match self.tag() { + DYNAMIC_TAG => unsafe { + let item = crate::dynamic::deref_from(self.unsafe_data); + wtf8::Wtf8::from_bytes(transmute::<&[u8], &'static [u8]>(&item.slice)) + }, + INLINE_TAG => { + let len = (self.unsafe_data.tag() & LEN_MASK) >> LEN_OFFSET; + let src = self.unsafe_data.data(); + wtf8::Wtf8::from_bytes(&src[..(len as usize)]) + } + _ => unsafe { debug_unreachable!() }, + } + } } #[cfg(test)] diff --git a/crates/hstr/src/wtf8/mod.rs b/crates/hstr/src/wtf8/mod.rs index 57e2ed40eec7..5a4c73098b59 100644 --- a/crates/hstr/src/wtf8/mod.rs +++ b/crates/hstr/src/wtf8/mod.rs @@ -539,6 +539,11 @@ impl Wtf8 { unsafe { transmute(value.as_bytes()) } } + #[inline] + pub fn from_bytes(value: &[u8]) -> &Wtf8 { + unsafe { transmute(value) } + } + /// Return the length, in WTF-8 bytes. #[inline] pub fn len(&self) -> usize { From a6ce6f663fd68e16bac706012d055175166efec7 Mon Sep 17 00:00:00 2001 From: CPunisher <1343316114@qq.com> Date: Mon, 15 Sep 2025 11:14:15 +0800 Subject: [PATCH 4/5] shear and doc test --- Cargo.lock | 1 - crates/hstr/src/wtf8/mod.rs | 4 ++-- crates/swc_ecma_lexer/Cargo.toml | 1 - 3 files changed, 2 insertions(+), 4 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index c00182f90ecd..8803bcb6b50d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5577,7 +5577,6 @@ dependencies = [ name = "swc_ecma_lexer" version = "23.0.1" dependencies = [ - "arrayvec", "bitflags 2.6.0", "codspeed-criterion-compat", "either", diff --git a/crates/hstr/src/wtf8/mod.rs b/crates/hstr/src/wtf8/mod.rs index 5a4c73098b59..6a1dd537af2a 100644 --- a/crates/hstr/src/wtf8/mod.rs +++ b/crates/hstr/src/wtf8/mod.rs @@ -277,7 +277,7 @@ impl Wtf8Buf { /// # Example /// /// ``` - /// let mut s = Wtf8Buf::new(); + /// let mut s = hstr::wtf8::Wtf8Buf::new(); /// s.reserve(10); /// assert!(s.capacity() >= 10); /// ``` @@ -292,7 +292,7 @@ impl Wtf8Buf { /// # Example /// /// ``` - /// let s = Wtf8Buf::with_capacity(10); + /// let s = hstr::wtf8::Wtf8Buf::with_capacity(10); /// assert!(s.capacity() >= 10); /// ``` #[inline] diff --git a/crates/swc_ecma_lexer/Cargo.toml b/crates/swc_ecma_lexer/Cargo.toml index b7628b4d7079..2fc4907991f9 100644 --- a/crates/swc_ecma_lexer/Cargo.toml +++ b/crates/swc_ecma_lexer/Cargo.toml @@ -25,7 +25,6 @@ typescript = [] verify = ["swc_ecma_visit"] [dependencies] -arrayvec = { workspace = true } bitflags = { workspace = true } either = { workspace = true } num-bigint = { workspace = true } From acf8ffeb98ac62fe888fa19a381a7da43fb95c63 Mon Sep 17 00:00:00 2001 From: CPunisher <1343316114@qq.com> Date: Mon, 15 Sep 2025 11:29:42 +0800 Subject: [PATCH 5/5] as_wtf8_str --- crates/hstr/src/lib.rs | 30 +++++++++++++++--------------- crates/swc_atoms/src/lib.rs | 5 +++++ crates/swc_ecma_codegen/src/lit.rs | 3 +-- 3 files changed, 21 insertions(+), 17 deletions(-) diff --git a/crates/hstr/src/lib.rs b/crates/hstr/src/lib.rs index 38a316c1dd6c..78fdbd23b703 100644 --- a/crates/hstr/src/lib.rs +++ b/crates/hstr/src/lib.rs @@ -250,6 +250,21 @@ impl Atom { } self.clone() } + + pub fn as_wtf8_str(&self) -> &wtf8::Wtf8 { + match self.tag() { + DYNAMIC_TAG => unsafe { + let item = crate::dynamic::deref_from(self.unsafe_data); + wtf8::Wtf8::from_bytes(transmute::<&[u8], &'static [u8]>(&item.slice)) + }, + INLINE_TAG => { + let len = (self.unsafe_data.tag() & LEN_MASK) >> LEN_OFFSET; + let src = self.unsafe_data.data(); + wtf8::Wtf8::from_bytes(&src[..(len as usize)]) + } + _ => unsafe { debug_unreachable!() }, + } + } } impl Atom { @@ -284,21 +299,6 @@ impl Atom { _ => unsafe { debug_unreachable!() }, } } - - fn as_wtf8_str(&self) -> &wtf8::Wtf8 { - match self.tag() { - DYNAMIC_TAG => unsafe { - let item = crate::dynamic::deref_from(self.unsafe_data); - wtf8::Wtf8::from_bytes(transmute::<&[u8], &'static [u8]>(&item.slice)) - }, - INLINE_TAG => { - let len = (self.unsafe_data.tag() & LEN_MASK) >> LEN_OFFSET; - let src = self.unsafe_data.data(); - wtf8::Wtf8::from_bytes(&src[..(len as usize)]) - } - _ => unsafe { debug_unreachable!() }, - } - } } #[cfg(test)] diff --git a/crates/swc_atoms/src/lib.rs b/crates/swc_atoms/src/lib.rs index 301c8cc9f7ef..f9114cc51640 100644 --- a/crates/swc_atoms/src/lib.rs +++ b/crates/swc_atoms/src/lib.rs @@ -76,6 +76,11 @@ impl Atom { pub fn as_str(&self) -> &str { &self.0 } + + #[inline] + pub fn as_wtf8_str(&self) -> &wtf8::Wtf8 { + self.0.as_wtf8_str() + } } impl Deref for Atom { diff --git a/crates/swc_ecma_codegen/src/lit.rs b/crates/swc_ecma_codegen/src/lit.rs index 1ebe6497faf6..15ab750d5f16 100644 --- a/crates/swc_ecma_codegen/src/lit.rs +++ b/crates/swc_ecma_codegen/src/lit.rs @@ -78,8 +78,7 @@ impl MacroNode for Str { if es5_safe && (!emitter.cfg.ascii_only || raw.is_ascii()) - && (!emitter.cfg.inline_script - || !self.raw.as_ref().unwrap().contains("script")) + && (!emitter.cfg.inline_script || !raw.contains("script")) { emitter.wr.write_str_lit(DUMMY_SP, raw)?; return Ok(());