diff --git a/.changeset/chilled-laws-look.md b/.changeset/chilled-laws-look.md new file mode 100644 index 000000000000..4b82e482b08e --- /dev/null +++ b/.changeset/chilled-laws-look.md @@ -0,0 +1,6 @@ +--- +hstr: minor +swc_core: minor +--- + +feat(hstr): Introduce `Wtf8Atom` diff --git a/crates/hstr/src/dynamic.rs b/crates/hstr/src/dynamic.rs index ad9244b19416..475b896f581e 100644 --- a/crates/hstr/src/dynamic.rs +++ b/crates/hstr/src/dynamic.rs @@ -14,7 +14,8 @@ use triomphe::ThinArc; use crate::{ tagged_value::{TaggedValue, MAX_INLINE_LEN}, - Atom, INLINE_TAG, INLINE_TAG_INIT, LEN_OFFSET, TAG_MASK, + wtf8::Wtf8, + Atom, Wtf8Atom, INLINE_TAG, INLINE_TAG_INIT, LEN_OFFSET, TAG_MASK, }; #[derive(PartialEq, Eq)] @@ -73,6 +74,11 @@ impl AtomStore { atom_in(self, &text.into()) } + #[inline(always)] + pub fn wtf8_atom<'a>(&mut self, text: impl Into>) -> Wtf8Atom { + wtf8_atom_in(self, text.into().as_bytes()) + } + fn gc(&mut self) { self.data.retain(|item, _| { let count = ThinArc::strong_count(&item.0); @@ -94,6 +100,14 @@ pub fn global_atom_store_gc() { }); } +pub(crate) fn global_wtf8_atom(text: &[u8]) -> Wtf8Atom { + GLOBAL_DATA.with(|global| { + let mut store = global.borrow_mut(); + + wtf8_atom_in(&mut *store, text) + }) +} + pub(crate) fn global_atom(text: &str) -> Atom { GLOBAL_DATA.with(|global| { let mut store = global.borrow_mut(); @@ -102,9 +116,7 @@ pub(crate) fn global_atom(text: &str) -> Atom { }) } -/// This can create any kind of [Atom], although this lives in the `dynamic` -/// module. -fn atom_in(storage: S, text: &str) -> Atom +fn wtf8_atom_in(storage: S, text: &[u8]) -> Wtf8Atom where S: Storage, { @@ -115,9 +127,9 @@ where let tag = INLINE_TAG_INIT | ((len as u8) << LEN_OFFSET); let mut unsafe_data = TaggedValue::new_tag(tag); unsafe { - unsafe_data.data_mut()[..len].copy_from_slice(text.as_bytes()); + unsafe_data.data_mut()[..len].copy_from_slice(text); } - return Atom { unsafe_data }; + return Wtf8Atom { unsafe_data }; } let hash = calc_hash(text); @@ -129,12 +141,22 @@ where NonNull::new_unchecked(entry) }; debug_assert!(0 == ptr.as_ptr() as u8 & TAG_MASK); - Atom { + Wtf8Atom { unsafe_data: TaggedValue::new_ptr(ptr), } } -/// Attempts to construct an Atom but only if it can be constructed inline. +/// This can create any kind of [Atom], although this lives in the `dynamic` +/// module. +fn atom_in(storage: S, text: &str) -> Atom +where + S: Storage, +{ + // SAFETY: `text` is valid UTF-8 + unsafe { Atom::from_wtf8_unchecked(wtf8_atom_in(storage, text.as_bytes())) } +} + +/// Attempts to construct an [Atom] but only if it can be constructed inline. /// This is primarily useful in constant contexts. pub(crate) const fn inline_atom(text: &str) -> Option { let len = text.len(); @@ -159,31 +181,25 @@ pub(crate) const fn inline_atom(text: &str) -> Option { } trait Storage { - fn insert_entry(self, text: &str, hash: u64) -> Item; + fn insert_entry(self, text: &[u8], hash: u64) -> Item; } impl Storage for &'_ mut AtomStore { - fn insert_entry(self, text: &str, hash: u64) -> Item { + fn insert_entry(self, text: &[u8], hash: u64) -> Item { // If the text is too long, interning is not worth it. if text.len() > 512 { - return Item(ThinArc::from_header_and_slice( - Metadata { hash }, - text.as_bytes(), - )); + return Item(ThinArc::from_header_and_slice(Metadata { hash }, text)); } let (entry, _) = self .data .raw_entry_mut() .from_hash(hash, |key| { - key.header.header.hash == hash && key.slice.eq(text.as_bytes()) + key.header.header.hash == hash && key.slice.eq(text) }) .or_insert_with(move || { ( - Item(ThinArc::from_header_and_slice( - Metadata { hash }, - text.as_bytes(), - )), + Item(ThinArc::from_header_and_slice(Metadata { hash }, text)), (), ) }); @@ -192,7 +208,7 @@ impl Storage for &'_ mut AtomStore { } #[inline(always)] -fn calc_hash(text: &str) -> u64 { +fn calc_hash(text: &[u8]) -> u64 { let mut hasher = FxHasher::default(); text.hash(&mut hasher); hasher.finish() diff --git a/crates/hstr/src/global_store.rs b/crates/hstr/src/global_store.rs index 6136c747b6ee..0a24833fa199 100644 --- a/crates/hstr/src/global_store.rs +++ b/crates/hstr/src/global_store.rs @@ -1,6 +1,13 @@ -use std::borrow::Cow; +use std::{ + borrow::Cow, + mem::{forget, ManuallyDrop}, +}; -use crate::{dynamic::global_atom, Atom}; +use crate::{ + dynamic::{global_atom, global_wtf8_atom}, + wtf8::{Wtf8, Wtf8Buf}, + Atom, Wtf8Atom, +}; macro_rules! direct_from_impl { ($T:ty) => { @@ -21,3 +28,43 @@ impl From> for crate::Atom { global_atom(&s) } } + +macro_rules! direct_from_impl_wtf8 { + ($T:ty) => { + impl From<$T> for Wtf8Atom { + fn from(s: $T) -> Self { + global_wtf8_atom(s.as_bytes()) + } + } + }; +} + +direct_from_impl_wtf8!(&'_ str); +direct_from_impl_wtf8!(Cow<'_, str>); +direct_from_impl_wtf8!(String); +direct_from_impl_wtf8!(&'_ Wtf8); +direct_from_impl_wtf8!(Wtf8Buf); + +impl From<&Atom> for crate::Wtf8Atom { + fn from(s: &Atom) -> Self { + forget(s.clone()); + Wtf8Atom { + unsafe_data: s.unsafe_data, + } + } +} + +impl From for crate::Wtf8Atom { + fn from(s: Atom) -> Self { + let s = ManuallyDrop::new(s); + Wtf8Atom { + unsafe_data: s.unsafe_data, + } + } +} + +impl From> for crate::Wtf8Atom { + fn from(s: Box) -> Self { + global_wtf8_atom(s.as_bytes()) + } +} diff --git a/crates/hstr/src/lib.rs b/crates/hstr/src/lib.rs index 6f2c842bcf33..c0b7642062bb 100644 --- a/crates/hstr/src/lib.rs +++ b/crates/hstr/src/lib.rs @@ -5,7 +5,7 @@ use core::str; use std::{ fmt::{Debug, Display}, hash::Hash, - mem::{self, forget, transmute}, + mem::{self, forget, transmute, ManuallyDrop}, num::NonZeroU8, ops::Deref, str::from_utf8_unchecked, @@ -15,13 +15,21 @@ use debug_unreachable::debug_unreachable; use once_cell::sync::Lazy; pub use crate::dynamic::{global_atom_store_gc, AtomStore}; -use crate::tagged_value::TaggedValue; +use crate::{ + macros::{get_hash, impl_from_alias, partial_eq}, + tagged_value::TaggedValue, +}; mod dynamic; mod global_store; +mod macros; mod tagged_value; #[cfg(test)] mod tests; +pub mod wtf8; +mod wtf8_atom; + +pub use wtf8_atom::Wtf8Atom; /// An immutable string which is cheap to clone, compare, hash, and has small /// size. @@ -253,20 +261,7 @@ impl Atom { impl Atom { fn get_hash(&self) -> u64 { - match self.tag() { - DYNAMIC_TAG => { - unsafe { crate::dynamic::deref_from(self.unsafe_data) } - .header - .header - .hash - } - INLINE_TAG => { - // This is passed as input to the caller's `Hasher` implementation, so it's okay - // that this isn't really a hash - self.unsafe_data.hash() - } - _ => unsafe { debug_unreachable!() }, - } + get_hash!(self) } fn as_str(&self) -> &str { @@ -302,30 +297,7 @@ impl Atom { impl PartialEq for Atom { #[inline(never)] fn eq(&self, other: &Self) -> bool { - if self.unsafe_data == other.unsafe_data { - return true; - } - - // If one is inline and the other is not, the length is different. - // If one is static and the other is not, it's different. - if self.tag() != other.tag() { - return false; - } - - if self.is_dynamic() && other.is_dynamic() { - let te = unsafe { crate::dynamic::deref_from(self.unsafe_data) }; - let oe = unsafe { crate::dynamic::deref_from(other.unsafe_data) }; - - if te.header.header.hash != oe.header.header.hash { - return false; - } - - return te.slice == oe.slice; - } - - if self.get_hash() != other.get_hash() { - return false; - } + partial_eq!(self, other); // If the store is different, the string may be the same, even though the // `unsafe_data` is different @@ -358,20 +330,7 @@ impl Clone for Atom { } } -impl Atom { - #[inline] - pub(crate) fn from_alias(alias: TaggedValue) -> Self { - if alias.tag() & TAG_MASK == DYNAMIC_TAG { - unsafe { - let arc = crate::dynamic::restore_arc(alias); - forget(arc.clone()); - forget(arc); - } - } - - Self { unsafe_data: alias } - } -} +impl_from_alias!(Atom); impl Deref for Atom { type Target = str; @@ -443,6 +402,28 @@ where } } +impl Atom { + /// Converts a WTF-8 encoded [Wtf8Atom] to a regular UTF-8 [Atom] without + /// validation. + /// + /// # Safety + /// + /// The caller must ensure that the WTF-8 atom contains only valid UTF-8 + /// data (no unpaired surrogates). This function performs no validation + /// and will create an invalid `Atom` if the input contains unpaired + /// surrogates. + /// + /// This is a zero-cost conversion that preserves all internal optimizations + /// (inline storage, precomputed hashes, etc.) since both types have + /// identical internal representation. + pub unsafe fn from_wtf8_unchecked(s: Wtf8Atom) -> Self { + let s = ManuallyDrop::new(s); + Atom { + unsafe_data: s.unsafe_data, + } + } +} + #[cfg(test)] mod macro_tests { diff --git a/crates/hstr/src/macros.rs b/crates/hstr/src/macros.rs new file mode 100644 index 000000000000..4f84747d65e9 --- /dev/null +++ b/crates/hstr/src/macros.rs @@ -0,0 +1,71 @@ +macro_rules! get_hash { + ($self:expr) => { + match $self.tag() { + DYNAMIC_TAG => { + let unsafe_data = $self.unsafe_data; + unsafe { $crate::dynamic::deref_from(unsafe_data) } + .header + .header + .hash + } + INLINE_TAG => { + // This is passed as input to the caller's `Hasher` implementation, so it's okay + // that this isn't really a hash + $self.unsafe_data.hash() + } + _ => unsafe { debug_unreachable!() }, + } + }; +} + +macro_rules! partial_eq { + ($self:expr, $other:expr) => { + if $self.unsafe_data == $other.unsafe_data { + return true; + } + + // If one is inline and the other is not, the length is different. + // If one is static and the other is not, it's different. + if $self.tag() != $other.tag() { + return false; + } + + if $self.is_dynamic() && $other.is_dynamic() { + let te = unsafe { $crate::dynamic::deref_from($self.unsafe_data) }; + let oe = unsafe { $crate::dynamic::deref_from($other.unsafe_data) }; + + if te.header.header.hash != oe.header.header.hash { + return false; + } + + return te.slice == oe.slice; + } + + if $self.get_hash() != $other.get_hash() { + return false; + } + }; +} + +macro_rules! impl_from_alias { + ($ty:ty) => { + impl $ty { + #[inline] + pub(crate) fn from_alias(alias: TaggedValue) -> Self { + if alias.tag() & TAG_MASK == DYNAMIC_TAG { + unsafe { + let arc = $crate::dynamic::restore_arc(alias); + forget(arc.clone()); + forget(arc); + } + } + + Self { unsafe_data: alias } + } + } + }; +} + +pub(crate) use get_hash; +pub(crate) use impl_from_alias; +pub(crate) use partial_eq; diff --git a/crates/hstr/src/tests.rs b/crates/hstr/src/tests.rs index ee2a6847959c..50d36898c673 100644 --- a/crates/hstr/src/tests.rs +++ b/crates/hstr/src/tests.rs @@ -1,4 +1,4 @@ -use crate::{Atom, AtomStore}; +use crate::{wtf8::Wtf8, Atom, AtomStore, Wtf8Atom}; fn store_with_atoms(texts: Vec<&str>) -> (AtomStore, Vec) { let mut store = AtomStore::default(); @@ -8,8 +8,22 @@ fn store_with_atoms(texts: Vec<&str>) -> (AtomStore, Vec) { (store, atoms) } +fn store_with_wtf8_atoms(texts: Vec<&Wtf8>) -> (AtomStore, Vec) { + let mut store = AtomStore::default(); + + let atoms = { + texts + .into_iter() + .map(|text| store.wtf8_atom(text)) + .collect() + }; + + (store, atoms) +} + #[test] fn simple_usage() { + // atom let (s, atoms) = store_with_atoms(vec!["Hello, world!", "Hello, world!"]); drop(s); @@ -19,10 +33,25 @@ fn simple_usage() { let a2 = atoms[1].clone(); assert_eq!(a1.unsafe_data, a2.unsafe_data); + + // wtf8_atom + let (s, atoms) = store_with_wtf8_atoms(vec![ + &Wtf8::from_str("Hello, world!"), + &Wtf8::from_str("Hello, world!"), + ]); + + drop(s); + + let a1 = atoms[0].clone(); + + let a2 = atoms[1].clone(); + + assert_eq!(a1.unsafe_data, a2.unsafe_data); } #[test] fn eager_drop() { + // atom let (_, atoms1) = store_with_atoms(vec!["Hello, world!!!!"]); let (_, atoms2) = store_with_atoms(vec!["Hello, world!!!!"]); @@ -38,10 +67,28 @@ fn eager_drop() { ); assert_eq!(a1.get_hash(), a2.get_hash(), "Same string should be equal"); assert_eq!(a1, a2, "Same string should be equal"); + + // wtf8_atom + let (_, atoms1) = store_with_wtf8_atoms(vec![&Wtf8::from_str("Hello, world!!!!")]); + let (_, atoms2) = store_with_wtf8_atoms(vec![&Wtf8::from_str("Hello, world!!!!")]); + + dbg!(&atoms1); + dbg!(&atoms2); + + let a1 = atoms1[0].clone(); + let a2 = atoms2[0].clone(); + + assert_ne!( + a1.unsafe_data, a2.unsafe_data, + "Different stores should have different addresses" + ); + assert_eq!(a1.get_hash(), a2.get_hash(), "Same string should be equal"); + assert_eq!(a1, a2, "Same string should be equal"); } #[test] fn store_multiple() { + // atom let (_s1, atoms1) = store_with_atoms(vec!["Hello, world!!!!"]); let (_s2, atoms2) = store_with_atoms(vec!["Hello, world!!!!"]); @@ -54,19 +101,100 @@ fn store_multiple() { ); assert_eq!(a1.get_hash(), a2.get_hash(), "Same string should be equal"); assert_eq!(a1, a2, "Same string should be equal"); + + // wtf8_atom + let (_s1, atoms1) = store_with_wtf8_atoms(vec![&Wtf8::from_str("Hello, world!!!!")]); + let (_s2, atoms2) = store_with_wtf8_atoms(vec![&Wtf8::from_str("Hello, world!!!!")]); + + let a1 = atoms1[0].clone(); + let a2 = atoms2[0].clone(); + + assert_ne!( + a1.unsafe_data, a2.unsafe_data, + "Different stores should have different addresses" + ); + assert_eq!(a1.get_hash(), a2.get_hash(), "Same string should be equal"); + assert_eq!(a1, a2, "Same string should be equal"); } #[test] fn store_ref_count() { + // atom + let (store, atoms) = store_with_atoms(vec!["Hello, world!!!!"]); + + assert_eq!(atoms[0].ref_count(), 2); + drop(store); + assert_eq!(atoms[0].ref_count(), 1); + + // wtf8_atom + let (store, atoms) = store_with_wtf8_atoms(vec![&Wtf8::from_str("Hello, world!!!!")]); + + assert_eq!(atoms[0].ref_count(), 2); + drop(store); + assert_eq!(atoms[0].ref_count(), 1); +} + +#[test] +fn store_ref_count_transitive() { + // transitive &Atom -> Wtf8Atom let (store, atoms) = store_with_atoms(vec!["Hello, world!!!!"]); assert_eq!(atoms[0].ref_count(), 2); drop(store); assert_eq!(atoms[0].ref_count(), 1); + + // Implicit clone here + let wtf8 = Wtf8Atom::from(&atoms[0]); + // Ref1 from atoms[0] + // Ref2 from Wtf8Atom::from(&atoms[0]) + assert_eq!(wtf8.ref_count(), 2); + drop(atoms); + assert_eq!(wtf8.ref_count(), 1); + + // transitive Atom -> &Wtf8Atom + let (store, atoms) = store_with_atoms(vec!["Hello, world!!!!"]); + + assert_eq!(atoms[0].ref_count(), 2); + drop(store); + assert_eq!(atoms[0].ref_count(), 1); + + let wtf8 = Wtf8Atom::from(atoms[0].clone()); + assert_eq!(wtf8.ref_count(), 2); + drop(atoms); + assert_eq!(wtf8.ref_count(), 1); +} + +#[test] +fn store_ref_count_transitive_roundtrip() { + // transitive &Atom -> Wtf8Atom -> Atom + let (store, atoms) = store_with_atoms(vec!["Hello, world!!!!"]); + + assert_eq!(atoms[0].ref_count(), 2); + drop(store); + assert_eq!(atoms[0].ref_count(), 1); + + // Implicit clone here + let wtf8 = Wtf8Atom::from(&atoms[0]); + // Ref1 from atoms[0] + // Ref2 from Wtf8Atom::from(&atoms[0]) + assert_eq!(wtf8.ref_count(), 2); + + let atom2 = unsafe { Atom::from_wtf8_unchecked(wtf8.clone()) }; + // Ref1 from atoms[0] + // Ref2 from wtf8 + // Ref3 from atom2 + assert_eq!(atom2.ref_count(), 3); + + drop(atoms); + assert_eq!(atom2.ref_count(), 2); + + drop(wtf8); + assert_eq!(atom2.ref_count(), 1); } #[test] fn store_ref_count_dynamic() { + // atom let (store, atoms) = store_with_atoms(vec!["Hello, world!!!!"]); let a1 = atoms[0].clone(); @@ -81,4 +209,32 @@ fn store_ref_count_dynamic() { drop(a2); assert_eq!(atoms[0].ref_count(), 1); + + // wtf8_atom + let (store, atoms) = store_with_wtf8_atoms(vec![&Wtf8::from_str("Hello, world!!!!")]); + + let a1 = atoms[0].clone(); + let a2 = atoms[0].clone(); + + assert_eq!(atoms[0].ref_count(), 4); + drop(store); + assert_eq!(atoms[0].ref_count(), 3); + + drop(a1); + assert_eq!(atoms[0].ref_count(), 2); + + drop(a2); + assert_eq!(atoms[0].ref_count(), 1); +} + +#[test] +fn wtf8_atom() { + let s = "hello"; + let w = Wtf8Atom::from(s); + assert_eq!(w, Atom::from(s)); + + // Test enough to exceed the small string optimization + let s = "abcdefghi"; + let w = Wtf8Atom::from(s); + assert_eq!(w, Atom::from(s)); } diff --git a/crates/hstr/src/wtf8/mod.rs b/crates/hstr/src/wtf8/mod.rs new file mode 100644 index 000000000000..0bba1bf0fc6e --- /dev/null +++ b/crates/hstr/src/wtf8/mod.rs @@ -0,0 +1,1282 @@ +// Copyright (c) 2014 Simon Sapin +// Licensed under the MIT License +// Original source: https://github.com/SimonSapin/rust-wtf8 + +/*! + +Implementation of [the WTF-8 encoding](https://simonsapin.github.io/wtf-8/). + +This library uses Rust’s type system to maintain +[well-formedness](https://simonsapin.github.io/wtf-8/#well-formed), +like the `String` and `&str` types do for UTF-8. + +Since [WTF-8 must not be used +for interchange](https://simonsapin.github.io/wtf-8/#intended-audience), +this library deliberately does not provide access to the underlying bytes +of WTF-8 strings, +nor can it decode WTF-8 from arbitrary bytes. +WTF-8 strings can be obtained from UTF-8, UTF-16, or code points. + +*/ + +extern crate alloc; + +use alloc::{ + borrow::{Borrow, Cow}, + string::String, + vec::Vec, +}; +use core::{ + cmp::Ordering, + fmt, hash, + iter::{FromIterator, IntoIterator}, + mem::transmute, + ops::Deref, + slice, str, + str::FromStr, +}; + +mod not_quite_std; + +static UTF8_REPLACEMENT_CHARACTER: &[u8] = b"\xEF\xBF\xBD"; + +/// A Unicode code point: from U+0000 to U+10FFFF. +/// +/// Compare with the `char` type, +/// which represents a Unicode scalar value: +/// a code point that is not a surrogate (U+D800 to U+DFFF). +#[derive(Eq, PartialEq, Ord, PartialOrd, Clone)] +pub struct CodePoint { + value: u32, +} + +impl Copy for CodePoint {} + +/// Format the code point as `U+` followed by four to six hexadecimal digits. +/// Example: `U+1F4A9` +impl fmt::Debug for CodePoint { + #[inline] + fn fmt(&self, formatter: &mut fmt::Formatter) -> Result<(), fmt::Error> { + write!(formatter, "U+{:04X}", self.value) + } +} + +impl CodePoint { + /// Unsafely create a new `CodePoint` without checking the value. + /// + /// # Safety + /// + /// Only use when `value` is known to be less than or equal to 0x10FFFF. + #[inline] + pub unsafe fn from_u32_unchecked(value: u32) -> CodePoint { + CodePoint { value } + } + + /// Create a new `CodePoint` if the value is a valid code point. + /// + /// Return `None` if `value` is above 0x10FFFF. + #[inline] + pub fn from_u32(value: u32) -> Option { + match value { + 0..=0x10ffff => Some(CodePoint { value }), + _ => None, + } + } + + /// Create a new `CodePoint` from a `char`. + /// + /// Since all Unicode scalar values are code points, this always succeds. + #[inline] + pub fn from_char(value: char) -> CodePoint { + CodePoint { + value: value as u32, + } + } + + /// Return the numeric value of the code point. + #[inline] + pub fn to_u32(&self) -> u32 { + self.value + } + + /// Optionally return a Unicode scalar value for the code point. + /// + /// Return `None` if the code point is a surrogate (from U+D800 to U+DFFF). + #[inline] + pub fn to_char(&self) -> Option { + match self.value { + 0xd800..=0xdfff => None, + _ => Some(unsafe { char::from_u32_unchecked(self.value) }), + } + } + + /// Return a Unicode scalar value for the code point. + /// + /// Return `'\u{FFFD}'` (the replacement character “�”) + /// if the code point is a surrogate (from U+D800 to U+DFFF). + #[inline] + pub fn to_char_lossy(&self) -> char { + self.to_char().unwrap_or('\u{FFFD}') + } +} + +/// An owned, growable string of well-formed WTF-8 data. +/// +/// Similar to `String`, but can additionally contain surrogate code points +/// if they’re not in a surrogate pair. +#[derive(Eq, PartialEq, Ord, PartialOrd, Clone)] +pub struct Wtf8Buf { + bytes: Vec, +} + +impl Deref for Wtf8Buf { + type Target = Wtf8; + + fn deref(&self) -> &Wtf8 { + unsafe { transmute(&*self.bytes) } + } +} + +/// Format the string with double quotes, +/// and surrogates as `\u` followed by four hexadecimal digits. +/// Example: `"a\u{D800}"` for a string with code points [U+0061, U+D800] +impl fmt::Debug for Wtf8Buf { + #[inline] + fn fmt(&self, formatter: &mut fmt::Formatter) -> Result<(), fmt::Error> { + Wtf8::fmt(self, formatter) + } +} + +impl Default for Wtf8Buf { + #[inline] + fn default() -> Self { + Self::new() + } +} + +impl FromStr for Wtf8Buf { + type Err = core::convert::Infallible; + + #[inline] + fn from_str(s: &str) -> Result { + Ok(Wtf8Buf { + bytes: s.as_bytes().to_vec(), + }) + } +} + +impl Wtf8Buf { + /// Create an new, empty WTF-8 string. + #[inline] + pub fn new() -> Wtf8Buf { + Wtf8Buf { bytes: Vec::new() } + } + + /// Create an new, empty WTF-8 string with pre-allocated capacity for `n` + /// bytes. + #[inline] + pub fn with_capacity(n: usize) -> Wtf8Buf { + Wtf8Buf { + bytes: Vec::with_capacity(n), + } + } + + /// Create a WTF-8 string from an UTF-8 `String`. + /// + /// This takes ownership of the `String` and does not copy. + /// + /// Since WTF-8 is a superset of UTF-8, this always succeeds. + #[inline] + pub fn from_string(string: String) -> Wtf8Buf { + Wtf8Buf { + bytes: string.into_bytes(), + } + } + + /// Create a WTF-8 string from an UTF-8 `&str` slice. + /// + /// This copies the content of the slice. + /// + /// Since WTF-8 is a superset of UTF-8, this always succeeds. + #[inline] + #[allow(clippy::should_implement_trait)] + pub fn from_str(s: &str) -> Wtf8Buf { + Wtf8Buf { + bytes: s.as_bytes().to_vec(), + } + } + + /// Create a WTF-8 string from a potentially ill-formed UTF-16 slice of + /// 16-bit code units. + /// + /// This is lossless: calling `.to_ill_formed_utf16()` on the resulting + /// string will always return the original code units. + pub fn from_ill_formed_utf16(v: &[u16]) -> Wtf8Buf { + let mut string = Wtf8Buf::with_capacity(v.len()); + for item in not_quite_std::decode_utf16(v.iter().cloned()) { + match item { + Ok(c) => string.push_char(c), + Err(s) => { + // Surrogates are known to be in the code point range. + let code_point = unsafe { CodePoint::from_u32_unchecked(s as u32) }; + // Skip the WTF-8 concatenation check, + // surrogate pairs are already decoded by utf16_items + not_quite_std::push_code_point(&mut string, code_point) + } + } + } + string + } + + /// Reserves capacity for at least `additional` more bytes to be inserted + /// in the given `Wtf8Buf`. + /// The collection may reserve more space to avoid frequent reallocations. + /// + /// # Panics + /// + /// Panics if the new capacity overflows `usize`. + #[inline] + pub fn reserve(&mut self, additional: usize) { + self.bytes.reserve(additional) + } + + /// Returns the number of bytes that this string buffer can hold without + /// reallocating. + #[inline] + pub fn capacity(&self) -> usize { + self.bytes.capacity() + } + + /// Append an UTF-8 slice at the end of the string. + #[inline] + pub fn push_str(&mut self, other: &str) { + self.bytes.extend_from_slice(other.as_bytes()) + } + + /// Append a WTF-8 slice at the end of the string. + /// + /// This replaces newly paired surrogates at the boundary + /// with a supplementary code point, + /// like concatenating ill-formed UTF-16 strings effectively would. + #[inline] + pub fn push_wtf8(&mut self, other: &Wtf8) { + match (self.final_lead_surrogate(), other.initial_trail_surrogate()) { + // Replace newly paired surrogates by a supplementary code point. + (Some(lead), Some(trail)) => { + let len_without_lead_surrogate = self.len() - 3; + self.bytes.truncate(len_without_lead_surrogate); + let other_without_trail_surrogate = &other.bytes[3..]; + // 4 bytes for the supplementary code point + self.bytes.reserve(4 + other_without_trail_surrogate.len()); + self.push_char(decode_surrogate_pair(lead, trail)); + self.bytes.extend_from_slice(other_without_trail_surrogate); + } + _ => self.bytes.extend_from_slice(&other.bytes), + } + } + + /// Append a Unicode scalar value at the end of the string. + #[inline] + pub fn push_char(&mut self, c: char) { + not_quite_std::push_code_point(self, CodePoint::from_char(c)) + } + + /// Append a code point at the end of the string. + /// + /// This replaces newly paired surrogates at the boundary + /// with a supplementary code point, + /// like concatenating ill-formed UTF-16 strings effectively would. + #[inline] + pub fn push(&mut self, code_point: CodePoint) { + if let trail @ 0xdc00..=0xdfff = code_point.to_u32() { + if let Some(lead) = self.final_lead_surrogate() { + let len_without_lead_surrogate = self.len() - 3; + self.bytes.truncate(len_without_lead_surrogate); + self.push_char(decode_surrogate_pair(lead, trail as u16)); + return; + } + } + + // No newly paired surrogates at the boundary. + not_quite_std::push_code_point(self, code_point) + } + + /// Shortens a string to the specified length. + /// + /// # Failure + /// + /// Fails if `new_len` > current length, + /// or if `new_len` is not a code point boundary. + #[inline] + pub fn truncate(&mut self, new_len: usize) { + assert!(not_quite_std::is_code_point_boundary(self, new_len)); + self.bytes.truncate(new_len) + } + + /// Consume the WTF-8 string and try to convert it to UTF-8. + /// + /// This does not copy the data. + /// + /// If the contents are not well-formed UTF-8 + /// (that is, if the string contains surrogates), + /// the original WTF-8 string is returned instead. + pub fn into_string(self) -> Result { + match self.next_surrogate(0) { + None => Ok(unsafe { String::from_utf8_unchecked(self.bytes) }), + Some(_) => Err(self), + } + } + + /// Consume the WTF-8 string and convert it lossily to UTF-8. + /// + /// This does not copy the data (but may overwrite parts of it in place). + /// + /// Surrogates are replaced with `"\u{FFFD}"` (the replacement character + /// “�”) + pub fn into_string_lossy(mut self) -> String { + let mut pos = 0; + loop { + match self.next_surrogate(pos) { + Some((surrogate_pos, _)) => { + pos = surrogate_pos + 3; + self.bytes[surrogate_pos..pos].copy_from_slice(UTF8_REPLACEMENT_CHARACTER); + } + None => return unsafe { String::from_utf8_unchecked(self.bytes) }, + } + } + } +} + +/// Create a new WTF-8 string from an iterator of code points. +/// +/// This replaces surrogate code point pairs with supplementary code points, +/// like concatenating ill-formed UTF-16 strings effectively would. +impl FromIterator for Wtf8Buf { + fn from_iter>(iterable: T) -> Wtf8Buf { + let mut string = Wtf8Buf::new(); + string.extend(iterable); + string + } +} + +/// Append code points from an iterator to the string. +/// +/// This replaces surrogate code point pairs with supplementary code points, +/// like concatenating ill-formed UTF-16 strings effectively would. +impl Extend for Wtf8Buf { + fn extend>(&mut self, iterable: T) { + let iterator = iterable.into_iter(); + let (low, _high) = iterator.size_hint(); + // Lower bound of one byte per code point (ASCII only) + self.bytes.reserve(low); + for code_point in iterator { + self.push(code_point); + } + } +} + +/// A borrowed slice of well-formed WTF-8 data. +/// +/// Similar to `&str`, but can additionally contain surrogate code points +/// if they’re not in a surrogate pair. +pub struct Wtf8 { + bytes: [u8], +} + +// FIXME: https://github.com/rust-lang/rust/issues/18805 +impl PartialEq for Wtf8 { + fn eq(&self, other: &Wtf8) -> bool { + self.bytes.eq(&other.bytes) + } +} + +// FIXME: https://github.com/rust-lang/rust/issues/18805 +impl Eq for Wtf8 {} + +// FIXME: https://github.com/rust-lang/rust/issues/18738 +impl PartialOrd for Wtf8 { + #[inline] + fn partial_cmp(&self, other: &Wtf8) -> Option { + Some(self.bytes.cmp(&other.bytes)) + } + + #[inline] + fn lt(&self, other: &Wtf8) -> bool { + self.bytes.lt(&other.bytes) + } + + #[inline] + fn le(&self, other: &Wtf8) -> bool { + self.bytes.le(&other.bytes) + } + + #[inline] + fn gt(&self, other: &Wtf8) -> bool { + self.bytes.gt(&other.bytes) + } + + #[inline] + fn ge(&self, other: &Wtf8) -> bool { + self.bytes.ge(&other.bytes) + } +} + +// FIXME: https://github.com/rust-lang/rust/issues/18738 +impl Ord for Wtf8 { + #[inline] + fn cmp(&self, other: &Wtf8) -> Ordering { + self.bytes.cmp(&other.bytes) + } +} + +/// Format the slice with double quotes, +/// and surrogates as `\u` followed by four hexadecimal digits. +/// Example: `"a\u{D800}"` for a slice with code points [U+0061, U+D800] +impl fmt::Debug for Wtf8 { + fn fmt(&self, formatter: &mut fmt::Formatter) -> Result<(), fmt::Error> { + formatter.write_str("\"")?; + let mut pos = 0; + loop { + match self.next_surrogate(pos) { + None => break, + Some((surrogate_pos, surrogate)) => { + formatter.write_str(unsafe { + str::from_utf8_unchecked(&self.bytes[pos..surrogate_pos]) + })?; + write!(formatter, "\\u{{{surrogate:X}}}")?; + pos = surrogate_pos + 3; + } + } + } + formatter.write_str(unsafe { str::from_utf8_unchecked(&self.bytes[pos..]) })?; + formatter.write_str("\"") + } +} + +impl Wtf8 { + /// Create a WTF-8 slice from a UTF-8 `&str` slice. + /// + /// Since WTF-8 is a superset of UTF-8, this always succeeds. + #[inline] + pub const fn from_str(value: &str) -> &Wtf8 { + unsafe { transmute(value.as_bytes()) } + } + + /// Return the length, in WTF-8 bytes. + #[inline] + pub const fn len(&self) -> usize { + self.bytes.len() + } + + /// Return `true` if the string has a length of zero bytes. + #[inline] + pub const fn is_empty(&self) -> bool { + self.bytes.is_empty() + } + + /// Return a slice of the given string for the byte range [`begin`..`end`). + /// + /// # Failure + /// + /// Fails when `begin` and `end` do not point to code point boundaries, + /// or point beyond the end of the string. + #[inline] + pub fn slice(&self, begin: usize, end: usize) -> &Wtf8 { + // is_code_point_boundary checks that the index is in [0, .len()] + if begin <= end + && not_quite_std::is_code_point_boundary(self, begin) + && not_quite_std::is_code_point_boundary(self, end) + { + unsafe { not_quite_std::slice_unchecked(self, begin, end) } + } else { + not_quite_std::slice_error_fail(self, begin, end) + } + } + + /// Return a slice of the given string from byte `begin` to its end. + /// + /// # Failure + /// + /// Fails when `begin` is not at a code point boundary, + /// or is beyond the end of the string. + #[inline] + pub fn slice_from(&self, begin: usize) -> &Wtf8 { + // is_code_point_boundary checks that the index is in [0, .len()] + if not_quite_std::is_code_point_boundary(self, begin) { + unsafe { not_quite_std::slice_unchecked(self, begin, self.len()) } + } else { + not_quite_std::slice_error_fail(self, begin, self.len()) + } + } + + /// Return a slice of the given string from its beginning to byte `end`. + /// + /// # Failure + /// + /// Fails when `end` is not at a code point boundary, + /// or is beyond the end of the string. + #[inline] + pub fn slice_to(&self, end: usize) -> &Wtf8 { + // is_code_point_boundary checks that the index is in [0, .len()] + if not_quite_std::is_code_point_boundary(self, end) { + unsafe { not_quite_std::slice_unchecked(self, 0, end) } + } else { + not_quite_std::slice_error_fail(self, 0, end) + } + } + + /// Return the code point at `position` if it is in the ASCII range, + /// or `b'\xFF' otherwise. + /// + /// # Failure + /// + /// Fails if `position` is beyond the end of the string. + #[inline] + pub fn ascii_byte_at(&self, position: usize) -> u8 { + match self.bytes[position] { + ascii_byte @ 0x00..=0x7f => ascii_byte, + _ => 0xff, + } + } + + /// Return an iterator for the string’s code points. + #[inline] + pub fn code_points(&self) -> Wtf8CodePoints { + Wtf8CodePoints { + bytes: self.bytes.iter(), + } + } + + /// Try to convert the string to UTF-8 and return a `&str` slice. + /// + /// Return `None` if the string contains surrogates. + /// + /// This does not copy the data. + #[inline] + pub fn as_str(&self) -> Option<&str> { + // Well-formed WTF-8 is also well-formed UTF-8 + // if and only if it contains no surrogate. + match self.next_surrogate(0) { + None => Some(unsafe { str::from_utf8_unchecked(&self.bytes) }), + Some(_) => None, + } + } + + /// Return the underlying WTF-8 bytes. + #[inline] + pub const fn as_bytes(&self) -> &[u8] { + &self.bytes + } + + /// Lossily convert the string to UTF-8. + /// Return an UTF-8 `&str` slice if the contents are well-formed in UTF-8. + /// + /// Surrogates are replaced with `"\u{FFFD}"` (the replacement character + /// “�”). + /// + /// This only copies the data if necessary (if it contains any surrogate). + pub fn to_string_lossy(&self) -> Cow { + let surrogate_pos = match self.next_surrogate(0) { + None => return Cow::Borrowed(unsafe { str::from_utf8_unchecked(&self.bytes) }), + Some((pos, _)) => pos, + }; + let wtf8_bytes = &self.bytes; + let mut utf8_bytes = Vec::with_capacity(self.len()); + utf8_bytes.extend_from_slice(&wtf8_bytes[..surrogate_pos]); + utf8_bytes.extend_from_slice(UTF8_REPLACEMENT_CHARACTER); + let mut pos = surrogate_pos + 3; + loop { + match self.next_surrogate(pos) { + Some((surrogate_pos, _)) => { + utf8_bytes.extend_from_slice(&wtf8_bytes[pos..surrogate_pos]); + utf8_bytes.extend_from_slice(UTF8_REPLACEMENT_CHARACTER); + pos = surrogate_pos + 3; + } + None => { + utf8_bytes.extend_from_slice(&wtf8_bytes[pos..]); + return Cow::Owned(unsafe { String::from_utf8_unchecked(utf8_bytes) }); + } + } + } + } + + /// Convert the WTF-8 string to potentially ill-formed UTF-16 + /// and return an iterator of 16-bit code units. + /// + /// This is lossless: + /// calling `Wtf8Buf::from_ill_formed_utf16` on the resulting code units + /// would always return the original WTF-8 string. + #[inline] + pub fn to_ill_formed_utf16(&self) -> IllFormedUtf16CodeUnits { + IllFormedUtf16CodeUnits { + code_points: self.code_points(), + extra: 0, + } + } + + /// Create a WTF-8 from a WTF-8 encoded byte slice. + /// + /// # Safety + /// + /// The caller must ensure that `bytes` is a well-formed WTF-8 byte + /// sequence. + /// + /// This means that: + /// - All bytes must form valid UTF-8 sequences OR valid surrogate code + /// point encodings + /// - Surrogate code points may appear unpaired and be encoded separately, + /// but if they are paired, it should be encoded as a single 4-byte UTF-8 + /// sequence. For example, the byte sequence `[0xED, 0xA0, 0x80, 0xED, + /// 0xB0, 0x80]` is not valid WTF-8 because WTF-8 forbids encoding a + /// surrogate pair as two separate 3-byte sequences. + #[inline] + pub const unsafe fn from_bytes_unchecked(bytes: &[u8]) -> &Wtf8 { + unsafe { transmute(bytes) } + } + + #[inline] + fn next_surrogate(&self, mut pos: usize) -> Option<(usize, u16)> { + let mut iter = self.bytes[pos..].iter(); + loop { + let b = match iter.next() { + None => return None, + Some(&b) => b, + }; + if b < 0x80 { + pos += 1; + } else if b < 0xe0 { + iter.next(); + pos += 2; + } else if b == 0xed { + match (iter.next(), iter.next()) { + (Some(&b2), Some(&b3)) if b2 >= 0xa0 => { + return Some((pos, decode_surrogate(b2, b3))) + } + _ => pos += 3, + } + } else if b < 0xf0 { + iter.next(); + iter.next(); + pos += 3; + } else { + iter.next(); + iter.next(); + iter.next(); + pos += 4; + } + } + } + + #[inline] + fn final_lead_surrogate(&self) -> Option { + let len = self.len(); + if len < 3 { + return None; + } + let seq = &self.bytes[len - 3..]; + if seq[0] == 0xed && 0xa0 <= seq[1] && seq[1] <= 0xaf { + Some(decode_surrogate(seq[1], seq[2])) + } else { + None + } + } + + #[inline] + fn initial_trail_surrogate(&self) -> Option { + let len = self.len(); + if len < 3 { + return None; + } + let seq = &self.bytes[..3]; + if seq[0] == 0xed && 0xb0 <= seq[1] && seq[1] <= 0xbf { + Some(decode_surrogate(seq[1], seq[2])) + } else { + None + } + } +} + +#[inline] +fn decode_surrogate(second_byte: u8, third_byte: u8) -> u16 { + // The first byte is assumed to be 0xED + 0xd800 | (second_byte as u16 & 0x3f) << 6 | third_byte as u16 & 0x3f +} + +#[inline] +fn decode_surrogate_pair(lead: u16, trail: u16) -> char { + let code_point = 0x10000 + (((lead as u32 - 0xd800) << 10) | (trail as u32 - 0xdc00)); + unsafe { char::from_u32_unchecked(code_point) } +} + +/// Iterator for the code points of a WTF-8 string. +/// +/// Created with the method `.code_points()`. +#[derive(Clone)] +pub struct Wtf8CodePoints<'a> { + bytes: slice::Iter<'a, u8>, +} + +impl<'a> Iterator for Wtf8CodePoints<'a> { + type Item = CodePoint; + + #[inline] + fn next(&mut self) -> Option { + not_quite_std::next_code_point(&mut self.bytes).map(|value| { + // Wtf8 invariant says `value` is a valid code point + unsafe { CodePoint::from_u32_unchecked(value) } + }) + } + + #[inline] + fn size_hint(&self) -> (usize, Option) { + let (len, _) = self.bytes.size_hint(); + (len.saturating_add(3) / 4, Some(len)) + } +} + +#[derive(Clone)] +pub struct IllFormedUtf16CodeUnits<'a> { + code_points: Wtf8CodePoints<'a>, + extra: u16, +} + +impl<'a> Iterator for IllFormedUtf16CodeUnits<'a> { + type Item = u16; + + #[inline] + fn next(&mut self) -> Option { + not_quite_std::next_utf16_code_unit(self) + } + + #[inline] + fn size_hint(&self) -> (usize, Option) { + let (low, high) = self.code_points.size_hint(); + // every code point gets either one u16 or two u16, + // so this iterator is between 1 or 2 times as + // long as the underlying iterator. + (low, high.and_then(|n| n.checked_mul(2))) + } +} + +impl PartialEq<&Wtf8> for Wtf8Buf { + fn eq(&self, other: &&Wtf8) -> bool { + **self == **other + } +} + +impl PartialEq for &Wtf8 { + fn eq(&self, other: &Wtf8Buf) -> bool { + **self == **other + } +} + +impl hash::Hash for CodePoint { + #[inline] + fn hash(&self, state: &mut H) { + self.value.hash(state) + } +} + +impl hash::Hash for Wtf8Buf { + #[inline] + fn hash(&self, state: &mut H) { + Wtf8::hash(self, state) + } +} + +impl hash::Hash for Wtf8 { + #[inline] + fn hash(&self, state: &mut H) { + state.write(&self.bytes); + 0xfeu8.hash(state) + } +} + +impl Borrow for Wtf8Buf { + #[inline] + fn borrow(&self) -> &Wtf8 { + self + } +} + +impl ToOwned for Wtf8 { + type Owned = Wtf8Buf; + + #[inline] + fn to_owned(&self) -> Wtf8Buf { + Wtf8Buf { + bytes: self.bytes.to_vec(), + } + } +} + +impl<'a> From<&'a Wtf8> for Cow<'a, Wtf8> { + #[inline] + fn from(s: &'a Wtf8) -> Cow<'a, Wtf8> { + Cow::Borrowed(s) + } +} + +impl<'a> From<&'a str> for &'a Wtf8 { + #[inline] + fn from(s: &'a str) -> &'a Wtf8 { + Wtf8::from_str(s) + } +} + +#[cfg(test)] +mod tests { + use alloc::{format, vec}; + use core::mem::transmute; + + use super::*; + + #[test] + fn code_point_from_u32() { + assert!(CodePoint::from_u32(0).is_some()); + assert!(CodePoint::from_u32(0xd800).is_some()); + assert!(CodePoint::from_u32(0x10ffff).is_some()); + assert!(CodePoint::from_u32(0x110000).is_none()); + } + + #[test] + fn code_point_to_u32() { + fn c(value: u32) -> CodePoint { + CodePoint::from_u32(value).unwrap() + } + assert_eq!(c(0).to_u32(), 0); + assert_eq!(c(0xd800).to_u32(), 0xd800); + assert_eq!(c(0x10ffff).to_u32(), 0x10ffff); + } + + #[test] + fn code_point_from_char() { + assert_eq!(CodePoint::from_char('a').to_u32(), 0x61); + assert_eq!(CodePoint::from_char('💩').to_u32(), 0x1f4a9); + } + + #[test] + fn code_point_to_string() { + let cp_a = CodePoint::from_char('a'); + assert_eq!(format!("{cp_a:?}"), "U+0061"); + let cp_poop = CodePoint::from_char('💩'); + assert_eq!(format!("{cp_poop:?}"), "U+1F4A9"); + } + + #[test] + fn code_point_to_char() { + fn c(value: u32) -> CodePoint { + CodePoint::from_u32(value).unwrap() + } + assert_eq!(c(0x61).to_char(), Some('a')); + assert_eq!(c(0x1f4a9).to_char(), Some('💩')); + assert_eq!(c(0xd800).to_char(), None); + } + + #[test] + fn code_point_to_char_lossy() { + fn c(value: u32) -> CodePoint { + CodePoint::from_u32(value).unwrap() + } + assert_eq!(c(0x61).to_char_lossy(), 'a'); + assert_eq!(c(0x1f4a9).to_char_lossy(), '💩'); + assert_eq!(c(0xd800).to_char_lossy(), '\u{FFFD}'); + } + + #[test] + fn wtf8buf_new() { + assert_eq!(Wtf8Buf::new().bytes, b""); + } + + #[test] + fn wtf8buf_from_str() { + assert_eq!(Wtf8Buf::from_str("").bytes, b""); + assert_eq!( + Wtf8Buf::from_str("aé 💩").bytes, + b"a\xC3\xA9 \xF0\x9F\x92\xA9" + ); + } + + #[test] + fn wtf8buf_from_string() { + assert_eq!(Wtf8Buf::from_string(String::from("")).bytes, b""); + assert_eq!( + Wtf8Buf::from_string(String::from("aé 💩")).bytes, + b"a\xC3\xA9 \xF0\x9F\x92\xA9" + ); + } + + #[test] + fn wtf8buf_from_ill_formed_utf16() { + assert_eq!(Wtf8Buf::from_ill_formed_utf16(&[]).bytes, b""); + assert_eq!( + Wtf8Buf::from_ill_formed_utf16(&[0x61, 0xe9, 0x20, 0xd83d, 0xd83d, 0xdca9]).bytes, + b"a\xC3\xA9 \xED\xA0\xBD\xF0\x9F\x92\xA9" + ); + } + + #[test] + fn wtf8buf_push_str() { + let mut string = Wtf8Buf::new(); + assert_eq!(string.bytes, b""); + string.push_str("aé 💩"); + assert_eq!(string.bytes, b"a\xC3\xA9 \xF0\x9F\x92\xA9"); + } + + #[test] + fn wtf8buf_push_char() { + let mut string = Wtf8Buf::from_str("aé "); + assert_eq!(string.bytes, b"a\xC3\xA9 "); + string.push_char('💩'); + assert_eq!(string.bytes, b"a\xC3\xA9 \xF0\x9F\x92\xA9"); + } + + #[test] + fn wtf8buf_push() { + let mut string = Wtf8Buf::from_str("aé "); + assert_eq!(string.bytes, b"a\xC3\xA9 "); + string.push(CodePoint::from_char('💩')); + assert_eq!(string.bytes, b"a\xC3\xA9 \xF0\x9F\x92\xA9"); + + fn c(value: u32) -> CodePoint { + CodePoint::from_u32(value).unwrap() + } + + let mut string = Wtf8Buf::new(); + string.push(c(0xd83d)); // lead + string.push(c(0xdca9)); // trail + assert_eq!(string.bytes, b"\xF0\x9F\x92\xA9"); // Magic! + + let mut string = Wtf8Buf::new(); + string.push(c(0xd83d)); // lead + string.push(c(0x20)); // not surrogate + string.push(c(0xdca9)); // trail + assert_eq!(string.bytes, b"\xED\xA0\xBD \xED\xB2\xA9"); + + let mut string = Wtf8Buf::new(); + string.push(c(0xd800)); // lead + string.push(c(0xdbff)); // lead + assert_eq!(string.bytes, b"\xED\xA0\x80\xED\xAF\xBF"); + + let mut string = Wtf8Buf::new(); + string.push(c(0xd800)); // lead + string.push(c(0xe000)); // not surrogate + assert_eq!(string.bytes, b"\xED\xA0\x80\xEE\x80\x80"); + + let mut string = Wtf8Buf::new(); + string.push(c(0xd7ff)); // not surrogate + string.push(c(0xdc00)); // trail + assert_eq!(string.bytes, b"\xED\x9F\xBF\xED\xB0\x80"); + + let mut string = Wtf8Buf::new(); + string.push(c(0x61)); // not surrogate, < 3 bytes + string.push(c(0xdc00)); // trail + assert_eq!(string.bytes, b"\x61\xED\xB0\x80"); + + let mut string = Wtf8Buf::new(); + string.push(c(0xdc00)); // trail + assert_eq!(string.bytes, b"\xED\xB0\x80"); + } + + #[test] + fn wtf8buf_push_wtf8() { + let mut string = Wtf8Buf::from_str("aé"); + assert_eq!(string.bytes, b"a\xC3\xA9"); + string.push_wtf8(Wtf8::from_str(" 💩")); + assert_eq!(string.bytes, b"a\xC3\xA9 \xF0\x9F\x92\xA9"); + + fn w(value: &[u8]) -> &Wtf8 { + unsafe { transmute(value) } + } + + let mut string = Wtf8Buf::new(); + string.push_wtf8(w(b"\xED\xA0\xBD")); // lead + string.push_wtf8(w(b"\xED\xB2\xA9")); // trail + assert_eq!(string.bytes, b"\xF0\x9F\x92\xA9"); // Magic! + + let mut string = Wtf8Buf::new(); + string.push_wtf8(w(b"\xED\xA0\xBD")); // lead + string.push_wtf8(w(b" ")); // not surrogate + string.push_wtf8(w(b"\xED\xB2\xA9")); // trail + assert_eq!(string.bytes, b"\xED\xA0\xBD \xED\xB2\xA9"); + + let mut string = Wtf8Buf::new(); + string.push_wtf8(w(b"\xED\xA0\x80")); // lead + string.push_wtf8(w(b"\xED\xAF\xBF")); // lead + assert_eq!(string.bytes, b"\xED\xA0\x80\xED\xAF\xBF"); + + let mut string = Wtf8Buf::new(); + string.push_wtf8(w(b"\xED\xA0\x80")); // lead + string.push_wtf8(w(b"\xEE\x80\x80")); // not surrogate + assert_eq!(string.bytes, b"\xED\xA0\x80\xEE\x80\x80"); + + let mut string = Wtf8Buf::new(); + string.push_wtf8(w(b"\xED\x9F\xBF")); // not surrogate + string.push_wtf8(w(b"\xED\xB0\x80")); // trail + assert_eq!(string.bytes, b"\xED\x9F\xBF\xED\xB0\x80"); + + let mut string = Wtf8Buf::new(); + string.push_wtf8(w(b"a")); // not surrogate, < 3 bytes + string.push_wtf8(w(b"\xED\xB0\x80")); // trail + assert_eq!(string.bytes, b"\x61\xED\xB0\x80"); + + let mut string = Wtf8Buf::new(); + string.push_wtf8(w(b"\xED\xB0\x80")); // trail + assert_eq!(string.bytes, b"\xED\xB0\x80"); + } + + #[test] + fn wtf8buf_truncate() { + let mut string = Wtf8Buf::from_str("aé"); + string.truncate(1); + assert_eq!(string.bytes, b"a"); + } + + #[test] + #[should_panic] + fn wtf8buf_truncate_fail_code_point_boundary() { + let mut string = Wtf8Buf::from_str("aé"); + string.truncate(2); + } + + #[test] + #[should_panic] + fn wtf8buf_truncate_fail_longer() { + let mut string = Wtf8Buf::from_str("aé"); + string.truncate(4); + } + + #[test] + fn wtf8buf_into_string() { + let mut string = Wtf8Buf::from_str("aé 💩"); + assert_eq!(string.clone().into_string(), Ok(String::from("aé 💩"))); + string.push(CodePoint::from_u32(0xd800).unwrap()); + assert_eq!(string.clone().into_string(), Err(string)); + } + + #[test] + fn wtf8buf_into_string_lossy() { + let mut string = Wtf8Buf::from_str("aé 💩"); + assert_eq!(string.clone().into_string_lossy(), String::from("aé 💩")); + string.push(CodePoint::from_u32(0xd800).unwrap()); + assert_eq!(string.clone().into_string_lossy(), String::from("aé 💩�")); + } + + #[test] + fn wtf8buf_from_iterator() { + fn f(values: &[u32]) -> Wtf8Buf { + values + .iter() + .map(|&c| CodePoint::from_u32(c).unwrap()) + .collect::() + } + assert_eq!( + f(&[0x61, 0xe9, 0x20, 0x1f4a9]).bytes, + b"a\xC3\xA9 \xF0\x9F\x92\xA9" + ); + + assert_eq!(f(&[0xd83d, 0xdca9]).bytes, b"\xF0\x9F\x92\xA9"); // Magic! + assert_eq!( + f(&[0xd83d, 0x20, 0xdca9]).bytes, + b"\xED\xA0\xBD \xED\xB2\xA9" + ); + assert_eq!(f(&[0xd800, 0xdbff]).bytes, b"\xED\xA0\x80\xED\xAF\xBF"); + assert_eq!(f(&[0xd800, 0xe000]).bytes, b"\xED\xA0\x80\xEE\x80\x80"); + assert_eq!(f(&[0xd7ff, 0xdc00]).bytes, b"\xED\x9F\xBF\xED\xB0\x80"); + assert_eq!(f(&[0x61, 0xdc00]).bytes, b"\x61\xED\xB0\x80"); + assert_eq!(f(&[0xdc00]).bytes, b"\xED\xB0\x80"); + } + + #[test] + fn wtf8buf_extend() { + fn e(initial: &[u32], extended: &[u32]) -> Wtf8Buf { + fn c(value: &u32) -> CodePoint { + CodePoint::from_u32(*value).unwrap() + } + let mut string = initial.iter().map(c).collect::(); + string.extend(extended.iter().map(c)); + string + } + + assert_eq!( + e(&[0x61, 0xe9], &[0x20, 0x1f4a9]).bytes, + b"a\xC3\xA9 \xF0\x9F\x92\xA9" + ); + + assert_eq!(e(&[0xd83d], &[0xdca9]).bytes, b"\xF0\x9F\x92\xA9"); // Magic! + assert_eq!( + e(&[0xd83d, 0x20], &[0xdca9]).bytes, + b"\xED\xA0\xBD \xED\xB2\xA9" + ); + assert_eq!(e(&[0xd800], &[0xdbff]).bytes, b"\xED\xA0\x80\xED\xAF\xBF"); + assert_eq!(e(&[0xd800], &[0xe000]).bytes, b"\xED\xA0\x80\xEE\x80\x80"); + assert_eq!(e(&[0xd7ff], &[0xdc00]).bytes, b"\xED\x9F\xBF\xED\xB0\x80"); + assert_eq!(e(&[0x61], &[0xdc00]).bytes, b"\x61\xED\xB0\x80"); + assert_eq!(e(&[], &[0xdc00]).bytes, b"\xED\xB0\x80"); + } + + #[test] + fn wtf8buf_debug() { + let mut string = Wtf8Buf::from_str("aé 💩"); + string.push(CodePoint::from_u32(0xd800).unwrap()); + assert_eq!(format!("{string:?}"), r#""aé 💩\u{D800}""#); + } + + #[test] + fn wtf8buf_as_slice() { + assert_eq!(Wtf8Buf::from_str("aé"), Wtf8::from_str("aé")); + } + + #[test] + fn wtf8_debug() { + let mut string = Wtf8Buf::from_str("aé 💩"); + string.push(CodePoint::from_u32(0xd800).unwrap()); + let string_ref = &*string; + assert_eq!(format!("{string_ref:?}"), r#""aé 💩\u{D800}""#); + } + + #[test] + fn wtf8_from_str() { + assert_eq!(&Wtf8::from_str("").bytes, b""); + assert_eq!( + &Wtf8::from_str("aé 💩").bytes, + b"a\xC3\xA9 \xF0\x9F\x92\xA9" + ); + } + + #[test] + fn wtf8_as_bytes() { + assert_eq!(Wtf8::from_str("").as_bytes(), b""); + assert_eq!( + Wtf8::from_str("aé 💩").as_bytes(), + b"a\xC3\xA9 \xF0\x9F\x92\xA9" + ); + } + + #[test] + fn wtf8_from_bytes_unchecked() { + assert_eq!(unsafe { &Wtf8::from_bytes_unchecked(b"").bytes }, b""); + assert_eq!( + unsafe { &Wtf8::from_bytes_unchecked(b"a\xC3\xA9 \xF0\x9F\x92\xA9").bytes }, + b"a\xC3\xA9 \xF0\x9F\x92\xA9" + ); + assert_eq!( + unsafe { Wtf8::from_bytes_unchecked(b"a\xC3\xA9 \xF0\x9F\x92\xA9") }, + Wtf8::from_str("aé 💩") + ) + } + + #[test] + fn wtf8_cow() { + let s: Cow = Cow::from(Wtf8::from_str("aé 💩")); + assert!(matches!(s, Cow::Borrowed(_))); + let owned: Wtf8Buf = s.into_owned(); + assert_eq!(owned, Wtf8Buf::from_str("aé 💩")); + } + + #[test] + fn wtf8_len() { + assert_eq!(Wtf8::from_str("").len(), 0); + assert_eq!(Wtf8::from_str("aé 💩").len(), 8); + } + + #[test] + fn wtf8_slice() { + assert_eq!(&Wtf8::from_str("aé 💩").slice(1, 4).bytes, b"\xC3\xA9 "); + } + + #[test] + #[should_panic] + fn wtf8_slice_not_code_point_boundary() { + Wtf8::from_str("aé 💩").slice(2, 4); + } + + #[test] + fn wtf8_slice_from() { + assert_eq!( + &Wtf8::from_str("aé 💩").slice_from(1).bytes, + b"\xC3\xA9 \xF0\x9F\x92\xA9" + ); + } + + #[test] + #[should_panic] + fn wtf8_slice_from_not_code_point_boundary() { + Wtf8::from_str("aé 💩").slice_from(2); + } + + #[test] + fn wtf8_slice_to() { + assert_eq!(&Wtf8::from_str("aé 💩").slice_to(4).bytes, b"a\xC3\xA9 "); + } + + #[test] + #[should_panic] + fn wtf8_slice_to_not_code_point_boundary() { + Wtf8::from_str("aé 💩").slice_from(5); + } + + #[test] + fn wtf8_ascii_byte_at() { + let slice = Wtf8::from_str("aé 💩"); + assert_eq!(slice.ascii_byte_at(0), b'a'); + assert_eq!(slice.ascii_byte_at(1), b'\xFF'); + assert_eq!(slice.ascii_byte_at(2), b'\xFF'); + assert_eq!(slice.ascii_byte_at(3), b' '); + assert_eq!(slice.ascii_byte_at(4), b'\xFF'); + } + + #[test] + fn wtf8_code_points() { + fn c(value: u32) -> CodePoint { + CodePoint::from_u32(value).unwrap() + } + fn cp(string: &Wtf8Buf) -> Vec> { + string + .code_points() + .map(|c| c.to_char()) + .collect::>() + } + let mut string = Wtf8Buf::from_str("é "); + assert_eq!(cp(&string), vec![Some('é'), Some(' ')]); + string.push(c(0xd83d)); + assert_eq!(cp(&string), vec![Some('é'), Some(' '), None]); + string.push(c(0xdca9)); + assert_eq!(cp(&string), vec![Some('é'), Some(' '), Some('💩')]); + } + + #[test] + fn wtf8_as_str() { + assert_eq!(Wtf8::from_str("").as_str(), Some("")); + assert_eq!(Wtf8::from_str("aé 💩").as_str(), Some("aé 💩")); + let mut string = Wtf8Buf::new(); + string.push(CodePoint::from_u32(0xd800).unwrap()); + assert_eq!(string.as_str(), None); + } + + #[test] + fn wtf8_to_string_lossy() { + assert_eq!(Wtf8::from_str("").to_string_lossy(), Cow::Borrowed("")); + assert_eq!( + Wtf8::from_str("aé 💩").to_string_lossy(), + Cow::Borrowed("aé 💩") + ); + let mut string = Wtf8Buf::from_str("aé 💩"); + string.push(CodePoint::from_u32(0xd800).unwrap()); + assert_eq!(string.to_string_lossy(), { + let o: Cow = Cow::Owned(String::from("aé 💩�")); + o + }); + } + + #[test] + fn wtf8_to_ill_formed_utf16() { + let mut string = Wtf8Buf::from_str("aé "); + string.push(CodePoint::from_u32(0xd83d).unwrap()); + string.push_char('💩'); + assert_eq!( + string.to_ill_formed_utf16().collect::>(), + vec![0x61, 0xe9, 0x20, 0xd83d, 0xd83d, 0xdca9] + ); + } +} diff --git a/crates/hstr/src/wtf8/not_quite_std.rs b/crates/hstr/src/wtf8/not_quite_std.rs new file mode 100644 index 000000000000..decf38ba5e42 --- /dev/null +++ b/crates/hstr/src/wtf8/not_quite_std.rs @@ -0,0 +1,254 @@ +// Copyright (c) 2014 Simon Sapin +// Licensed under the MIT License +// Original source: https://github.com/SimonSapin/rust-wtf8 + +//! The code in this module is copied from Rust standard library +//! (the `std` crate and crates it is a facade for) +//! at commit 16d80de231abb2b1756f3951ffd4776d681035eb, +//! with the signature changed to use `Wtf8Buf`, `Wtf8`, and `CodePoint` +//! instead of `String`, `&str`, and `char`. +//! +//! FIXME: if and when this is moved into the standard library, +//! try to avoid the code duplication. +//! Maybe by having private generic code that is monomorphized to UTF-8 and +//! WTF-8? + +use core::{char, mem, slice}; + +use super::{CodePoint, IllFormedUtf16CodeUnits, Wtf8, Wtf8Buf}; + +// UTF-8 ranges and tags for encoding characters +// Copied from 48d5fe9ec560b53b1f5069219b0d62015e1de5ba^:src/libcore/char.rs +const TAG_CONT: u8 = 0b1000_0000; +const TAG_TWO_B: u8 = 0b1100_0000; +const TAG_THREE_B: u8 = 0b1110_0000; +const TAG_FOUR_B: u8 = 0b1111_0000; +const MAX_ONE_B: u32 = 0x80; +const MAX_TWO_B: u32 = 0x800; +const MAX_THREE_B: u32 = 0x10000; + +/// Copied from 48d5fe9ec560b53b1f5069219b0d62015e1de5ba^:src/libcore/char.rs +#[inline] +fn encode_utf8_raw(code: u32, dst: &mut [u8]) -> Option { + // Marked #[inline] to allow llvm optimizing it away + if code < MAX_ONE_B && !dst.is_empty() { + dst[0] = code as u8; + Some(1) + } else if code < MAX_TWO_B && dst.len() >= 2 { + dst[0] = (code >> 6 & 0x1f) as u8 | TAG_TWO_B; + dst[1] = (code & 0x3f) as u8 | TAG_CONT; + Some(2) + } else if code < MAX_THREE_B && dst.len() >= 3 { + dst[0] = (code >> 12 & 0x0f) as u8 | TAG_THREE_B; + dst[1] = (code >> 6 & 0x3f) as u8 | TAG_CONT; + dst[2] = (code & 0x3f) as u8 | TAG_CONT; + Some(3) + } else if dst.len() >= 4 { + dst[0] = (code >> 18 & 0x07) as u8 | TAG_FOUR_B; + dst[1] = (code >> 12 & 0x3f) as u8 | TAG_CONT; + dst[2] = (code >> 6 & 0x3f) as u8 | TAG_CONT; + dst[3] = (code & 0x3f) as u8 | TAG_CONT; + Some(4) + } else { + None + } +} + +/// Copied from 48d5fe9ec560b53b1f5069219b0d62015e1de5ba^:src/libcore/char.rs +#[inline] +fn encode_utf16_raw(mut ch: u32, dst: &mut [u16]) -> Option { + // Marked #[inline] to allow llvm optimizing it away + if (ch & 0xffff) == ch && !dst.is_empty() { + // The BMP falls through (assuming non-surrogate, as it should) + dst[0] = ch as u16; + Some(1) + } else if dst.len() >= 2 { + // Supplementary planes break into surrogates. + ch -= 0x1_0000; + dst[0] = 0xd800 | ((ch >> 10) as u16); + dst[1] = 0xdc00 | ((ch as u16) & 0x3ff); + Some(2) + } else { + None + } +} + +/// Copied from core::str::next_code_point +#[inline] +pub fn next_code_point(bytes: &mut slice::Iter) -> Option { + // Decode UTF-8 + let x = match bytes.next() { + None => return None, + Some(&next_byte) if next_byte < 128 => return Some(next_byte as u32), + Some(&next_byte) => next_byte, + }; + + // Multibyte case follows + // Decode from a byte combination out of: [[[x y] z] w] + // NOTE: Performance is sensitive to the exact formulation here + let init = utf8_first_byte(x, 2); + let y = unwrap_or_0(bytes.next()); + let mut ch = utf8_acc_cont_byte(init, y); + if x >= 0xe0 { + // [[x y z] w] case + // 5th bit in 0xE0 .. 0xEF is always clear, so `init` is still valid + let z = unwrap_or_0(bytes.next()); + let y_z = utf8_acc_cont_byte((y & CONT_MASK) as u32, z); + ch = init << 12 | y_z; + if x >= 0xf0 { + // [x y z w] case + // use only the lower 3 bits of `init` + let w = unwrap_or_0(bytes.next()); + ch = (init & 7) << 18 | utf8_acc_cont_byte(y_z, w); + } + } + + Some(ch) +} + +#[inline] +fn utf8_first_byte(byte: u8, width: u32) -> u32 { + (byte & (0x7f >> width)) as u32 +} + +/// Return the value of `ch` updated with continuation byte `byte`. +#[inline] +fn utf8_acc_cont_byte(ch: u32, byte: u8) -> u32 { + (ch << 6) | (byte & CONT_MASK) as u32 +} + +#[inline] +fn unwrap_or_0(opt: Option<&u8>) -> u8 { + match opt { + Some(&byte) => byte, + None => 0, + } +} + +/// Mask of the value bits of a continuation byte +const CONT_MASK: u8 = 0b0011_1111; + +/// Copied from String::push +/// This does **not** include the WTF-8 concatenation check. +#[inline] +pub fn push_code_point(string: &mut Wtf8Buf, code_point: CodePoint) { + let cur_len = string.len(); + // This may use up to 4 bytes. + string.reserve(4); + + unsafe { + // Attempt to not use an intermediate buffer by just pushing bytes + // directly onto this string. + let slice = slice::from_raw_parts_mut(string.bytes.as_mut_ptr().add(cur_len), 4); + let used = encode_utf8_raw(code_point.to_u32(), slice).unwrap_or(0); + string.bytes.set_len(cur_len + used); + } +} + +/// Copied from core::str::StrPrelude::is_char_boundary +#[inline] +pub fn is_code_point_boundary(slice: &Wtf8, index: usize) -> bool { + if index == slice.len() { + return true; + } + match slice.bytes.get(index) { + None => false, + Some(&b) => !(128u8..192u8).contains(&b), + } +} + +/// Copied from core::str::raw::slice_unchecked +#[inline] +pub unsafe fn slice_unchecked(s: &Wtf8, begin: usize, end: usize) -> &Wtf8 { + mem::transmute(slice::from_raw_parts( + s.bytes.as_ptr().add(begin), + end - begin, + )) +} + +/// Copied from core::str::raw::slice_error_fail +#[inline(never)] +pub fn slice_error_fail(s: &Wtf8, begin: usize, end: usize) -> ! { + assert!(begin <= end); + panic!("index {begin} and/or {end} in {s:?} do not lie on character boundary"); +} + +/// Copied from core::str::Utf16CodeUnits::next +pub fn next_utf16_code_unit(iter: &mut IllFormedUtf16CodeUnits) -> Option { + if iter.extra != 0 { + let tmp = iter.extra; + iter.extra = 0; + return Some(tmp); + } + + let mut buf = [0u16; 2]; + iter.code_points.next().map(|code_point| { + let n = encode_utf16_raw(code_point.to_u32(), &mut buf).unwrap_or(0); + if n == 2 { + iter.extra = buf[1]; + } + buf[0] + }) +} + +/// Copied from src/librustc_unicode/char.rs +pub struct DecodeUtf16 +where + I: Iterator, +{ + iter: I, + buf: Option, +} + +/// Copied from src/librustc_unicode/char.rs +#[inline] +pub fn decode_utf16>(iterable: I) -> DecodeUtf16 { + DecodeUtf16 { + iter: iterable.into_iter(), + buf: None, + } +} + +/// Copied from src/librustc_unicode/char.rs +impl> Iterator for DecodeUtf16 { + type Item = Result; + + fn next(&mut self) -> Option> { + let u = match self.buf.take() { + Some(buf) => buf, + None => self.iter.next()?, + }; + + if !(0xd800..=0xdfff).contains(&u) { + // not a surrogate + Some(Ok(unsafe { char::from_u32_unchecked(u as u32) })) + } else if u >= 0xdc00 { + // a trailing surrogate + Some(Err(u)) + } else { + let u2 = match self.iter.next() { + Some(u2) => u2, + // eof + None => return Some(Err(u)), + }; + if !(0xdc00..=0xdfff).contains(&u2) { + // not a trailing surrogate so we're not a valid + // surrogate pair, so rewind to redecode u2 next time. + self.buf = Some(u2); + return Some(Err(u)); + } + + // all ok, so lets decode it. + let c = (((u - 0xd800) as u32) << 10 | (u2 - 0xdc00) as u32) + 0x1_0000; + Some(Ok(unsafe { char::from_u32_unchecked(c) })) + } + } + + #[inline] + fn size_hint(&self) -> (usize, Option) { + let (low, high) = self.iter.size_hint(); + // we could be entirely valid surrogates (2 elements per + // char), or entirely non-surrogates (1 element per char) + (low / 2, high) + } +} diff --git a/crates/hstr/src/wtf8_atom.rs b/crates/hstr/src/wtf8_atom.rs new file mode 100644 index 000000000000..7e5f75c4510d --- /dev/null +++ b/crates/hstr/src/wtf8_atom.rs @@ -0,0 +1,201 @@ +use std::{ + fmt::Debug, + hash::Hash, + mem::{forget, transmute}, + ops::Deref, +}; + +use debug_unreachable::debug_unreachable; + +use crate::{ + macros::{get_hash, impl_from_alias, partial_eq}, + tagged_value::TaggedValue, + wtf8::Wtf8, + DYNAMIC_TAG, INLINE_TAG, LEN_MASK, LEN_OFFSET, TAG_MASK, +}; + +/// A WTF-8 encoded atom. This is like [Atom], but can contain unpaired +/// surrogates. +/// +/// [Atom]: crate::Atom +pub struct Wtf8Atom { + pub(crate) unsafe_data: TaggedValue, +} + +impl Wtf8Atom { + #[inline(always)] + pub fn new(s: S) -> Self + where + Self: From, + { + Self::from(s) + } + + #[inline(always)] + fn tag(&self) -> u8 { + self.unsafe_data.tag() & TAG_MASK + } + + /// Return true if this is a dynamic Atom. + #[inline(always)] + fn is_dynamic(&self) -> bool { + self.tag() == DYNAMIC_TAG + } +} + +impl Default for Wtf8Atom { + #[inline(never)] + fn default() -> Self { + Wtf8Atom::new("") + } +} + +/// Immutable, so it's safe to be shared between threads +unsafe impl Send for Wtf8Atom {} + +/// Immutable, so it's safe to be shared between threads +unsafe impl Sync for Wtf8Atom {} + +impl Debug for Wtf8Atom { + #[inline] + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + Debug::fmt(&self.to_string_lossy(), f) + } +} + +#[cfg(feature = "serde")] +impl serde::ser::Serialize for Wtf8Atom { + fn serialize(&self, serializer: S) -> Result + where + S: serde::ser::Serializer, + { + serializer.serialize_bytes(self.as_bytes()) + } +} + +#[cfg(feature = "serde")] +impl<'de> serde::de::Deserialize<'de> for Wtf8Atom { + fn deserialize(deserializer: D) -> Result + where + D: serde::Deserializer<'de>, + { + String::deserialize(deserializer).map(Self::new) + } +} + +impl PartialEq for Wtf8Atom { + #[inline(never)] + fn eq(&self, other: &Self) -> bool { + partial_eq!(self, other); + + // If the store is different, the string may be the same, even though the + // `unsafe_data` is different + self.as_wtf8() == other.as_wtf8() + } +} + +impl Eq for Wtf8Atom {} + +impl Hash for Wtf8Atom { + #[inline(always)] + fn hash(&self, state: &mut H) { + state.write_u64(self.get_hash()); + } +} + +impl Drop for Wtf8Atom { + #[inline(always)] + fn drop(&mut self) { + if self.is_dynamic() { + unsafe { drop(crate::dynamic::restore_arc(self.unsafe_data)) } + } + } +} + +impl Clone for Wtf8Atom { + #[inline(always)] + fn clone(&self) -> Self { + Self::from_alias(self.unsafe_data) + } +} + +impl Deref for Wtf8Atom { + type Target = Wtf8; + + #[inline(always)] + fn deref(&self) -> &Self::Target { + self.as_wtf8() + } +} + +impl AsRef for Wtf8Atom { + #[inline(always)] + fn as_ref(&self) -> &Wtf8 { + self.as_wtf8() + } +} + +impl PartialEq for Wtf8Atom { + #[inline] + fn eq(&self, other: &Wtf8) -> bool { + self.as_wtf8() == other + } +} + +impl PartialEq for Wtf8Atom { + #[inline] + fn eq(&self, other: &crate::Atom) -> bool { + self.as_str() == Some(other.as_str()) + } +} + +impl PartialEq<&'_ Wtf8> for Wtf8Atom { + #[inline] + fn eq(&self, other: &&Wtf8) -> bool { + self.as_wtf8() == *other + } +} + +impl PartialEq for Wtf8 { + #[inline] + fn eq(&self, other: &Wtf8Atom) -> bool { + self == other.as_wtf8() + } +} + +impl Wtf8Atom { + pub(super) fn get_hash(&self) -> u64 { + get_hash!(self) + } + + fn as_wtf8(&self) -> &Wtf8 { + match self.tag() { + DYNAMIC_TAG => unsafe { + let item = crate::dynamic::deref_from(self.unsafe_data); + Wtf8::from_bytes_unchecked(transmute::<&[u8], &'static [u8]>(&item.slice)) + }, + INLINE_TAG => { + let len = (self.unsafe_data.tag() & LEN_MASK) >> LEN_OFFSET; + let src = self.unsafe_data.data(); + unsafe { Wtf8::from_bytes_unchecked(&src[..(len as usize)]) } + } + _ => unsafe { debug_unreachable!() }, + } + } +} + +impl_from_alias!(Wtf8Atom); + +#[cfg(test)] +impl Wtf8Atom { + pub(crate) fn ref_count(&self) -> usize { + match self.tag() { + DYNAMIC_TAG => { + let ptr = unsafe { crate::dynamic::deref_from(self.unsafe_data) }; + + triomphe::ThinArc::strong_count(&ptr.0) + } + _ => 1, + } + } +}