Skip to content

Commit 8cfd47b

Browse files
authored
feat(hstr): Introduce Wtf8Atom (#11104)
**Description:** Continue from #11085 This PR adds `Wtf8Atom` to represent unpaired surrogates (i.e. lone surrogates) in Rust. **Related issue:** Reimplemented a part of #10987
1 parent bdee12c commit 8cfd47b

File tree

9 files changed

+2091
-77
lines changed

9 files changed

+2091
-77
lines changed

.changeset/chilled-laws-look.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
---
2+
hstr: minor
3+
swc_core: minor
4+
---
5+
6+
feat(hstr): Introduce `Wtf8Atom`

crates/hstr/src/dynamic.rs

Lines changed: 36 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,8 @@ use triomphe::ThinArc;
1414

1515
use crate::{
1616
tagged_value::{TaggedValue, MAX_INLINE_LEN},
17-
Atom, INLINE_TAG, INLINE_TAG_INIT, LEN_OFFSET, TAG_MASK,
17+
wtf8::Wtf8,
18+
Atom, Wtf8Atom, INLINE_TAG, INLINE_TAG_INIT, LEN_OFFSET, TAG_MASK,
1819
};
1920

2021
#[derive(PartialEq, Eq)]
@@ -73,6 +74,11 @@ impl AtomStore {
7374
atom_in(self, &text.into())
7475
}
7576

77+
#[inline(always)]
78+
pub fn wtf8_atom<'a>(&mut self, text: impl Into<Cow<'a, Wtf8>>) -> Wtf8Atom {
79+
wtf8_atom_in(self, text.into().as_bytes())
80+
}
81+
7682
fn gc(&mut self) {
7783
self.data.retain(|item, _| {
7884
let count = ThinArc::strong_count(&item.0);
@@ -94,6 +100,14 @@ pub fn global_atom_store_gc() {
94100
});
95101
}
96102

103+
pub(crate) fn global_wtf8_atom(text: &[u8]) -> Wtf8Atom {
104+
GLOBAL_DATA.with(|global| {
105+
let mut store = global.borrow_mut();
106+
107+
wtf8_atom_in(&mut *store, text)
108+
})
109+
}
110+
97111
pub(crate) fn global_atom(text: &str) -> Atom {
98112
GLOBAL_DATA.with(|global| {
99113
let mut store = global.borrow_mut();
@@ -102,9 +116,7 @@ pub(crate) fn global_atom(text: &str) -> Atom {
102116
})
103117
}
104118

105-
/// This can create any kind of [Atom], although this lives in the `dynamic`
106-
/// module.
107-
fn atom_in<S>(storage: S, text: &str) -> Atom
119+
fn wtf8_atom_in<S>(storage: S, text: &[u8]) -> Wtf8Atom
108120
where
109121
S: Storage,
110122
{
@@ -115,9 +127,9 @@ where
115127
let tag = INLINE_TAG_INIT | ((len as u8) << LEN_OFFSET);
116128
let mut unsafe_data = TaggedValue::new_tag(tag);
117129
unsafe {
118-
unsafe_data.data_mut()[..len].copy_from_slice(text.as_bytes());
130+
unsafe_data.data_mut()[..len].copy_from_slice(text);
119131
}
120-
return Atom { unsafe_data };
132+
return Wtf8Atom { unsafe_data };
121133
}
122134

123135
let hash = calc_hash(text);
@@ -129,12 +141,22 @@ where
129141
NonNull::new_unchecked(entry)
130142
};
131143
debug_assert!(0 == ptr.as_ptr() as u8 & TAG_MASK);
132-
Atom {
144+
Wtf8Atom {
133145
unsafe_data: TaggedValue::new_ptr(ptr),
134146
}
135147
}
136148

137-
/// Attempts to construct an Atom but only if it can be constructed inline.
149+
/// This can create any kind of [Atom], although this lives in the `dynamic`
150+
/// module.
151+
fn atom_in<S>(storage: S, text: &str) -> Atom
152+
where
153+
S: Storage,
154+
{
155+
// SAFETY: `text` is valid UTF-8
156+
unsafe { Atom::from_wtf8_unchecked(wtf8_atom_in(storage, text.as_bytes())) }
157+
}
158+
159+
/// Attempts to construct an [Atom] but only if it can be constructed inline.
138160
/// This is primarily useful in constant contexts.
139161
pub(crate) const fn inline_atom(text: &str) -> Option<Atom> {
140162
let len = text.len();
@@ -159,31 +181,25 @@ pub(crate) const fn inline_atom(text: &str) -> Option<Atom> {
159181
}
160182

161183
trait Storage {
162-
fn insert_entry(self, text: &str, hash: u64) -> Item;
184+
fn insert_entry(self, text: &[u8], hash: u64) -> Item;
163185
}
164186

165187
impl Storage for &'_ mut AtomStore {
166-
fn insert_entry(self, text: &str, hash: u64) -> Item {
188+
fn insert_entry(self, text: &[u8], hash: u64) -> Item {
167189
// If the text is too long, interning is not worth it.
168190
if text.len() > 512 {
169-
return Item(ThinArc::from_header_and_slice(
170-
Metadata { hash },
171-
text.as_bytes(),
172-
));
191+
return Item(ThinArc::from_header_and_slice(Metadata { hash }, text));
173192
}
174193

175194
let (entry, _) = self
176195
.data
177196
.raw_entry_mut()
178197
.from_hash(hash, |key| {
179-
key.header.header.hash == hash && key.slice.eq(text.as_bytes())
198+
key.header.header.hash == hash && key.slice.eq(text)
180199
})
181200
.or_insert_with(move || {
182201
(
183-
Item(ThinArc::from_header_and_slice(
184-
Metadata { hash },
185-
text.as_bytes(),
186-
)),
202+
Item(ThinArc::from_header_and_slice(Metadata { hash }, text)),
187203
(),
188204
)
189205
});
@@ -192,7 +208,7 @@ impl Storage for &'_ mut AtomStore {
192208
}
193209

194210
#[inline(always)]
195-
fn calc_hash(text: &str) -> u64 {
211+
fn calc_hash(text: &[u8]) -> u64 {
196212
let mut hasher = FxHasher::default();
197213
text.hash(&mut hasher);
198214
hasher.finish()

crates/hstr/src/global_store.rs

Lines changed: 49 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,13 @@
1-
use std::borrow::Cow;
1+
use std::{
2+
borrow::Cow,
3+
mem::{forget, ManuallyDrop},
4+
};
25

3-
use crate::{dynamic::global_atom, Atom};
6+
use crate::{
7+
dynamic::{global_atom, global_wtf8_atom},
8+
wtf8::{Wtf8, Wtf8Buf},
9+
Atom, Wtf8Atom,
10+
};
411

512
macro_rules! direct_from_impl {
613
($T:ty) => {
@@ -21,3 +28,43 @@ impl From<Box<str>> for crate::Atom {
2128
global_atom(&s)
2229
}
2330
}
31+
32+
macro_rules! direct_from_impl_wtf8 {
33+
($T:ty) => {
34+
impl From<$T> for Wtf8Atom {
35+
fn from(s: $T) -> Self {
36+
global_wtf8_atom(s.as_bytes())
37+
}
38+
}
39+
};
40+
}
41+
42+
direct_from_impl_wtf8!(&'_ str);
43+
direct_from_impl_wtf8!(Cow<'_, str>);
44+
direct_from_impl_wtf8!(String);
45+
direct_from_impl_wtf8!(&'_ Wtf8);
46+
direct_from_impl_wtf8!(Wtf8Buf);
47+
48+
impl From<&Atom> for crate::Wtf8Atom {
49+
fn from(s: &Atom) -> Self {
50+
forget(s.clone());
51+
Wtf8Atom {
52+
unsafe_data: s.unsafe_data,
53+
}
54+
}
55+
}
56+
57+
impl From<Atom> for crate::Wtf8Atom {
58+
fn from(s: Atom) -> Self {
59+
let s = ManuallyDrop::new(s);
60+
Wtf8Atom {
61+
unsafe_data: s.unsafe_data,
62+
}
63+
}
64+
}
65+
66+
impl From<Box<str>> for crate::Wtf8Atom {
67+
fn from(s: Box<str>) -> Self {
68+
global_wtf8_atom(s.as_bytes())
69+
}
70+
}

crates/hstr/src/lib.rs

Lines changed: 35 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ use core::str;
55
use std::{
66
fmt::{Debug, Display},
77
hash::Hash,
8-
mem::{self, forget, transmute},
8+
mem::{self, forget, transmute, ManuallyDrop},
99
num::NonZeroU8,
1010
ops::Deref,
1111
str::from_utf8_unchecked,
@@ -15,13 +15,21 @@ use debug_unreachable::debug_unreachable;
1515
use once_cell::sync::Lazy;
1616

1717
pub use crate::dynamic::{global_atom_store_gc, AtomStore};
18-
use crate::tagged_value::TaggedValue;
18+
use crate::{
19+
macros::{get_hash, impl_from_alias, partial_eq},
20+
tagged_value::TaggedValue,
21+
};
1922

2023
mod dynamic;
2124
mod global_store;
25+
mod macros;
2226
mod tagged_value;
2327
#[cfg(test)]
2428
mod tests;
29+
pub mod wtf8;
30+
mod wtf8_atom;
31+
32+
pub use wtf8_atom::Wtf8Atom;
2533

2634
/// An immutable string which is cheap to clone, compare, hash, and has small
2735
/// size.
@@ -253,20 +261,7 @@ impl Atom {
253261

254262
impl Atom {
255263
fn get_hash(&self) -> u64 {
256-
match self.tag() {
257-
DYNAMIC_TAG => {
258-
unsafe { crate::dynamic::deref_from(self.unsafe_data) }
259-
.header
260-
.header
261-
.hash
262-
}
263-
INLINE_TAG => {
264-
// This is passed as input to the caller's `Hasher` implementation, so it's okay
265-
// that this isn't really a hash
266-
self.unsafe_data.hash()
267-
}
268-
_ => unsafe { debug_unreachable!() },
269-
}
264+
get_hash!(self)
270265
}
271266

272267
fn as_str(&self) -> &str {
@@ -302,30 +297,7 @@ impl Atom {
302297
impl PartialEq for Atom {
303298
#[inline(never)]
304299
fn eq(&self, other: &Self) -> bool {
305-
if self.unsafe_data == other.unsafe_data {
306-
return true;
307-
}
308-
309-
// If one is inline and the other is not, the length is different.
310-
// If one is static and the other is not, it's different.
311-
if self.tag() != other.tag() {
312-
return false;
313-
}
314-
315-
if self.is_dynamic() && other.is_dynamic() {
316-
let te = unsafe { crate::dynamic::deref_from(self.unsafe_data) };
317-
let oe = unsafe { crate::dynamic::deref_from(other.unsafe_data) };
318-
319-
if te.header.header.hash != oe.header.header.hash {
320-
return false;
321-
}
322-
323-
return te.slice == oe.slice;
324-
}
325-
326-
if self.get_hash() != other.get_hash() {
327-
return false;
328-
}
300+
partial_eq!(self, other);
329301

330302
// If the store is different, the string may be the same, even though the
331303
// `unsafe_data` is different
@@ -358,20 +330,7 @@ impl Clone for Atom {
358330
}
359331
}
360332

361-
impl Atom {
362-
#[inline]
363-
pub(crate) fn from_alias(alias: TaggedValue) -> Self {
364-
if alias.tag() & TAG_MASK == DYNAMIC_TAG {
365-
unsafe {
366-
let arc = crate::dynamic::restore_arc(alias);
367-
forget(arc.clone());
368-
forget(arc);
369-
}
370-
}
371-
372-
Self { unsafe_data: alias }
373-
}
374-
}
333+
impl_from_alias!(Atom);
375334

376335
impl Deref for Atom {
377336
type Target = str;
@@ -443,6 +402,28 @@ where
443402
}
444403
}
445404

405+
impl Atom {
406+
/// Converts a WTF-8 encoded [Wtf8Atom] to a regular UTF-8 [Atom] without
407+
/// validation.
408+
///
409+
/// # Safety
410+
///
411+
/// The caller must ensure that the WTF-8 atom contains only valid UTF-8
412+
/// data (no unpaired surrogates). This function performs no validation
413+
/// and will create an invalid `Atom` if the input contains unpaired
414+
/// surrogates.
415+
///
416+
/// This is a zero-cost conversion that preserves all internal optimizations
417+
/// (inline storage, precomputed hashes, etc.) since both types have
418+
/// identical internal representation.
419+
pub unsafe fn from_wtf8_unchecked(s: Wtf8Atom) -> Self {
420+
let s = ManuallyDrop::new(s);
421+
Atom {
422+
unsafe_data: s.unsafe_data,
423+
}
424+
}
425+
}
426+
446427
#[cfg(test)]
447428
mod macro_tests {
448429

0 commit comments

Comments
 (0)