From 470e7e03d6b282996671f9944a53f7541c045e3a Mon Sep 17 00:00:00 2001 From: Tim McNamara Date: Thu, 3 Jul 2025 20:57:17 +1200 Subject: [PATCH 01/51] First hint of a dedicated unsafe module appears --- .cargo/config.toml | 5 ++ src/SUMMARY.md | 9 +++ src/unsafe-deep-dive/Cargo.toml | 0 .../foundations/data-structures-are-safe.md | 17 +++++ src/unsafe-deep-dive/foundations/welcome.md | 1 + .../foundations/what-is-unsafe.md | 74 +++++++++++++++++++ .../foundations/when-is-unsafe-used.md | 43 +++++++++++ src/unsafe-deep-dive/setup.md | 20 +++++ src/unsafe-deep-dive/welcome.md | 33 +++++++++ 9 files changed, 202 insertions(+) create mode 100644 src/unsafe-deep-dive/Cargo.toml create mode 100644 src/unsafe-deep-dive/foundations/data-structures-are-safe.md create mode 100644 src/unsafe-deep-dive/foundations/welcome.md create mode 100644 src/unsafe-deep-dive/foundations/what-is-unsafe.md create mode 100644 src/unsafe-deep-dive/foundations/when-is-unsafe-used.md create mode 100644 src/unsafe-deep-dive/setup.md create mode 100644 src/unsafe-deep-dive/welcome.md diff --git a/.cargo/config.toml b/.cargo/config.toml index c8e543626304..7f3477723396 100644 --- a/.cargo/config.toml +++ b/.cargo/config.toml @@ -2,6 +2,11 @@ # We use this alias for task automation in the project. # See README in xtask directory. xtask = "run --package xtask --" +install-tools = "run --package xtask -- install-tools" +web-tests = "run --package xtask -- web-tests" +rust-tests = "run --package xtask -- rust-tests" +serve = "run --package xtask -- serve" +build-book = "run --package xtask -- build" [env] # To provide an anchor to the root of the workspace when working with paths. diff --git a/src/SUMMARY.md b/src/SUMMARY.md index 4caccd9258b9..3d166e82513e 100644 --- a/src/SUMMARY.md +++ b/src/SUMMARY.md @@ -427,6 +427,15 @@ - [Broadcast Chat Application](concurrency/async-exercises/chat-app.md) - [Solutions](concurrency/async-exercises/solutions.md) +# Writing Unsafe + +- [Welcome](unsafe-deep-dive/welcome.md) +- [Setup](unsafe-deep-dive/setup.md) +- [Foundations](unsafe-deep-dive/foundations/welcome.md) + - [What is unsafe?](unsafe-deep-dive/foundations/what-is-unsafe.md) + - [When is unsafe used?](unsafe-deep-dive/foundations/when-is-unsafe-used.md) + - [Data structures are safe](unsafe-deep-dive/foundations/data-structures-are-safe.md) + --- # Final Words diff --git a/src/unsafe-deep-dive/Cargo.toml b/src/unsafe-deep-dive/Cargo.toml new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/src/unsafe-deep-dive/foundations/data-structures-are-safe.md b/src/unsafe-deep-dive/foundations/data-structures-are-safe.md new file mode 100644 index 000000000000..54d2623cb076 --- /dev/null +++ b/src/unsafe-deep-dive/foundations/data-structures-are-safe.md @@ -0,0 +1,17 @@ +# Data structures are safe + +Data structures are inert. They cannot do any harm by themselves. + +It's possible to create a valid program with raw pointer entirely in safe code: + +```rust +fn main() { + let n: i64 = 12345; + let safe = &n as *const _; + println!("{safe:p}"); +} +``` + +However, using them in an unsafe way. + +Consider a raw pointer to an integer, i.e. `*const i64`. \ No newline at end of file diff --git a/src/unsafe-deep-dive/foundations/welcome.md b/src/unsafe-deep-dive/foundations/welcome.md new file mode 100644 index 000000000000..97f12474267a --- /dev/null +++ b/src/unsafe-deep-dive/foundations/welcome.md @@ -0,0 +1 @@ +# Foundations diff --git a/src/unsafe-deep-dive/foundations/what-is-unsafe.md b/src/unsafe-deep-dive/foundations/what-is-unsafe.md new file mode 100644 index 000000000000..51c0a9301377 --- /dev/null +++ b/src/unsafe-deep-dive/foundations/what-is-unsafe.md @@ -0,0 +1,74 @@ +# What is “unsafety”? + +The Rust reference is intentionally vague about what "unsafe" is. + + + +Let's create a working definition together. + +
+ +## Definitions from authoritative docs: + +stdlib's keyword: + +> Code or interfaces whose memory safety cannot be verified by the type system. +> +> ... +> +> Here are the abilities Unsafe Rust has in addition to Safe Rust: +> +> - Dereference raw pointers +> - Implement unsafe traits +> - Call unsafe functions +> - Mutate statics (including external ones) +> - Access fields of unions + +From the [reference](https://doc.rust-lang.org/reference/unsafety.html) + + +> The following language level features cannot be used in the safe subset of Rust: +> +> - Dereferencing a raw pointer. +> - Reading or writing a mutable or external static variable. +> - Accessing a field of a union, other than to assign to it. +> - Calling an unsafe function (including an intrinsic or foreign function). +> - Calling a safe function marked with a target_feature from a function that does not have a target_feature attribute enabling the same features (see attributes.codegen.target_feature.safety-restrictions). +> - Implementing an unsafe trait. +> - Declaring an extern block. +> - Applying an unsafe attribute to an item. + + + +## Group exercise + +> You may have a group of learners who are not familiar with each other yet. +> This is a way for you to gather some data about their confidence levels and +> the phsychological safety that they're feeling. + + +### Part 1: Informal definition + +> Use this to guage the confidence level of the group. If they are uncertain, then tailor the next section to be more directed. + +Ask the class: **By raising your hand, indicate if you would feel comfortable defining unsafe?** + +If anyone's feeling confident, allow them to try to explain. + + +### Part 2: Evidence gathering + +Ask the class to spend 3-5 minutes. + +- Find a a use of the unsafe keyword. What contract/invariant/pre-condition is being established or satisfied. +- Write down terms that need to be defined (unsafe, memory safety, soundness, undefined behavior) + + +### Part 3: Write a working definition + + +### Part 4: Remarks + +Mention that we'll be reviewing our definition at the end of the day. + +
\ No newline at end of file diff --git a/src/unsafe-deep-dive/foundations/when-is-unsafe-used.md b/src/unsafe-deep-dive/foundations/when-is-unsafe-used.md new file mode 100644 index 000000000000..c4b34472e164 --- /dev/null +++ b/src/unsafe-deep-dive/foundations/when-is-unsafe-used.md @@ -0,0 +1,43 @@ +# When is unsafe used? + +The unsafe keyword indicates that the programmer is responsible for upholding +Rust's safety guarantees. + +The keyword has two roles: + +- define pre-conditions must be satisfied +- verify that those defined pre-conditions are satisfied + + +## Further references + +- [The unsafe keyword chapter of the Rust Reference](https://doc.rust-lang.org/reference/unsafe-keyword.html) + +
+ +Places where pre-conditions can be defined (Role 1) + + - [unsafe functions] (`unsafe fn { ... }`). Example: `get_unchecked` method on slices, which requires callers to verify that the index is in-bounds. + - unsafe traits (`unsafe trait`). Examples: [`Send`] and [`Sync`] marker traits in the standard library. + +Places where pre-conditions must be satisfied (Role 2) + +- unsafe blocks (`unafe { ... }`) +- implementing unsafe traits (`unsafe impl`) +- access external items (`unsafe extern`) +- adding [unsafe attributes](https://doc.rust-lang.org/reference/attributes.html) o an item. Examples: [`export_name`], [`link_section`] and [`no_mangle`]. Usage: `#[unsafe(no_mangle)]` + + +[unsafe functions]: https://doc.rust-lang.org/reference/unsafe-keyword.html#unsafe-functions-unsafe-fn +[unsafe traits]: https://doc.rust-lang.org/reference/unsafe-keyword.html#unsafe-traits-unsafe-trait +[`export_name`]: https://doc.rust-lang.org/reference/abi.html#the-export_name-attribute +[`link_section`]: https://doc.rust-lang.org/reference/abi.html#the-link_section-attribute +[`no_mangle`]: https://doc.rust-lang.org/reference/abi.html#the-no_mangle-attribute + +[`Send`]: https://doc.rust-lang.org/std/marker/trait.Send.html +[`Sync`]: https://doc.rust-lang.org/std/marker/trait.Sync.html + + +
+ + diff --git a/src/unsafe-deep-dive/setup.md b/src/unsafe-deep-dive/setup.md new file mode 100644 index 000000000000..7376979b7a09 --- /dev/null +++ b/src/unsafe-deep-dive/setup.md @@ -0,0 +1,20 @@ +# Setting Up + +You should have a Rust compiler installed that supports the 2024 edition of the language, +which is any version of rustc higher than 1.84. + +```bash +$ rustc --version +rustc 1.87 +``` + + + \ No newline at end of file diff --git a/src/unsafe-deep-dive/welcome.md b/src/unsafe-deep-dive/welcome.md new file mode 100644 index 000000000000..8096d5a24a70 --- /dev/null +++ b/src/unsafe-deep-dive/welcome.md @@ -0,0 +1,33 @@ +# Welcome to Unsafe Rust + +> IMPORTANT: THIS MODULE IS IN AN EARLY STAGE OF DEVELOPMENT +> +> Please do not consider this module of Comprehensive Rust to be complete. +> Your feedback and comments, and especially your concerns, are very welcome. + +This is an extension to the Unsafe Rust session that's part of the Rust Fundametals. +That section discusses its mechanics. This section provides a more thorough understanding +of how to work with `unsafe` and produce your own. + +What you'll learn: + +- what the terms undefined behavior, soundness and safety mean +- why the `unsafe` keyword exists in the Rust language +- how to write your own code using `unsafe` safely +- how to review `unsafe` code + +introduction to `unsafe` + +Much of the what we'll cover will be to provide a mental model. + + That is, we're attempting +developing a language-agnostic + +It can also be used as a standalone course to teach software engineers how +to review unsafe code, with a particular focus on making it clear that ther + +## Links to other sections of the course. + +- _Rust Fundamentals_ (the module of Comprehensive rust) includes a session on unsafe in its last day. Use that if you want to +- _Rust in Chromium_ discusses how to [interoperate with C++](../chromium/interoperability-with-cpp.md). +- _Bare Metal Rust_ uses unsafe heavily to interact with the underlying host. If you are using \ No newline at end of file From dfd5057e9224e41d18cb01a2eb14a79e54628e95 Mon Sep 17 00:00:00 2001 From: Tim McNamara Date: Fri, 4 Jul 2025 23:13:21 +1200 Subject: [PATCH 02/51] Use shorter working title --- src/SUMMARY.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/SUMMARY.md b/src/SUMMARY.md index 3d166e82513e..0592fbfd5f94 100644 --- a/src/SUMMARY.md +++ b/src/SUMMARY.md @@ -427,7 +427,9 @@ - [Broadcast Chat Application](concurrency/async-exercises/chat-app.md) - [Solutions](concurrency/async-exercises/solutions.md) -# Writing Unsafe +--- + +# Unsafe - [Welcome](unsafe-deep-dive/welcome.md) - [Setup](unsafe-deep-dive/setup.md) From b82890952ac77eefae5b072c2572ee7e5affb2aa Mon Sep 17 00:00:00 2001 From: Tim McNamara Date: Fri, 4 Jul 2025 23:50:16 +1200 Subject: [PATCH 03/51] Reword the introduction --- .../foundations/data-structures-are-safe.md | 2 +- .../foundations/what-is-unsafe.md | 34 +++++++------ .../foundations/when-is-unsafe-used.md | 17 ++++--- src/unsafe-deep-dive/setup.md | 9 ++-- src/unsafe-deep-dive/welcome.md | 48 +++++++++++-------- 5 files changed, 56 insertions(+), 54 deletions(-) diff --git a/src/unsafe-deep-dive/foundations/data-structures-are-safe.md b/src/unsafe-deep-dive/foundations/data-structures-are-safe.md index 54d2623cb076..cd5136d5f94a 100644 --- a/src/unsafe-deep-dive/foundations/data-structures-are-safe.md +++ b/src/unsafe-deep-dive/foundations/data-structures-are-safe.md @@ -14,4 +14,4 @@ fn main() { However, using them in an unsafe way. -Consider a raw pointer to an integer, i.e. `*const i64`. \ No newline at end of file +Consider a raw pointer to an integer, i.e. `*const i64`. diff --git a/src/unsafe-deep-dive/foundations/what-is-unsafe.md b/src/unsafe-deep-dive/foundations/what-is-unsafe.md index 51c0a9301377..098ac69ff6ee 100644 --- a/src/unsafe-deep-dive/foundations/what-is-unsafe.md +++ b/src/unsafe-deep-dive/foundations/what-is-unsafe.md @@ -1,8 +1,6 @@ # What is “unsafety”? -The Rust reference is intentionally vague about what "unsafe" is. - - +The Rust reference is intentionally vague about what "unsafe" is. Let's create a working definition together. @@ -26,49 +24,49 @@ stdlib's keyword: From the [reference](https://doc.rust-lang.org/reference/unsafety.html) - -> The following language level features cannot be used in the safe subset of Rust: +> The following language level features cannot be used in the safe subset of +> Rust: > > - Dereferencing a raw pointer. > - Reading or writing a mutable or external static variable. > - Accessing a field of a union, other than to assign to it. > - Calling an unsafe function (including an intrinsic or foreign function). -> - Calling a safe function marked with a target_feature from a function that does not have a target_feature attribute enabling the same features (see attributes.codegen.target_feature.safety-restrictions). +> - Calling a safe function marked with a target_feature from a function that +> does not have a target_feature attribute enabling the same features (see +> attributes.codegen.target_feature.safety-restrictions). > - Implementing an unsafe trait. > - Declaring an extern block. > - Applying an unsafe attribute to an item. - - ## Group exercise -> You may have a group of learners who are not familiar with each other yet. +> You may have a group of learners who are not familiar with each other yet. > This is a way for you to gather some data about their confidence levels and > the phsychological safety that they're feeling. - ### Part 1: Informal definition -> Use this to guage the confidence level of the group. If they are uncertain, then tailor the next section to be more directed. +> Use this to guage the confidence level of the group. If they are uncertain, +> then tailor the next section to be more directed. -Ask the class: **By raising your hand, indicate if you would feel comfortable defining unsafe?** +Ask the class: **By raising your hand, indicate if you would feel comfortable +defining unsafe?** If anyone's feeling confident, allow them to try to explain. - ### Part 2: Evidence gathering Ask the class to spend 3-5 minutes. -- Find a a use of the unsafe keyword. What contract/invariant/pre-condition is being established or satisfied. -- Write down terms that need to be defined (unsafe, memory safety, soundness, undefined behavior) - +- Find a a use of the unsafe keyword. What contract/invariant/pre-condition is + being established or satisfied. +- Write down terms that need to be defined (unsafe, memory safety, soundness, + undefined behavior) ### Part 3: Write a working definition - ### Part 4: Remarks Mention that we'll be reviewing our definition at the end of the day. - \ No newline at end of file + diff --git a/src/unsafe-deep-dive/foundations/when-is-unsafe-used.md b/src/unsafe-deep-dive/foundations/when-is-unsafe-used.md index c4b34472e164..088b73a38ee6 100644 --- a/src/unsafe-deep-dive/foundations/when-is-unsafe-used.md +++ b/src/unsafe-deep-dive/foundations/when-is-unsafe-used.md @@ -8,7 +8,6 @@ The keyword has two roles: - define pre-conditions must be satisfied - verify that those defined pre-conditions are satisfied - ## Further references - [The unsafe keyword chapter of the Rust Reference](https://doc.rust-lang.org/reference/unsafe-keyword.html) @@ -17,27 +16,27 @@ The keyword has two roles: Places where pre-conditions can be defined (Role 1) - - [unsafe functions] (`unsafe fn { ... }`). Example: `get_unchecked` method on slices, which requires callers to verify that the index is in-bounds. - - unsafe traits (`unsafe trait`). Examples: [`Send`] and [`Sync`] marker traits in the standard library. +- [unsafe functions] (`unsafe fn { ... }`). Example: `get_unchecked` method on + slices, which requires callers to verify that the index is in-bounds. +- unsafe traits (`unsafe trait`). Examples: [`Send`] and [`Sync`] marker traits + in the standard library. Places where pre-conditions must be satisfied (Role 2) - unsafe blocks (`unafe { ... }`) - implementing unsafe traits (`unsafe impl`) - access external items (`unsafe extern`) -- adding [unsafe attributes](https://doc.rust-lang.org/reference/attributes.html) o an item. Examples: [`export_name`], [`link_section`] and [`no_mangle`]. Usage: `#[unsafe(no_mangle)]` - +- adding + [unsafe attributes](https://doc.rust-lang.org/reference/attributes.html) o an + item. Examples: [`export_name`], [`link_section`] and [`no_mangle`]. Usage: + `#[unsafe(no_mangle)]` [unsafe functions]: https://doc.rust-lang.org/reference/unsafe-keyword.html#unsafe-functions-unsafe-fn [unsafe traits]: https://doc.rust-lang.org/reference/unsafe-keyword.html#unsafe-traits-unsafe-trait [`export_name`]: https://doc.rust-lang.org/reference/abi.html#the-export_name-attribute [`link_section`]: https://doc.rust-lang.org/reference/abi.html#the-link_section-attribute [`no_mangle`]: https://doc.rust-lang.org/reference/abi.html#the-no_mangle-attribute - [`Send`]: https://doc.rust-lang.org/std/marker/trait.Send.html [`Sync`]: https://doc.rust-lang.org/std/marker/trait.Sync.html - - - diff --git a/src/unsafe-deep-dive/setup.md b/src/unsafe-deep-dive/setup.md index 7376979b7a09..d9dddb2a21de 100644 --- a/src/unsafe-deep-dive/setup.md +++ b/src/unsafe-deep-dive/setup.md @@ -1,20 +1,19 @@ # Setting Up -You should have a Rust compiler installed that supports the 2024 edition of the language, -which is any version of rustc higher than 1.84. +You should have a Rust compiler installed that supports the 2024 edition of the +language, which is any version of rustc higher than 1.84. ```bash $ rustc --version rustc 1.87 ``` - \ No newline at end of file +--> diff --git a/src/unsafe-deep-dive/welcome.md b/src/unsafe-deep-dive/welcome.md index 8096d5a24a70..7f6521fbe49d 100644 --- a/src/unsafe-deep-dive/welcome.md +++ b/src/unsafe-deep-dive/welcome.md @@ -1,33 +1,39 @@ # Welcome to Unsafe Rust -> IMPORTANT: THIS MODULE IS IN AN EARLY STAGE OF DEVELOPMENT +> IMPORTANT: THIS MODULE IS IN AN EARLY STAGE OF DEVELOPMENT > -> Please do not consider this module of Comprehensive Rust to be complete. -> Your feedback and comments, and especially your concerns, are very welcome. +> Please do not consider this module of Comprehensive Rust to be complete. With +> that in mind, your feedback, comments, and especially your concerns, are very +> welcome. +> +> To comment on this module's development, please use the +> [GitHub issue tracker]. -This is an extension to the Unsafe Rust session that's part of the Rust Fundametals. -That section discusses its mechanics. This section provides a more thorough understanding -of how to work with `unsafe` and produce your own. +[GitHub issue tracker]: https://github.com/google/comprehensive-rust/issues -What you'll learn: +The `unsafe` keyword is a misunderstood, but at times essential, part of the +Rust programming language. -- what the terms undefined behavior, soundness and safety mean -- why the `unsafe` keyword exists in the Rust language -- how to write your own code using `unsafe` safely -- how to review `unsafe` code +By the end of this deep dive, you'll know how to work with `unsafe` code, review +others' changes that include the `unsafe` keyword, and produce your own. -introduction to `unsafe` +What you'll learn: -Much of the what we'll cover will be to provide a mental model. +- What the terms undefined behavior, soundness, and safety mean +- Why the `unsafe` keyword exists in the Rust language +- How to write your own code using `unsafe` safely +- How to review `unsafe` code - That is, we're attempting -developing a language-agnostic +## Links to other sections of the course -It can also be used as a standalone course to teach software engineers how -to review unsafe code, with a particular focus on making it clear that ther +The `unsafe` keyword has treatment -## Links to other sections of the course. +- _Rust Fundamentals_, the main module of Comprehensive Rust, includes a session + on [Unsafe Rust] in its last day. +- _Rust in Chromium_ discusses how to [interoperate with C++]. Consult that + material if you are looking into FFI. +- _Bare Metal Rust_ uses unsafe heavily to interact with the underlying host, + among other things. -- _Rust Fundamentals_ (the module of Comprehensive rust) includes a session on unsafe in its last day. Use that if you want to -- _Rust in Chromium_ discusses how to [interoperate with C++](../chromium/interoperability-with-cpp.md). -- _Bare Metal Rust_ uses unsafe heavily to interact with the underlying host. If you are using \ No newline at end of file +[interoperate with C++]: ../chromium/interoperability-with-cpp.md +[Unsafe Rust]: /unsafe-rust.html From b4be7b9bee36fc72bef78dc5a2380b0a72cecf8b Mon Sep 17 00:00:00 2001 From: Tim McNamara Date: Fri, 4 Jul 2025 23:50:59 +1200 Subject: [PATCH 04/51] Mention the unsafe deep dive in the notes for instructors --- src/running-the-course/course-structure.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/running-the-course/course-structure.md b/src/running-the-course/course-structure.md index d47e56d06f4d..5a88b9f43fdd 100644 --- a/src/running-the-course/course-structure.md +++ b/src/running-the-course/course-structure.md @@ -72,6 +72,13 @@ cargo run {{%course outline Concurrency}} +### Unsafe (Work in Progress) + +The [Unsafe](../unsafe-deep-dive/welcome.md) deep dive is a two day class on +using the `unsafe` keyword correctly. It covers the fundamentals of what Rust's +safety guarantees are and why `unsafe` is needed, reviewing `unsafe` code, using +FFI and building data structures. + ## Format The course is meant to be very interactive and we recommend letting the From 38da590c9e4845ae356641236559af6c151b7540 Mon Sep 17 00:00:00 2001 From: Tim McNamara Date: Fri, 4 Jul 2025 23:51:49 +1200 Subject: [PATCH 05/51] Use shorter working title --- src/SUMMARY.md | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/SUMMARY.md b/src/SUMMARY.md index 0592fbfd5f94..bf970ed9a6a5 100644 --- a/src/SUMMARY.md +++ b/src/SUMMARY.md @@ -427,9 +427,7 @@ - [Broadcast Chat Application](concurrency/async-exercises/chat-app.md) - [Solutions](concurrency/async-exercises/solutions.md) ---- - -# Unsafe +# Writing Unsafe - [Welcome](unsafe-deep-dive/welcome.md) - [Setup](unsafe-deep-dive/setup.md) From de10892a3586daa803c0cf15c7121d15b102ed75 Mon Sep 17 00:00:00 2001 From: Tim McNamara Date: Sat, 5 Jul 2025 00:59:20 +1200 Subject: [PATCH 06/51] Sketch of opening --- src/SUMMARY.md | 12 +++++++++-- .../welcome.md => foundations.md} | 0 .../foundations/actions-might-not-be.md | 15 ++++++++++++++ .../foundations/data-structures-are-safe.md | 10 +++++++--- .../foundations/less-powerful.md | 18 +++++++++++++++++ .../foundations/what-is-unsafe.md | 19 +++++++++++++++++- src/unsafe-deep-dive/motivations.md | 18 +++++++++++++++++ .../motivations/data-structures.md | 20 +++++++++++++++++++ src/unsafe-deep-dive/motivations/interop.md | 3 +++ .../motivations/perfomance.md | 6 ++++++ src/unsafe-deep-dive/welcome.md | 2 +- 11 files changed, 116 insertions(+), 7 deletions(-) rename src/unsafe-deep-dive/{foundations/welcome.md => foundations.md} (100%) create mode 100644 src/unsafe-deep-dive/foundations/actions-might-not-be.md create mode 100644 src/unsafe-deep-dive/foundations/less-powerful.md create mode 100644 src/unsafe-deep-dive/motivations.md create mode 100644 src/unsafe-deep-dive/motivations/data-structures.md create mode 100644 src/unsafe-deep-dive/motivations/interop.md create mode 100644 src/unsafe-deep-dive/motivations/perfomance.md diff --git a/src/SUMMARY.md b/src/SUMMARY.md index bf970ed9a6a5..723e2abf1e9f 100644 --- a/src/SUMMARY.md +++ b/src/SUMMARY.md @@ -427,14 +427,22 @@ - [Broadcast Chat Application](concurrency/async-exercises/chat-app.md) - [Solutions](concurrency/async-exercises/solutions.md) -# Writing Unsafe +--- + +# Unsafe - [Welcome](unsafe-deep-dive/welcome.md) - [Setup](unsafe-deep-dive/setup.md) -- [Foundations](unsafe-deep-dive/foundations/welcome.md) +- [Motivations](unsafe-deep-dive/motivations.md) + - [Interop](unsafe-deep-dive/motivations/interop.md) + - [Data Structures](unsafe-deep-dive/motivations/data-structures.md) + - [Performance](unsafe-deep-dive/motivations/perfomance.md) +- [Foundations](unsafe-deep-dive/foundations.md) - [What is unsafe?](unsafe-deep-dive/foundations/what-is-unsafe.md) - [When is unsafe used?](unsafe-deep-dive/foundations/when-is-unsafe-used.md) - [Data structures are safe](unsafe-deep-dive/foundations/data-structures-are-safe.md) + - [Actions might not be](unsafe-deep-dive/foundations/actions-might-not-be.md) + - [Less powerful than it seems](unsafe-deep-dive/foundations/less-powerful.md) --- diff --git a/src/unsafe-deep-dive/foundations/welcome.md b/src/unsafe-deep-dive/foundations.md similarity index 100% rename from src/unsafe-deep-dive/foundations/welcome.md rename to src/unsafe-deep-dive/foundations.md diff --git a/src/unsafe-deep-dive/foundations/actions-might-not-be.md b/src/unsafe-deep-dive/foundations/actions-might-not-be.md new file mode 100644 index 000000000000..d181bec18f52 --- /dev/null +++ b/src/unsafe-deep-dive/foundations/actions-might-not-be.md @@ -0,0 +1,15 @@ +# ... but actions on them might not be + +```rust +fn main() { + let n: i64 = 12345; + let safe = &n as *const _; + println!("{safe:p}"); +} +``` + +
+ +Modify the example to de-reference `safe` without an `unsafe` block. + +
diff --git a/src/unsafe-deep-dive/foundations/data-structures-are-safe.md b/src/unsafe-deep-dive/foundations/data-structures-are-safe.md index cd5136d5f94a..427c6fb35c3f 100644 --- a/src/unsafe-deep-dive/foundations/data-structures-are-safe.md +++ b/src/unsafe-deep-dive/foundations/data-structures-are-safe.md @@ -1,4 +1,4 @@ -# Data structures are safe +# Data structures are safe ... Data structures are inert. They cannot do any harm by themselves. @@ -12,6 +12,10 @@ fn main() { } ``` -However, using them in an unsafe way. +
-Consider a raw pointer to an integer, i.e. `*const i64`. +Consider a raw pointer to an integer, i.e. the value `safe` is the raw pointer +type `*const i64`. Raw pointers can be out-of-bounds, misaligned, or refer to +null. But the unsafe keyword is not required when creating them. + +
diff --git a/src/unsafe-deep-dive/foundations/less-powerful.md b/src/unsafe-deep-dive/foundations/less-powerful.md new file mode 100644 index 000000000000..87e475154be9 --- /dev/null +++ b/src/unsafe-deep-dive/foundations/less-powerful.md @@ -0,0 +1,18 @@ +# Less powerful than it seems + +The `unsafe` keyword + +```rust +use std::mem::transmute; + +let orig = b"RUST"; +let n: i32 = unsafe { transmute(orig) }; + +println!("{}") +``` + +
+ +Try to c different + +
diff --git a/src/unsafe-deep-dive/foundations/what-is-unsafe.md b/src/unsafe-deep-dive/foundations/what-is-unsafe.md index 098ac69ff6ee..f0dfe64c190a 100644 --- a/src/unsafe-deep-dive/foundations/what-is-unsafe.md +++ b/src/unsafe-deep-dive/foundations/what-is-unsafe.md @@ -1,11 +1,28 @@ # What is “unsafety”? -The Rust reference is intentionally vague about what "unsafe" is. +Rust is intentionally vague about what "unsafe" means. Let's create a working definition together.
+## Clarifying "intentionally vague" + +It's likely that you'll have an audience member object to the phrase +"intentionally vague". Perhaps refer to this statement from the standard library +when discussing the [safety requirements of raw pointers]. + +> Many functions in [this module] take raw pointers as arguments and read from +> or write to them. For this to be safe, these pointers must be _valid_ for the +> given access. +> +> ... +> +> The precise rules for validity are not determined yet. + +[this module]: https://doc.rust-lang.org/std/ptr/index.html +[safety requirements of raw pointers]: https://doc.rust-lang.org/std/ptr/index.html#safety + ## Definitions from authoritative docs: stdlib's keyword: diff --git a/src/unsafe-deep-dive/motivations.md b/src/unsafe-deep-dive/motivations.md new file mode 100644 index 000000000000..6a5523244328 --- /dev/null +++ b/src/unsafe-deep-dive/motivations.md @@ -0,0 +1,18 @@ +# Motivations + +We know that writing code without the guarantees that Rust provides ... + +> “Use-after-free (UAF), integer overflows, and out of bounds (OOB) reads/writes +> comprise 90% of vulnerabilities with OOB being the most common.” +> +> --— **Jeff Vander Stoep and Chong Zang**, Google. "Queue the Hardening +> Enhancements" + +... so why is `unsafe` part of the language? + +
+ +The `unsafe` keyword exists because there is no compiler technology available +today that makes it obsolete. Compilers cannot verify everything. + +
diff --git a/src/unsafe-deep-dive/motivations/data-structures.md b/src/unsafe-deep-dive/motivations/data-structures.md new file mode 100644 index 000000000000..ff18dbb05f47 --- /dev/null +++ b/src/unsafe-deep-dive/motivations/data-structures.md @@ -0,0 +1,20 @@ +# Data Structures + +Some families of data structures, are impossible to create in safe Rust. + +- graphs +- bit stuffing +- self-referential types + +
+ +Graphs: General-purpose graphs cannot be created as they may need to represent +cycles. Cycles are impossible for the type system to reason about. + +Bit stuffing: Overloading bits with multiple meanings, such as the NaN bits in +`f64` for some other purpose or higher-order bits on `x86_64` platforms, + +Self-referential types are too hard for the borrow checker to verify. (note to +self: citation needed) + +
diff --git a/src/unsafe-deep-dive/motivations/interop.md b/src/unsafe-deep-dive/motivations/interop.md new file mode 100644 index 000000000000..fdf29ecdbf4a --- /dev/null +++ b/src/unsafe-deep-dive/motivations/interop.md @@ -0,0 +1,3 @@ +# Interop + +Rust has no understanding of code generated from other languages. diff --git a/src/unsafe-deep-dive/motivations/perfomance.md b/src/unsafe-deep-dive/motivations/perfomance.md new file mode 100644 index 000000000000..d40e3f66a8eb --- /dev/null +++ b/src/unsafe-deep-dive/motivations/perfomance.md @@ -0,0 +1,6 @@ +# Performance + +> Stub for now + +It's easy to think of performance as the main reason for unsafe, but high +performance code makes up the minority of unsafe blocks. diff --git a/src/unsafe-deep-dive/welcome.md b/src/unsafe-deep-dive/welcome.md index 7f6521fbe49d..f6a0aa6d9e8f 100644 --- a/src/unsafe-deep-dive/welcome.md +++ b/src/unsafe-deep-dive/welcome.md @@ -36,4 +36,4 @@ The `unsafe` keyword has treatment among other things. [interoperate with C++]: ../chromium/interoperability-with-cpp.md -[Unsafe Rust]: /unsafe-rust.html +[Unsafe Rust]: ../unsafe-rust.html From 72e073959081890ebdd0e7c94dff37921c6606ce Mon Sep 17 00:00:00 2001 From: Tim McNamara Date: Tue, 8 Jul 2025 21:53:43 +1200 Subject: [PATCH 07/51] Shorten comment so that it can fit on slides Co-authored-by: Dmitri Gribenko --- src/unsafe-deep-dive/foundations/data-structures-are-safe.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/unsafe-deep-dive/foundations/data-structures-are-safe.md b/src/unsafe-deep-dive/foundations/data-structures-are-safe.md index 427c6fb35c3f..75efe3a4f7cc 100644 --- a/src/unsafe-deep-dive/foundations/data-structures-are-safe.md +++ b/src/unsafe-deep-dive/foundations/data-structures-are-safe.md @@ -2,7 +2,7 @@ Data structures are inert. They cannot do any harm by themselves. -It's possible to create a valid program with raw pointer entirely in safe code: +Safe Rust code can create raw pointers: ```rust fn main() { From 7fdbb45eaf55a2c864741c0fe973b4e9e46f55d1 Mon Sep 17 00:00:00 2001 From: Tim McNamara Date: Tue, 8 Jul 2025 22:06:16 +1200 Subject: [PATCH 08/51] Add intrusive data structures --- src/unsafe-deep-dive/motivations/data-structures.md | 1 + 1 file changed, 1 insertion(+) diff --git a/src/unsafe-deep-dive/motivations/data-structures.md b/src/unsafe-deep-dive/motivations/data-structures.md index ff18dbb05f47..bb07ca8fc246 100644 --- a/src/unsafe-deep-dive/motivations/data-structures.md +++ b/src/unsafe-deep-dive/motivations/data-structures.md @@ -5,6 +5,7 @@ Some families of data structures, are impossible to create in safe Rust. - graphs - bit stuffing - self-referential types +- intrusive data structures
From 65d60494863936af3a04b16f78052350e410f131 Mon Sep 17 00:00:00 2001 From: Tim McNamara Date: Tue, 8 Jul 2025 22:06:52 +1200 Subject: [PATCH 09/51] Use term `bit twiddling` --- src/unsafe-deep-dive/motivations/data-structures.md | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/unsafe-deep-dive/motivations/data-structures.md b/src/unsafe-deep-dive/motivations/data-structures.md index bb07ca8fc246..ed8d98dd4da1 100644 --- a/src/unsafe-deep-dive/motivations/data-structures.md +++ b/src/unsafe-deep-dive/motivations/data-structures.md @@ -3,7 +3,7 @@ Some families of data structures, are impossible to create in safe Rust. - graphs -- bit stuffing +- bit twiddling - self-referential types - intrusive data structures @@ -12,8 +12,10 @@ Some families of data structures, are impossible to create in safe Rust. Graphs: General-purpose graphs cannot be created as they may need to represent cycles. Cycles are impossible for the type system to reason about. -Bit stuffing: Overloading bits with multiple meanings, such as the NaN bits in -`f64` for some other purpose or higher-order bits on `x86_64` platforms, +Bit twiddling: Overloading bits with multiple meanings. Examples include using +the NaN bits in `f64` for some other purpose or the higher-order bits of +pointers on `x86_64` platforms. This is somewhat common when writing language +interpreters to keep representations within the word size the target platform. Self-referential types are too hard for the borrow checker to verify. (note to self: citation needed) From a30d51838e40ec4fa964d30d5cbb95c9433096dd Mon Sep 17 00:00:00 2001 From: Tim McNamara Date: Tue, 8 Jul 2025 22:13:39 +1200 Subject: [PATCH 10/51] Grammar - remove spurious comma --- src/unsafe-deep-dive/motivations/data-structures.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/unsafe-deep-dive/motivations/data-structures.md b/src/unsafe-deep-dive/motivations/data-structures.md index ed8d98dd4da1..6d3958213153 100644 --- a/src/unsafe-deep-dive/motivations/data-structures.md +++ b/src/unsafe-deep-dive/motivations/data-structures.md @@ -1,6 +1,6 @@ # Data Structures -Some families of data structures, are impossible to create in safe Rust. +Some families of data structures are impossible to create in safe Rust. - graphs - bit twiddling From b01492413c12a1d1dad0de2474059a1ae79173c1 Mon Sep 17 00:00:00 2001 From: Tim McNamara Date: Tue, 8 Jul 2025 22:24:50 +1200 Subject: [PATCH 11/51] Fix grammar --- src/unsafe-deep-dive/foundations/what-is-unsafe.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/unsafe-deep-dive/foundations/what-is-unsafe.md b/src/unsafe-deep-dive/foundations/what-is-unsafe.md index f0dfe64c190a..944e2a67408f 100644 --- a/src/unsafe-deep-dive/foundations/what-is-unsafe.md +++ b/src/unsafe-deep-dive/foundations/what-is-unsafe.md @@ -75,8 +75,8 @@ If anyone's feeling confident, allow them to try to explain. Ask the class to spend 3-5 minutes. -- Find a a use of the unsafe keyword. What contract/invariant/pre-condition is - being established or satisfied. +- Find a use of the unsafe keyword. What contract/invariant/pre-condition is + being established or satisfied? - Write down terms that need to be defined (unsafe, memory safety, soundness, undefined behavior) From af340b0745bc40aef365116c5431d34ab13275ce Mon Sep 17 00:00:00 2001 From: Tim McNamara Date: Tue, 8 Jul 2025 22:28:21 +1200 Subject: [PATCH 12/51] Fix spelling --- src/unsafe-deep-dive/foundations/what-is-unsafe.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/unsafe-deep-dive/foundations/what-is-unsafe.md b/src/unsafe-deep-dive/foundations/what-is-unsafe.md index 944e2a67408f..9399c6bad506 100644 --- a/src/unsafe-deep-dive/foundations/what-is-unsafe.md +++ b/src/unsafe-deep-dive/foundations/what-is-unsafe.md @@ -59,11 +59,11 @@ From the [reference](https://doc.rust-lang.org/reference/unsafety.html) > You may have a group of learners who are not familiar with each other yet. > This is a way for you to gather some data about their confidence levels and -> the phsychological safety that they're feeling. +> the psychological safety that they're feeling. ### Part 1: Informal definition -> Use this to guage the confidence level of the group. If they are uncertain, +> Use this to gauge the confidence level of the group. If they are uncertain, > then tailor the next section to be more directed. Ask the class: **By raising your hand, indicate if you would feel comfortable From 6c28cfe80f06ba06fe5700764d7b30f020d664a9 Mon Sep 17 00:00:00 2001 From: Tim McNamara Date: Tue, 8 Jul 2025 22:39:42 +1200 Subject: [PATCH 13/51] Change emphasis to what unsafe enables --- .../foundations/what-is-unsafe.md | 45 ++++++++++--------- 1 file changed, 25 insertions(+), 20 deletions(-) diff --git a/src/unsafe-deep-dive/foundations/what-is-unsafe.md b/src/unsafe-deep-dive/foundations/what-is-unsafe.md index 9399c6bad506..393182bf96e7 100644 --- a/src/unsafe-deep-dive/foundations/what-is-unsafe.md +++ b/src/unsafe-deep-dive/foundations/what-is-unsafe.md @@ -1,31 +1,14 @@ # What is “unsafety”? -Rust is intentionally vague about what "unsafe" means. +Unsafe Rust is a superset of Safe Rust. -Let's create a working definition together. +Let's create a list of things that are enabled by the `unsafe` keyword.
-## Clarifying "intentionally vague" - -It's likely that you'll have an audience member object to the phrase -"intentionally vague". Perhaps refer to this statement from the standard library -when discussing the [safety requirements of raw pointers]. - -> Many functions in [this module] take raw pointers as arguments and read from -> or write to them. For this to be safe, these pointers must be _valid_ for the -> given access. -> -> ... -> -> The precise rules for validity are not determined yet. - -[this module]: https://doc.rust-lang.org/std/ptr/index.html -[safety requirements of raw pointers]: https://doc.rust-lang.org/std/ptr/index.html#safety - ## Definitions from authoritative docs: -stdlib's keyword: +From the [unsafe keyword's documentation](): > Code or interfaces whose memory safety cannot be verified by the type system. > @@ -86,4 +69,26 @@ Ask the class to spend 3-5 minutes. Mention that we'll be reviewing our definition at the end of the day. +## Note: Avoid detailed discussion about precise semantics of memory safety + +It's possible that the group will slide into a discussion about the precise +semantics of what memory safety actually is and how define pointer validity. +This isn't a productive line of discussion. It can undermine confidence in less +experienced learners. + +Perhaps refer people who wish to discuss this to the discussion within the +official [documentation for pointer types] (excerpt below) as a place for +further research. + +> Many functions in [this module] take raw pointers as arguments and read from +> or write to them. For this to be safe, these pointers must be _valid_ for the +> given access. +> +> ... +> +> The precise rules for validity are not determined yet. + +[this module]: https://doc.rust-lang.org/std/ptr/index.html +[documentation for pointer types]: https://doc.rust-lang.org/std/ptr/index.html#safety +
From c4c7382e9e4a41dc4089118a5adf25717cf40a1e Mon Sep 17 00:00:00 2001 From: Tim McNamara Date: Wed, 9 Jul 2025 18:20:29 +1200 Subject: [PATCH 14/51] Add TODO marker --- src/unsafe-deep-dive/motivations/perfomance.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/unsafe-deep-dive/motivations/perfomance.md b/src/unsafe-deep-dive/motivations/perfomance.md index d40e3f66a8eb..7370375f43cb 100644 --- a/src/unsafe-deep-dive/motivations/perfomance.md +++ b/src/unsafe-deep-dive/motivations/perfomance.md @@ -1,6 +1,6 @@ # Performance -> Stub for now +> TODO: Stub for now It's easy to think of performance as the main reason for unsafe, but high performance code makes up the minority of unsafe blocks. From 92af3d006f66b53e2707a8869712bd7147ed261f Mon Sep 17 00:00:00 2001 From: Tim McNamara Date: Wed, 9 Jul 2025 19:02:42 +1200 Subject: [PATCH 15/51] Finish draft of "less powrful than it seems" --- .../foundations/less-powerful.md | 36 +++++++++++++++++-- 1 file changed, 33 insertions(+), 3 deletions(-) diff --git a/src/unsafe-deep-dive/foundations/less-powerful.md b/src/unsafe-deep-dive/foundations/less-powerful.md index 87e475154be9..80e18b1721f6 100644 --- a/src/unsafe-deep-dive/foundations/less-powerful.md +++ b/src/unsafe-deep-dive/foundations/less-powerful.md @@ -1,6 +1,6 @@ # Less powerful than it seems -The `unsafe` keyword +The `unsafe` keyword does not allow you to break Rust. ```rust use std::mem::transmute; @@ -8,11 +8,41 @@ use std::mem::transmute; let orig = b"RUST"; let n: i32 = unsafe { transmute(orig) }; -println!("{}") +println!("{n}") ```
-Try to c different +## Suggested outline + +- Request that someone explains what `std::mem::transmute` does +- Discuss why it doesn't compile +- Make + +## Expected compiler output + +``` + Compiling playground v0.0.1 (/playground) +error[E0512]: cannot transmute between types of different sizes, or dependently-sized types + --> src/main.rs:5:27 + | +5 | let n: i32 = unsafe { transmute(orig) }; + | ^^^^^^^^^ + | + = note: source type: `&[u8; 4]` (64 bits) + = note: target type: `i32` (32 bits) +``` + +## Suggested change + +```diff +- let n: i32 = unsafe { transmute(orig) }; ++ let n: i64 = unsafe { transmute(orig) }; +``` + +## Notes on less familiar Rust + +- the `b` prefix on a string literal marks it as byte slice (`&[u8]`) rather + than a string slice (`&str`)
From d1b856caa96f70397c32ec12087cdfddca888ef9 Mon Sep 17 00:00:00 2001 From: Tim McNamara Date: Wed, 9 Jul 2025 19:38:07 +1200 Subject: [PATCH 16/51] Add deeplink to blog post --- src/unsafe-deep-dive/motivations.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/unsafe-deep-dive/motivations.md b/src/unsafe-deep-dive/motivations.md index 6a5523244328..564cac9dfc73 100644 --- a/src/unsafe-deep-dive/motivations.md +++ b/src/unsafe-deep-dive/motivations.md @@ -5,8 +5,8 @@ We know that writing code without the guarantees that Rust provides ... > “Use-after-free (UAF), integer overflows, and out of bounds (OOB) reads/writes > comprise 90% of vulnerabilities with OOB being the most common.” > -> --— **Jeff Vander Stoep and Chong Zang**, Google. "Queue the Hardening -> Enhancements" +> --— **Jeff Vander Stoep and Chong Zang**, Google. +> "[Queue the Hardening Enhancements](https://security.googleblog.com/2019/05/queue-hardening-enhancements.html)" ... so why is `unsafe` part of the language? From df1a9365c8fbfc109839a44de01ddd0a7066e59e Mon Sep 17 00:00:00 2001 From: Tim McNamara Date: Wed, 9 Jul 2025 19:49:50 +1200 Subject: [PATCH 17/51] Replace the word "misunderstood" with something clearer --- src/unsafe-deep-dive/welcome.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/unsafe-deep-dive/welcome.md b/src/unsafe-deep-dive/welcome.md index f6a0aa6d9e8f..ac946a3fb005 100644 --- a/src/unsafe-deep-dive/welcome.md +++ b/src/unsafe-deep-dive/welcome.md @@ -11,8 +11,9 @@ [GitHub issue tracker]: https://github.com/google/comprehensive-rust/issues -The `unsafe` keyword is a misunderstood, but at times essential, part of the -Rust programming language. +The `unsafe` keyword is easy to type, but difficult to master. When used +appropriately, it forms a useful and indeed essential part of the Rust +programming language. By the end of this deep dive, you'll know how to work with `unsafe` code, review others' changes that include the `unsafe` keyword, and produce your own. From 16701fa94eb4da970d12b1b618ae2e9402f00153 Mon Sep 17 00:00:00 2001 From: Tim McNamara Date: Mon, 14 Jul 2025 14:00:58 +1200 Subject: [PATCH 18/51] Clarify course description --- src/running-the-course/course-structure.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/running-the-course/course-structure.md b/src/running-the-course/course-structure.md index 5a88b9f43fdd..b41b08829063 100644 --- a/src/running-the-course/course-structure.md +++ b/src/running-the-course/course-structure.md @@ -74,10 +74,10 @@ cargo run ### Unsafe (Work in Progress) -The [Unsafe](../unsafe-deep-dive/welcome.md) deep dive is a two day class on -using the `unsafe` keyword correctly. It covers the fundamentals of what Rust's +The [Unsafe](../unsafe-deep-dive/welcome.md) deep dive is a two-day class on +the *unsafe* Rust language. It covers the fundamentals of what Rust's safety guarantees are and why `unsafe` is needed, reviewing `unsafe` code, using -FFI and building data structures. +FFI, and building data structures that the borrow checker would normally reject. ## Format From 594a73f574fd47a96fc07820ef2672a7e3b9d98a Mon Sep 17 00:00:00 2001 From: Tim McNamara Date: Mon, 14 Jul 2025 14:30:21 +1200 Subject: [PATCH 19/51] Add timing directives --- src/running-the-course/course-structure.md | 2 ++ src/unsafe-deep-dive/foundations/actions-might-not-be.md | 4 ++++ src/unsafe-deep-dive/foundations/less-powerful.md | 4 ++++ src/unsafe-deep-dive/foundations/what-is-unsafe.md | 4 ++++ src/unsafe-deep-dive/foundations/when-is-unsafe-used.md | 4 ++++ src/unsafe-deep-dive/motivations.md | 6 ++++++ src/unsafe-deep-dive/motivations/data-structures.md | 4 ++++ src/unsafe-deep-dive/motivations/interop.md | 4 ++++ src/unsafe-deep-dive/setup.md | 4 ++++ src/unsafe-deep-dive/welcome.md | 6 ++++++ 10 files changed, 42 insertions(+) diff --git a/src/running-the-course/course-structure.md b/src/running-the-course/course-structure.md index b41b08829063..6ae8af4b9fa0 100644 --- a/src/running-the-course/course-structure.md +++ b/src/running-the-course/course-structure.md @@ -79,6 +79,8 @@ the *unsafe* Rust language. It covers the fundamentals of what Rust's safety guarantees are and why `unsafe` is needed, reviewing `unsafe` code, using FFI, and building data structures that the borrow checker would normally reject. +{{%course outline Unsafe}} + ## Format The course is meant to be very interactive and we recommend letting the diff --git a/src/unsafe-deep-dive/foundations/actions-might-not-be.md b/src/unsafe-deep-dive/foundations/actions-might-not-be.md index d181bec18f52..fd9f60d790e6 100644 --- a/src/unsafe-deep-dive/foundations/actions-might-not-be.md +++ b/src/unsafe-deep-dive/foundations/actions-might-not-be.md @@ -1,3 +1,7 @@ +--- +minutes: 2 +--- + # ... but actions on them might not be ```rust diff --git a/src/unsafe-deep-dive/foundations/less-powerful.md b/src/unsafe-deep-dive/foundations/less-powerful.md index 80e18b1721f6..14752a4bf191 100644 --- a/src/unsafe-deep-dive/foundations/less-powerful.md +++ b/src/unsafe-deep-dive/foundations/less-powerful.md @@ -1,3 +1,7 @@ +--- +minutes: 10 +--- + # Less powerful than it seems The `unsafe` keyword does not allow you to break Rust. diff --git a/src/unsafe-deep-dive/foundations/what-is-unsafe.md b/src/unsafe-deep-dive/foundations/what-is-unsafe.md index 393182bf96e7..8af083ac3fac 100644 --- a/src/unsafe-deep-dive/foundations/what-is-unsafe.md +++ b/src/unsafe-deep-dive/foundations/what-is-unsafe.md @@ -1,3 +1,7 @@ +--- +minutes: 6 +--- + # What is “unsafety”? Unsafe Rust is a superset of Safe Rust. diff --git a/src/unsafe-deep-dive/foundations/when-is-unsafe-used.md b/src/unsafe-deep-dive/foundations/when-is-unsafe-used.md index 088b73a38ee6..262ed90a73e4 100644 --- a/src/unsafe-deep-dive/foundations/when-is-unsafe-used.md +++ b/src/unsafe-deep-dive/foundations/when-is-unsafe-used.md @@ -1,3 +1,7 @@ +--- +minutes: 2 +--- + # When is unsafe used? The unsafe keyword indicates that the programmer is responsible for upholding diff --git a/src/unsafe-deep-dive/motivations.md b/src/unsafe-deep-dive/motivations.md index 564cac9dfc73..bffd117f1470 100644 --- a/src/unsafe-deep-dive/motivations.md +++ b/src/unsafe-deep-dive/motivations.md @@ -1,3 +1,7 @@ +--- +minutes: 1 +--- + # Motivations We know that writing code without the guarantees that Rust provides ... @@ -10,6 +14,8 @@ We know that writing code without the guarantees that Rust provides ... ... so why is `unsafe` part of the language? +{{%session outline}} +
The `unsafe` keyword exists because there is no compiler technology available diff --git a/src/unsafe-deep-dive/motivations/data-structures.md b/src/unsafe-deep-dive/motivations/data-structures.md index 6d3958213153..d15c904839e5 100644 --- a/src/unsafe-deep-dive/motivations/data-structures.md +++ b/src/unsafe-deep-dive/motivations/data-structures.md @@ -1,3 +1,7 @@ +--- +minutes: 10 +--- + # Data Structures Some families of data structures are impossible to create in safe Rust. diff --git a/src/unsafe-deep-dive/motivations/interop.md b/src/unsafe-deep-dive/motivations/interop.md index fdf29ecdbf4a..6fb3146e8a5f 100644 --- a/src/unsafe-deep-dive/motivations/interop.md +++ b/src/unsafe-deep-dive/motivations/interop.md @@ -1,3 +1,7 @@ +--- +minutes: 2 +--- + # Interop Rust has no understanding of code generated from other languages. diff --git a/src/unsafe-deep-dive/setup.md b/src/unsafe-deep-dive/setup.md index d9dddb2a21de..8dc83f4cfb8b 100644 --- a/src/unsafe-deep-dive/setup.md +++ b/src/unsafe-deep-dive/setup.md @@ -1,3 +1,7 @@ +--- +minutes: 2 +--- + # Setting Up You should have a Rust compiler installed that supports the 2024 edition of the diff --git a/src/unsafe-deep-dive/welcome.md b/src/unsafe-deep-dive/welcome.md index f6a0aa6d9e8f..b1b73578907c 100644 --- a/src/unsafe-deep-dive/welcome.md +++ b/src/unsafe-deep-dive/welcome.md @@ -1,3 +1,9 @@ +--- +course: Unsafe +session: Day 1 Morning +target_minutes: 300 +--- + # Welcome to Unsafe Rust > IMPORTANT: THIS MODULE IS IN AN EARLY STAGE OF DEVELOPMENT From 7dc18c95d5995b4a08228d0a6d229229864e7aa9 Mon Sep 17 00:00:00 2001 From: Tim McNamara Date: Mon, 14 Jul 2025 16:52:35 +1200 Subject: [PATCH 20/51] Expand setup instructions --- src/unsafe-deep-dive/setup.md | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/src/unsafe-deep-dive/setup.md b/src/unsafe-deep-dive/setup.md index 8dc83f4cfb8b..b4bea8be0b43 100644 --- a/src/unsafe-deep-dive/setup.md +++ b/src/unsafe-deep-dive/setup.md @@ -4,10 +4,12 @@ minutes: 2 # Setting Up +## Local Rust installation + You should have a Rust compiler installed that supports the 2024 edition of the language, which is any version of rustc higher than 1.84. -```bash +```console $ rustc --version rustc 1.87 ``` @@ -21,3 +23,26 @@ We recommend that you install the [Bazel build system](https://bazel.build/insta This will allow you to easily compile project that combine multiple languages. --> + +## (Optional) Create a local instance of the course + +Having a local version of the course material is useful in case of any interruptions with the network and makes it easy to access it later. + +```console +$ git clone --depth=1 https://github.com/google/comprehensive-rust.git +Cloning into 'comprehensive-rust'... +... +$ cd comprehensive-rust +$ cargo install-tools +... +$ cargo serve # then open http://127.0.0.1:3000/ in a browser +``` + + +
+ +Ask everyone to confirm that everyone is able to execute `rustc` with a version older that 1.87. + +For those people who do not, tell them that we'll resolve that in the break. + +
\ No newline at end of file From 8472b441501147e2da6f1322fead42acd5b3460a Mon Sep 17 00:00:00 2001 From: Tim McNamara Date: Mon, 14 Jul 2025 16:53:31 +1200 Subject: [PATCH 21/51] Minor: apply formatting --- src/unsafe-deep-dive/setup.md | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/src/unsafe-deep-dive/setup.md b/src/unsafe-deep-dive/setup.md index b4bea8be0b43..2d9feeb5938d 100644 --- a/src/unsafe-deep-dive/setup.md +++ b/src/unsafe-deep-dive/setup.md @@ -16,8 +16,8 @@ rustc 1.87 + +If an unsafe block has multiple safety conditions that can be assessed +independently, then it's likely that each of those conditions should be in its +own block. + +
diff --git a/src/unsafe-deep-dive/mechanics/safety-comments.md b/src/unsafe-deep-dive/mechanics/safety-comments.md new file mode 100644 index 000000000000..91d7a6642155 --- /dev/null +++ b/src/unsafe-deep-dive/mechanics/safety-comments.md @@ -0,0 +1,14 @@ +# Safety comments + +// TODO: Expand + +
+ +An effective safety comment is falsifiable. That is, there should be something +empirical that people can point to and check. + +Note that Clippy's lint for safety comments does little more than check that the +string SAFETY: appears before the `unsafe` keyword. There is no further +validation. + +
From fdaf9cd77c9bc6f423a3389a66137095aced3600 Mon Sep 17 00:00:00 2001 From: Tim McNamara Date: Thu, 17 Jul 2025 15:11:04 +1200 Subject: [PATCH 35/51] Init mechanics segment --- src/SUMMARY.md | 1 + src/unsafe-deep-dive/mechanics.md | 6 ++++++ 2 files changed, 7 insertions(+) create mode 100644 src/unsafe-deep-dive/mechanics.md diff --git a/src/SUMMARY.md b/src/SUMMARY.md index 1dca1f14df59..0f61ccd54c2f 100644 --- a/src/SUMMARY.md +++ b/src/SUMMARY.md @@ -454,6 +454,7 @@ - [Data structures are safe](unsafe-deep-dive/foundations/data-structures-are-safe.md) - [Actions might not be](unsafe-deep-dive/foundations/actions-might-not-be.md) - [Less powerful than it seems](unsafe-deep-dive/foundations/less-powerful.md) +- [Mechanics](unsafe-deep-dive/mechanics.md) --- diff --git a/src/unsafe-deep-dive/mechanics.md b/src/unsafe-deep-dive/mechanics.md new file mode 100644 index 000000000000..bdb424728c8d --- /dev/null +++ b/src/unsafe-deep-dive/mechanics.md @@ -0,0 +1,6 @@ +# Mechanics + +We've seen several examples of `unsafe` blocks in this course. Let's look at +what's involved in creating a well-written one. + +{{% segment outline}} From 9c070c9fc6ada01e83aa6c139da115332a606ee2 Mon Sep 17 00:00:00 2001 From: Tim McNamara Date: Thu, 17 Jul 2025 18:19:48 +1200 Subject: [PATCH 36/51] Start of Mechanics segment --- src/SUMMARY.md | 4 ++ src/unsafe-deep-dive/mechanics.md | 6 +++ .../mechanics/narrow-scope.md | 45 +++++++++++++++++ .../representing-booleans-extension.md | 50 +++++++++++++++++++ .../mechanics/representing-booleans.md | 32 ++++++++++++ .../mechanics/safety-comments.md | 14 ++++++ 6 files changed, 151 insertions(+) create mode 100644 src/unsafe-deep-dive/mechanics/narrow-scope.md create mode 100644 src/unsafe-deep-dive/mechanics/representing-booleans-extension.md create mode 100644 src/unsafe-deep-dive/mechanics/representing-booleans.md create mode 100644 src/unsafe-deep-dive/mechanics/safety-comments.md diff --git a/src/SUMMARY.md b/src/SUMMARY.md index 0f61ccd54c2f..a8bb0fb1de12 100644 --- a/src/SUMMARY.md +++ b/src/SUMMARY.md @@ -455,6 +455,10 @@ - [Actions might not be](unsafe-deep-dive/foundations/actions-might-not-be.md) - [Less powerful than it seems](unsafe-deep-dive/foundations/less-powerful.md) - [Mechanics](unsafe-deep-dive/mechanics.md) + - [Narrow scope](unsafe-deep-dive/mechanics/narrow-scope.md) + - [Safety comments](unsafe-deep-dive/mechanics/safety-comments.md) + - [Example: Representing Booleans](unsafe-deep-dive/mechanics/representing-booleans.md) + - [Extension](unsafe-deep-dive/mechanics/representing-booleans-extension.md) --- diff --git a/src/unsafe-deep-dive/mechanics.md b/src/unsafe-deep-dive/mechanics.md index bdb424728c8d..1181a594fcc4 100644 --- a/src/unsafe-deep-dive/mechanics.md +++ b/src/unsafe-deep-dive/mechanics.md @@ -4,3 +4,9 @@ We've seen several examples of `unsafe` blocks in this course. Let's look at what's involved in creating a well-written one. {{% segment outline}} + +
+ +Inform the class that we will be doing a number + +
diff --git a/src/unsafe-deep-dive/mechanics/narrow-scope.md b/src/unsafe-deep-dive/mechanics/narrow-scope.md new file mode 100644 index 000000000000..a0e7a2909971 --- /dev/null +++ b/src/unsafe-deep-dive/mechanics/narrow-scope.md @@ -0,0 +1,45 @@ +# Keep unsafe narrow + +Compare these two code examples: + +```rust +fn main() { + let raw = b"Crab"; + + // SAFETY: `raw` has the static lifetime of valid UTF-8 data and therefore `ptr` is valid + let crab = unsafe { + let ptr = raw.as_ptr(); + let bytes = std::slice::from_raw_parts(ptr, 4); + std::str::from_utf8_unchecked(bytes) + }; + + println!("{crab}"); +} +``` + +```rust +fn main() { + let raw = b"Crab"; + let ptr = raw.as_ptr(); + + // SAFETY: `raw` has the static lifetime and therefore `ptr` is valid + let bytes = unsafe { std::slice::from_raw_parts(ptr, 4) }; + + // SAFETY: We created `raw` with valid UTF-8 data + let crab = unsafe { std::str::from_utf8_unchecked(bytes) }; + + println!("{crab}"); +} +``` + +
+ +Unsafe blocks should have a narrow lens. + + + +If an unsafe block has multiple safety conditions that can be assessed +independently, then it's likely that each of those conditions should be in its +own block. + +
diff --git a/src/unsafe-deep-dive/mechanics/representing-booleans-extension.md b/src/unsafe-deep-dive/mechanics/representing-booleans-extension.md new file mode 100644 index 000000000000..cb36840cfdc3 --- /dev/null +++ b/src/unsafe-deep-dive/mechanics/representing-booleans-extension.md @@ -0,0 +1,50 @@ +# Extension + +Create a similar data structure for Rust's [`char`] type. A `char` occupies 4 +bytes, but not all 4 bytes sequences are valid as `char`. + +[`char`]: https://doc.rust-lang.org/std/primitive.char.html + +Here is some starter code: + +```rust +struct Char; + +impl TryFrom for Char { + type Error = u32; + + fn try_from(x: u32) -> std::result::Result>::Error> { + todo!() // Attempt conversion, returning Err(x) when invalid + } +} + +#[test] +fn repr_matches() { + use std::alloc::Layout; + + assert_eq!(Layout::new::(), Layout::new::()); +} + +#[test] +fn conversion() { + for i in u32::MIN..=u32::MAX { + let res = Char::try_from(i); + + match i { + 0..=0xD7FF | 0xE000..=0x10FFFF => assert!(res.is_ok()), + _ => assert!(res.is_err()), + }; + } +} +``` + +
+ +Representation: + +From Rust's documentation: + +> `char` is guaranteed to have the same size, alignment, and function call ABI +> as `u32` on all platforms. + +
diff --git a/src/unsafe-deep-dive/mechanics/representing-booleans.md b/src/unsafe-deep-dive/mechanics/representing-booleans.md new file mode 100644 index 000000000000..de1b46b9e8a2 --- /dev/null +++ b/src/unsafe-deep-dive/mechanics/representing-booleans.md @@ -0,0 +1,32 @@ +# Example: Representing Booleans + +To Boolean values must match a precise representation to avoid undefined +behavior. + + + + + + + + + + + + + + +
Bit patternRust type
00000001true
00000000false
Other patternsUndefined
+ +You have two tasks in this exercise. + +- First, create Rust struct that represents a Boolean value and a function that + create a value of your type from `u8` with no overhead cost while ensuring + that undefined behavior is impossible. +- Secondly, review someone else's implementation. + +Starter code: + +```rust +struct Boolean(u8); +``` diff --git a/src/unsafe-deep-dive/mechanics/safety-comments.md b/src/unsafe-deep-dive/mechanics/safety-comments.md new file mode 100644 index 000000000000..91d7a6642155 --- /dev/null +++ b/src/unsafe-deep-dive/mechanics/safety-comments.md @@ -0,0 +1,14 @@ +# Safety comments + +// TODO: Expand + +
+ +An effective safety comment is falsifiable. That is, there should be something +empirical that people can point to and check. + +Note that Clippy's lint for safety comments does little more than check that the +string SAFETY: appears before the `unsafe` keyword. There is no further +validation. + +
From bf646f8e16e8bfd97950513a01b1b5368cd62633 Mon Sep 17 00:00:00 2001 From: Tim McNamara Date: Mon, 21 Jul 2025 18:33:58 +1200 Subject: [PATCH 37/51] Expand unsafe deep dive --- src/SUMMARY.md | 10 +- src/unsafe-deep-dive/mechanics/case-study.md | 20 ++++ .../guideline-invariant-checklist.md | 11 ++ .../mechanics/guideline-safety-checklist.md | 52 +++++++++ src/unsafe-deep-dive/mechanics/guidelines.md | 10 ++ .../representing-booleans-extension.md | 2 +- .../mechanics/representing-booleans.md | 108 +++++++++++++++++- .../representing-only-even-numbers.md | 1 + .../mechanics/safety-comments.md | 51 ++++++++- 9 files changed, 254 insertions(+), 11 deletions(-) create mode 100644 src/unsafe-deep-dive/mechanics/case-study.md create mode 100644 src/unsafe-deep-dive/mechanics/guideline-invariant-checklist.md create mode 100644 src/unsafe-deep-dive/mechanics/guideline-safety-checklist.md create mode 100644 src/unsafe-deep-dive/mechanics/guidelines.md create mode 100644 src/unsafe-deep-dive/mechanics/representing-only-even-numbers.md diff --git a/src/SUMMARY.md b/src/SUMMARY.md index a8bb0fb1de12..95b65a3b5da3 100644 --- a/src/SUMMARY.md +++ b/src/SUMMARY.md @@ -455,10 +455,14 @@ - [Actions might not be](unsafe-deep-dive/foundations/actions-might-not-be.md) - [Less powerful than it seems](unsafe-deep-dive/foundations/less-powerful.md) - [Mechanics](unsafe-deep-dive/mechanics.md) - - [Narrow scope](unsafe-deep-dive/mechanics/narrow-scope.md) - - [Safety comments](unsafe-deep-dive/mechanics/safety-comments.md) - [Example: Representing Booleans](unsafe-deep-dive/mechanics/representing-booleans.md) - - [Extension](unsafe-deep-dive/mechanics/representing-booleans-extension.md) + - [Extension:](unsafe-deep-dive/mechanics/representing-booleans-extension.md) + - [Extension: Representing "Only Even Numbers"](unsafe-deep-dive/mechanics/representing-only-even-numbers.md) + - [Case Study](unsafe-deep-dive/mechanics/case-study.md) + - [Guidelines](unsafe-deep-dive/mechanics/guidelines.md) + - [Narrow scope](unsafe-deep-dive/mechanics/narrow-scope.md) + - [Safety comments](unsafe-deep-dive/mechanics/safety-comments.md) + - [Safety checklist](unsafe-deep-dive/mechanics/guideline-safety-checklist.md) --- diff --git a/src/unsafe-deep-dive/mechanics/case-study.md b/src/unsafe-deep-dive/mechanics/case-study.md new file mode 100644 index 000000000000..5f59102e6391 --- /dev/null +++ b/src/unsafe-deep-dive/mechanics/case-study.md @@ -0,0 +1,20 @@ +# Case Study: Lesser-known parts of std::mem + +As a group, we'll study some parts of Rust's memory management functionality: + +- `std::mem::TransmuteFrom` trait and its `Assume` struct +- `std::mem::discriminant` +- `std::mem::forget_unsized` +- `std::mem::MaybeUninit` + +
+ +Split learners into small groups. After a few minutes, they should be able to +answer the following questions: + +- What does it do? +- How does the documentation describe the safety contract, if any? Can that + documentation be improved? +- + +
diff --git a/src/unsafe-deep-dive/mechanics/guideline-invariant-checklist.md b/src/unsafe-deep-dive/mechanics/guideline-invariant-checklist.md new file mode 100644 index 000000000000..92daf61efdd4 --- /dev/null +++ b/src/unsafe-deep-dive/mechanics/guideline-invariant-checklist.md @@ -0,0 +1,11 @@ +# Invariant Checklist + +When writing and reviewing `unsafe` code, we should make sure that there are no. + +- Validity +- Alignment +- "Business Rules", i.e. all values must be even numbers + +
+ +
diff --git a/src/unsafe-deep-dive/mechanics/guideline-safety-checklist.md b/src/unsafe-deep-dive/mechanics/guideline-safety-checklist.md new file mode 100644 index 000000000000..c2e877e1b769 --- /dev/null +++ b/src/unsafe-deep-dive/mechanics/guideline-safety-checklist.md @@ -0,0 +1,52 @@ +# Safety checklist + +When writing and reviewing `unsafe` code, we should make sure that we've +considered the following considerations _and documented_ what callers must later +uphold: + +- Validity +- Alignment +- Lifetimes +- Ownership +- Platform +- Compliance with specifications + +
+ +**Validity** + +Callers must ensure that values must match some bit-pattern. + +**Alignment** + +Callers must ensure that values are correctly aligned. + +**Lifetimes** + +Do callers need to verify that a referent must exist before/after/during? + +**Ownership** + +Can this function generate confusion about ownership? + +> _Aside:_ Memory leaks +> +> A discussion about leaking memory may arise here. If calling a function +> removes all ownership information, then . +> +> Memory leaking is not strictly a memory safety concern. However, it's often a +> problem in practice, especially if it is unintentional. +> +> Therefore, this should at least be documented. If it's possible to mishandle +> the API and cause an unintentional leak, then there is a case for an unsafe +> block. + +**Platform** + +Callers must be wary of platform-specific behavior. + +**Compliance with specifications** + +"Business Rules", i.e. all values must be even numbers. + +
diff --git a/src/unsafe-deep-dive/mechanics/guidelines.md b/src/unsafe-deep-dive/mechanics/guidelines.md new file mode 100644 index 000000000000..a48f3450f4b1 --- /dev/null +++ b/src/unsafe-deep-dive/mechanics/guidelines.md @@ -0,0 +1,10 @@ +# Guidelines + +Ther + +
+ +The next few slides are intended as reference material. You should have covered +the material in the discussion. + +
diff --git a/src/unsafe-deep-dive/mechanics/representing-booleans-extension.md b/src/unsafe-deep-dive/mechanics/representing-booleans-extension.md index cb36840cfdc3..ce75da82e9c2 100644 --- a/src/unsafe-deep-dive/mechanics/representing-booleans-extension.md +++ b/src/unsafe-deep-dive/mechanics/representing-booleans-extension.md @@ -40,7 +40,7 @@ fn conversion() {
-Representation: +## Representation From Rust's documentation: diff --git a/src/unsafe-deep-dive/mechanics/representing-booleans.md b/src/unsafe-deep-dive/mechanics/representing-booleans.md index de1b46b9e8a2..80e09233bedd 100644 --- a/src/unsafe-deep-dive/mechanics/representing-booleans.md +++ b/src/unsafe-deep-dive/mechanics/representing-booleans.md @@ -1,7 +1,6 @@ -# Example: Representing Booleans +# Example: Representing Boolean values -To Boolean values must match a precise representation to avoid undefined -behavior. +Boolean values must match a precise representation to avoid undefined behavior: @@ -20,13 +19,110 @@ behavior. You have two tasks in this exercise. -- First, create Rust struct that represents a Boolean value and a function that - create a value of your type from `u8` with no overhead cost while ensuring - that undefined behavior is impossible. +- First, + - Create Rust type, `Boolean` type that represents a Boolean value in a + spec-compliant way + - The first create values of your type from `u8` with no overhead cost while + ensuring that undefined behavior is impossible. - Secondly, review someone else's implementation. Starter code: +Part 1 involves a `Boolean`, which is a type that can be + +
+ +Admittedly, there isn't much starter code. + +## Discussion + +- The critical point in these reviews is that learners accurately describe the + contract that callers need to uphold when converting from `u8`. It should be + well described in a Safety section of the docstring. +- Functions should have an `#[inline(always)]` annotation as Rust's `Copy` trait + involves memcpy. We want the compiler to erase the function call + +> _Aside: TransmuteFrom trait_ +> +> The standard library contains a nightly feature, `transmutability` which +> defines the [`std::mem::TransmuteFrom`] trait for performing this kind of +> operation. This is one of the outputs from the [Safe Transmute Project] within +> the Rust compiler team. + +[`transmutability`]: https://github.com/rust-lang/rust/issues/99571 +[Safe Transmute Project]: https://github.com/rust-lang/project-safe-transmute +[`std::mem::TransmuteFrom`]: https://doc.rust-lang.org/std/mem/trait.TransmuteFrom.html + +### Picking a data structure + +**Newtype wrapping u8** + +The orthodox strategy will be to wrap `u8` in a struct: + ```rust struct Boolean(u8); ``` + +This ensures that the representation is the same as `u8`. + +**Newtype wrapping bool** + +Hopefully, you will have some learners will wrap `bool` as a newtype: + +```rust +struct Boolean(bool); +``` + +At first, this may look like a bit of a cheat code for the exercise. It won't +avoid the need to convert from `u8`, however. + +Wrapping `bool` includes the bonus that you can guarantee--in so far as you can +guarantee Rust's own behavior--that `Boolean` is spec-compliant with `bool`. + +It may also look redundant - why bother creating a new type when it doesn't +perform as a `bool`? Because it gives us complete control over the trait system. + +**Union** + +An alternative strategy would be to use a `union`: + +```rust +union Byte { + u8, + bool, +} +``` + +This isn't advised. It means that the value will _never_ be able to be +considered safe to access. Callers will need to ensure that they comply with the +rules at every interaction with the type. + +**Typestate** + +Some advanced programmers may attempt to encode Boolean values as zero-sized +types in the type system. If you receive questions about this, gently nudge them +back to including the byte. + +```rust +struct True; +struct False; +``` + +There are a couple of reasons for this. First, zero-sized types do not obey the +width and alignment requirements of the spec for `bool`. Secondly, they're very +difficult to work with in practice. + +If they wish to make use of the typestate pattern, then a possible alternative +would be to . + +```rust +struct Boolean(bool); +struct True(bool); +struct False(bool); +``` + +## Code review + +Suggest that there be some advice + +
diff --git a/src/unsafe-deep-dive/mechanics/representing-only-even-numbers.md b/src/unsafe-deep-dive/mechanics/representing-only-even-numbers.md new file mode 100644 index 000000000000..c37d762e7c9d --- /dev/null +++ b/src/unsafe-deep-dive/mechanics/representing-only-even-numbers.md @@ -0,0 +1 @@ +# Extension: Representing "Only Even Numbers" diff --git a/src/unsafe-deep-dive/mechanics/safety-comments.md b/src/unsafe-deep-dive/mechanics/safety-comments.md index 91d7a6642155..daf68b87f147 100644 --- a/src/unsafe-deep-dive/mechanics/safety-comments.md +++ b/src/unsafe-deep-dive/mechanics/safety-comments.md @@ -1,9 +1,58 @@ # Safety comments -// TODO: Expand +When defining unsafe functions, provide a `Safety` section in the docstring: + +```rust,editable +/// Compress `data`, overwriting its memory and updating the length of the slice. +unsafe fn compress_inplace(data: &mut [u8]) { + todo!(); +} +``` + +When using an unsafe block, document how you have upheld your side of the +contract: + +```rust,editable +unsafe { + std::mem::transmute::(x) +} +```
+## Code + +```rust +/// Compress `data`, overwriting its memory and updating the length of the slice. +/// +/// ## Safety +/// +/// Callers must ensure that the data's compressed form is shorter than the +/// original. As a heuristic, this function should not be used on a buffer +/// that has fewer than 256 bytes. +unsafe fn compress_inplace(data: &mut [u8]) { + todo!(); +} +``` + +```rust +/// SAFETY: We control the generation of `x` and can ensure that it's 4 bytes wide +unsafe { + std::mem::transmute::(x) +} +``` + +> _Aside: In-place compression_ +> +> Creating an algorithm that does in-place compression is likely to nerd snipe 1 +> or two people. Avoid getting distracted. +> +> You could mention that it's possible to use a stack-allocated tmp buffer +> rather than something on the heap. If the implementation uses a static buffer, +> the comment must be updated to mention that the code is not thread-safe. + +## Discussion + An effective safety comment is falsifiable. That is, there should be something empirical that people can point to and check. From 496b0c64c13f457ddf71e5464796aaaca8175e1a Mon Sep 17 00:00:00 2001 From: Tim McNamara Date: Wed, 23 Jul 2025 16:05:08 +1200 Subject: [PATCH 38/51] Finish Guidelines intro slide --- src/unsafe-deep-dive/mechanics/guidelines.md | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/src/unsafe-deep-dive/mechanics/guidelines.md b/src/unsafe-deep-dive/mechanics/guidelines.md index a48f3450f4b1..be0af0a9c616 100644 --- a/src/unsafe-deep-dive/mechanics/guidelines.md +++ b/src/unsafe-deep-dive/mechanics/guidelines.md @@ -1,10 +1,17 @@ # Guidelines -Ther +> WORK IN PROGRESS +> +> These guidelines should not be interpreted as authoritative or official. + +Specific advice on creating well-written unsafe Rust code.
-The next few slides are intended as reference material. You should have covered -the material in the discussion. +The next few slides are intended as reference material. You do not need to spend +much time here – the intent is to tell people that these guidelines exist. + +You should have covered discussed most of the points in the preceding +discussion.
From 9b316f63f2c272799f05cf2034261ac1d2c42416 Mon Sep 17 00:00:00 2001 From: Tim McNamara Date: Wed, 23 Jul 2025 16:10:33 +1200 Subject: [PATCH 39/51] Refactor file names for consistency --- src/SUMMARY.md | 6 +-- .../guideline-invariant-checklist.md | 47 +++++++++++++++-- ...row-scope.md => guideline-narrow-scope.md} | 0 .../mechanics/guideline-safety-checklist.md | 52 ------------------- ...mments.md => guideline-safety-comments.md} | 0 src/unsafe-deep-dive/mechanics/narrow.md | 45 ---------------- 6 files changed, 47 insertions(+), 103 deletions(-) rename src/unsafe-deep-dive/mechanics/{narrow-scope.md => guideline-narrow-scope.md} (100%) delete mode 100644 src/unsafe-deep-dive/mechanics/guideline-safety-checklist.md rename src/unsafe-deep-dive/mechanics/{safety-comments.md => guideline-safety-comments.md} (100%) delete mode 100644 src/unsafe-deep-dive/mechanics/narrow.md diff --git a/src/SUMMARY.md b/src/SUMMARY.md index 95b65a3b5da3..f86b8ebf8894 100644 --- a/src/SUMMARY.md +++ b/src/SUMMARY.md @@ -460,9 +460,9 @@ - [Extension: Representing "Only Even Numbers"](unsafe-deep-dive/mechanics/representing-only-even-numbers.md) - [Case Study](unsafe-deep-dive/mechanics/case-study.md) - [Guidelines](unsafe-deep-dive/mechanics/guidelines.md) - - [Narrow scope](unsafe-deep-dive/mechanics/narrow-scope.md) - - [Safety comments](unsafe-deep-dive/mechanics/safety-comments.md) - - [Safety checklist](unsafe-deep-dive/mechanics/guideline-safety-checklist.md) + - [Narrow scope](unsafe-deep-dive/mechanics/guideline-narrow-scope.md) + - [Safety comments](unsafe-deep-dive/mechanics/guideline-safety-comments.md) + - [Invariant checklist](unsafe-deep-dive/mechanics/guideline-invariant-checklist.md) --- diff --git a/src/unsafe-deep-dive/mechanics/guideline-invariant-checklist.md b/src/unsafe-deep-dive/mechanics/guideline-invariant-checklist.md index 92daf61efdd4..c2e877e1b769 100644 --- a/src/unsafe-deep-dive/mechanics/guideline-invariant-checklist.md +++ b/src/unsafe-deep-dive/mechanics/guideline-invariant-checklist.md @@ -1,11 +1,52 @@ -# Invariant Checklist +# Safety checklist -When writing and reviewing `unsafe` code, we should make sure that there are no. +When writing and reviewing `unsafe` code, we should make sure that we've +considered the following considerations _and documented_ what callers must later +uphold: - Validity - Alignment -- "Business Rules", i.e. all values must be even numbers +- Lifetimes +- Ownership +- Platform +- Compliance with specifications
+**Validity** + +Callers must ensure that values must match some bit-pattern. + +**Alignment** + +Callers must ensure that values are correctly aligned. + +**Lifetimes** + +Do callers need to verify that a referent must exist before/after/during? + +**Ownership** + +Can this function generate confusion about ownership? + +> _Aside:_ Memory leaks +> +> A discussion about leaking memory may arise here. If calling a function +> removes all ownership information, then . +> +> Memory leaking is not strictly a memory safety concern. However, it's often a +> problem in practice, especially if it is unintentional. +> +> Therefore, this should at least be documented. If it's possible to mishandle +> the API and cause an unintentional leak, then there is a case for an unsafe +> block. + +**Platform** + +Callers must be wary of platform-specific behavior. + +**Compliance with specifications** + +"Business Rules", i.e. all values must be even numbers. +
diff --git a/src/unsafe-deep-dive/mechanics/narrow-scope.md b/src/unsafe-deep-dive/mechanics/guideline-narrow-scope.md similarity index 100% rename from src/unsafe-deep-dive/mechanics/narrow-scope.md rename to src/unsafe-deep-dive/mechanics/guideline-narrow-scope.md diff --git a/src/unsafe-deep-dive/mechanics/guideline-safety-checklist.md b/src/unsafe-deep-dive/mechanics/guideline-safety-checklist.md deleted file mode 100644 index c2e877e1b769..000000000000 --- a/src/unsafe-deep-dive/mechanics/guideline-safety-checklist.md +++ /dev/null @@ -1,52 +0,0 @@ -# Safety checklist - -When writing and reviewing `unsafe` code, we should make sure that we've -considered the following considerations _and documented_ what callers must later -uphold: - -- Validity -- Alignment -- Lifetimes -- Ownership -- Platform -- Compliance with specifications - -
- -**Validity** - -Callers must ensure that values must match some bit-pattern. - -**Alignment** - -Callers must ensure that values are correctly aligned. - -**Lifetimes** - -Do callers need to verify that a referent must exist before/after/during? - -**Ownership** - -Can this function generate confusion about ownership? - -> _Aside:_ Memory leaks -> -> A discussion about leaking memory may arise here. If calling a function -> removes all ownership information, then . -> -> Memory leaking is not strictly a memory safety concern. However, it's often a -> problem in practice, especially if it is unintentional. -> -> Therefore, this should at least be documented. If it's possible to mishandle -> the API and cause an unintentional leak, then there is a case for an unsafe -> block. - -**Platform** - -Callers must be wary of platform-specific behavior. - -**Compliance with specifications** - -"Business Rules", i.e. all values must be even numbers. - -
diff --git a/src/unsafe-deep-dive/mechanics/safety-comments.md b/src/unsafe-deep-dive/mechanics/guideline-safety-comments.md similarity index 100% rename from src/unsafe-deep-dive/mechanics/safety-comments.md rename to src/unsafe-deep-dive/mechanics/guideline-safety-comments.md diff --git a/src/unsafe-deep-dive/mechanics/narrow.md b/src/unsafe-deep-dive/mechanics/narrow.md deleted file mode 100644 index a0e7a2909971..000000000000 --- a/src/unsafe-deep-dive/mechanics/narrow.md +++ /dev/null @@ -1,45 +0,0 @@ -# Keep unsafe narrow - -Compare these two code examples: - -```rust -fn main() { - let raw = b"Crab"; - - // SAFETY: `raw` has the static lifetime of valid UTF-8 data and therefore `ptr` is valid - let crab = unsafe { - let ptr = raw.as_ptr(); - let bytes = std::slice::from_raw_parts(ptr, 4); - std::str::from_utf8_unchecked(bytes) - }; - - println!("{crab}"); -} -``` - -```rust -fn main() { - let raw = b"Crab"; - let ptr = raw.as_ptr(); - - // SAFETY: `raw` has the static lifetime and therefore `ptr` is valid - let bytes = unsafe { std::slice::from_raw_parts(ptr, 4) }; - - // SAFETY: We created `raw` with valid UTF-8 data - let crab = unsafe { std::str::from_utf8_unchecked(bytes) }; - - println!("{crab}"); -} -``` - -
- -Unsafe blocks should have a narrow lens. - - - -If an unsafe block has multiple safety conditions that can be assessed -independently, then it's likely that each of those conditions should be in its -own block. - -
From 101a071f27bbc70797f6ea153b8b9f501dd932aa Mon Sep 17 00:00:00 2001 From: Tim McNamara Date: Wed, 23 Jul 2025 16:22:13 +1200 Subject: [PATCH 40/51] Continue merging stray conflicts --- src/SUMMARY.md | 4 +- ...eans-extension.md => representing-char.md} | 0 src/unsafe-deep-dive/motivations/interop.md | 67 +------------------ .../motivations/perfomance.md | 10 --- 4 files changed, 3 insertions(+), 78 deletions(-) rename src/unsafe-deep-dive/mechanics/{representing-booleans-extension.md => representing-char.md} (100%) delete mode 100644 src/unsafe-deep-dive/motivations/perfomance.md diff --git a/src/SUMMARY.md b/src/SUMMARY.md index f86b8ebf8894..a253b047a021 100644 --- a/src/SUMMARY.md +++ b/src/SUMMARY.md @@ -456,8 +456,8 @@ - [Less powerful than it seems](unsafe-deep-dive/foundations/less-powerful.md) - [Mechanics](unsafe-deep-dive/mechanics.md) - [Example: Representing Booleans](unsafe-deep-dive/mechanics/representing-booleans.md) - - [Extension:](unsafe-deep-dive/mechanics/representing-booleans-extension.md) - - [Extension: Representing "Only Even Numbers"](unsafe-deep-dive/mechanics/representing-only-even-numbers.md) + - [Extension: Representing Char](unsafe-deep-dive/mechanics/representing-char.md) + - [Extension: Representing "Only Even Numbers"](unsafe-deep-dive/mechanics/representing-only-even-numbers.md) - [Case Study](unsafe-deep-dive/mechanics/case-study.md) - [Guidelines](unsafe-deep-dive/mechanics/guidelines.md) - [Narrow scope](unsafe-deep-dive/mechanics/guideline-narrow-scope.md) diff --git a/src/unsafe-deep-dive/mechanics/representing-booleans-extension.md b/src/unsafe-deep-dive/mechanics/representing-char.md similarity index 100% rename from src/unsafe-deep-dive/mechanics/representing-booleans-extension.md rename to src/unsafe-deep-dive/mechanics/representing-char.md diff --git a/src/unsafe-deep-dive/motivations/interop.md b/src/unsafe-deep-dive/motivations/interop.md index 25df39e13812..6096a3b40fe2 100644 --- a/src/unsafe-deep-dive/motivations/interop.md +++ b/src/unsafe-deep-dive/motivations/interop.md @@ -6,11 +6,7 @@ minutes: 5 > an introduction to the motivations only, rather than to be an elaborate > discussion of the whole problem. -<<<<<<< HEAD -# Interop -======= # Interoperability ->>>>>>> ebcff61ee0e91066888289fb2c51beb0e36d4a62 Language interoperability allows you to: @@ -19,11 +15,7 @@ Language interoperability allows you to: However, this requires unsafe. -<<<<<<< HEAD -```rust,editable -======= ```rust,editable,ignore ->>>>>>> ebcff61ee0e91066888289fb2c51beb0e36d4a62 unsafe extern "C" { safe fn random() -> libc::c_long; } @@ -41,11 +33,7 @@ hasn't compiled, so it delegates that responsibility to you through the unsafe keyword. The code example we're seeing shows how to call the random function provided by -<<<<<<< HEAD -libc within Rust. -======= libc within Rust. libc is available to scripts in the Rust Playground. ->>>>>>> ebcff61ee0e91066888289fb2c51beb0e36d4a62 This uses Rust's _foreign function interface_. @@ -63,27 +51,16 @@ parsing all take energy and time. rely on its symbols, including `random`, being available to our program. - _What is the "safe" keyword?_\ It allows callers to call the function without needing to wrap that call in -<<<<<<< HEAD - `unsafe`. The [`safe` function qualifier] was introduced in the 2024 edition - of Rust and can only be used within `extern` blocks. It was introduced because - `unsafe` became a mandatory qualifier for `extern` blocks in that edition. -======= `unsafe`. The [`safe` function qualifier][safe] was introduced in the 2024 edition of Rust and can only be used within `extern` blocks. It was introduced because `unsafe` became a mandatory qualifier for `extern` blocks in that edition. ->>>>>>> ebcff61ee0e91066888289fb2c51beb0e36d4a62 - _What is the [`std::ffi::c_long`] type?_\ According to the C standard, an integer that's at least 32 bits wide. On today's systems, It's an `i32` on Windows and an `i64` on Linux. -<<<<<<< HEAD -[`safe` keyword]: https://doc.rust-lang.org/reference/safe-keyword.html -[`std::ffi::c_long`]: https://doc.rust-lang.org/std/ffi/type.c_long.html -======= -[`std::ffi::c_long`]: https://doc.rust-lang.org/std/ffi/type.c_long.html [safe]: https://doc.rust-lang.org/stable/edition-guide/rust-2024/unsafe-extern.html ->>>>>>> ebcff61ee0e91066888289fb2c51beb0e36d4a62 +[`std::ffi::c_long`]: https://doc.rust-lang.org/std/ffi/type.c_long.html ## Consideration: type safety @@ -116,14 +93,6 @@ fn main() { > } > ``` -<<<<<<< HEAD -It's also possible to completely erase the type. Stress that the Rust compiler -will trust that the wrapper is telling the truth. - -```rust -unsafe extern "C" { - safe fn random() -> [u8; 64]; -======= It's also possible to completely ignore the intended type and create undefined behavior in multiple ways. The code below produces output most of the time, but generally results in a stack overflow. It may also produce illegal `char` @@ -139,7 +108,6 @@ Stress that the Rust compiler will trust that the wrapper is telling the truth. ```rust,ignore unsafe extern "C" { safe fn random() -> [char; 2]; ->>>>>>> ebcff61ee0e91066888289fb2c51beb0e36d4a62 } fn main() { @@ -153,11 +121,7 @@ fn main() { > ```diff > unsafe extern "C" { > - safe fn random() -> libc::c_long; -<<<<<<< HEAD -> + safe fn random() -> [u8; 64]; -======= > + safe fn random() -> [char; 2]; ->>>>>>> ebcff61ee0e91066888289fb2c51beb0e36d4a62 > } > > fn main() { @@ -168,17 +132,6 @@ fn main() { > } > ``` -<<<<<<< HEAD -Mention that type safety is generally not a large concern in practice. -Auto-generated wrappers, i.e. those produced by bindgen and related tools, are -excellent at reading header files and producing values of the correct type. - -## Consideration: Ownership and lifetime management - -While libc's `random` function doesn't use pointers, may do. This creates the -possibility that interacting with another programming language introduce -unsoundness. -======= > Attempting to print a `[char; 2]` from randomly generated input will often > produce strange output, including: > @@ -200,7 +153,6 @@ header files and producing values of the correct type. While libc's `random` function doesn't use pointers, many do. This creates many more possibilities for unsoundness. ->>>>>>> ebcff61ee0e91066888289fb2c51beb0e36d4a62 - both sides might attempt to free the memory (double free) - both sides can attempt to write to the data @@ -208,9 +160,6 @@ more possibilities for unsoundness. For example, some C libraries expose functions that write to static buffers that are re-used between calls. -<<<<<<< HEAD -```rust -======= ```rust,ignore ->>>>>>> ebcff61ee0e91066888289fb2c51beb0e36d4a62 use std::ffi::{CStr, c_char}; use std::time::{SystemTime, UNIX_EPOCH}; unsafe extern "C" { /// Create a formatted time based on time `t`, including trailing newline. -<<<<<<< HEAD -======= /// Read `man 3 ctime` details. ->>>>>>> ebcff61ee0e91066888289fb2c51beb0e36d4a62 fn ctime(t: *const libc::time_t) -> *const c_char; } @@ -259,15 +204,6 @@ fn main() { } ``` -<<<<<<< HEAD -Bonus points: can anyone spot the lifetime bug? `format_timestamp()` should -return a `&'static str`. - -## Consideration: Representation mismatch - -Different programming languages have made design decisions and this can create -impedance mismatches between different domains. -======= > _Aside:_ Lifetimes in the `format_timestamp()` function > > Neither `'a`, nor `'static`, correctly describe the lifetime of the string @@ -278,7 +214,6 @@ impedance mismatches between different domains. Different programming languages have made different design decisions and this can create impedance mismatches between different domains. ->>>>>>> ebcff61ee0e91066888289fb2c51beb0e36d4a62 Consider string handling. C++ defines `std::string`, which has an incompatible memory layout with Rust's `String` type. `String` also requires text to be diff --git a/src/unsafe-deep-dive/motivations/perfomance.md b/src/unsafe-deep-dive/motivations/perfomance.md deleted file mode 100644 index 0b32e8600afa..000000000000 --- a/src/unsafe-deep-dive/motivations/perfomance.md +++ /dev/null @@ -1,10 +0,0 @@ ---- -minutes: 5 ---- - -# Performance - -> TODO: Stub for now - -It's easy to think of performance as the main reason for unsafe, but high -performance code makes up the minority of unsafe blocks. From 2580b8afe86bf34a8fc103a488f3363459393d29 Mon Sep 17 00:00:00 2001 From: Tim McNamara Date: Wed, 23 Jul 2025 16:29:53 +1200 Subject: [PATCH 41/51] Remove 'only even numbers' example --- src/SUMMARY.md | 1 - src/unsafe-deep-dive/mechanics/representing-only-even-numbers.md | 1 - 2 files changed, 2 deletions(-) delete mode 100644 src/unsafe-deep-dive/mechanics/representing-only-even-numbers.md diff --git a/src/SUMMARY.md b/src/SUMMARY.md index a253b047a021..33387410b4c9 100644 --- a/src/SUMMARY.md +++ b/src/SUMMARY.md @@ -457,7 +457,6 @@ - [Mechanics](unsafe-deep-dive/mechanics.md) - [Example: Representing Booleans](unsafe-deep-dive/mechanics/representing-booleans.md) - [Extension: Representing Char](unsafe-deep-dive/mechanics/representing-char.md) - - [Extension: Representing "Only Even Numbers"](unsafe-deep-dive/mechanics/representing-only-even-numbers.md) - [Case Study](unsafe-deep-dive/mechanics/case-study.md) - [Guidelines](unsafe-deep-dive/mechanics/guidelines.md) - [Narrow scope](unsafe-deep-dive/mechanics/guideline-narrow-scope.md) diff --git a/src/unsafe-deep-dive/mechanics/representing-only-even-numbers.md b/src/unsafe-deep-dive/mechanics/representing-only-even-numbers.md deleted file mode 100644 index c37d762e7c9d..000000000000 --- a/src/unsafe-deep-dive/mechanics/representing-only-even-numbers.md +++ /dev/null @@ -1 +0,0 @@ -# Extension: Representing "Only Even Numbers" From a879818fdb6d7c1873b00a40cc88b08ed9947ac1 Mon Sep 17 00:00:00 2001 From: Tim McNamara Date: Wed, 23 Jul 2025 16:39:59 +1200 Subject: [PATCH 42/51] Improve instructor notes --- src/unsafe-deep-dive/mechanics/case-study.md | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/src/unsafe-deep-dive/mechanics/case-study.md b/src/unsafe-deep-dive/mechanics/case-study.md index 5f59102e6391..2004e04acfeb 100644 --- a/src/unsafe-deep-dive/mechanics/case-study.md +++ b/src/unsafe-deep-dive/mechanics/case-study.md @@ -9,12 +9,17 @@ As a group, we'll study some parts of Rust's memory management functionality:
-Split learners into small groups. After a few minutes, they should be able to -answer the following questions: +Split learners into small groups and ask them to look into the implementation of +one of the types above. -- What does it do? +You may need to show learner how to view the source code of the standard +library. + +After a few minutes, they should be able to answer the following questions: + +- What is the purpose of the function/type/trait? - How does the documentation describe the safety contract, if any? Can that documentation be improved? -- +- Were there any interesting parts in its implementation?
From 05a68eebf875ed5d2544c577b01dafccc4e5237e Mon Sep 17 00:00:00 2001 From: Tim McNamara Date: Wed, 23 Jul 2025 17:27:35 +1200 Subject: [PATCH 43/51] Expand Mechanics segment --- src/SUMMARY.md | 8 +++- .../mechanics/case-study-rawvec.md | 47 +++++++++++++++++++ .../mechanics/case-study-std-mem.md | 1 + .../mechanics/case-study-unsafe-cell.md | 1 + src/unsafe-deep-dive/mechanics/example-ffi.md | 1 + .../mechanics/guideline-narrow-scope.md | 4 ++ .../mechanics/guideline-portal-types.md | 16 +++++++ .../mechanics/guideline-reuse-preexisting.md | 23 +++++++++ .../mechanics/guideline-safety-comments.md | 4 ++ .../mechanics/guideline-smart-constructors.md | 43 +++++++++++++++++ 10 files changed, 147 insertions(+), 1 deletion(-) create mode 100644 src/unsafe-deep-dive/mechanics/case-study-rawvec.md create mode 100644 src/unsafe-deep-dive/mechanics/case-study-std-mem.md create mode 100644 src/unsafe-deep-dive/mechanics/case-study-unsafe-cell.md create mode 100644 src/unsafe-deep-dive/mechanics/example-ffi.md create mode 100644 src/unsafe-deep-dive/mechanics/guideline-portal-types.md create mode 100644 src/unsafe-deep-dive/mechanics/guideline-reuse-preexisting.md create mode 100644 src/unsafe-deep-dive/mechanics/guideline-smart-constructors.md diff --git a/src/SUMMARY.md b/src/SUMMARY.md index 33387410b4c9..a5727efe2bf9 100644 --- a/src/SUMMARY.md +++ b/src/SUMMARY.md @@ -457,8 +457,14 @@ - [Mechanics](unsafe-deep-dive/mechanics.md) - [Example: Representing Booleans](unsafe-deep-dive/mechanics/representing-booleans.md) - [Extension: Representing Char](unsafe-deep-dive/mechanics/representing-char.md) - - [Case Study](unsafe-deep-dive/mechanics/case-study.md) + - [Example: FFI](unsafe-deep-dive/mechanics/example-ffi.md) + - [Case Study: RawVec](unsafe-deep-dive/mechanics/case-study-rawvec.md) + - [Case Study: std::mem](unsafe-deep-dive/mechanics/case-study-std-mem.md) + - [Case Study: UnsafeCell](unsafe-deep-dive/mechanics/case-study-unsafe-cell.md) - [Guidelines](unsafe-deep-dive/mechanics/guidelines.md) + - [Portal types](unsafe-deep-dive/mechanics/guideline-portal-types.md) + - [Smart constructors](unsafe-deep-dive/mechanics/guideline-smart-constructors.md) + - [Reuse pre-existing code](unsafe-deep-dive/mechanics/guideline-reuse-preexisting.md) - [Narrow scope](unsafe-deep-dive/mechanics/guideline-narrow-scope.md) - [Safety comments](unsafe-deep-dive/mechanics/guideline-safety-comments.md) - [Invariant checklist](unsafe-deep-dive/mechanics/guideline-invariant-checklist.md) diff --git a/src/unsafe-deep-dive/mechanics/case-study-rawvec.md b/src/unsafe-deep-dive/mechanics/case-study-rawvec.md new file mode 100644 index 000000000000..7f3fd504f4ea --- /dev/null +++ b/src/unsafe-deep-dive/mechanics/case-study-rawvec.md @@ -0,0 +1,47 @@ +--- +minutes: 15 +--- + +# Case Study: RawVec + +> WORK IN PROGRESS +> +> This section is likely to receive significant alterations before completion +> and may even be removed entirely. + +Many important collections in the standard library, such as `Vec`, `String` +and `Deque` rely on a private inner type called `RawVec`. + +Why is that inner type used? + +```rust,ignore +// https://doc.rust-lang.org/src/alloc/vec/mod.rs.html +// std::alloc +pub struct Vec { + buf: RawVec, + len: usize, +} +``` + +```rust,ignore +// std::raw_vec +pub(crate) struct RawVec { + inner: RawVecInner, + _marker: PhantomData, +} +``` + +The [implementation of `RawVec` is described in the Rustonomicon][rv]. + +[rv]: https://doc.rust-lang.org/nomicon/vec/vec-raw.html + +
+ +`Vec` is normally described as being a struct with three fields: length, +capacity, and pointer to an underlying buffer. Once you dig into the +implementation details, you'll notice that + +Because Rust won't allow self-referential types, RawVec in the type system is +used to contain the capacity and pointer. + +
diff --git a/src/unsafe-deep-dive/mechanics/case-study-std-mem.md b/src/unsafe-deep-dive/mechanics/case-study-std-mem.md new file mode 100644 index 000000000000..89b9a15afd40 --- /dev/null +++ b/src/unsafe-deep-dive/mechanics/case-study-std-mem.md @@ -0,0 +1 @@ +# Case Study: std::mem diff --git a/src/unsafe-deep-dive/mechanics/case-study-unsafe-cell.md b/src/unsafe-deep-dive/mechanics/case-study-unsafe-cell.md new file mode 100644 index 000000000000..cb5a9e1d7417 --- /dev/null +++ b/src/unsafe-deep-dive/mechanics/case-study-unsafe-cell.md @@ -0,0 +1 @@ +# Case Study: UnsafeCell diff --git a/src/unsafe-deep-dive/mechanics/example-ffi.md b/src/unsafe-deep-dive/mechanics/example-ffi.md new file mode 100644 index 000000000000..c4954da2011b --- /dev/null +++ b/src/unsafe-deep-dive/mechanics/example-ffi.md @@ -0,0 +1 @@ +# Example: FFI diff --git a/src/unsafe-deep-dive/mechanics/guideline-narrow-scope.md b/src/unsafe-deep-dive/mechanics/guideline-narrow-scope.md index a0e7a2909971..b5f4812f6548 100644 --- a/src/unsafe-deep-dive/mechanics/guideline-narrow-scope.md +++ b/src/unsafe-deep-dive/mechanics/guideline-narrow-scope.md @@ -1,3 +1,7 @@ +--- +minutes: 3 +--- + # Keep unsafe narrow Compare these two code examples: diff --git a/src/unsafe-deep-dive/mechanics/guideline-portal-types.md b/src/unsafe-deep-dive/mechanics/guideline-portal-types.md new file mode 100644 index 000000000000..6f775bbefb21 --- /dev/null +++ b/src/unsafe-deep-dive/mechanics/guideline-portal-types.md @@ -0,0 +1,16 @@ +--- +minutes: 2 +--- + +# Portal types + +> TODO(timclicks): expand + +Create a safe type that wraps a type that performs unsafe operations. The safe +type makes the unsafe type impossible to misuse. The wrapper acts as a portal to +the world of unsafe. + +Examples: + +- `std::collections::Vec` wraps `std::alloc::RawVec` +- The "sys crate" pattern diff --git a/src/unsafe-deep-dive/mechanics/guideline-reuse-preexisting.md b/src/unsafe-deep-dive/mechanics/guideline-reuse-preexisting.md new file mode 100644 index 000000000000..b831a654efe2 --- /dev/null +++ b/src/unsafe-deep-dive/mechanics/guideline-reuse-preexisting.md @@ -0,0 +1,23 @@ +--- +minutes: 3 +--- + +# Reuse pre-existing code + +> TODO(timclicks): expand + +Avoid re-implementing: + +- Interior mutability – `Cell` and `UnsafeCell` +- Wrapping NULL pointers safely – `Option<&mut T>` + +
+ +When we are writing code, it can be tempting to write everything from scratch. +Check whether pre-existing solutions exist already. In particular, the standard +library offers excellent defaults for memory management. + +If you find yourself writing a better implementation of these types, then +consider submitting them to the Rust project. + +
diff --git a/src/unsafe-deep-dive/mechanics/guideline-safety-comments.md b/src/unsafe-deep-dive/mechanics/guideline-safety-comments.md index daf68b87f147..a137734c70a4 100644 --- a/src/unsafe-deep-dive/mechanics/guideline-safety-comments.md +++ b/src/unsafe-deep-dive/mechanics/guideline-safety-comments.md @@ -1,3 +1,7 @@ +--- +minutes: 2 +--- + # Safety comments When defining unsafe functions, provide a `Safety` section in the docstring: diff --git a/src/unsafe-deep-dive/mechanics/guideline-smart-constructors.md b/src/unsafe-deep-dive/mechanics/guideline-smart-constructors.md new file mode 100644 index 000000000000..691ffb017c72 --- /dev/null +++ b/src/unsafe-deep-dive/mechanics/guideline-smart-constructors.md @@ -0,0 +1,43 @@ +# Smart constructors + +> TODO(timclicks): Think of a better type name; expand details + +```rust,ignore +impl ForeignRefCount { + fn new(...) { + // .. + } + + unsafe fn incr(&mut self) { + // ... + } + + unsafe fn decr(&mut self) { + // ... + } +} +``` + +```rust,ignore +impl ForeignRefCount { + unsafe fn new_unchchecked(...) { + // .. + } + + fn incr(&mut self) { + // ... + } + + fn decr(&mut self) { + // ... + } +} +``` + +
+ +It is tedious to check invariants at every call during an object's life. +Instead, you can provide a `new_unchecked` method which provides an opportunity +for the invariants to be checked once and then later relied upon. + +
From 3f5f5c737e4210f97900f86d8e830f5c783bd75e Mon Sep 17 00:00:00 2001 From: Tim McNamara Date: Fri, 25 Jul 2025 19:15:32 +1200 Subject: [PATCH 44/51] WIP: major re-write in progress --- .../mechanics/representing-booleans.md | 116 ++++++++++++++++++ 1 file changed, 116 insertions(+) diff --git a/src/unsafe-deep-dive/mechanics/representing-booleans.md b/src/unsafe-deep-dive/mechanics/representing-booleans.md index 80e09233bedd..533527eceb91 100644 --- a/src/unsafe-deep-dive/mechanics/representing-booleans.md +++ b/src/unsafe-deep-dive/mechanics/representing-booleans.md @@ -1,5 +1,121 @@ +--- +minutes: 15 +--- + # Example: Representing Boolean values +> TODO(timclicks): split this content into multiple sub-sections + +One of the terms that we introduced earlier was _undefined behavior_. This +exercise aims to discuss what undefined behavior actually is and how it can +arise. + +High performance code is particularly prone to introducing undefined behavior +into a program, because it will typically find every corner that's possible to +cut. + +We don't want to have undefined behavior in our code, because it makes the code +_unsound_. Unsound code can crash abruptly or produce unexpected results, +because compilers are written with the assumption that undefined behavior does +not exist. + +Safe Rust does not permit undefined behavior. + +It becomes impossible to reason about, . + +In fact, compilers are engineered to assume that undefined behavior never +exists. + +We are going to work through an example of how undefined behavior can be +introduced in an attempt to improve performance. + +--- + +## Part 1 + +How are the Boolean values `true` and `false` represented by programming +languages? + +Many languages, including Rust and C, encode Boolean values as an integer, +where: + +- 1 represents truth or positivity +- 0 represents falsehood or negativity + +### Exercise: + +Define a type that represents a bool + +--- + +### + +Or in Rust syntax: + +```rust +struct Boolean(u8); + +const true: Boolean = Boolean(1); +const false: Boolean = Boolean(0); +``` + +>> Instructor Notes +> +> We define a type here so that there is no confusion in the type system between +> `u8` and `Boolean`. + +From a theoretical perspective, the two states `true` and `false` be represented +by a single bit. However, the smallest integer available is `u8`, which has 254 +additional states. + +## Exercise + +Implement two conversion functions, `byte_to_boolean()` and `boolean_to_byte()`: + +```rust +struct Boolean(u8); + +fn byte_to_boolean(b: u8) -> Boolean { + todo!(); +} + +fn boolean_to_byte(b: Boolean) -> u8 { + todo!(); +} +``` + +## Discussion + +Should this function be marked as unsafe? + +```rust +struct Boolean(u8); + +fn byte_to_boolean(b: u8) -> Boolean { + match b { + 0 => false, + _ => true, + } +} +``` + +## + +This example demonstrates how the search for high performance can . Software +engineers can find themselves wanting to exploit characteristics of the +operating environment, + +CPUs + +> Well, actually... +> +> CPUs don't really have a concept of a Boolean value. Instead, they have +> Boolean operations. + +In Rust, the conventional way to think of them is something like this: + +They're encoded as + Boolean values must match a precise representation to avoid undefined behavior:
From 003ae722b66310eea48827fd9ee16e87bc78c73c Mon Sep 17 00:00:00 2001 From: Tim McNamara Date: Mon, 28 Jul 2025 18:02:19 +1200 Subject: [PATCH 45/51] WIP: continuing rewrite; checking in progress --- src/SUMMARY.md | 3 + .../mechanics/representing-booleans.md | 284 ++++++++++++++++-- .../understanding-unsafety.md | 10 + .../understanding-unsafety/out-of-bounds.md | 50 +++ .../undefined-behavior.md | 267 ++++++++++++++++ 5 files changed, 590 insertions(+), 24 deletions(-) create mode 100644 src/unsafe-deep-dive/understanding-unsafety.md create mode 100644 src/unsafe-deep-dive/understanding-unsafety/out-of-bounds.md create mode 100644 src/unsafe-deep-dive/understanding-unsafety/undefined-behavior.md diff --git a/src/SUMMARY.md b/src/SUMMARY.md index a5727efe2bf9..a9539b5b308c 100644 --- a/src/SUMMARY.md +++ b/src/SUMMARY.md @@ -454,6 +454,9 @@ - [Data structures are safe](unsafe-deep-dive/foundations/data-structures-are-safe.md) - [Actions might not be](unsafe-deep-dive/foundations/actions-might-not-be.md) - [Less powerful than it seems](unsafe-deep-dive/foundations/less-powerful.md) +- [Understanding Unsafety](unsafe-deep-dive/understanding-unsafety.md) + - [Undefined behavior](unsafe-deep-dive/understanding-unsafety/undefined-behavior.md) + - [Out of bounds](unsafe-deep-dive/understanding-unsafety/out-of-bounds.md) - [Mechanics](unsafe-deep-dive/mechanics.md) - [Example: Representing Booleans](unsafe-deep-dive/mechanics/representing-booleans.md) - [Extension: Representing Char](unsafe-deep-dive/mechanics/representing-char.md) diff --git a/src/unsafe-deep-dive/mechanics/representing-booleans.md b/src/unsafe-deep-dive/mechanics/representing-booleans.md index 533527eceb91..bbc85938cd42 100644 --- a/src/unsafe-deep-dive/mechanics/representing-booleans.md +++ b/src/unsafe-deep-dive/mechanics/representing-booleans.md @@ -10,45 +10,260 @@ One of the terms that we introduced earlier was _undefined behavior_. This exercise aims to discuss what undefined behavior actually is and how it can arise. -High performance code is particularly prone to introducing undefined behavior -into a program, because it will typically find every corner that's possible to -cut. +High performance code is particularly prone to accidentally introducing +undefined behavior into a program, because its authors are typically very +interested in finding ways to cut corners. + +--- + +## What's wrong with undefined behavior? + +C++ compilers will typically (*) compile this code without warnings, and will +run without error signaling an error: + +```cpp +#include + +int axiom_increment_is_greater(int x) { + return x + 1 > x; +} + +int main() { + int a = 2147483647; + assert(axiom_increment_is_greater(a)); +} +``` + +Equivalent Rust programs produce different output: + +```rust,editable +fn axiom_increment_is_greater(x: i32) -> bool { + x + 1 > x +} + +fn main() { + let a = 2147483647; + assert!(axiom_increment_is_greater(a)); +} +``` + +(*) We can't be certain. That's one of the problems. + +
We don't want to have undefined behavior in our code, because it makes the code -_unsound_. Unsound code can crash abruptly or produce unexpected results, -because compilers are written with the assumption that undefined behavior does -not exist. +_unsound_. + +Unsound code can crash abruptly or produce unexpected results, because compilers +are written with the assumption that undefined behavior does not exist. They +will create optimizations that could be completely contrary to your +expectations. + +In this example, assume that we're creating some sort of proof assistant that +makes deductions based on mathematical axioms. One of the axioms that we want to +encode is that an integer's increment is always greater than the integer itself: -Safe Rust does not permit undefined behavior. +gcc v13.2, clang v16.0.0 and msvc v19.0 [all compile the C++ code to][asm] the +following assembly when optimizations are enabled ( `-O2`): + +```asm +axiom_increment_is_greater(int): + mov eax, 1 + ret +``` -It becomes impossible to reason about, . +[asm]: https://godbolt.org/z/q4MMY8vxs -In fact, compilers are engineered to assume that undefined behavior never -exists. +That is, while it looks like they'll always return `true`, the code also +produces undefined behavior. When `x` is 2^32-1 and is incremented, it enters an +undefined state. The operation produces a number that is outside of the range of +a 32-bit signed integer. + +Integer overflow for signed integers is _undefined_. In the conventional twos +complement representation, increment often wraps to -(2^31)-1 `i32::MIN`. + +Rust takes a stricter approach. When integer oveflow is signaled by the CPU, a +panic is induced. This allows Safe Rust to be free of undefined behaviour. + +
+ +--- + +## Rust keeps undefined behavior out... + +...but, unsafe provides a way for it to get back in. + +
We are going to work through an example of how undefined behavior can be introduced in an attempt to improve performance. +
+ --- -## Part 1 +## Booleans + +A typical representation: + +- 1 => truth/positivity +- 0 => falsehood/negativity + +
+ +Just as integers can have their quirks, so do Boolean data types. How are the Boolean values `true` and `false` represented by programming languages? -Many languages, including Rust and C, encode Boolean values as an integer, +Many languages, including Rust and C++, encode Boolean values as an integer, where: - 1 represents truth or positivity - 0 represents falsehood or negativity -### Exercise: +However, there is an impedance mismatch because even the smallest integer (a +single byte) can represent many more numbers than the two that are required. + +> Aside: Not a universal definition +> +> Programming language designers are free to have their own representations, or +> not include a Boolean type in their language at all. +> +> CPUs do not have a Boolean datatype, rather they have Boolean operations that +> are performed against operands that are typically integers. + +
+ +--- + +## Exercise + +Define a type that represents a `bool` and conversion two conversion functions +to convert between a `u8` and your new type and back again. + +
+ +
+ +--- + +## Code review 1 + +Critique this code and suggest improvements, if any: + +```rust,editable +struct Boolean(u8); + +fn byte_to_boolean(b: u8) -> Boolean { + Boolean(b) +} + +fn boolean_to_byte(boolean: Boolean) -> u8 { + boolean.0 +} + +fn boolean_to_bool(boolean: Boolean) -> bool { + match b.0 { + 0 => false, + _ => true, + } +} +``` + +
+ +Which function should be `unsafe`? It could either be at the "constructor" +(`byte_to_boolean`) or when the Boolean is converted to a Rust-native `bool` +(`boolean_to_bool`). + +
+ +--- + +## Code review 2 + +```rust,editable +struct Boolean(bool); + +fn byte_to_boolean(b: u8) -> Boolean { + match b.0 { + 0 => Boolean(false), + _ => Boolean(true), + } +} + +fn boolean_to_byte(boolean: Boolean) -> u8 { + boolean.0 as u8 +} + +fn boolean_to_bool(boolean: Boolean) -> bool { + boolean.0 +} +``` + +
+ +In this version, we mask the error. All non-zero inputs are coerced to `true`. +We store the internal field of the `Boolean` struct as a `bool` to make as much +use of Rust's type system as possible. + +However, this `byte_to_boolean` is not zero-cost. There is still a `match` +operation that's required. + +
+ +--- + +## Code review 3 + +```rust,editable +#[repr(C)] +union Boolean { + raw: u8, + rust: bool, +} + +fn byte_to_boolean(b: u8) -> Boolean { + Boolean { raw: b } +} + +fn boolean_to_byte(boolean: Boolean) -> u8 { + unsafe { boolean.rust } +} + +fn boolean_to_bool(boolean: Boolean) -> bool { + unsafe { boolean.raw } +} +``` + +--- + +## Code review 4 + +```rust,editable +struct Boolean(bool); -Define a type that represents a bool +fn byte_to_boolean(b: u8) -> Boolean { + let b: bool = unsafe { sys::mem::transmute(b) }; + + Boolean(b) +} + +fn boolean_to_byte(boolean: Boolean) -> u8 { + boolean.0 as u8 +} + +fn boolean_to_bool(boolean: Boolean) -> bool { + boolean.0 +} +``` --- -### +## + +--- Or in Rust syntax: @@ -68,6 +283,29 @@ From a theoretical perspective, the two states `true` and `false` be represented by a single bit. However, the smallest integer available is `u8`, which has 254 additional states. +This is a similar problem to the mismatch casting from a `i64` to `i32`, but +there is a significant difference. When converting an integer from a 64-bit type +to a 32-bit type, there is not enough space in the narrower type for all +possible input values. They can't all fit. In the case of casting from `u8` to +`bool`, the number of bits isn't the issue. It's the standard that imposes the +additional restrictions. + +Depending on one's perspective, this either presents an opportunity or a +challenge. + +Moreover, [Rust (following C) imposes the following restrictions][ref-bool] on +its `bool` type: + +> The value `false` has the bit pattern `0x00` and the value `true` has the bit +> pattern `0x01`. It is _undefined behavior_ for an object with the boolean type +> to have any other bit pattern. [emphasis added] + +Many CPUs, don't strictly have a "Boolean type". They have Boolean operations. + +- For true, CPUs ask. Does this value match + +[ref-bool]: https://doc.rust-lang.org/reference/types/boolean.html + ## Exercise Implement two conversion functions, `byte_to_boolean()` and `boolean_to_byte()`: @@ -99,7 +337,13 @@ fn byte_to_boolean(b: u8) -> Boolean { } ``` -## +--- + +> Note: Content following this comment is from a previous revisions and is being +> retained temporarily. + +> TODO(timclicks): Review the following content for anything useful that should +> be retained. This example demonstrates how the search for high performance can . Software engineers can find themselves wanting to exploit characteristics of the @@ -114,8 +358,6 @@ CPUs In Rust, the conventional way to think of them is something like this: -They're encoded as - Boolean values must match a precise representation to avoid undefined behavior:
@@ -142,14 +384,8 @@ You have two tasks in this exercise. ensuring that undefined behavior is impossible. - Secondly, review someone else's implementation. -Starter code: - -Part 1 involves a `Boolean`, which is a type that can be -
-Admittedly, there isn't much starter code. - ## Discussion - The critical point in these reviews is that learners accurately describe the diff --git a/src/unsafe-deep-dive/understanding-unsafety.md b/src/unsafe-deep-dive/understanding-unsafety.md new file mode 100644 index 000000000000..ba4c6352b065 --- /dev/null +++ b/src/unsafe-deep-dive/understanding-unsafety.md @@ -0,0 +1,10 @@ +--- +minutes: 1 +--- + +# Understanding Unsafety + +We've introduced a few technical terms, such as _undefined behavior_. Let's take +a good look at what they actually mean. + +{{%segment outline}} diff --git a/src/unsafe-deep-dive/understanding-unsafety/out-of-bounds.md b/src/unsafe-deep-dive/understanding-unsafety/out-of-bounds.md new file mode 100644 index 000000000000..e757a9ad8bae --- /dev/null +++ b/src/unsafe-deep-dive/understanding-unsafety/out-of-bounds.md @@ -0,0 +1,50 @@ +# Out of bounds access + +The way that we often think of memory as application programmers, as a linear +block of a space that we can reserve space from and give back to, is somewhat of +an illusion. + +--- + +## A motivating example + +```cpp +int numbers[10] = {}; + +bool numbers_contains(int n) +{ + for (int i = 0; i <= 10; i++) { + if (table[i] == v) return true; + } + return false; +} +``` + +> Derived from the Undefined Behavior chapter from cppreference.com +> + +
+ +The `numbers` array contains no members, and therefore should be false for all +inputs. However, gcc13 with -O2 optimizes this code to ensure that it returns +true for all cases. + +```asm +numbers_contains(int): + mov eax, 1 + ret +numbers: + .zero 16 +``` + +
+ +--- + +Hello there + +
+ +More details + +
diff --git a/src/unsafe-deep-dive/understanding-unsafety/undefined-behavior.md b/src/unsafe-deep-dive/understanding-unsafety/undefined-behavior.md new file mode 100644 index 000000000000..87062dbff044 --- /dev/null +++ b/src/unsafe-deep-dive/understanding-unsafety/undefined-behavior.md @@ -0,0 +1,267 @@ +--- +minutes: 15 +--- + +# Example: Representing Boolean values + +> TODO(timclicks): split this content into multiple sub-sections + +One of the terms that we introduced earlier was _undefined behavior_. This +exercise aims to discuss what undefined behavior actually is and how it can +arise. + +High performance code is particularly prone to introducing undefined behavior +into a program, because it will typically find every corner that's possible to +cut. + +We don't want to have undefined behavior in our code, because it makes the code +_unsound_. Unsound code can crash abruptly or produce unexpected results, +because compilers are written with the assumption that undefined behavior does +not exist. + +Safe Rust does not permit undefined behavior. + +It becomes impossible to reason about, . + +In fact, compilers are engineered to assume that undefined behavior never +exists. + +We are going to work through an example of how undefined behavior can be +introduced in an attempt to improve performance. + +--- + +## Part 1 + +How are the Boolean values `true` and `false` represented by programming +languages? + +Many languages, including Rust and C, encode Boolean values as an integer, +where: + +- 1 represents truth or positivity +- 0 represents falsehood or negativity + +### Exercise: + +Define a type that represents a bool + +--- + +Or in Rust syntax: + +```rust +struct Boolean(u8); + +const true: Boolean = Boolean(1); +const false: Boolean = Boolean(0); +``` + +>> Instructor Notes +> +> We define a type here so that there is no confusion in the type system between +> `u8` and `Boolean`. + +From a theoretical perspective, the two states `true` and `false` be represented +by a single bit. However, the smallest integer available is `u8`, which has 254 +additional states. + +This is a similar problem to the mismatch casting from a `i64` to `i32`, but +there is a significant difference. When converting an integer from a 64-bit type +to a 32-bit type, there is not enough space in the narrower type for all +possible input values. They can't all fit. In the case of casting from `u8` to +`bool`, the number of bits isn't the issue. It's the standard that imposes the +additional restrictions. + +is a mismatch when casting between a `u8` and a `bool`. + +That means, to covert from an integer to bool. + +Depending on one's perspective, this either presents an opportunity or a +challenge. + +Moreover, [Rust (following C) imposes the following restrictions][ref-bool] on +its `bool` type: + +> The value `false` has the bit pattern `0x00` and the value `true` has the bit +> pattern `0x01`. It is _undefined behavior_ for an object with the boolean type +> to have any other bit pattern. [emphasis added] + +Many CPUs, don't strictly have a "Boolean type". They have Boolean operations. + +[ref-bool]: https://doc.rust-lang.org/reference/types/boolean.html + +## Exercise + +Implement two conversion functions, `byte_to_boolean()` and `boolean_to_byte()`: + +```rust +struct Boolean(u8); + +fn byte_to_boolean(b: u8) -> Boolean { + todo!(); +} + +fn boolean_to_byte(b: Boolean) -> u8 { + todo!(); +} +``` + +## Discussion + +Should this function be marked as unsafe? + +```rust +struct Boolean(u8); + +fn byte_to_boolean(b: u8) -> Boolean { + match b { + 0 => false, + _ => true, + } +} +``` + +## + +This example demonstrates how the search for high performance can . Software +engineers can find themselves wanting to exploit characteristics of the +operating environment, + +CPUs + +> Well, actually... +> +> CPUs don't really have a concept of a Boolean value. Instead, they have +> Boolean operations. + +In Rust, the conventional way to think of them is something like this: + +They're encoded as + +Boolean values must match a precise representation to avoid undefined behavior: + +
+ + + + + + + + + + + + +
Bit patternRust type
00000001true
00000000false
Other patternsUndefined
+ +You have two tasks in this exercise. + +- First, + - Create Rust type, `Boolean` type that represents a Boolean value in a + spec-compliant way + - The first create values of your type from `u8` with no overhead cost while + ensuring that undefined behavior is impossible. +- Secondly, review someone else's implementation. + +Starter code: + +Part 1 involves a `Boolean`, which is a type that can be + +
+ +Admittedly, there isn't much starter code. + +## Discussion + +- The critical point in these reviews is that learners accurately describe the + contract that callers need to uphold when converting from `u8`. It should be + well described in a Safety section of the docstring. +- Functions should have an `#[inline(always)]` annotation as Rust's `Copy` trait + involves memcpy. We want the compiler to erase the function call + +> _Aside: TransmuteFrom trait_ +> +> The standard library contains a nightly feature, `transmutability` which +> defines the [`std::mem::TransmuteFrom`] trait for performing this kind of +> operation. This is one of the outputs from the [Safe Transmute Project] within +> the Rust compiler team. + +[`transmutability`]: https://github.com/rust-lang/rust/issues/99571 +[Safe Transmute Project]: https://github.com/rust-lang/project-safe-transmute +[`std::mem::TransmuteFrom`]: https://doc.rust-lang.org/std/mem/trait.TransmuteFrom.html + +### Picking a data structure + +**Newtype wrapping u8** + +The orthodox strategy will be to wrap `u8` in a struct: + +```rust +struct Boolean(u8); +``` + +This ensures that the representation is the same as `u8`. + +**Newtype wrapping bool** + +Hopefully, you will have some learners will wrap `bool` as a newtype: + +```rust +struct Boolean(bool); +``` + +At first, this may look like a bit of a cheat code for the exercise. It won't +avoid the need to convert from `u8`, however. + +Wrapping `bool` includes the bonus that you can guarantee--in so far as you can +guarantee Rust's own behavior--that `Boolean` is spec-compliant with `bool`. + +It may also look redundant - why bother creating a new type when it doesn't +perform as a `bool`? Because it gives us complete control over the trait system. + +**Union** + +An alternative strategy would be to use a `union`: + +```rust +union Byte { + u8, + bool, +} +``` + +This isn't advised. It means that the value will _never_ be able to be +considered safe to access. Callers will need to ensure that they comply with the +rules at every interaction with the type. + +**Typestate** + +Some advanced programmers may attempt to encode Boolean values as zero-sized +types in the type system. If you receive questions about this, gently nudge them +back to including the byte. + +```rust +struct True; +struct False; +``` + +There are a couple of reasons for this. First, zero-sized types do not obey the +width and alignment requirements of the spec for `bool`. Secondly, they're very +difficult to work with in practice. + +If they wish to make use of the typestate pattern, then a possible alternative +would be to . + +```rust +struct Boolean(bool); +struct True(bool); +struct False(bool); +``` + +## Code review + +Suggest that there be some advice + +
From ad96bac83104a0545deb06f641527d65e758decc Mon Sep 17 00:00:00 2001 From: Tim McNamara Date: Wed, 30 Jul 2025 13:33:55 +1200 Subject: [PATCH 46/51] WIP: bringing in rawvec case study notes --- .../mechanics/case-study-rawvec.md | 21 ++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/src/unsafe-deep-dive/mechanics/case-study-rawvec.md b/src/unsafe-deep-dive/mechanics/case-study-rawvec.md index 7f3fd504f4ea..ae1fc19a6928 100644 --- a/src/unsafe-deep-dive/mechanics/case-study-rawvec.md +++ b/src/unsafe-deep-dive/mechanics/case-study-rawvec.md @@ -29,6 +29,17 @@ pub(crate) struct RawVec { inner: RawVecInner
, _marker: PhantomData, } + +struct RawVecInner { + ptr: Unique, + /// Never used for ZSTs; it's `capacity()`'s responsibility to return usize::MAX in that case. + /// + /// # Safety + ///s + /// `cap` must be in the `0..=isize::MAX` range. + cap: Cap, + alloc: A, +} ``` The [implementation of `RawVec` is described in the Rustonomicon][rv]. @@ -39,9 +50,13 @@ The [implementation of `RawVec` is described in the Rustonomicon][rv]. `Vec` is normally described as being a struct with three fields: length, capacity, and pointer to an underlying buffer. Once you dig into the -implementation details, you'll notice that +implementation details, you'll notice that things are much more complicated. + +`RawVec` provides a barrier between Safe and Unsafe. + +`RawVec` -Because Rust won't allow self-referential types, RawVec in the type system is -used to contain the capacity and pointer. +`RawVecInner` contains the actual pointer and capacity of the underlying +buffer.
From e406d0e35046304d5962e00bfebf546d2b6db553 Mon Sep 17 00:00:00 2001 From: Tim McNamara Date: Wed, 30 Jul 2025 15:24:02 +1200 Subject: [PATCH 47/51] Updating intro to UB chapter --- .../undefined-behavior.md | 528 ++++++++++++++---- 1 file changed, 405 insertions(+), 123 deletions(-) diff --git a/src/unsafe-deep-dive/understanding-unsafety/undefined-behavior.md b/src/unsafe-deep-dive/understanding-unsafety/undefined-behavior.md index 87062dbff044..e125e014f91b 100644 --- a/src/unsafe-deep-dive/understanding-unsafety/undefined-behavior.md +++ b/src/unsafe-deep-dive/understanding-unsafety/undefined-behavior.md @@ -1,5 +1,5 @@ --- -minutes: 15 +minutes: 30 --- # Example: Representing Boolean values @@ -10,187 +10,259 @@ One of the terms that we introduced earlier was _undefined behavior_. This exercise aims to discuss what undefined behavior actually is and how it can arise. -High performance code is particularly prone to introducing undefined behavior -into a program, because it will typically find every corner that's possible to -cut. +High performance code is particularly prone to accidentally introducing +undefined behavior into a program, because its authors are typically very +interested in finding ways to cut corners. -We don't want to have undefined behavior in our code, because it makes the code -_unsound_. Unsound code can crash abruptly or produce unexpected results, -because compilers are written with the assumption that undefined behavior does -not exist. +--- -Safe Rust does not permit undefined behavior. +## What's wrong with undefined behavior? -It becomes impossible to reason about, . +C++ compilers will typically (*) compile this code without warnings, and will +run without error signaling an error: -In fact, compilers are engineered to assume that undefined behavior never -exists. +```cpp +#include -We are going to work through an example of how undefined behavior can be -introduced in an attempt to improve performance. +int axiom_increment_is_greater(int x) { + return x + 1 > x; +} ---- +int main() { + int a = 2147483647; + assert(axiom_increment_is_greater(a)); +} +``` -## Part 1 +Equivalent Rust programs produce different output: -How are the Boolean values `true` and `false` represented by programming -languages? +```rust,editable +fn axiom_increment_is_greater(x: i32) -> bool { + x + 1 > x +} -Many languages, including Rust and C, encode Boolean values as an integer, -where: +fn main() { + let a = 2147483647; + assert!(axiom_increment_is_greater(a)); +} +``` -- 1 represents truth or positivity -- 0 represents falsehood or negativity +(*) We can't be certain. That's one of the problems. -### Exercise: +
-Define a type that represents a bool +We don't want to have undefined behavior in our code, because it makes the code +_unsound_. ---- +Unsound code can crash abruptly or produce unexpected results, because compilers +are written with the assumption that undefined behavior does not exist. They +will create optimizations that could be completely contrary to your +expectations. -Or in Rust syntax: +In this example, assume that we're creating some sort of proof assistant that +makes deductions based on mathematical axioms. One of the axioms that we want to +encode is that an integer's increment is always greater than the integer itself: -```rust -struct Boolean(u8); +gcc v13.2, clang v16.0.0 and msvc v19.0 [all compile the C++ code to][asm] the +following assembly when optimizations are enabled ( `-O2`): -const true: Boolean = Boolean(1); -const false: Boolean = Boolean(0); +```asm +axiom_increment_is_greater(int): + mov eax, 1 + ret ``` ->> Instructor Notes -> -> We define a type here so that there is no confusion in the type system between -> `u8` and `Boolean`. +[asm]: https://godbolt.org/z/q4MMY8vxs -From a theoretical perspective, the two states `true` and `false` be represented -by a single bit. However, the smallest integer available is `u8`, which has 254 -additional states. +That is, while it looks like they'll always return `true`, the code also +produces undefined behavior. When `x` is 2^32-1 and is incremented, it enters an +undefined state. The operation produces a number that is outside of the range of +a 32-bit signed integer. -This is a similar problem to the mismatch casting from a `i64` to `i32`, but -there is a significant difference. When converting an integer from a 64-bit type -to a 32-bit type, there is not enough space in the narrower type for all -possible input values. They can't all fit. In the case of casting from `u8` to -`bool`, the number of bits isn't the issue. It's the standard that imposes the -additional restrictions. +Integer overflow for signed integers is _undefined_. In the conventional twos +complement representation, increment often wraps to -(2^31)-1 `i32::MIN`. -is a mismatch when casting between a `u8` and a `bool`. +Rust takes a stricter approach. When integer oveflow is signaled by the CPU, a +panic is induced. This allows Safe Rust to be free of undefined behaviour. -That means, to covert from an integer to bool. +
-Depending on one's perspective, this either presents an opportunity or a -challenge. +--- -Moreover, [Rust (following C) imposes the following restrictions][ref-bool] on -its `bool` type: +## Rust keeps undefined behavior out... -> The value `false` has the bit pattern `0x00` and the value `true` has the bit -> pattern `0x01`. It is _undefined behavior_ for an object with the boolean type -> to have any other bit pattern. [emphasis added] +...but, unsafe provides a way for it to get back in. -Many CPUs, don't strictly have a "Boolean type". They have Boolean operations. +
-[ref-bool]: https://doc.rust-lang.org/reference/types/boolean.html +We are going to work through an example of how undefined behavior can be +introduced in an attempt to improve performance. -## Exercise +
-Implement two conversion functions, `byte_to_boolean()` and `boolean_to_byte()`: +--- -```rust -struct Boolean(u8); +## Booleans -fn byte_to_boolean(b: u8) -> Boolean { - todo!(); -} +A typical representation: -fn boolean_to_byte(b: Boolean) -> u8 { - todo!(); -} -``` +- 1 => truth/positivity +- 0 => falsehood/negativity + +
## Discussion -Should this function be marked as unsafe? +### Encoding -```rust -struct Boolean(u8); +Just as integers can have their quirks, so do Boolean data types. -fn byte_to_boolean(b: u8) -> Boolean { - match b { - 0 => false, - _ => true, - } -} -``` +How are the Boolean values `true` and `false` represented by programming +languages? -## +Many languages, including Rust and C++, encode Boolean values as an integer, +where: -This example demonstrates how the search for high performance can . Software -engineers can find themselves wanting to exploit characteristics of the -operating environment, +- 1 represents truth or positivity +- 0 represents falsehood or negativity -CPUs +However, there is an impedance mismatch because even the smallest integer (a +single byte) can represent many more numbers than the two that are required. -> Well, actually... +> Aside: Not a universal definition +> +> Programming language designers are free to have their own representations, or +> not include a Boolean type in their language at all. > -> CPUs don't really have a concept of a Boolean value. Instead, they have -> Boolean operations. +> CPUs do not have a Boolean datatype, rather they have Boolean operations that +> are performed against operands that are typically integers. + +As the input space is larger than the output space, this can cause problems. +Allowing any byte to represent "true", except for `0x01`, is undefined. -In Rust, the conventional way to think of them is something like this: +[Rust (following C) imposes the following restrictions][ref-bool] on its `bool` +type: + +> The value `false` has the bit pattern `0x00` and the value `true` has the bit +> pattern `0x01`. It is _undefined behavior_ for an object with the boolean type +> to have any other bit pattern. [emphasis added] + +[ref-bool]: https://doc.rust-lang.org/reference/types/boolean.html -They're encoded as +Depending on one's perspective, this either presents an opportunity or a +difficulty. -Boolean values must match a precise representation to avoid undefined behavior: +
- - - - - - - - - - - - - -
Bit patternRust type
00000001true
00000000false
Other patternsUndefined
+--- -You have two tasks in this exercise. +## Exercise -- First, - - Create Rust type, `Boolean` type that represents a Boolean value in a - spec-compliant way - - The first create values of your type from `u8` with no overhead cost while - ensuring that undefined behavior is impossible. -- Secondly, review someone else's implementation. +- Define a type that represents a Boolean value +- A zero-cost conversion function from `u8` to your new type +- A zero-cost conversion function from your new type to `bool` -Starter code: +```rust,editable +fn byte_to_boolean(byte: u8) -> Boolean { + todo!("convert from u8") +} -Part 1 involves a `Boolean`, which is a type that can be +fn boolean_to_bool(b: Boolean) -> bool { + todo!("convert to Rust's bool") +} +```
-Admittedly, there isn't much starter code. +Tell the group that they will need start by defining the `Boolean` type that's +provided in the type signature themselves. (This is not included in the sample +code so that the audience is not biased using a `struct`) -## Discussion +This exercise should be completed quite quickly – no more than 3 minutes +– because we will soon review several examples ourselves. + +### Recommended guidance -- The critical point in these reviews is that learners accurately describe the - contract that callers need to uphold when converting from `u8`. It should be - well described in a Safety section of the docstring. -- Functions should have an `#[inline(always)]` annotation as Rust's `Copy` trait - involves memcpy. We want the compiler to erase the function call +- User-defined Booleans should occupy a single byte the same space. This + precludes using an `enum`. +- The following function annotations are likely to be needed: + - `unsafe` on the `byte_to_boolean` function + - `#[inline]` +- Rust's `Copy` trait involves memcpy and is therefore _not_ zero-cost -> _Aside: TransmuteFrom trait_ +> _Aside: Possible upcoming language feature - the TransmuteFrom trait_ > -> The standard library contains a nightly feature, `transmutability` which +> The standard library contains a nightly feature, [`transmutability`] which > defines the [`std::mem::TransmuteFrom`] trait for performing this kind of > operation. This is one of the outputs from the [Safe Transmute Project] within > the Rust compiler team. +[`std::mem::TransmuteFrom`]: https://doc.rust-lang.org/std/mem/trait.TransmuteFrom.html [`transmutability`]: https://github.com/rust-lang/rust/issues/99571 [Safe Transmute Project]: https://github.com/rust-lang/project-safe-transmute -[`std::mem::TransmuteFrom`]: https://doc.rust-lang.org/std/mem/trait.TransmuteFrom.html + +### Questions to raise + +- How would we indicate to callers that they can cause undefined behavior by + calling `byte_to_boolean` with invalid inputs? + - Safety comments. You could briefly mention safety comments and raise + questions about what learners would expect to see if they were reviewing + code. + - Adding assertions. While not a complete solution, you can suggest that + learners add assertions under debug and/or test. + +### Partial solution focusing on assertions + +```rust +fn is_valid_bool_repr(byte: u8) -> bool { + (byte >> 1) != 0 +} + +fn byte_to_boolean(byte: u8) -> Boolean { + if cfg!(debug_assertions) || cfg!(test) { + assert!(is_valid_bool_repr(byte), "input must be 0x00 or 0x01") + } + + todo!("convert from u8") +} +``` + +### Full solution + +```rust +struct Boolean(bool); + +fn is_valid_bool_repr(byte: u8) -> bool { + (byte >> 1) != 0 +} + +/// Create a `Boolean` from a `u8` +/// +/// ## Safety +/// +/// This function produces undefined bahavior when `byte` is neither 0 nor 1. +unsafe fn byte_to_boolean(byte: u8) -> Boolean { + if cfg!(debug_assertions) || cfg!(test) { + assert!(is_valid_bool_repr(byte), "input must be 0x00 or 0x01") + } + + // SAFETY: Valid for all valid inputs into this function + let b = unsafe { std::mem::transmute(byte) }; + Boolean(b) +} + +fn boolean_to_byte(b: Boolean) -> bool { + b.0 +} + +fn main() { + let t = 123; + let ub = unsafe { byte_to_boolean(t) }; + if boolean_to_byte(ub) { + println!(r"¯\_(ツ)_/¯"); + } +} +``` ### Picking a data structure @@ -252,16 +324,226 @@ width and alignment requirements of the spec for `bool`. Secondly, they're very difficult to work with in practice. If they wish to make use of the typestate pattern, then a possible alternative -would be to . +would be to create three independent types. This creates an ergonomic problem, +but might that might be justified if you only want to permit a follow-on +function from being only called from a "true" value. ```rust -struct Boolean(bool); struct True(bool); struct False(bool); ``` -## Code review +
+ +--- + +## Code reviews + +We'll now be critiquing other implementations of the previous exercise. + +
+ +The critical point in these reviews is that learners accurately describe the +contract that callers need to uphold when converting from `u8`. It should be +well described in a Safety section of the docstring. + +
+ +--- + +## Code review 1 + +Critique this code and suggest improvements, if any: + +```rust,editable +struct Boolean(u8); + +fn byte_to_boolean(b: u8) -> Boolean { + Boolean(b) +} + +fn boolean_to_byte(boolean: Boolean) -> u8 { + boolean.0 +} + +fn boolean_to_bool(boolean: Boolean) -> bool { + match b.0 { + 0 => false, + _ => true, + } +} +``` + +
+ +Which function should be `unsafe`? It could either be at the "constructor" +(`byte_to_boolean`) or when the Boolean is converted to a Rust-native `bool` +(`boolean_to_bool`). + +
+ +--- + +## Code review 2 + +```rust,editable +struct Boolean(bool); + +fn byte_to_boolean(b: u8) -> Boolean { + match b.0 { + 0 => Boolean(false), + _ => Boolean(true), + } +} + +fn boolean_to_byte(boolean: Boolean) -> u8 { + boolean.0 as u8 +} + +fn boolean_to_bool(boolean: Boolean) -> bool { + boolean.0 +} +``` + +
+ +In this version, we mask the error. All non-zero inputs are coerced to `true`. +We store the internal field of the `Boolean` struct as a `bool` to make as much +use of Rust's type system as possible. -Suggest that there be some advice +However, this `byte_to_boolean` is not zero-cost. There is still a `match` +operation that's required.
+ +--- + +## Code review 3 + +```rust,editable +#[repr(C)] +union Boolean { + raw: u8, + rust: bool, +} + +fn byte_to_boolean(b: u8) -> Boolean { + Boolean { raw: b } +} + +fn boolean_to_byte(boolean: Boolean) -> u8 { + unsafe { boolean.rust } +} + +fn boolean_to_bool(boolean: Boolean) -> bool { + unsafe { boolean.raw } +} +``` + +--- + +## Code review 4 + +```rust,editable +struct Boolean(bool); + +fn byte_to_boolean(b: u8) -> Boolean { + let b: bool = unsafe { sys::mem::transmute(b) }; + + Boolean(b) +} + +fn boolean_to_byte(boolean: Boolean) -> u8 { + boolean.0 as u8 +} + +fn boolean_to_bool(boolean: Boolean) -> bool { + boolean.0 +} +``` + +--- + +## Scratch Space + +> Note: Content following this comment is from a previous revisions and is being +> retained temporarily. + +> TODO(timclicks): Review the following content for anything useful that should +> be retained. + +--- + +Or in Rust syntax: + +```rust +struct Boolean(u8); + +const true: Boolean = Boolean(1); +const false: Boolean = Boolean(0); +``` + +>> Instructor Notes +> +> We define a type here so that there is no confusion in the type system between +> `u8` and `Boolean`. + +From a theoretical perspective, the two states `true` and `false` be represented +by a single bit. However, the smallest integer available is `u8`, which has 254 +additional states. + +This is a similar problem to the mismatch casting from a `i64` to `i32`, but +there is a significant difference. When converting an integer from a 64-bit type +to a 32-bit type, there is not enough space in the narrower type for all +possible input values. They can't all fit. In the case of casting from `u8` to +`bool`, the number of bits isn't the issue. It's the standard that imposes the +additional restrictions. + +Depending on one's perspective, this either presents an opportunity or a +challenge. + +Moreover, [Rust (following C) imposes the following restrictions][ref-bool] on +its `bool` type: + +> The value `false` has the bit pattern `0x00` and the value `true` has the bit +> pattern `0x01`. It is _undefined behavior_ for an object with the boolean type +> to have any other bit pattern. [emphasis added] + +Many CPUs, don't strictly have a "Boolean type". They have Boolean operations. + +- For true, CPUs ask. Does this value match + +[ref-bool]: https://doc.rust-lang.org/reference/types/boolean.html + +## Exercise + +Implement two conversion functions, `byte_to_boolean()` and `boolean_to_byte()`: + +```rust +struct Boolean(u8); + +fn byte_to_boolean(b: u8) -> Boolean { + todo!(); +} + +fn boolean_to_byte(b: Boolean) -> u8 { + todo!(); +} +``` + +## Discussion + +Should this function be marked as unsafe? + +```rust +struct Boolean(u8); + +fn byte_to_boolean(b: u8) -> Boolean { + match b { + 0 => false, + _ => true, + } +} +``` + +--- From f1536000b4f65f3deac5dff6f87b4cd6237d9c14 Mon Sep 17 00:00:00 2001 From: Tim McNamara Date: Thu, 7 Aug 2025 21:01:03 +1200 Subject: [PATCH 48/51] WIP: initialization chapter --- src/SUMMARY.md | 1 + .../understanding-unsafety/initialization.md | 498 ++++++++++++++++++ 2 files changed, 499 insertions(+) create mode 100644 src/unsafe-deep-dive/understanding-unsafety/initialization.md diff --git a/src/SUMMARY.md b/src/SUMMARY.md index a9539b5b308c..3b0a59677ce8 100644 --- a/src/SUMMARY.md +++ b/src/SUMMARY.md @@ -457,6 +457,7 @@ - [Understanding Unsafety](unsafe-deep-dive/understanding-unsafety.md) - [Undefined behavior](unsafe-deep-dive/understanding-unsafety/undefined-behavior.md) - [Out of bounds](unsafe-deep-dive/understanding-unsafety/out-of-bounds.md) + - [Initialization](unsafe-deep-dive/understanding-unsafety/initialization.md) - [Mechanics](unsafe-deep-dive/mechanics.md) - [Example: Representing Booleans](unsafe-deep-dive/mechanics/representing-booleans.md) - [Extension: Representing Char](unsafe-deep-dive/mechanics/representing-char.md) diff --git a/src/unsafe-deep-dive/understanding-unsafety/initialization.md b/src/unsafe-deep-dive/understanding-unsafety/initialization.md new file mode 100644 index 000000000000..7d611e19ce7e --- /dev/null +++ b/src/unsafe-deep-dive/understanding-unsafety/initialization.md @@ -0,0 +1,498 @@ +# Initialization + +> TODO(timclicks): split this content into multiple sub-sections + +--- + +> All runtime-allocated memory in a Rust program begins its life as +> uninitialized. +> +> — +> [The Rustonomicon](https://doc.rust-lang.org/nomicon/uninitialized.html) + +
+ +Validity related to other concepts that we've seen before, such as _undefined +behavior_. Validity is a precondition for well-defined behavior. + +This segment of the course describes what initialization is and some of its +related concepts, such as _alignment_ and _validity_, and how they relate to one +that we've seen before: _undefined behavior_. + +The primary focus of the segment though is to introduce the +`std::mem::MaybeUninit` type. Its role is to allow programmers to interact with +memory that is uninitialized and convert it to some initialized state. + +To get this to work, we'll work through several code examples and other +exercises. + +--- + +```rust,editable +fn mystery() -> u32 { + let mut x: u32; + + unsafe { x } +} + +fn main() { + let a = mystery(); + println!("{a}") +} +``` + +
+ +What is the value of `x`? + +**Action:** Pause and await for people's responses. + +We can't know. + +This is a case of an _uninitialized_ value. When we define the variable on line +2, the compiler makes space for an integer on the stack, however it makes no +guarantees that there is a valid value there. + +**Action:** Attempt compilation. + +**Action:** Suggested change: + +```rust +use std::mem; + +fn mystery() -> u32 { + let mut x: u32 = unsafe { mem::MaybeUninit::uninit().assume_init() }; + + x +} + +fn main() { + let a = mystery(); + println!("{a}") +} +``` + +Initialization transforms that a value's bytes from an undetermined state to +something that's guaranteed to be valid. + +As we've seen from the Boolean case, not every bit pattern is a valid value in +Rust's `bool` type. + +When a value uninitialized, it's impossible to know what'. + +Rust requires every variable is _valid_. An important part of validity is +ensuring that values are initialized before use. + +Getting this wrong is so unsafe that you cannot simply use the `unsafe` keyword +to convince Rust to compile your code. + +
+ +--- + +## Validity + +- What is validity? +- Why is it important? + +
+ +This segment of the course describes what that means and why it's important. + +Validity related to other concepts that we've seen before, such as _undefined +behavior_. Validity is a precondition for well-defined behavior. + +
+ +--- + +## Validity + +\ +\ +\ + +Bit patterns \ + +Valid values \ + + +
+ +Data types define what it means to be _valid_. For some types, such as integers, +every bit pattern is a valid type. For many others though, there are some +patterns which are not. + +In Rust, references are not allowed to be NULL and `char` values must be valid +Unicode scalar values. + +Outside of bit patterns, there are also other considerations. For example, many +types impose rules that must be enforced that extend past. The way to find these +rules is by the documentation. Therefore, we're also going to spend time +examining docs. + +
+ +--- + +## Why `MaybeUninit`? + +```rust,editable +``` + +
+ +Rust requires every variable to be initialized before use. More generally, +compilers assume that all variables are properly initialized. + +But for FFI and for creating high performance data structures—sometimes +referred to as getting stuff done—we need the ability to describe +uninitialized buffers. + +
+ +--- + +## Why care about initialization? + +```rust,editable +fn create_1mb_buffer() -> Vec { + vec![0; 1_000_000] +} +``` + +
+ +You're probably aware that this code allocates a new block of memory. It also +has a second phase that is slightly more subtle. After allocation, every byte +has its bits set to zero. + +However, there are cases where this second step is unnecessary. For example, if +we're using this buffer for I/O, then we're going to overwrite the memory with +whatever data that is going to be provided. + +
+ +--- + +## Case study: selective initialization + +```rust +use std::mem::MaybeUninit; + +/// Builds a sparse row where only certain positions have values +struct ArrayFastBuilder { + data: [MaybeUninit; N], + initialized: [bool; N], + count: usize, +} + +impl ArrayFastBuilder { + fn new() -> Self { + Self { + data: unsafe { MaybeUninit::uninit().assume_init() }, + initialized: [false; N], + count: 0, + } + } + + fn set(&mut self, index: usize, value: f64) -> Result<(), &'static str> { + if index >= N { + return Err("Index out of bounds"); + } + + if !self.initialized[index] { + self.count += 1; + } + + self.data[index] = MaybeUninit::new(value); + self.initialized[index] = true; + Ok(()) + } + + fn get(&self, index: usize) -> Option { + if index < N && self.initialized[index] { + Some(unsafe { self.data[index].assume_init() }) + } else { + None + } + } + + fn into_array(self, default: f64) -> [f64; N] { + let mut result: [MaybeUninit; N] = std::array::from_fn(|i| { + if self.initialized[i] { + self.data[i] // Already initialized + } else { + MaybeUninit::new(default) + } + }); + + unsafe { + std::ptr::read( + &result as *const [MaybeUninit; N] as *const [f64; N], + ) + } + } + + fn into_sparse_vec(self) -> Vec<(usize, f64)> { + let mut result = Vec::with_capacity(self.count); + + for (i, is_init) in self.initialized.iter().enumerate() { + if *is_init { + let value = unsafe { self.data[i].assume_init() }; + result.push((i, value)); + } + } + + result + } +} +``` + +
+ +Here is an application of what we just saw. `ArrayFastBuilder` reserves space on +the stack for the contents, but skips avoids zeroing that array when it is +created. + +
+ +--- + +## What is the contract? + +Whenever we're creating unsafe code, we need to consider what the contract is. + +What does `assume_init(self)` mean? What do we need to do to guarantee that +initialization it is no longer an assumption. + +
+ +What is this code asking of us? What are the expectations that we need to +satisfy? If we don't know the expectations, where would we find them? + +
+ +--- + +## Layout guarantees + +The following program runs sucessfully for `u64` values. Is that the case for +all possible types `T`? + +```rust,editable +use std::mem::MaybeUninit; + +fn main() { + + let u = MaybeUninit::uninit(); + + assert_eq!(size_of::>(), size_of::()); + assert_eq!(align_of::>(), align_of::()); +} +``` + +Look through the documentation for `MaybeUninit` to verify your assumptions. + +
+ +Another way to ask this is to check whether guarantees does `MaybeUninit` +provide about its memory layout? + +Here is [the relevant quote][q] from the Layout section of the docs: + +> `MaybeUninit` is guaranteed to have the same size, alignment, and ABI as +> `T`. + +[q]: https://doc.rust-lang.org/std/mem/union.MaybeUninit.html#layout-1 + +
+ +--- + +## What about safety when panicking? + +```rust +``` + +
+ +Rust's drop behavior presents a challenge during panics. In situations where +there is partially-initiated values, dropping causes undefined behavior. + +
+ +--- + +## Questions for review + +Where should the safety comment be? What kinds of tests can we perform. Fuzzing. + +--- + +## Exercise: Vec + +Look up the documentation for `assume_init` and describe why this creates +undefined behavior: + +```rust +use std::mem::MaybeUninit; + +fn main() { + let x = MaybeUninit::>::uninit(); + let x_ = unsafe { x.assume_init() }; + + println!("{x_:?}") +} +``` + +
+ +Many types have additional invariants that need to be upheld. For example, +`Vec` has a different representation when it's first created with `::new()` +compared to after its first entry is inserted. It lazily allocates memory and +there is no allocation involved until space is actually needed. + +From the [doc comment of `assume_init()`][docs]: + +> It is up to the caller to guarantee that the `MaybeUninit` really is in an +> initialized state. Calling this when the content is not yet fully initialized +> causes immediate undefined behavior. The type-level documentation contains +> more information about this initialization invariant. +> +> On top of that, **remember that most types have additional invariants beyond +> merely being considered initialized at the type level**. For example, a +> 1-initialized `Vec` is considered initialized (under the current +> implementation; this does not constitute a stable guarantee) because the only +> requirement the compiler knows about it is that the data pointer must be +> non-null. Creating such a `Vec` does not cause immediate undefined +> behavior, but will cause undefined behavior with most safe operations +> (including dropping it). +> +> _Emphasis added_ + +[docs]: https://doc.rust-lang.org/std/mem/union.MaybeUninit.html#method.assume_init + +### Extension exercise + +Ask the class to think of other types that require special handling: + +- `char` outside the range of a Unicode scalar + (`[0x0000..=0xD7FF, 0xE000..=0x10FFFF]`) +- References, (NULL is a valid pointer, but not a valid reference) +- Types backed by `Vec<_>`, including `String`. +- Pinned types, i.e. `Pin` +- Non-zero types, i.e. `NonZeroU32`, etc + +
+ +--- + +## MaybeUninit use case: initializing a struct field by field + +```rust +use std::mem::MaybeUninit; +use std::ptr::addr_of_mut; + +#[derive(Debug, PartialEq)] +pub struct FileFormat { + marker: [u8; 4], + len: u32, + data: Vec, +} + +fn main() { + let rfc = { + let mut uninit: MaybeUninit = MaybeUninit::uninit(); + let ptr = uninit.as_mut_ptr(); + + unsafe { + addr_of_mut!((*ptr).name).write([b'R', b'F', b'C', b'1']); + } + + unsafe { + addr_of_mut!((*ptr).len).write(3); + } + + unsafe { + addr_of_mut!((*ptr).list).write(vec![0, 1, 2]); + } + + unsafe { uninit.assume_init() } + }; + + assert_eq!( + rfc, + FileFormat { + name: b"RFC1", + len: 3 + data: vec![0, 1, 2] + } + ); +} +``` + +--- + +## Use case: partial initialization + +```rust,editable +use std::mem::MaybeUninit; + +const SIZE: usize = 10_000_000; + +fn with_zeroing() -> Vec { + let mut vec = vec![0u8; SIZE]; + for i in 0..SIZE { + vec[i] = (i % 256) as u8; + } + vec +} + +fn without_zeroing() -> Vec { + let mut vec = Vec::with_capacity(SIZE); + unsafe { + let ptr = vec.as_mut_ptr(); + for i in 0..SIZE { + ptr.add(i).write((i % 256) as u8); + } + vec.set_len(SIZE); + } + vec +} +``` + +
+ +
+ +--- + +## SCRATCH SPACE + +Key APIs: + +- `MaybeUninit::uninit()`: create an uninitialized value +- `MaybeUninit::zeroed()`: create a zeroed, but possibly invalid, value +- `MaybeUninit::write(val: T)`: write a new value in-place + +- `unsafe fn assume_init(self) -> T` — extract the initialized value +- `as_ptr(self) -> &T` / `as_mut_ptr()`: raw pointers to the underlyin storage + +Safety contract: Calling assume_init on uninitialized data is UB. + +- `MaybeUninit` is Rust's way to describe memory that is of a potentially + invalid state. You are expected to bring the contents of the memory to a valid + state, then call `assume_init() + + `. that will eventually + hold a`T`, but isn't ready yet. +- It acts as a **contract** with the compiler: "This space is for a `T`, but + it's empty/uninitialized for now." +- The actual initialization (writing a `T` into that memory) and the final + declaration that it's ready (`assume_init()`) are usually `unsafe` operations. + This puts the burden of correctness on the programmer, ensuring that the + memory truly holds a valid `T` before Rust starts trusting it. From 7261123cb70fe05fc850017a6103d97ab122a7d6 Mon Sep 17 00:00:00 2001 From: Tim McNamara Date: Tue, 12 Aug 2025 20:46:23 +1200 Subject: [PATCH 49/51] WIP: starting a virtual mem intro --- .../understanding-unsafety/initialization.md | 155 ++++++++++++++---- 1 file changed, 126 insertions(+), 29 deletions(-) diff --git a/src/unsafe-deep-dive/understanding-unsafety/initialization.md b/src/unsafe-deep-dive/understanding-unsafety/initialization.md index 7d611e19ce7e..a008c909331d 100644 --- a/src/unsafe-deep-dive/understanding-unsafety/initialization.md +++ b/src/unsafe-deep-dive/understanding-unsafety/initialization.md @@ -4,6 +4,131 @@ --- +## Addressing data + +```rust +static s: &str = "_"; + +fn main() { + let l = 123; + let h = Box::new(123); + + println!("{:p}", &l); + println!("{:p}", s); + println!("{:p}", &*h); +} +``` + +
+ +All data stored in a program lives at an _address_, a number which the operating +system can use to retrieve or store data at that address. + +Local variables, such as `l`, are stored on the "stack". Memory addresses on the +stack are quite high. (When executed, the program probably prints out a value +near `0x7fffffffffff`) + +Static variables are lower + +Functions also stored in memory. In Rust, the keyword `fn` signifies a function +pointer. Its address can also be printed. + +### Questions + +- Q: Why does addresses printed a not start at 1?\ + A: The kernel reserves half of a process's address space for itself in the + lower half. + +### Variable mapping + +- `l` - L for _local_ - stored on the "stack" +- `h` - H for _heap_ +- `f` - F for _function_ +- `s` - S for _static_ + +
+ +--- + +## Memory lifecycle + +Unpaged + +Mapped but unallocated + +Allocated + +Allocated and "available" (uninitialized) + +Allocated and "active" (ininitialized) + +Call to free + +Deallocated but mapped + +Unpaged + +
+ +Variables, the data that is used to represent them, have a surprisingly complex +lifecycle. + +Operating systems, programming languages and hardware cooperate to programs with +convenient access to data stored on physical devices, such as RAM chips. +Programs are provided with a façade, an imaginary array of bytes addressed from +1 to _n_, that allows them to store and retrieve data. + +This imaginary array of bytes is called the _virtual address space_ and this +setup is called _virtual memory_. + +Each operating system process has its own virtual address space, meaning that +the same address means different things in different processes. Another way of +thinking about this is that process believes that it has exclusive access to the +data available to the machine. + +The operating system kernel is responsible for mapping between these virtual +memory addresses that your program understands to something that the hardware +understands. + +To do this bookkeeping, the kernel stores information in its own data structures +and relies on concept of a _memory page_. Pages are typically 4 KB in size +(although this can be tuned). + +Virtual memory is complex and has many stages. + +The kernel understands physical memory addresses. User-space programs only have +access to virtual memory. + +The details are complex and we don't want to turn this class into a +graduate-&spy;level computer architecture course. However, understanding this +system is useful, because it explains why programmers use uninitialized memory +for performance-critical code. + +The mapping between memory addresses and the pages themselves is also stored +within memory, in a data structure that is called TLB. TLB expands to +"thread-local buffer", which is a name that has persisted for historical +reasons. + +The CPU provides the operating system with privileged instructions for +interacting with hardware, including main memory. + +Rust's ownership model adds its own characteristics to this overall model. The +data is likely to still be present in the original location, after variables are +moved, however this is inaccessible to the program. + +## References + +An extensive introduction background + +Drepper, Ulrich (2007) "What every programmer should know about memory" + +The Linux kernel provides extensive documentation about how virtual memory works +on each platform https://www.kernel.org/doc/html/v5.8/x86/x86_64/mm.html + +
+ +--- + > All runtime-allocated memory in a Rust program begins its life as > uninitialized. > @@ -280,7 +405,7 @@ satisfy? If we don't know the expectations, where would we find them? ## Layout guarantees -The following program runs sucessfully for `u64` values. Is that the case for +The following program runs successfully for `u64` values. Is that the case for all possible types `T`? ```rust,editable @@ -468,31 +593,3 @@ fn without_zeroing() -> Vec {
- ---- - -## SCRATCH SPACE - -Key APIs: - -- `MaybeUninit::uninit()`: create an uninitialized value -- `MaybeUninit::zeroed()`: create a zeroed, but possibly invalid, value -- `MaybeUninit::write(val: T)`: write a new value in-place - -- `unsafe fn assume_init(self) -> T` — extract the initialized value -- `as_ptr(self) -> &T` / `as_mut_ptr()`: raw pointers to the underlyin storage - -Safety contract: Calling assume_init on uninitialized data is UB. - -- `MaybeUninit` is Rust's way to describe memory that is of a potentially - invalid state. You are expected to bring the contents of the memory to a valid - state, then call `assume_init() - - `. that will eventually - hold a`T`, but isn't ready yet. -- It acts as a **contract** with the compiler: "This space is for a `T`, but - it's empty/uninitialized for now." -- The actual initialization (writing a `T` into that memory) and the final - declaration that it's ready (`assume_init()`) are usually `unsafe` operations. - This puts the burden of correctness on the programmer, ensuring that the - memory truly holds a valid `T` before Rust starts trusting it. From a0d526af8534bc2324ff8986d961c455d2ffa582 Mon Sep 17 00:00:00 2001 From: Tim McNamara Date: Wed, 13 Aug 2025 14:39:27 +1200 Subject: [PATCH 50/51] WIP: more content --- .../understanding-unsafety/initialization.md | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/src/unsafe-deep-dive/understanding-unsafety/initialization.md b/src/unsafe-deep-dive/understanding-unsafety/initialization.md index a008c909331d..32f3f9ea9be5 100644 --- a/src/unsafe-deep-dive/understanding-unsafety/initialization.md +++ b/src/unsafe-deep-dive/understanding-unsafety/initialization.md @@ -91,8 +91,15 @@ memory addresses that your program understands to something that the hardware understands. To do this bookkeeping, the kernel stores information in its own data structures -and relies on concept of a _memory page_. Pages are typically 4 KB in size -(although this can be tuned). +and relies on concept of a _memory page_. Pages allow the CPU, kernel and +storage hardware to improve their coordination, by referring to reduce the +number of lookups when memory addresses are nearby. Pages also allow groups of +memory addresses to be given attributes, such as write or execution. + +You may be familiar with the term _segmentation fault_, often shortened to _seg +fault_. This term arises because each page is a _segment_ of the very large +virtual address space. Only a small fraction of the address space is given a +page. Virtual memory is complex and has many stages. From 95dbe845e5561af59f0a868011b242a7cc89496e Mon Sep 17 00:00:00 2001 From: Tim McNamara Date: Wed, 13 Aug 2025 17:02:17 +1200 Subject: [PATCH 51/51] More content on memory lifecycle --- .../understanding-unsafety/initialization.md | 188 +++++++++++------- 1 file changed, 112 insertions(+), 76 deletions(-) diff --git a/src/unsafe-deep-dive/understanding-unsafety/initialization.md b/src/unsafe-deep-dive/understanding-unsafety/initialization.md index 32f3f9ea9be5..b87e07340dcc 100644 --- a/src/unsafe-deep-dive/understanding-unsafety/initialization.md +++ b/src/unsafe-deep-dive/understanding-unsafety/initialization.md @@ -4,75 +4,26 @@ --- -## Addressing data - -```rust -static s: &str = "_"; - -fn main() { - let l = 123; - let h = Box::new(123); - - println!("{:p}", &l); - println!("{:p}", s); - println!("{:p}", &*h); -} -``` - -
- -All data stored in a program lives at an _address_, a number which the operating -system can use to retrieve or store data at that address. - -Local variables, such as `l`, are stored on the "stack". Memory addresses on the -stack are quite high. (When executed, the program probably prints out a value -near `0x7fffffffffff`) - -Static variables are lower - -Functions also stored in memory. In Rust, the keyword `fn` signifies a function -pointer. Its address can also be printed. - -### Questions - -- Q: Why does addresses printed a not start at 1?\ - A: The kernel reserves half of a process's address space for itself in the - lower half. - -### Variable mapping - -- `l` - L for _local_ - stored on the "stack" -- `h` - H for _heap_ -- `f` - F for _function_ -- `s` - S for _static_ - -
- ---- - ## Memory lifecycle -Unpaged - -Mapped but unallocated - -Allocated - -Allocated and "available" (uninitialized) - -Allocated and "active" (ininitialized) - -Call to free - -Deallocated but mapped - -Unpaged +- Unpaged +- Mapped but unallocated +- Allocated +- Allocated and "available" (uninitialized) +- Allocated and "active" (initialized) +- Deallocated but mapped +- Unpaged
Variables, the data that is used to represent them, have a surprisingly complex lifecycle. +The details are complex and we don't want to turn this class into a +graduate-&spy;level computer architecture course. However, understanding this +system is useful, because it explains why programmers use uninitialized memory +for performance-critical code. + Operating systems, programming languages and hardware cooperate to programs with convenient access to data stored on physical devices, such as RAM chips. Programs are provided with a façade, an imaginary array of bytes addressed from @@ -91,26 +42,68 @@ memory addresses that your program understands to something that the hardware understands. To do this bookkeeping, the kernel stores information in its own data structures -and relies on concept of a _memory page_. Pages allow the CPU, kernel and -storage hardware to improve their coordination, by referring to reduce the -number of lookups when memory addresses are nearby. Pages also allow groups of -memory addresses to be given attributes, such as write or execution. +and relies on concept of a _memory page_. Pages allow components within the +computer to work together, including the OS kernel, the OS process, the +program's threads, the CPU, and storage hardware. Pages allow sections of the +phyiscal memory to be reserved for specific purposes and for security +restrictions to be enforced. also allow groups of memory addresses to be given +attributes, such as write or execution. + +to improve their coordination, by referring to reduce the number of lookups when +memory addresses are nearby. You may be familiar with the term _segmentation fault_, often shortened to _seg fault_. This term arises because each page is a _segment_ of the very large virtual address space. Only a small fraction of the address space is given a page. -Virtual memory is complex and has many stages. +Virtual memory is complex and has many stages. We'll skip over most of them to +allow us to build a general mental model of what's happening at runtime during a +variable's lifecycle: + +- Memory starts as _unmapped_ and available to OS processes that require it. The + operating system knows that there is available space on the hardware, but the + process's virtual address space does not yet include a mapping to it. + +As space to store data is needed, memory transitions from the unmapped state: + +- Memory is then _mapped_ by the OS. The operation system maps a portion of the + available space on the hardware to the process's virtual address space. +- The program's allocator then _allocates_ memory. +- This allocated memory then becomes available to the program, but is in an + _uninitialized_ state. +- When the variables are created within that memory and are guaranteed to be + _valid_, the memory is said to be _initialized_. + +As space for data decreases, memory reverts to the unmapped state: + +- After some time, the variable's lifetime ends. It has been moved or dropped. + The memory for the variable in the original position may not have been + modified though, however it is now invalid to access. Accessing those bytes at + this point is _undefined behavior_ . +- At some point, the unused memory is _deallocated_. This memory addresses + remain mapped. +- Later on, when the memory page is no longer being used, the operating system + may remove the page from the mapping table, allowing other processes to make + use of the hardware. + +Accessing uninitialized data is undefined behavior and a very serious safety +hazard. + +### Other notes + +When virtual machines and hypervisors are involved, additional layers of mapping +are involved. + +Unless your operating system or allocator provides specific guarantees, memory +provided to a program is not necessarily in a clean state. + +Allocators: The allocator is part of the program itself. The operating system is +agnostic to how The kernel understands physical memory addresses. User-space programs only have access to virtual memory. -The details are complex and we don't want to turn this class into a -graduate-&spy;level computer architecture course. However, understanding this -system is useful, because it explains why programmers use uninitialized memory -for performance-critical code. - The mapping between memory addresses and the pages themselves is also stored within memory, in a data structure that is called TLB. TLB expands to "thread-local buffer", which is a name that has persisted for historical @@ -123,19 +116,62 @@ Rust's ownership model adds its own characteristics to this overall model. The data is likely to still be present in the original location, after variables are moved, however this is inaccessible to the program. -## References +
+ +--- + +## Addressing data + +```rust +static s: &str = "_"; + +fn main() { + let l = 123; + let h = Box::new(123); + + println!("{:p}", &l); + println!("{:p}", s); + println!("{:p}", &*h); +} +``` + +
+ +All data stored in a program lives at an _address_, a number which the operating +system can use to retrieve or store data at that address. + +Local variables, such as `l`, are stored on the "stack". Memory addresses on the +stack are quite high. (When executed, the program probably prints out a value +near `0x7fffffffffff`) + +Static variables are lower + +Functions also stored in memory. In Rust, the keyword `fn` signifies a function +pointer. Its address can also be printed. + +### Questions -An extensive introduction background +- Q: Why does addresses printed a not start at 1?\ + A: The kernel reserves half of a process's address space for itself in the + lower half. -Drepper, Ulrich (2007) "What every programmer should know about memory" +### Variable mapping -The Linux kernel provides extensive documentation about how virtual memory works -on each platform https://www.kernel.org/doc/html/v5.8/x86/x86_64/mm.html +- `l` - L for _local_ - stored on the "stack" +- `h` - H for _heap_ +- `f` - F for _function_ +- `s` - S for _static_
--- +## Memory lifecycle - stack + +- Allocation: + +--- + > All runtime-allocated memory in a Rust program begins its life as > uninitialized. >