From de8064a14c8914ff306a160212c2ddbe538906b5 Mon Sep 17 00:00:00 2001 From: Miguel Young de la Sota Date: Mon, 30 Dec 2024 19:31:34 -0800 Subject: [PATCH 1/9] chore: Bump MSRV --- .github/workflows/rust.yml | 2 +- rust-toolchain.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 34574af..b25d1d5 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -7,7 +7,7 @@ on: env: CARGO_TERM_COLOR: always - NIGHTLY: 'nightly-2023-09-30' + NIGHTLY: 'nightly-2025-01-01' jobs: check_lints: diff --git a/rust-toolchain.toml b/rust-toolchain.toml index 1c8cfba..6cc20ea 100644 --- a/rust-toolchain.toml +++ b/rust-toolchain.toml @@ -1,3 +1,3 @@ [toolchain] -channel = "1.75.0" +channel = "1.83.0" profile = "default" From 581c5e49a19cc6b7e974f15f560faf9d5c95df06 Mon Sep 17 00:00:00 2001 From: Miguel Young de la Sota Date: Mon, 30 Dec 2024 19:38:38 -0800 Subject: [PATCH 2/9] ilex: Remove `SpanId` from public API --- ilex/src/file/mod.rs | 28 +++++++--- ilex/src/lib.rs | 2 +- ilex/src/rt/mod.rs | 2 +- ilex/src/testing/recognize.rs | 2 +- ilex/src/token/mod.rs | 98 ++++++++++++++++++----------------- ilex/tests/json.rs | 1 - 6 files changed, 76 insertions(+), 57 deletions(-) diff --git a/ilex/src/file/mod.rs b/ilex/src/file/mod.rs index cbed544..06d03a0 100644 --- a/ilex/src/file/mod.rs +++ b/ilex/src/file/mod.rs @@ -3,7 +3,6 @@ use std::cell::RefCell; use std::fmt; use std::fmt::Write; -use std::iter; use std::ops::Bound; use std::ops::Index; use std::ops::RangeBounds; @@ -121,7 +120,7 @@ pub struct Span { /// This type is just a numeric ID; in order to do anything with it, you'll /// need to call one of the functions in [`Spanned`]. #[derive(Copy, Clone)] -pub struct SpanId(u32); +pub(crate) struct SpanId(u32); impl fmt::Debug for SpanId { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { @@ -436,12 +435,29 @@ impl<'ctx> Comments<'ctx> { } } -impl<'a> IntoIterator for &'a Comments<'_> { - type Item = SpanId; - type IntoIter = iter::Copied>; +impl<'a, 'ctx> IntoIterator for &'a Comments<'ctx> { + type Item = Span; + type IntoIter = CommentsIter<'a, 'ctx>; fn into_iter(self) -> Self::IntoIter { - unsafe { &*self.slice.1 }.iter().copied() + CommentsIter { + iter: unsafe { &*self.slice.1 }.iter(), + ctx: self.ctx, + } + } +} + +// Iterator over a [`Comments`]. +pub struct CommentsIter<'a, 'ctx> { + iter: slice::Iter<'a, SpanId>, + ctx: &'ctx Context, +} + +impl<'a, 'ctx> Iterator for CommentsIter<'a, 'ctx> { + type Item = Span; + + fn next(&mut self) -> Option { + self.iter.next().map(|s| s.span(self.ctx)) } } diff --git a/ilex/src/lib.rs b/ilex/src/lib.rs index e781288..06f2088 100644 --- a/ilex/src/lib.rs +++ b/ilex/src/lib.rs @@ -271,7 +271,7 @@ pub use { crate::{ file::Context, file::File, - file::{Span, SpanId, Spanned}, + file::{Span, Spanned}, report::{Fatal, Report}, rule::Rule, spec::{Lexeme, Spec, SpecBuilder}, diff --git a/ilex/src/rt/mod.rs b/ilex/src/rt/mod.rs index 631bd05..96a56ec 100644 --- a/ilex/src/rt/mod.rs +++ b/ilex/src/rt/mod.rs @@ -83,7 +83,7 @@ pub enum Kind { Keyword, Ident(SpanId), Quoted { - content: Vec, + content: Vec>, open: SpanId, close: SpanId, }, diff --git a/ilex/src/testing/recognize.rs b/ilex/src/testing/recognize.rs index b1a2626..2027509 100644 --- a/ilex/src/testing/recognize.rs +++ b/ilex/src/testing/recognize.rs @@ -83,7 +83,7 @@ impl Matcher { state.match_options("suffix", suffix.as_ref(), tok.suffix()); } (Kind::Quoted { delims, content, prefix, suffix }, Any::Quoted(tok)) => { - let (open, close) = tok.delimiters(); + let [open, close] = tok.delimiters(); state.match_spans("open quote", &delims.0, open); state.match_spans("close quote", &delims.1, close); state.match_options("prefix", prefix.as_ref(), tok.prefix()); diff --git a/ilex/src/token/mod.rs b/ilex/src/token/mod.rs index 2682a84..0f9dde7 100644 --- a/ilex/src/token/mod.rs +++ b/ilex/src/token/mod.rs @@ -365,18 +365,18 @@ pub struct Bracket<'lex> { impl<'lex> Bracket<'lex> { /// Returns this token's open delimiter. - pub fn open(self) -> SpanId { - self.open + pub fn open(self) -> Span { + self.open.span(self.ctx) } /// Returns this token's close delimiter. - pub fn close(self) -> SpanId { - self.close + pub fn close(self) -> Span { + self.close.span(self.ctx) } /// Returns this token's quote delimiters. - pub fn delimiters(self) -> [SpanId; 2] { - [self.open, self.close] + pub fn delimiters(self) -> [Span; 2] { + [self.open(), self.close()] } /// Returns a cursor over this bracket's internal tokens (not including the @@ -455,16 +455,16 @@ pub struct Ident<'lex> { impl<'lex> Ident<'lex> { /// Returns this token's name span. - pub fn name(self) -> SpanId { + pub fn name(self) -> Span { match &self.tok.kind { - &Kind::Ident(name) => name, + &Kind::Ident(name) => name.span(self.ctx), _ => panic!("non-lexer::Kind::Ident inside of Ident"), } } /// Returns this token's prefix. - pub fn prefix(self) -> Option { - self.tok.prefix + pub fn prefix(self) -> Option { + self.tok.prefix.map(|s| s.span(self.ctx)) } /// Checks whether this identifier has a particular prefix. @@ -473,8 +473,8 @@ impl<'lex> Ident<'lex> { } /// Returns this token's suffix. - pub fn suffix(&self) -> Option { - self.tok.suffix + pub fn suffix(&self) -> Option { + self.tok.suffix.map(|s| s.span(self.ctx)) } /// Checks whether this identifier has a particular prefix. @@ -589,13 +589,13 @@ impl<'lex> Digital<'lex> { } /// Returns the span corresponding to [`Digital::sign()`]. - pub fn sign_span(self) -> Option { - self.rt_blocks().sign.map(|(_, sp)| sp) + pub fn sign_span(self) -> Option { + self.rt_blocks().sign.map(|(_, sp)| sp.span(self.ctx)) } /// Returns the point-separated digit chunks of this digital literal. - pub fn digit_blocks(self) -> impl Iterator + 'lex { - self.digit_slice().iter().copied() + pub fn digit_blocks(self) -> impl Iterator + 'lex { + self.digit_slice().iter().map(|s| s.span(self.ctx)) } /// Returns the exponents of this digital literal, if it any. @@ -612,12 +612,12 @@ impl<'lex> Digital<'lex> { } /// Returns this token's prefix. - pub fn prefix(self) -> Option { + pub fn prefix(self) -> Option { if self.idx > 0 { - return self.rt_blocks().prefix; + return self.rt_blocks().prefix.map(|s| s.span(self.ctx)); } - self.tok.prefix + self.tok.prefix.map(|s| s.span(self.ctx)) } /// Checks whether this identifier has a particular prefix. @@ -626,13 +626,13 @@ impl<'lex> Digital<'lex> { } /// Returns this token's suffix. - pub fn suffix(&self) -> Option { + pub fn suffix(&self) -> Option { if self.idx > 0 { // Exponent tokens never have a suffix. return None; } - self.tok.suffix + self.tok.suffix.map(|s| s.span(self.ctx)) } /// Checks whether this identifier has a particular prefix. @@ -968,19 +968,21 @@ pub struct Quoted<'lex> { impl<'lex> Quoted<'lex> { /// Returns this token's open delimiter. - pub fn open(self) -> SpanId { - self.delimiters().0 + pub fn open(self) -> Span { + self.delimiters()[0] } /// Returns this token's close delimiter. - pub fn close(self) -> SpanId { - self.delimiters().0 + pub fn close(self) -> Span { + self.delimiters()[1] } /// Returns this token's quote delimiters. - pub fn delimiters(self) -> (SpanId, SpanId) { + pub fn delimiters(self) -> [Span; 2] { match &self.tok.kind { - &Kind::Quoted { open, close, .. } => (open, close), + &Kind::Quoted { open, close, .. } => { + [open.span(self.ctx), close.span(self.ctx)] + } _ => panic!("non-lexer::Kind::Quoted inside of Quoted"), } } @@ -995,22 +997,27 @@ impl<'lex> Quoted<'lex> { /// strings. [`Quoted::to_utf8()`] helps with the common case of doing this for /// UTF-8 strings. pub fn raw_content(self) -> impl Iterator + 'lex { - self.content_slice().iter().copied() + self.content_slice().iter().map(|c| match c { + Content::Lit(s) => Content::Lit(s.span(self.ctx)), + Content::Esc(s, e) => { + Content::Esc(s.span(self.ctx), e.map(|e| e.span(self.ctx))) + } + }) } /// Returns the unique single [`Content`] of this token, if it is unique. pub fn unique_content(self) -> Option { - match self.content_slice() { - [unique] => Some(*unique), - _ => None, + if self.content_slice().len() == 1 { + return self.raw_content().next(); } + None } /// Constructs a UTF-8 string in the "obvious way", using this token and a /// mapping function for escapes. pub fn to_utf8( self, - mut decode_esc: impl FnMut(SpanId, Option, &mut String), + mut decode_esc: impl FnMut(Span, Option, &mut String), ) -> String { let total = self .raw_content() @@ -1030,7 +1037,7 @@ impl<'lex> Quoted<'lex> { buf } - fn content_slice(self) -> &'lex [Content] { + fn content_slice(self) -> &'lex [Content] { match &self.tok.kind { Kind::Quoted { content, .. } => content, _ => panic!("non-lexer::Kind::Quoted inside of Quoted"), @@ -1038,8 +1045,8 @@ impl<'lex> Quoted<'lex> { } /// Returns this token's prefix. - pub fn prefix(self) -> Option { - self.tok.prefix + pub fn prefix(self) -> Option { + self.tok.prefix.map(|s| s.span(self.ctx)) } /// Checks whether this identifier has a particular prefix. @@ -1048,8 +1055,8 @@ impl<'lex> Quoted<'lex> { } /// Returns this token's suffix. - pub fn suffix(self) -> Option { - self.tok.suffix + pub fn suffix(self) -> Option { + self.tok.suffix.map(|s| s.span(self.ctx)) } /// Checks whether this identifier has a particular prefix. @@ -1063,31 +1070,28 @@ impl<'lex> Quoted<'lex> { /// The "span type" is configurable; this type is used by multiple parts of /// the library. #[derive(Copy, Clone, Debug)] -pub enum Content { +pub enum Content { /// A literal chunk, i.e. UTF-8 text directly from the source file. - Lit(SpanId), + Lit(Span), /// An escape sequence, which may have associated data (e.g. the `NN` from a /// `\xNN`). - Esc(SpanId, Option), + Esc(Span, Option), } -impl Content { +impl Content { /// Literal contents. - pub fn lit(chunk: impl Into) -> Self { + pub fn lit(chunk: impl Into) -> Self { Self::Lit(chunk.into()) } /// Escaped contents. - pub fn esc(chunk: impl Into) -> Self { + pub fn esc(chunk: impl Into) -> Self { Self::Esc(chunk.into(), None) } /// Escaped contents. - pub fn esc_with_data( - chunk: impl Into, - data: impl Into, - ) -> Self { + pub fn esc_with_data(chunk: impl Into, data: impl Into) -> Self { Self::Esc(chunk.into(), Some(data.into())) } } diff --git a/ilex/tests/json.rs b/ilex/tests/json.rs index 652d6b0..718b243 100644 --- a/ilex/tests/json.rs +++ b/ilex/tests/json.rs @@ -11,7 +11,6 @@ use ilex::token; use ilex::token::Content as C; use ilex::token::Cursor; use ilex::Lexeme; -use ilex::Spanned; #[ilex::spec] struct JsonSpec { From e31d50bd9007599c1d17dc7253f940059c3c7965 Mon Sep 17 00:00:00 2001 From: Miguel Young de la Sota Date: Wed, 1 Jan 2025 13:10:37 -0800 Subject: [PATCH 3/9] ilex: Add helpers for dealing with auxiliary lexemes --- ilex/src/rt/dfa.rs | 4 ++-- ilex/src/spec.rs | 43 +++++++++++++++++++++++++++++-------------- 2 files changed, 31 insertions(+), 16 deletions(-) diff --git a/ilex/src/rt/dfa.rs b/ilex/src/rt/dfa.rs index 8553c18..40362ec 100644 --- a/ilex/src/rt/dfa.rs +++ b/ilex/src/rt/dfa.rs @@ -103,7 +103,7 @@ impl Dfa { let id = dfa.match_pattern(lexer.cache(), state, i); if id.as_usize() < self.non_close_rules { Lexeme2 { - lexeme: Lexeme::new(id.as_u32()), + lexeme: Lexeme::new(id.as_i32()), is_close: false, } } else { @@ -127,7 +127,7 @@ pub fn compile(rules: &[Any]) -> Dfa { let mut closers = Vec::new(); for (lexeme, rule) in rules.iter().enumerate() { - let lexeme = Lexeme::new(lexeme as u32); + let lexeme = Lexeme::new(lexeme as i32); let rule = compile_rule(rule); patterns.push(rule.pat); if let Some(close) = rule.close { diff --git a/ilex/src/spec.rs b/ilex/src/spec.rs index 78ffdc8..f3f0177 100644 --- a/ilex/src/spec.rs +++ b/ilex/src/spec.rs @@ -4,6 +4,7 @@ use std::cmp::Ordering; use std::fmt; use std::fmt::Display; use std::hash::Hash; +use std::i32; use std::marker::PhantomData; use byteyarn::yarn; @@ -23,14 +24,14 @@ use crate::rule::Rule; /// be used to distinguish what rule a [`Token`][crate::token::Token] came from. #[repr(transparent)] pub struct Lexeme { - id: u32, + id: i32, _ph: PhantomData, } impl Lexeme { /// Returns the unique lexeme for the end-of-file marker. pub fn eof() -> Self { - Self::new(!0) + Self::new(i32::MAX) } } @@ -40,6 +41,22 @@ impl Lexeme { Lexeme::new(self.id) } + /// Returns whether this is the EOF lexeme. + pub fn is_eof(self) -> bool { + self == Lexeme::eof() + } + + /// Returns whether this is an auxiliary token that users should never + /// actually see. + pub(crate) fn is_aux(self) -> bool { + self.id < 0 + } + + /// Returns whether this lexeme can have comments attached to it. + pub(crate) fn can_have_comments(self, spec: &Spec) -> bool { + !self.is_aux() && !matches!(spec.rule(self.any()), rule::Any::Comment(_)) + } + /// Converts this lexeme into an index. pub(crate) fn index(self) -> usize { self.id as usize @@ -51,14 +68,9 @@ impl Lexeme { } /// Creates a new lexeme. - pub(crate) fn new(id: u32) -> Self { + pub(crate) fn new(id: i32) -> Self { Self { id, _ph: PhantomData } } - - /// Creates a new lexeme. - pub fn z() -> Self { - Self { id: 0, _ph: PhantomData } - } } impl fmt::Debug for Lexeme { @@ -176,13 +188,16 @@ impl SpecBuilder { name: impl Into, rule: R, ) -> Lexeme { - if self.rules.len() == (u32::MAX as usize) - 2 { - panic!("ilex: ran out of lexeme ids") + if self.rules.len() == (i32::MAX as usize) { + panic!( + "ilex: grammars with more than {} lexemes are unsupported", + i32::MAX + ) } self.names.push(name.into()); self.rules.push(rule.into()); - Lexeme::new(self.rules.len() as u32 - 1) + Lexeme::new(self.rules.len() as i32 - 1) } #[doc(hidden)] @@ -206,8 +221,8 @@ impl Clone for Lexeme { impl Copy for Lexeme {} -impl PartialEq> for Lexeme { - fn eq(&self, other: &Lexeme) -> bool { +impl PartialEq> for Lexeme { + fn eq(&self, other: &Lexeme) -> bool { self.id == other.id } } @@ -225,7 +240,7 @@ impl Ord for Lexeme { impl Hash for Lexeme { fn hash(&self, state: &mut H) { - u32::hash(&self.id, state) + i32::hash(&self.id, state) } } From ee42575a3425dbb51d53d131ad65c48c97856b97 Mon Sep 17 00:00:00 2001 From: Miguel Young de la Sota Date: Wed, 1 Jan 2025 14:41:27 -0800 Subject: [PATCH 4/9] ilex: Implement a smaller token layout --- ilex/src/file/context.rs | 28 --- ilex/src/file/mod.rs | 70 +----- ilex/src/report/builtin.rs | 1 - ilex/src/report/diagnostic.rs | 2 +- ilex/src/rt/dfa.rs | 2 +- ilex/src/rt/emit2.rs | 259 ++++++++++---------- ilex/src/rt/lexer.rs | 262 +++++++++++--------- ilex/src/rt/mod.rs | 123 +++++----- ilex/src/spec.rs | 6 +- ilex/src/testing/recognize.rs | 13 +- ilex/src/token/mod.rs | 432 ++++++++++++++++++--------------- ilex/src/token/stream.rs | 443 ++++++++++++++++++++++++---------- 12 files changed, 906 insertions(+), 735 deletions(-) diff --git a/ilex/src/file/context.rs b/ilex/src/file/context.rs index b17e416..69d2baa 100644 --- a/ilex/src/file/context.rs +++ b/ilex/src/file/context.rs @@ -1,8 +1,6 @@ -use std::collections::HashMap; use std::fs; use std::sync::Arc; use std::sync::RwLock; -use std::sync::RwLockReadGuard; use camino::Utf8Path; use camino::Utf8PathBuf; @@ -34,7 +32,6 @@ pub struct State { files: Vec<(Utf8PathBuf, String)>, ranges: Vec, - comments: HashMap<(u32, u32), Vec>, } unsafe impl Send for Context {} @@ -152,31 +149,6 @@ impl Context { state.ranges[span.0 as usize] } - pub(crate) fn lookup_comments( - &self, - file: File, - offset: usize, - ) -> (RwLockReadGuard, *const [SpanId]) { - let state = self.state.read().unwrap(); - let ptr = state - .comments - .get(&(file.idx as u32, offset as u32)) - .map(|x| x.as_slice()) - .unwrap_or_default() as *const [SpanId]; - (state, ptr) - } - - pub(crate) fn add_comment(&self, file: File, offset: usize, comment: SpanId) { - self - .state - .write() - .unwrap() - .comments - .entry((file.idx as u32, offset as u32)) - .or_default() - .push(comment) - } - /// Creates a new synthetic span with the given contents. pub(crate) fn new_span(&self, range: Span) -> SpanId { let mut state = self.state.write().unwrap(); diff --git a/ilex/src/file/mod.rs b/ilex/src/file/mod.rs index 06d03a0..736be62 100644 --- a/ilex/src/file/mod.rs +++ b/ilex/src/file/mod.rs @@ -7,8 +7,6 @@ use std::ops::Bound; use std::ops::Index; use std::ops::RangeBounds; use std::ptr; -use std::slice; -use std::sync::RwLockReadGuard; use camino::Utf8Path; @@ -199,19 +197,6 @@ impl Span { (self.end - self.start) as usize } - /// Gets the comment associated with this span, if any. - /// - /// # Panics - /// - /// May panic if this span is not owned by `ctx` (or it may produce an - /// unexpected result). - pub fn comments(self, ctx: &Context) -> Comments { - Comments { - slice: ctx.lookup_comments(self.file(ctx), self.start()), - ctx, - } - } - /// Returns a subspan of this range. /// /// # Panics @@ -250,8 +235,7 @@ impl Span { /// Splits this range in two at `at`. /// /// # Panics - /// - /// Panics if `at` is larger than the length of this range. + /// /// Panics if `at` is larger than the length of this range. pub fn split_at(self, at: usize) -> (Span, Span) { (self.subspan(..at), self.subspan(at..)) } @@ -314,12 +298,6 @@ impl Span { } Some(self.intern(ctx)) } - - /// Sets the comment associated with a given span. The comment must itself - /// be specified as a span. - pub(crate) fn append_comment_span(self, ctx: &Context, comment: SpanId) { - ctx.add_comment(self.file(ctx), self.start(), comment) - } } /// A syntax element which contains a span. @@ -359,11 +337,6 @@ pub trait Spanned { fn text<'ctx>(&self, ctx: &'ctx Context) -> &'ctx str { self.span(ctx).text(ctx) } - - /// Forwards to [`SpanId::comments()`]. - fn comments<'ctx>(&self, ctx: &'ctx Context) -> Comments<'ctx> { - self.span(ctx).comments(ctx) - } } impl Spanned for SpanId { @@ -420,47 +393,6 @@ impl fmt::Debug for Span { } } -/// An iterator over the comment spans attached to a [`SpanId`]. -pub struct Comments<'ctx> { - slice: (RwLockReadGuard<'ctx, context::State>, *const [SpanId]), - ctx: &'ctx Context, -} - -impl<'ctx> Comments<'ctx> { - /// Adapts this iterator to return just the text contents of each [`SpanId`]. - pub fn as_strings(&self) -> impl Iterator { - unsafe { &*self.slice.1 } - .iter() - .map(|span| span.text(self.ctx)) - } -} - -impl<'a, 'ctx> IntoIterator for &'a Comments<'ctx> { - type Item = Span; - type IntoIter = CommentsIter<'a, 'ctx>; - - fn into_iter(self) -> Self::IntoIter { - CommentsIter { - iter: unsafe { &*self.slice.1 }.iter(), - ctx: self.ctx, - } - } -} - -// Iterator over a [`Comments`]. -pub struct CommentsIter<'a, 'ctx> { - iter: slice::Iter<'a, SpanId>, - ctx: &'ctx Context, -} - -impl<'a, 'ctx> Iterator for CommentsIter<'a, 'ctx> { - type Item = Span; - - fn next(&mut self) -> Option { - self.iter.next().map(|s| s.span(self.ctx)) - } -} - #[track_caller] fn cast + fmt::Debug>(value: T) -> u32 { value diff --git a/ilex/src/report/builtin.rs b/ilex/src/report/builtin.rs index 5008b72..4ff5102 100644 --- a/ilex/src/report/builtin.rs +++ b/ilex/src/report/builtin.rs @@ -68,7 +68,6 @@ impl Builtins<'_> { #[track_caller] pub(crate) fn extra_chars<'a>( &self, - unexpected_in: impl Into>, at: impl Spanned, ) -> Diagnostic { diff --git a/ilex/src/report/diagnostic.rs b/ilex/src/report/diagnostic.rs index 33b1010..9ec1153 100644 --- a/ilex/src/report/diagnostic.rs +++ b/ilex/src/report/diagnostic.rs @@ -13,7 +13,7 @@ use crate::report::Report; /// almost always temporaries, e.g. /// /// ``` -/// # fn x(report: &ilex::Report, span: ilex::SpanId) { +/// # fn x(report: &ilex::Report, span: ilex::Span) { /// report.error("my error message") /// .saying(span, "this is bad code"); /// # } diff --git a/ilex/src/rt/dfa.rs b/ilex/src/rt/dfa.rs index 40362ec..759ad1f 100644 --- a/ilex/src/rt/dfa.rs +++ b/ilex/src/rt/dfa.rs @@ -69,7 +69,7 @@ impl Dfa { /// length, plus potential lexical interpretations of that range. pub fn search(&self, lexer: &mut Lexer) -> Option { let dfa = &self.engine; - let haystack = lexer.rest(); + let haystack = lexer.text(lexer.cursor()..); let mut state = dfa .start_state(lexer.cache(), &start::Config::new().anchored(Anchored::Yes)) diff --git a/ilex/src/rt/emit2.rs b/ilex/src/rt/emit2.rs index 02b2b6a..3cc19aa 100644 --- a/ilex/src/rt/emit2.rs +++ b/ilex/src/rt/emit2.rs @@ -22,9 +22,6 @@ use crate::rule::Comment; use crate::rule::Quoted; use crate::spec::Lexeme; use crate::spec::Spec; -use crate::token; -use crate::token::Content; -use crate::token::Cursor; use super::dfa::Lexeme2; use super::unicode::is_xid; @@ -39,11 +36,11 @@ pub fn emit(lexer: &mut Lexer) { }; let start = lexer.cursor(); - lexer.advance(match_.len); - let range = lexer.span(start..lexer.cursor()); + let end = start + match_.len; + let range = lexer.span(start..end); let span = range.intern(ctx); let text = range.text(ctx); - lexer.advance(match_.extra); + let end = end + match_.extra; // Now we have to decide which of `candidates` is the best one, i.e., // the one with no errors. The following things are explicitly *not* @@ -198,13 +195,10 @@ pub fn emit(lexer: &mut Lexer) { } let best = best.unwrap_or(match_.candidates[0]); - let [sign, pre, range, suf] = + let [sign, prefix, range, suffix] = find_affixes_partial(range, lexer.spec(), best, ctx); let text = range.text(ctx); - let prefix = pre.intern_nonempty(ctx); - let suffix = suf.intern_nonempty(ctx); - let mirrored = match lexer.spec().rule(best.lexeme) { Any::Bracket(bracket) | Any::Comment(Comment { bracket, .. }) @@ -249,7 +243,7 @@ pub fn emit(lexer: &mut Lexer) { _ => None, }; - let mut generated_token = true; + let mut emitted = true; if best.is_close { let Some(opener) = &mirrored else { bug!("found is_close Lexeme2 corresponding to rule without brackets") @@ -262,19 +256,14 @@ pub fn emit(lexer: &mut Lexer) { }; lexer.builtins().unopened(opener, found, span); - generated_token = false; + lexer.add_token(rt::UNEXPECTED, end - start, None); + emitted = false; } else { // Now we have repeat the process from the 'verify, but now we know what kind // of token we're going to create. match lexer.spec().rule(best.lexeme) { - Any::Keyword(..) => lexer.add_token(rt::Token { - kind: rt::Kind::Keyword, - span, - lexeme: best.lexeme, - prefix, - suffix, - }), + Any::Keyword(..) => lexer.add_token(best.lexeme, range.len(), None), Any::Bracket(..) => { // Construct the closer. @@ -282,35 +271,35 @@ pub fn emit(lexer: &mut Lexer) { best.lexeme.cast(), mirrored.clone().unwrap().immortalize(), ); - lexer.add_token(rt::Token { - kind: rt::Kind::Open { offset_to_close: !0 }, - span, - lexeme: best.lexeme, - prefix, - suffix, - }); + lexer.add_token( + best.lexeme, + range.len(), + Some(rt::Kind::Offset { cursor: 0, meta: 0 }), + ); } + #[allow(clippy::almost_swapped)] Any::Comment(rule) => { // Comments aren't real tokens. - generated_token = false; + emitted = false; + let mut cursor = end; // The span we created only contains the open bracket for the comment. // We still need to lex the comment to the end. let mut depth = 1; let close = mirrored.clone().unwrap().immortalize(); - while let Some(c) = lexer.rest().chars().next() { - if rule.can_nest && lexer.rest().starts_with(text) { + while let Some(c) = lexer.text(cursor..).chars().next() { + if rule.can_nest && lexer.text(cursor..).starts_with(text) { depth += 1; - lexer.advance(text.len()); - } else if lexer.rest().starts_with(close.as_str()) { + cursor += text.len(); + } else if lexer.text(cursor..).starts_with(close.as_str()) { depth -= 1; - lexer.advance(close.len()); + cursor += close.len(); if depth == 0 { break; } } else { - lexer.advance(c.len_utf8()); + cursor += c.len_utf8(); } } @@ -321,8 +310,7 @@ pub fn emit(lexer: &mut Lexer) { .unclosed(span, &close, Lexeme::eof(), lexer.eof()); } - let span = lexer.intern(start..lexer.cursor()); - lexer.add_comment(span); + lexer.add_token(best.lexeme, cursor - lexer.cursor(), None); } Any::Ident(rule) => { @@ -342,16 +330,20 @@ pub fn emit(lexer: &mut Lexer) { } } - lexer.add_token(rt::Token { - kind: rt::Kind::Ident(range.intern(ctx)), - span, - lexeme: best.lexeme, - prefix, - suffix, - }); + lexer.add_token(rt::PREFIX, prefix.len(), None); + lexer.add_token(best.lexeme, range.len(), None); + lexer.add_token(rt::SUFFIX, suffix.len(), None); } Any::Digital(rule) => { + lexer.add_token(rt::PREFIX, prefix.len(), None); + lexer.add_token( + best.lexeme, + sign.len() + range.len(), + Some(rt::Kind::Digital(rt::Digital::default())), + ); + lexer.add_token(rt::SUFFIX, suffix.len(), None); + let sign_text = sign.text(ctx); let sign = sign.intern_nonempty(ctx).map(|span| { for (text, value) in &rule.mant.signs { @@ -363,7 +355,7 @@ pub fn emit(lexer: &mut Lexer) { }); let mut chunks = vec![DigitBlocks { - prefix, + prefix: prefix.intern_nonempty(ctx), sign, blocks: Vec::new(), which_exp: !0, @@ -489,23 +481,27 @@ pub fn emit(lexer: &mut Lexer) { .unwrap() .blocks .push(range.subspan(block_start..).intern(ctx)); - let mant = chunks.remove(0); - let tok = rt::Token { - kind: rt::Kind::Digital { digits: mant, exponents: chunks }, - span, - lexeme: best.lexeme, - prefix, - suffix, + + let Some(rt::Kind::Digital(meta)) = lexer + .stream_mut() + .last_meta_mut() + .and_then(|m| m.kind.as_mut()) + else { + bug!("missing rt::Digital in digital token"); + }; + meta.digits = mant; + meta.exponents = chunks; + + let Some(rt::Kind::Digital(meta)) = + lexer.stream().last_meta().and_then(|m| m.kind.as_ref()) + else { + bug!("missing rt::Digital in digital token"); }; - let token = Cursor::fake_token(lexer.file(), lexer.spec(), &tok); // This happens later so we have access to the full spans of // the digit blocks. - let rt::Kind::Digital { digits, exponents } = &tok.kind else { - unreachable!() - }; - for chunk in iter::once(digits).chain(exponents) { + for chunk in iter::once(&meta.digits).chain(&meta.exponents) { let digits = rule .exps .get(chunk.which_exp) @@ -536,7 +532,7 @@ pub fn emit(lexer: &mut Lexer) { let range = block.span(ctx); let mut text = block.text(ctx); - if range.is_empty() { + if range.is_empty() && chunk.prefix.is_some() { let prefix = chunk.prefix.unwrap(); lexer .builtins() @@ -567,7 +563,7 @@ pub fn emit(lexer: &mut Lexer) { if !c.is_digit(digits.radix as u32) { lexer.builtins().unexpected( Expected::Literal(c.into()), - token, + lexer.stream().last_token(), lexer.span(cursor..cursor + c.len_utf8()), ) .remark( @@ -581,70 +577,69 @@ pub fn emit(lexer: &mut Lexer) { } } } - - lexer.add_token(tok); } Any::Quoted(rule) => { let close = mirrored.clone().unwrap().immortalize(); - let mut chunk_start = lexer.cursor(); - let mut content = Vec::new(); + let mut chunk_start = end; + let mut cursor = end; + let mut marks = vec![chunk_start as u32]; let uq_end = loop { - if lexer.rest().starts_with(close.as_str()) { - let end = lexer.cursor(); - lexer.advance(close.len()); + if lexer.text(cursor..).starts_with(close.as_str()) { + let end = cursor; + cursor += close.len(); if end > chunk_start { - content.push(Content::Lit(lexer.intern(chunk_start..end))); + marks.push(end as u32); } break Some(end); } - let (esc, rule) = match rule.escapes.longest_prefix(lexer.rest()) { + let rest = lexer.text(cursor..); + let (esc, rule) = match rule.escapes.longest_prefix(rest) { Some(e) => e, - None => match lexer.rest().chars().next() { + None => match rest.chars().next() { Some(c) => { - lexer.advance(c.len_utf8()); + cursor += c.len_utf8(); continue; } None => break None, }, }; - if lexer.cursor() > chunk_start { - content - .push(Content::Lit(lexer.intern(chunk_start..lexer.cursor()))); - } + // Push unconditionally: this ensures that chunks of text are always + // between escapes, even if the literal chunks are empty. + marks.push(cursor as u32); - let esc_start = lexer.cursor(); - lexer.advance(esc.len()); - let esc = lexer.intern(esc_start..lexer.cursor()); - let value = match rule { + let esc_start = cursor; + cursor += esc.len(); + let esc_end = cursor; + let mark = match rule { rule::Escape::Invalid => { lexer.builtins().invalid_escape( - lexer.span(esc_start..lexer.cursor()), + lexer.span(esc_start..cursor), "invalid escape sequence", ); - None + [cursor; 3] } - rule::Escape::Basic => None, + rule::Escape::Basic => [cursor; 3], rule::Escape::Fixed(chars) => { - let arg_start = lexer.cursor(); + let arg_start = cursor; let mut count = 0; for _ in 0..*chars { // TRICKY: We have just skipped over \x. If we were to take *any* // characters, we would lex `"\x" ` as being `\x` with arg `" `. // So, we want to check for a closer on *every* loop iteration, and // break out if we *see* it: we should not consume it. - if lexer.rest().starts_with(close.as_str()) { + if lexer.text(cursor..).starts_with(close.as_str()) { break; } - match lexer.rest().chars().next() { - Some(c) => lexer.advance(c.len_utf8()), + match lexer.text(cursor..).chars().next() { + Some(c) => cursor += c.len_utf8(), None => break, } count += 1; @@ -652,7 +647,7 @@ pub fn emit(lexer: &mut Lexer) { if count != *chars { lexer.builtins().invalid_escape( - lexer.span(esc_start..lexer.cursor()), + lexer.span(esc_start..cursor), f!( "expected exactly {chars} character{} here", plural(*chars) @@ -660,80 +655,78 @@ pub fn emit(lexer: &mut Lexer) { ); } - Some(lexer.intern(arg_start..lexer.cursor())) + [arg_start, cursor, cursor] } rule::Escape::Bracketed(open, close) => 'delim: { - if !lexer.rest().starts_with(open.as_str()) { + if !lexer.text(cursor..).starts_with(open.as_str()) { lexer.builtins().invalid_escape( - lexer.span(esc_start..lexer.cursor()), + lexer.span(esc_start..cursor), f!("expected a `{open}`"), ); - break 'delim None; + break 'delim [cursor; 3]; } else { - lexer.advance(open.len()); + cursor += open.len() } - let arg_start = lexer.cursor(); - let Some(len) = lexer.rest().find(close.as_str()) else { + let arg_start = cursor; + let Some(len) = lexer.text(..cursor).find(close.as_str()) else { lexer.builtins().invalid_escape( - lexer.span(esc_start..lexer.cursor()), + lexer.span(esc_start..cursor), f!("expected a `{close}`"), ); - break 'delim None; + break 'delim [arg_start, cursor, cursor]; }; - lexer.advance(len + close.len()); - Some(lexer.intern(arg_start..lexer.cursor() - close.len())) + cursor += len + close.len(); + [arg_start, arg_start + len, cursor] } }; - content.push(Content::Esc(esc, value)); - chunk_start = lexer.cursor(); + marks.push(esc_end as u32); + marks.extend(mark.iter().map(|&x| x as u32)); + chunk_start = cursor; }; - let uq_end = uq_end.unwrap_or_else(|| { + if uq_end.is_none() { lexer .builtins() .unclosed(span, &close, Lexeme::eof(), lexer.eof()); - lexer.cursor() - }); + } // We have to parse the suffix ourselves explicitly! let suf = rule .affixes .suffixes() .iter() - .filter(|y| lexer.rest().starts_with(y.as_str())) + .filter(|y| lexer.text(cursor..).starts_with(y.as_str())) .map(|y| y.len()) .max() .unwrap_or_else(|| { + let found = match lexer.text(cursor..).chars().next() { + Some(n) => Expected::Literal(n.into()), + None => Lexeme::eof().into(), + }; + lexer.builtins().expected( rule .affixes .suffixes() .iter() .map(|y| Expected::Literal(y.aliased())), - Expected::Literal("fixme".into()), - lexer.span(lexer.cursor()..lexer.cursor()), + found, + lexer.span(cursor..cursor), ); 0 }); - let suf_start = lexer.cursor(); - lexer.advance(suf); - let suffix = lexer.span(suf_start..lexer.cursor()).intern_nonempty(ctx); - - lexer.add_token(rt::Token { - kind: rt::Kind::Quoted { - content, - open: range.intern(ctx), - close: lexer.intern(uq_end..suf_start), - }, - span: lexer.intern(span.span(ctx).start()..lexer.cursor()), - lexeme: best.lexeme, - prefix, - suffix, - }); + + lexer.add_token(rt::PREFIX, prefix.len(), None); + lexer.add_token( + best.lexeme, + cursor - lexer.cursor(), + Some(rt::Kind::Quoted(rt::Quoted { marks })), + ); + lexer.add_token(rt::SUFFIX, suf, None); } } } @@ -745,12 +738,8 @@ pub fn emit(lexer: &mut Lexer) { // and diagnose that. if match_.extra > 0 { - let expected = if generated_token { - Expected::Token(token::Cursor::fake_token( - lexer.file(), - lexer.spec(), - lexer.last_token(), - )) + let expected = if emitted { + Expected::Token(lexer.stream().last_token()) } else if let Some(mirrored) = &mirrored { if best.is_close { Expected::Literal(yarn!("{mirrored} ... {text}")) @@ -767,22 +756,16 @@ pub fn emit(lexer: &mut Lexer) { .extra_chars(expected, lexer.span(start..start + match_.extra)); } - let prev = lexer.rest().chars().next_back(); + let rest = lexer.text(lexer.cursor()..); + let prev = rest.chars().next_back(); if prev.is_some_and(is_xid) { - let xids = lexer - .rest() - .find(|c| !is_xid(c)) - .unwrap_or(lexer.rest().len()); + let xids = rest.find(|c| !is_xid(c)).unwrap_or(rest.len()); if xids > 0 { let start = lexer.cursor(); - lexer.advance(xids); - - let expected = if generated_token { - Expected::Token(token::Cursor::fake_token( - lexer.file(), - lexer.spec(), - lexer.last_token(), - )) + lexer.add_token(rt::UNEXPECTED, xids, None); + + let expected = if emitted { + Expected::Token(lexer.stream().last_token()) } else if let Some(mirrored) = &mirrored { if best.is_close { Expected::Literal(yarn!("{mirrored} ... {text}")) @@ -795,7 +778,7 @@ pub fn emit(lexer: &mut Lexer) { lexer .builtins() - .extra_chars(expected, lexer.span(start..lexer.cursor())); + .extra_chars(expected, lexer.span(start..start + xids)); } } } diff --git a/ilex/src/rt/lexer.rs b/ilex/src/rt/lexer.rs index 2e1025a..aa97450 100644 --- a/ilex/src/rt/lexer.rs +++ b/ilex/src/rt/lexer.rs @@ -1,4 +1,5 @@ use std::mem; +use std::num::NonZeroU32; use std::ops::Index; use std::ops::RangeBounds; @@ -10,15 +11,16 @@ use crate::file::Context; use crate::file::File; use crate::file::Span; use crate::file::SpanId; -use crate::file::Spanned; use crate::report::Builtins; use crate::report::Report; use crate::rt; use crate::rule; +use crate::rule::Any; use crate::rule::Bracket; use crate::spec::Lexeme; use crate::spec::Spec; use crate::token; +use crate::token::Stream; use super::unicode::is_xid; @@ -26,13 +28,11 @@ use super::unicode::is_xid; /// operation. pub struct Lexer<'a, 'ctx> { report: &'a Report, - spec: &'ctx Spec, - file: File<'ctx>, + stream: Stream<'ctx>, cursor: usize, - tokens: Vec, closers: Vec, - comments: Vec, + comments: Vec, eof: SpanId, cache: Cache, @@ -42,6 +42,7 @@ pub struct Lexer<'a, 'ctx> { pub struct Closer { lexeme: Lexeme, open_idx: usize, + meta_idx: usize, original_open_idx: usize, // For diagnostics. close: Yarn, } @@ -50,27 +51,22 @@ impl<'a, 'ctx> Lexer<'a, 'ctx> { /// Creates a new lexer. pub fn new(file: File<'ctx>, report: &'a Report, spec: &'ctx Spec) -> Self { Lexer { - eof: file.span(file.len()..file.len()).intern(file.context()), - cache: Cache::new(&spec.dfa().engine), - - file, report, - spec, + stream: Stream { + file, + spec, + toks: Vec::new(), + meta_idx: Vec::new(), + meta: Vec::new(), + }, cursor: 0, - tokens: Vec::new(), closers: Vec::new(), comments: Vec::new(), - } - } - - pub fn advance(&mut self, by: usize) { - assert!( - self.cursor.saturating_add(by) <= self.text(..).len(), - "ilex: advanced cursor beyond the end of text; this is a bug" - ); - self.cursor += by; + eof: file.span(file.len()..file.len()).intern(file.context()), + cache: Cache::new(&spec.dfa().engine), + } } /// Returns the report for diagnostics. @@ -78,9 +74,18 @@ impl<'a, 'ctx> Lexer<'a, 'ctx> { self.report } + /// Returns the stream this lexer is building. + pub fn stream(&self) -> &Stream<'ctx> { + &self.stream + } + + pub(crate) fn stream_mut(&mut self) -> &mut Stream<'ctx> { + &mut self.stream + } + /// Returns the spec we're lexing against. pub fn spec(&self) -> &'ctx Spec { - self.spec + self.stream.spec() } /// Returns the diagnostics builtins. @@ -90,7 +95,7 @@ impl<'a, 'ctx> Lexer<'a, 'ctx> { /// Returns the spec we're lexing against. pub fn file(&self) -> File<'ctx> { - self.file + self.stream.file() } /// Returns a slice of the current file being lexed. @@ -98,7 +103,7 @@ impl<'a, 'ctx> Lexer<'a, 'ctx> { where str: Index, { - self.file.text(range) + self.file().text(range) } /// Returns the current cursor position. @@ -106,11 +111,6 @@ impl<'a, 'ctx> Lexer<'a, 'ctx> { self.cursor } - /// Returns everything after the current cursor position. - pub fn rest(&self) -> &'ctx str { - self.text(self.cursor..) - } - /// Returns the EOF span. pub fn eof(&self) -> SpanId { self.eof @@ -118,12 +118,22 @@ impl<'a, 'ctx> Lexer<'a, 'ctx> { /// Creates a new range in the current file. pub fn span(&self, range: impl RangeBounds) -> Span { - self.file.span(range) + self.file().span(range) } /// Creates a new range in the current file and bakes it. pub fn intern(&self, range: impl RangeBounds) -> SpanId { - self.file.span(range).intern(self.ctx()) + self.file().span(range).intern(self.ctx()) + } + + // Returns the span of the token at the given index. + pub fn lookup_span(&self, idx: usize) -> Span { + let end = self.stream.toks[idx].end as usize; + let start = self.stream.toks[..idx] + .last() + .map(|p| p.end as usize) + .unwrap_or(0); + self.file().span(start..end) } /// Creates a new span in the current file with the given range. @@ -135,26 +145,22 @@ impl<'a, 'ctx> Lexer<'a, 'ctx> { &mut self.cache } - pub fn last_token(&self) -> &rt::Token { - self.tokens.last().unwrap() - } - /// Pushes a closer. pub fn push_closer(&mut self, lexeme: Lexeme, close: Yarn) { self.closers.push(Closer { lexeme, close, - open_idx: self.tokens.len(), - original_open_idx: self.tokens.len(), + open_idx: self.stream.toks.len(), + meta_idx: self.stream.meta_idx.len(), + original_open_idx: self.stream.toks.len(), }); } /// Pops a closer, if it is time for it. pub fn pop_closer(&mut self) { - let idx = self - .closers - .iter() - .rposition(|close| self.rest().starts_with(close.close.as_str())); + let idx = self.closers.iter().rposition(|close| { + self.text(self.cursor()..).starts_with(close.close.as_str()) + }); let Some(idx) = idx else { return }; let len = self.closers.len(); @@ -167,43 +173,43 @@ impl<'a, 'ctx> Lexer<'a, 'ctx> { } let start = self.cursor(); - self.advance(close.close.len()); + let mut end = start + close.close.len(); - let close_idx = self.tokens.len(); - let offset_to_open = (close_idx - close.open_idx) as u32; + let close_idx = self.stream.toks.len(); + let meta_idx = self.stream.meta.len(); + let offset = (close_idx - close.open_idx) as i32; + let meta_offset = (meta_idx - close.meta_idx) as i32; - match &mut self.tokens[close.open_idx].kind { - rt::Kind::Open { offset_to_close, .. } => { - *offset_to_close = offset_to_open - } - _ => { - panic!("ilex: lexer.closers.last().open_idx did not point to an rt::Kind::Open; this is a bug") - } - } - let open_sp = self.tokens[close.open_idx].span; + let Some(rt::Kind::Offset { cursor, meta }) = + &mut self.stream.meta[close.meta_idx].kind + else { + bug!("ilex: lexer.closers.last().open_idx did not point to an rt::Kind::Open") + }; + *cursor += offset; + *meta += meta_offset; - let prev = self.rest().chars().next_back(); + let open_sp = self.lookup_span(close.open_idx); + + let rest = self.text(end..); + let prev = rest.chars().next_back(); if prev.is_some_and(is_xid) { - let xids = self - .rest() - .find(|c| !is_xid(c)) - .unwrap_or(self.rest().len()); + let xids = rest.find(|c| !is_xid(c)).unwrap_or(rest.len()); if xids > 0 { - let start = self.cursor(); - self.advance(xids); + let start = end; + end += xids; - let span = self.span(start..self.cursor()); + let span = self.span(start..end); self.builtins().extra_chars( self.spec().rule_name_or( close.lexeme.any(), - f!("{} ... {}", open_sp.text(self.file.context()), close.close), + f!("{} ... {}", open_sp.text(self.file().context()), close.close), ), span, ); } } - let span = self.span(start..self.cursor).intern(self.ctx()); + let span = self.span(start..end); if idx != self.closers.len() { // This is a so-called "mixed delimiter", and an error we need to // diagnose. @@ -215,76 +221,110 @@ impl<'a, 'ctx> Lexer<'a, 'ctx> { ); } - let full_span = - self.intern(open_sp.span(self.ctx()).start()..self.cursor()); - self.add_token(rt::Token { - kind: rt::Kind::Close { full_span, offset_to_open }, - span, - lexeme: close.lexeme.any(), - prefix: None, - suffix: None, - }); + self.add_token( + close.lexeme.any(), + end - start, + Some(rt::Kind::Offset { cursor: -offset, meta: -meta_offset }), + ); } - /// Adds a new token, draining all of the saved-up comments. - pub fn add_token(&mut self, tok: rt::Token) { - let span = tok.span.span(self.ctx()); - for comment in self.comments.drain(..) { - span.append_comment_span(self.file.context(), comment); + /// Adds a new token. + pub fn add_token( + &mut self, + lexeme: Lexeme, + len: usize, + kind: Option, + ) { + if lexeme.is_aux() { + if len == 0 { + return; + } + + if let Some(prev) = self.stream.toks.last_mut() { + if prev.lexeme == lexeme { + prev.end += len as u32; + self.cursor += len; + return; + } + } } - self.tokens.push(tok); - } + let new_len = self.cursor.saturating_add(len); + let total_len = self.text(..).len(); - /// Adds a new token, draining all of the saved-up comments. - pub fn add_comment(&mut self, span: SpanId) { - self.comments.push(span); - } + debug_assert!( + new_len <= total_len, + "ilex: advanced cursor beyond the end of text ({new_len} > {total_len}); this is a bug" + ); - /// Adds new unexpected tokens, starting from `start`. This may generate - /// multiple tokens, since it does not include whitespace in them. - pub fn add_unexpected(&mut self, mut start: usize, end: usize) { - let mut idx = start; - // Can't use a for loop, since that takes ownership of the iterator - // and that makes the self. calls below a problem. - while let Some(c) = self.text(idx..end).chars().next() { - if c.is_whitespace() { - if idx > start { - let span = self.span(start..idx); - self.builtins().unexpected_token(span); + if cfg!(debug_assertions) && !lexeme.is_eof() && !lexeme.is_aux() { + match self.spec().rule(lexeme) { + Any::Bracket(_) if !matches!(kind, Some(rt::Kind::Offset { .. })) => { + bug!("missing rt::Metadata::Offset on bracket rule") + } + Any::Digital(_) if !matches!(kind, Some(rt::Kind::Digital(_))) => { + bug!("missing rt::Metadata::Digital on digital rule") + } + Any::Quoted(_) if !matches!(kind, Some(rt::Kind::Quoted(_))) => { + bug!("missing rt::Metadata::Quoted on quoted rule") } - start = idx + c.len_utf8(); + _ => {} } + } + + let start = self.cursor(); + self + .stream + .toks + .push(rt::Token { lexeme, end: (start + len) as u32 }); + + let mut meta = rt::Metadata { kind, comments: Vec::new() }; + + if lexeme.can_have_comments(self.spec()) { + meta.comments = mem::take(&mut self.comments); + } - idx += c.len_utf8(); + if meta.kind.is_some() || !meta.comments.is_empty() { + self.stream.meta_idx.push(token::Id( + NonZeroU32::new(self.stream.toks.len() as u32).unwrap(), + )); + self.stream.meta.push(meta); } - if idx > start { - let span = self.span(start..idx); - self.builtins().unexpected_token(span); + if !lexeme.is_eof() + && !lexeme.is_aux() + && matches!(self.spec().rule(lexeme), rule::Any::Comment(_)) + { + self.comments.push(token::Id( + NonZeroU32::new(self.stream.toks.len() as u32).unwrap(), + )); } + + self.cursor += len; + } + + pub fn skip_whitespace(&mut self) -> bool { + let len = self + .text(self.cursor()..) + .chars() + .take_while(|c| c.is_whitespace()) + .map(char::len_utf8) + .sum(); + + self.add_token(rt::WHITESPACE, len, None); + len > 0 } pub fn finish(mut self) -> token::Stream<'ctx> { - self.add_token(rt::Token { - kind: rt::Kind::Eof, - span: self.eof, - lexeme: Lexeme::eof().cast(), - prefix: None, - suffix: None, - }); + self.add_token(Lexeme::eof().any(), 0, None); for close in mem::take(&mut self.closers) { - let open = self.tokens[close.original_open_idx].span; + let open = self.lookup_span(close.original_open_idx); self .builtins() .unclosed(open, &close.close, Lexeme::eof(), self.eof()); } - token::Stream { - file: self.file, - spec: self.spec, - toks: self.tokens, - } + self.stream } } diff --git a/ilex/src/rt/mod.rs b/ilex/src/rt/mod.rs index 96a56ec..420f3d2 100644 --- a/ilex/src/rt/mod.rs +++ b/ilex/src/rt/mod.rs @@ -1,5 +1,7 @@ //! The lexer runtime. +use std::cell::Cell; + use crate::file::File; use crate::file::SpanId; use crate::report::Fatal; @@ -9,7 +11,6 @@ use crate::rule::Sign; use crate::spec::Lexeme; use crate::spec::Spec; use crate::token; -use crate::token::Content; mod emit2; pub mod lexer; @@ -26,41 +27,39 @@ pub fn lex<'ctx>( ) -> Result, Fatal> { let mut lexer = lexer::Lexer::new(file, report, spec); - let mut unexpected_start = None; - while let Some(next) = lexer.rest().chars().next() { - if !next.is_whitespace() { - let start = lexer.cursor(); - - lexer.pop_closer(); - if lexer.cursor() != start { - if let Some(ustart) = unexpected_start.take() { - lexer.add_unexpected(ustart, start); - } - - continue; - } - - emit2::emit(&mut lexer); - if lexer.cursor() != start { - if let Some(ustart) = unexpected_start.take() { - lexer.add_unexpected(ustart, start); - } - - continue; - } - - // We failed to make progress. Skip this character and start an - // "unexpected" token. - if unexpected_start.is_none() { - unexpected_start = Some(lexer.cursor()); - } + let unexpected = Cell::new(None); + let diagnose_unexpected = |end: usize| { + let Some(start) = unexpected.take() else { return }; + report + .builtins(spec) + .unexpected_token(file.span(start..end)); + }; + + loop { + let start = lexer.cursor(); + if lexer.skip_whitespace() { + diagnose_unexpected(start); } - lexer.advance(next.len_utf8()); - } + let start = lexer.cursor(); + let Some(next) = lexer.text(lexer.cursor()..).chars().next() else { break }; + + lexer.pop_closer(); + if lexer.cursor() > start { + diagnose_unexpected(start); + continue; + } - if let Some(start) = unexpected_start { - lexer.add_unexpected(start, lexer.cursor()); + emit2::emit(&mut lexer); + if lexer.cursor() > start { + diagnose_unexpected(start); + continue; + } + + lexer.add_token(UNEXPECTED, next.len_utf8(), None); + if unexpected.get().is_none() { + unexpected.set(Some(start)) + } } report.fatal_or(lexer.finish()) @@ -69,41 +68,51 @@ pub fn lex<'ctx>( /// The internal representation of a token inside of a token stream. #[derive(Clone)] pub struct Token { - pub kind: Kind, - pub span: SpanId, pub lexeme: Lexeme, - pub prefix: Option, - pub suffix: Option, + pub end: u32, +} +#[derive(Clone, Default)] +pub struct Metadata { + pub kind: Option, + pub comments: Vec, } -/// A pared-down token kind. #[derive(Clone)] pub enum Kind { - Eof, - Keyword, - Ident(SpanId), - Quoted { - content: Vec>, - open: SpanId, - close: SpanId, - }, - Digital { - digits: DigitBlocks, - exponents: Vec, - }, - Open { - offset_to_close: u32, - }, - Close { - offset_to_open: u32, - full_span: SpanId, - }, + Quoted(Quoted), + Digital(Digital), + Offset { cursor: i32, meta: i32 }, } #[derive(Clone)] +pub struct Quoted { + // Offsets for the components of the string. First mark is the end of the + // open quote; following are alternating marks for textual and escape content. + // Adjacent escapes are separated by empty text content. + // + // Each text component consists of one mark, its end. Each escape consists of + // four marks, which refer to the end of the escape sequence prefix, the start of extra data, its end, and the + // end of the whole escape. This means that when we encounter \xNN, the + // positions of the marks are \x||NN||. When we encounter \u{NN}, the positions + // are \u|{|NN|}|. For \n, the positions are \n||||. + pub marks: Vec, +} + +#[derive(Clone, Default)] +pub struct Digital { + pub digits: DigitBlocks, + pub exponents: Vec, +} + +#[derive(Clone, Default)] pub struct DigitBlocks { pub prefix: Option, pub sign: Option<(Sign, SpanId)>, pub blocks: Vec, pub which_exp: usize, } + +pub const WHITESPACE: Lexeme = Lexeme::new(-1); +pub const UNEXPECTED: Lexeme = Lexeme::new(-2); +pub const PREFIX: Lexeme = Lexeme::new(-3); +pub const SUFFIX: Lexeme = Lexeme::new(-4); diff --git a/ilex/src/spec.rs b/ilex/src/spec.rs index f3f0177..0a9fe3b 100644 --- a/ilex/src/spec.rs +++ b/ilex/src/spec.rs @@ -54,7 +54,9 @@ impl Lexeme { /// Returns whether this lexeme can have comments attached to it. pub(crate) fn can_have_comments(self, spec: &Spec) -> bool { - !self.is_aux() && !matches!(spec.rule(self.any()), rule::Any::Comment(_)) + !self.is_aux() + && (self.is_eof() + || !matches!(spec.rule(self.any()), rule::Any::Comment(_))) } /// Converts this lexeme into an index. @@ -68,7 +70,7 @@ impl Lexeme { } /// Creates a new lexeme. - pub(crate) fn new(id: i32) -> Self { + pub(crate) const fn new(id: i32) -> Self { Self { id, _ph: PhantomData } } } diff --git a/ilex/src/testing/recognize.rs b/ilex/src/testing/recognize.rs index 2027509..94e4223 100644 --- a/ilex/src/testing/recognize.rs +++ b/ilex/src/testing/recognize.rs @@ -16,6 +16,7 @@ use crate::testing::Text; use crate::token; use crate::token::Any; use crate::token::Sign; +use crate::token::Token; pub struct Matcher { pub which: Option>, @@ -65,15 +66,9 @@ impl Matcher { ) { state.match_spans("token span", &self.span, Spanned::span(&tok, ctx)); - zip_eq( - "comments", - state, - &self.comments, - &tok.comments(ctx), - |state, t, s| { - state.match_spans("comment", t, s); - }, - ); + zip_eq("comments", state, &self.comments, tok.comments(), |state, t, s| { + state.match_spans("comment", t, s); + }); match (&self.kind, tok) { (Kind::Eof, Any::Eof(..)) | (Kind::Keyword, Any::Keyword(..)) => {} diff --git a/ilex/src/token/mod.rs b/ilex/src/token/mod.rs index 0f9dde7..91bd055 100644 --- a/ilex/src/token/mod.rs +++ b/ilex/src/token/mod.rs @@ -11,7 +11,8 @@ //! value. They all implement [`Token`]. use std::fmt; -use std::marker::PhantomData; +use std::iter; +use std::num::NonZeroU32; use std::ops::RangeBounds; use std::panic::Location; @@ -28,7 +29,6 @@ use crate::fp; use crate::report::Report; use crate::rt; use crate::rt::DigitBlocks; -use crate::rt::Kind; use crate::rule; use crate::spec::Lexeme; use crate::spec::Spec; @@ -39,9 +39,31 @@ mod stream; pub use stream::switch::switch; pub use stream::switch::Switch; +pub use stream::Comments; pub use stream::Cursor; pub use stream::Stream; +/// A token ID. +/// +/// An [`Id`] is a lightweight handle to some token, which can be converted +/// back into that token using the corresponding [`Stream`]. +#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Debug)] +pub struct Id(pub(crate) NonZeroU32); + +impl Id { + fn idx(self) -> usize { + self.0.get() as usize - 1 + } + + fn prev(self) -> Option { + NonZeroU32::new(self.0.get() - 1).map(Self) + } + + fn next(self) -> Option { + self.0.checked_add(1).map(Self) + } +} + /// A token type. All types in [`ilex::token`][crate::token] implement this /// trait. pub trait Token<'lex>: @@ -50,14 +72,39 @@ pub trait Token<'lex>: /// The token this rule was parsed from. type Rule: rule::Rule; + /// The ID of this token. + fn id(self) -> Id; + + /// The token stream that owns this token. + fn stream(self) -> &'lex Stream<'lex>; + /// The context that owns this token. - fn context(self) -> &'lex Context; + fn context(self) -> &'lex Context { + self.stream().context() + } /// The spec that lexed this token. - fn spec(self) -> &'lex Spec; + fn spec(self) -> &'lex Spec { + self.stream().spec() + } /// Returns this token's [`Lexeme`]. - fn lexeme(self) -> Lexeme; + fn lexeme(self) -> Lexeme { + self.stream().lookup_token(self.id()).lexeme.cast() + } + + /// Returns an iterator over the attacked to this token. + fn comments(self) -> Comments<'lex> { + let stream = self.stream(); + Comments { + stream, + comments: stream + .lookup_meta(self.id()) + .map(|m| m.comments.as_slice()) + .unwrap_or(&[]) + .iter(), + } + } /// The rule inside of [`Token::spec()`] that this token refers to. /// @@ -97,36 +144,25 @@ pub enum Any<'lex> { impl<'lex> Token<'lex> for Any<'lex> { type Rule = rule::Any; - fn lexeme(self) -> Lexeme { - match self { - Self::Eof(tok) => tok.lexeme().any(), - Self::Bracket(tok) => tok.lexeme().any(), - Self::Keyword(tok) => tok.lexeme().any(), - Self::Ident(tok) => tok.lexeme().any(), - Self::Digital(tok) => tok.lexeme().any(), - Self::Quoted(tok) => tok.lexeme().any(), - } - } - - fn context(self) -> &'lex Context { + fn id(self) -> Id { match self { - Self::Eof(tok) => tok.context(), - Self::Bracket(tok) => tok.context(), - Self::Keyword(tok) => tok.context(), - Self::Ident(tok) => tok.context(), - Self::Digital(tok) => tok.context(), - Self::Quoted(tok) => tok.context(), + Self::Eof(tok) => tok.id(), + Self::Bracket(tok) => tok.id(), + Self::Keyword(tok) => tok.id(), + Self::Ident(tok) => tok.id(), + Self::Digital(tok) => tok.id(), + Self::Quoted(tok) => tok.id(), } } - fn spec(self) -> &'lex Spec { + fn stream(self) -> &'lex Stream<'lex> { match self { - Self::Eof(tok) => tok.spec, - Self::Bracket(tok) => tok.spec, - Self::Keyword(tok) => tok.spec, - Self::Ident(tok) => tok.spec, - Self::Digital(tok) => tok.spec, - Self::Quoted(tok) => tok.spec, + Self::Eof(tok) => tok.stream(), + Self::Bracket(tok) => tok.stream(), + Self::Keyword(tok) => tok.stream(), + Self::Ident(tok) => tok.stream(), + Self::Digital(tok) => tok.stream(), + Self::Quoted(tok) => tok.stream(), } } @@ -236,20 +272,27 @@ impl Spanned for Any<'_> { /// comments within. #[derive(Copy, Clone)] pub struct Eof<'lex> { - span: SpanId, - ctx: &'lex Context, - spec: &'lex Spec, + stream: &'lex Stream<'lex>, + id: Id, } impl<'lex> Token<'lex> for Eof<'lex> { type Rule = rule::Eof; + fn id(self) -> Id { + self.id + } + + fn stream(self) -> &'lex Stream<'lex> { + self.stream + } + fn context(self) -> &'lex Context { - self.ctx + self.stream.context() } fn spec(self) -> &'lex Spec { - self.spec + self.stream.spec() } fn lexeme(self) -> Lexeme { @@ -277,13 +320,13 @@ impl<'lex> TryFrom> for Eof<'lex> { impl fmt::Debug for Eof<'_> { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - write!(f, "Eof({:?})", self.span) + write!(f, "Eof({:?})", self.span(self.context())) } } impl Spanned for Eof<'_> { - fn span(&self, ctx: &Context) -> Span { - self.span.span(ctx) + fn span(&self, _: &Context) -> Span { + self.stream.lookup_span_no_affix(self.id) } } @@ -294,26 +337,19 @@ impl Spanned for Eof<'_> { /// fixed string. #[derive(Copy, Clone)] pub struct Keyword<'lex> { - lexeme: Lexeme, - ctx: &'lex Context, - spec: &'lex Spec, - span: SpanId, - _ph: PhantomData<&'lex rt::Token>, + stream: &'lex Stream<'lex>, + id: Id, } impl<'lex> Token<'lex> for Keyword<'lex> { type Rule = rule::Keyword; - fn context(self) -> &'lex Context { - self.ctx - } - - fn spec(self) -> &'lex Spec { - self.spec + fn id(self) -> Id { + self.id } - fn lexeme(self) -> Lexeme { - self.lexeme + fn stream(self) -> &'lex Stream<'lex> { + self.stream } #[doc(hidden)] @@ -337,13 +373,13 @@ impl<'lex> TryFrom> for Keyword<'lex> { impl fmt::Debug for Keyword<'_> { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - write!(f, "Keyword({:?})", self.span) + write!(f, "Keyword({:?})", self.span(self.stream.context())) } } impl Spanned for Keyword<'_> { - fn span(&self, ctx: &Context) -> Span { - self.span.span(ctx) + fn span(&self, _: &Context) -> Span { + self.stream.lookup_span_no_affix(self.id) } } @@ -354,24 +390,20 @@ impl Spanned for Keyword<'_> { /// *trees*, like Rust does. #[derive(Copy, Clone)] pub struct Bracket<'lex> { - span: SpanId, - open: SpanId, - close: SpanId, - lexeme: Lexeme, - ctx: &'lex Context, - spec: &'lex Spec, + open: Id, + close: Id, contents: Cursor<'lex>, } impl<'lex> Bracket<'lex> { /// Returns this token's open delimiter. pub fn open(self) -> Span { - self.open.span(self.ctx) + self.contents.stream().lookup_span_no_affix(self.open) } /// Returns this token's close delimiter. pub fn close(self) -> Span { - self.close.span(self.ctx) + self.contents.stream().lookup_span_no_affix(self.close) } /// Returns this token's quote delimiters. @@ -391,16 +423,12 @@ impl<'lex> Bracket<'lex> { impl<'lex> Token<'lex> for Bracket<'lex> { type Rule = rule::Bracket; - fn context(self) -> &'lex Context { - self.ctx - } - - fn spec(self) -> &'lex Spec { - self.spec + fn id(self) -> Id { + self.open } - fn lexeme(self) -> Lexeme { - self.lexeme + fn stream(self) -> &'lex Stream<'lex> { + self.contents().stream() } #[doc(hidden)] @@ -433,69 +461,66 @@ impl<'lex> IntoIterator for Bracket<'lex> { impl fmt::Debug for Bracket<'_> { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { f.debug_struct("Bracket") - .field("delimiters", &f!("({:?}, {:?})", self.open, self.close)) + .field("delimiters", &f!("({:?}, {:?})", self.open(), self.close())) .field("contents", &self.contents) .finish() } } impl Spanned for Bracket<'_> { - fn span(&self, ctx: &Context) -> Span { - self.span.span(ctx) + fn span(&self, _: &Context) -> Span { + let [a, b] = self.delimiters(); + self.contents.stream().file().span(a.start()..b.end()) } } /// A identifier, i.e., a self-delimiting word like `foo` or `黒猫`. #[derive(Copy, Clone)] pub struct Ident<'lex> { - tok: &'lex rt::Token, - ctx: &'lex Context, - spec: &'lex Spec, + stream: &'lex Stream<'lex>, + id: Id, } impl<'lex> Ident<'lex> { /// Returns this token's name span. pub fn name(self) -> Span { - match &self.tok.kind { - &Kind::Ident(name) => name.span(self.ctx), - _ => panic!("non-lexer::Kind::Ident inside of Ident"), - } + self.stream.lookup_span_no_affix(self.id) } /// Returns this token's prefix. pub fn prefix(self) -> Option { - self.tok.prefix.map(|s| s.span(self.ctx)) + self.stream.lookup_prefix(self.id) } /// Checks whether this identifier has a particular prefix. pub fn has_prefix(&self, expected: &str) -> bool { - self.prefix().is_some_and(|s| s.text(self.ctx) == expected) + self + .prefix() + .is_some_and(|s| s.text(self.context()) == expected) } /// Returns this token's suffix. pub fn suffix(&self) -> Option { - self.tok.suffix.map(|s| s.span(self.ctx)) + self.stream.lookup_suffix(self.id) } /// Checks whether this identifier has a particular prefix. pub fn has_suffix(&self, expected: &str) -> bool { - self.suffix().is_some_and(|s| s.text(self.ctx) == expected) + self + .suffix() + .is_some_and(|s| s.text(self.context()) == expected) } } impl<'lex> Token<'lex> for Ident<'lex> { type Rule = rule::Ident; - fn context(self) -> &'lex Context { - self.ctx - } - - fn spec(self) -> &'lex Spec { - self.spec + fn id(self) -> Id { + self.id } - fn lexeme(self) -> Lexeme { - self.tok.lexeme.cast() + fn stream(self) -> &'lex Stream<'lex> { + self.stream } #[doc(hidden)] @@ -520,11 +545,12 @@ impl<'lex> TryFrom> for Ident<'lex> { impl fmt::Debug for Ident<'_> { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { if self.prefix().is_none() && self.suffix().is_none() { - return write!(f, "Ident({:?})", self.tok.span); + return write!(f, "Ident({:?})", self.name()); } let mut f = f.debug_struct("Ident"); - f.field("span", &self.tok.span).field("name", &self.name()); + f.field("span", &self.span(self.context())) + .field("name", &self.name()); if let Some(prefix) = self.prefix() { f.field("prefix", &prefix); @@ -539,8 +565,8 @@ impl fmt::Debug for Ident<'_> { } impl Spanned for Ident<'_> { - fn span(&self, ctx: &Context) -> Span { - self.tok.span.span(ctx) + fn span(&self, _: &Context) -> Span { + self.stream.lookup_span_with_affixes(self.id) } } @@ -561,10 +587,10 @@ impl Spanned for Ident<'_> { /// others). #[derive(Copy, Clone)] pub struct Digital<'lex> { - tok: &'lex rt::Token, + stream: &'lex Stream<'lex>, + id: Id, + meta: &'lex rt::Digital, idx: usize, - ctx: &'lex Context, - spec: &'lex Spec, } impl<'lex> Digital<'lex> { @@ -590,12 +616,13 @@ impl<'lex> Digital<'lex> { /// Returns the span corresponding to [`Digital::sign()`]. pub fn sign_span(self) -> Option { - self.rt_blocks().sign.map(|(_, sp)| sp.span(self.ctx)) + self.rt_blocks().sign.map(|(_, sp)| sp.span(self.context())) } /// Returns the point-separated digit chunks of this digital literal. pub fn digit_blocks(self) -> impl Iterator + 'lex { - self.digit_slice().iter().map(|s| s.span(self.ctx)) + let ctx = self.context(); + self.digit_slice().iter().map(|s| s.span(ctx)) } /// Returns the exponents of this digital literal, if it any. @@ -603,26 +630,28 @@ impl<'lex> Digital<'lex> { /// Calling `exponents()` on any of the returned tokens will yield all /// exponents that follow. pub fn exponents(self) -> impl Iterator> { - (self.idx..self.exponent_slice().len()).map(move |idx| Self { - tok: self.tok, - ctx: self.ctx, + (self.idx..self.meta.exponents.len()).map(move |idx| Self { + stream: self.stream, + id: self.id, + meta: self.meta, idx: idx + 1, - spec: self.spec, }) } /// Returns this token's prefix. pub fn prefix(self) -> Option { if self.idx > 0 { - return self.rt_blocks().prefix.map(|s| s.span(self.ctx)); + return self.rt_blocks().prefix.map(|s| s.span(self.context())); } - self.tok.prefix.map(|s| s.span(self.ctx)) + self.stream.lookup_prefix(self.id) } /// Checks whether this identifier has a particular prefix. pub fn has_prefix(&self, expected: &str) -> bool { - self.prefix().is_some_and(|s| s.text(self.ctx) == expected) + self + .prefix() + .is_some_and(|s| s.text(self.context()) == expected) } /// Returns this token's suffix. @@ -632,12 +661,14 @@ impl<'lex> Digital<'lex> { return None; } - self.tok.suffix.map(|s| s.span(self.ctx)) + self.stream.lookup_suffix(self.id) } /// Checks whether this identifier has a particular prefix. pub fn has_suffix(&self, expected: &str) -> bool { - self.suffix().is_some_and(|s| s.text(self.ctx) == expected) + self + .suffix() + .is_some_and(|s| s.text(self.context()) == expected) } /// Parses this token as an integer. @@ -653,7 +684,7 @@ impl<'lex> Digital<'lex> { N: Bounded + PartialOrd + FromRadix + fmt::Display, { for extra in self.digit_blocks().skip(1) { - report.builtins(self.spec).unexpected( + report.builtins(self.spec()).unexpected( "extra digits", self.lexeme(), extra, @@ -662,7 +693,7 @@ impl<'lex> Digital<'lex> { for extra in self.exponents() { report - .builtins(self.spec) + .builtins(self.spec()) .unexpected("exponent", self.lexeme(), extra); } @@ -686,7 +717,7 @@ impl<'lex> Digital<'lex> { self .digit_blocks() .map(|span| { - let text = span.text(self.ctx); + let text = span.text(self.context()); let buf; let text = if !rule.separator.is_empty() && text.contains(&*rule.separator) { @@ -738,7 +769,7 @@ impl<'lex> Digital<'lex> { range: impl RangeBounds, report: &Report, ) -> Result { - let fp: Fp = self.parse_fp(self.ctx, report, false)?; + let fp: Fp = self.parse_fp(self.context(), report, false)?; if !fp.__is_finite() || !range.contains(&fp) { report.builtins(self.spec()).literal_out_of_range( @@ -764,7 +795,7 @@ impl<'lex> Digital<'lex> { range: impl RangeBounds, report: &Report, ) -> Result { - let fp: Fp = self.parse_fp(self.ctx, report, true)?; + let fp: Fp = self.parse_fp(self.context(), report, true)?; if !fp.__is_finite() || !range.contains(&fp) { report.builtins(self.spec()).literal_out_of_range( @@ -792,19 +823,11 @@ impl<'lex> Digital<'lex> { &self.rt_blocks().blocks } - fn exponent_slice(self) -> &'lex [DigitBlocks] { - match &self.tok.kind { - Kind::Digital { exponents, .. } => exponents, - _ => panic!("non-lexer::Kind::Digital inside of Digital"), - } - } - fn rt_blocks(&self) -> &'lex DigitBlocks { - match &self.tok.kind { - Kind::Digital { digits, .. } if self.idx == 0 => digits, - Kind::Digital { exponents, .. } => &exponents[self.idx - 1], - _ => panic!("non-lexer::Kind::Digital inside of Digital"), + if self.idx == 0 { + return &self.meta.digits; } + &self.meta.exponents[self.idx - 1] } } @@ -894,16 +917,12 @@ impl_radix! { impl<'lex> Token<'lex> for Digital<'lex> { type Rule = rule::Digital; - fn context(self) -> &'lex Context { - self.ctx + fn id(self) -> Id { + self.id } - fn spec(self) -> &'lex Spec { - self.spec - } - - fn lexeme(self) -> Lexeme { - self.tok.lexeme.cast() + fn stream(self) -> &'lex Stream<'lex> { + self.stream } #[doc(hidden)] @@ -928,7 +947,7 @@ impl<'lex> TryFrom> for Digital<'lex> { impl fmt::Debug for Digital<'_> { fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { let mut f = f.debug_struct("Digital"); - f.field("span", &self.tok.span) + f.field("span", &self.span(self.context())) .field("radix", &self.radix()) .field("digits", &self.digit_slice()); @@ -953,17 +972,17 @@ impl fmt::Debug for Digital<'_> { } impl Spanned for Digital<'_> { - fn span(&self, ctx: &Context) -> Span { - self.tok.span.span(ctx) + fn span(&self, _: &Context) -> Span { + self.stream.lookup_span_with_affixes(self.id) } } /// A quoted literal. #[derive(Copy, Clone)] pub struct Quoted<'lex> { - tok: &'lex rt::Token, - ctx: &'lex Context, - spec: &'lex Spec, + stream: &'lex Stream<'lex>, + id: Id, + meta: &'lex rt::Quoted, } impl<'lex> Quoted<'lex> { @@ -979,12 +998,17 @@ impl<'lex> Quoted<'lex> { /// Returns this token's quote delimiters. pub fn delimiters(self) -> [Span; 2] { - match &self.tok.kind { - &Kind::Quoted { open, close, .. } => { - [open.span(self.ctx), close.span(self.ctx)] - } - _ => panic!("non-lexer::Kind::Quoted inside of Quoted"), - } + let span = self.stream.lookup_span_no_affix(self.id); + [ + self + .stream + .file() + .span(span.start()..*self.meta.marks.first().unwrap() as usize), + self + .stream + .file() + .span(*self.meta.marks.last().unwrap() as usize..span.end()), + ] } /// Returns the raw content of this token. @@ -997,20 +1021,58 @@ impl<'lex> Quoted<'lex> { /// strings. [`Quoted::to_utf8()`] helps with the common case of doing this for /// UTF-8 strings. pub fn raw_content(self) -> impl Iterator + 'lex { - self.content_slice().iter().map(|c| match c { - Content::Lit(s) => Content::Lit(s.span(self.ctx)), - Content::Esc(s, e) => { - Content::Esc(s.span(self.ctx), e.map(|e| e.span(self.ctx))) - } + let file = self.stream.file(); + let mut next = self.meta.marks[0]; + let mut is_escape = false; + let mut marks = &self.meta.marks[1..]; + + iter::from_fn(move || loop { + return match is_escape { + false => { + let start = next; + let &[end, ref rest @ ..] = marks else { + return None; + }; + + next = end; + marks = rest; + is_escape = true; + + if start == end { + continue; + } + + let span = file.span(start as usize..end as usize); + Some(Content::Lit(span)) + } + true => { + let start = next; + let &[esc_end, data_start, data_end, end, ref rest @ ..] = marks + else { + return None; + }; + + next = end; + marks = rest; + is_escape = false; + + let span = file.span(start as usize..esc_end as usize); + let data = (data_start != data_end) + .then(|| file.span(data_start as usize..data_end as usize)); + Some(Content::Esc(span, data)) + } + }; }) } - /// Returns the unique single [`Content`] of this token, if it is unique. - pub fn unique_content(self) -> Option { - if self.content_slice().len() == 1 { - return self.raw_content().next(); + /// Returns the unique single literal content of this token, if it is unique. + pub fn literal(self) -> Option { + if self.meta.marks.len() > 2 { + return None; } - None + let start = *self.meta.marks.first().unwrap(); + let end = *self.meta.marks.last().unwrap(); + Some(self.stream.file().span(start as usize..end as usize)) } /// Constructs a UTF-8 string in the "obvious way", using this token and a @@ -1022,7 +1084,7 @@ impl<'lex> Quoted<'lex> { let total = self .raw_content() .map(|c| match c { - Content::Lit(sp) => sp.text(self.ctx).len(), + Content::Lit(sp) => sp.text(self.context()).len(), Content::Esc(..) => 1, }) .sum(); @@ -1030,38 +1092,35 @@ impl<'lex> Quoted<'lex> { let mut buf = String::with_capacity(total); for chunk in self.raw_content() { match chunk { - Content::Lit(sp) => buf.push_str(sp.text(self.ctx)), + Content::Lit(sp) => buf.push_str(sp.text(self.context())), Content::Esc(sp, data) => decode_esc(sp, data, &mut buf), } } buf } - fn content_slice(self) -> &'lex [Content] { - match &self.tok.kind { - Kind::Quoted { content, .. } => content, - _ => panic!("non-lexer::Kind::Quoted inside of Quoted"), - } - } - /// Returns this token's prefix. pub fn prefix(self) -> Option { - self.tok.prefix.map(|s| s.span(self.ctx)) + self.stream.lookup_prefix(self.id) } /// Checks whether this identifier has a particular prefix. - pub fn has_prefix(self, expected: &str) -> bool { - self.prefix().is_some_and(|s| s.text(self.ctx) == expected) + pub fn has_prefix(&self, expected: &str) -> bool { + self + .prefix() + .is_some_and(|s| s.text(self.context()) == expected) } /// Returns this token's suffix. - pub fn suffix(self) -> Option { - self.tok.suffix.map(|s| s.span(self.ctx)) + pub fn suffix(&self) -> Option { + self.stream.lookup_suffix(self.id) } /// Checks whether this identifier has a particular prefix. - pub fn has_suffix(self, expected: &str) -> bool { - self.suffix().is_some_and(|s| s.text(self.ctx) == expected) + pub fn has_suffix(&self, expected: &str) -> bool { + self + .suffix() + .is_some_and(|s| s.text(self.context()) == expected) } } @@ -1099,16 +1158,12 @@ impl Content { impl<'lex> Token<'lex> for Quoted<'lex> { type Rule = rule::Quoted; - fn context(self) -> &'lex Context { - self.ctx - } - - fn spec(self) -> &'lex Spec { - self.spec + fn id(self) -> Id { + self.id } - fn lexeme(self) -> Lexeme { - self.tok.lexeme.cast() + fn stream(self) -> &'lex Stream<'lex> { + self.stream } #[doc(hidden)] @@ -1133,9 +1188,10 @@ impl<'lex> TryFrom> for Quoted<'lex> { impl fmt::Debug for Quoted<'_> { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { let mut f = f.debug_struct("Quoted"); - f.field("span", &self.tok.span) + f.field("span", &self.span(self.context())) .field("delimiters", &self.delimiters()) - .field("content", &self.content_slice()); + // TODO: get rid of this collect(). + .field("content", &self.raw_content().collect::>()); if let Some(prefix) = self.prefix() { f.field("prefix", &prefix); @@ -1150,23 +1206,19 @@ impl fmt::Debug for Quoted<'_> { } impl Spanned for Quoted<'_> { - fn span(&self, ctx: &Context) -> Span { - self.tok.span.span(ctx) + fn span(&self, _: &Context) -> Span { + self.stream.lookup_span_with_affixes(self.id) } } impl<'lex> Token<'lex> for Never { type Rule = Never; - fn context(self) -> &'lex Context { - self.from_nothing_anything() - } - - fn spec(self) -> &'lex Spec { + fn id(self) -> Id { self.from_nothing_anything() } - fn lexeme(self) -> Lexeme { + fn stream(self) -> &'lex Stream<'lex> { self.from_nothing_anything() } diff --git a/ilex/src/token/stream.rs b/ilex/src/token/stream.rs index 9794280..8783664 100644 --- a/ilex/src/token/stream.rs +++ b/ilex/src/token/stream.rs @@ -1,15 +1,15 @@ -use std::array; use std::fmt; use std::iter; -use std::marker::PhantomData; use std::mem; +use std::num::NonZeroU32; +use std::slice; use crate::file::Context; use crate::file::File; -use crate::file::SpanId; +use crate::file::Span; use crate::report::Report; use crate::rt; -use crate::rt::Kind; +use crate::rule; use crate::rule::Rule; use crate::spec::Lexeme; use crate::spec::Spec; @@ -22,17 +22,21 @@ use crate::token; pub struct Stream<'ctx> { pub(crate) file: File<'ctx>, pub(crate) spec: &'ctx Spec, + pub(crate) toks: Vec, + pub(crate) meta_idx: Vec, + pub(crate) meta: Vec, } impl<'ctx> Stream<'ctx> { /// Returns a cursor over this stream. pub fn cursor(&self) -> Cursor { Cursor { - file: self.file, - spec: self.spec, - toks: &self.toks, + stream: self, + start: 0, + end: self.toks.len(), cursor: 0, + meta_cursor: 0, } } @@ -50,6 +54,171 @@ impl<'ctx> Stream<'ctx> { pub fn spec(&self) -> &'ctx Spec { self.spec } + + /// Returns the token with the given ID. + /// + /// # Panics + /// + /// Panics if this stream does not have a token with the given ID. + pub fn token_at(&self, id: token::Id) -> token::Any { + let meta_hint = self.meta_idx.binary_search(&id).unwrap_or(0); + self.token_at_hint(id, meta_hint).unwrap() + } + + /// Returns the last token pushed to this stream. + pub(crate) fn last_token(&self) -> token::Any { + let mut cursor = self.cursor(); + cursor.cursor = cursor.end; + cursor.meta_cursor = self.meta_idx.len(); + loop { + cursor.step_backward(); + let tok = self.lookup_token(cursor.id()); + if tok.lexeme.is_aux() { + continue; + } + + return self.token_at_hint(cursor.id(), cursor.meta_cursor).unwrap(); + } + } + + pub(crate) fn token_at_hint( + &self, + id: token::Id, + meta_hint: usize, + ) -> Option { + let tok = &self.toks[id.idx()]; + let meta = self + .lookup_meta_hint(id, meta_hint) + .and_then(|m| m.kind.as_ref()); + + if [rt::PREFIX, rt::SUFFIX, rt::WHITESPACE, rt::UNEXPECTED] + .contains(&tok.lexeme) + { + return None; + } + + if tok.lexeme == Lexeme::eof().any() { + return Some(token::Eof { stream: self, id }.into()); + } + + return Some(match self.spec().rule(tok.lexeme) { + rule::Any::Comment(..) => return None, + rule::Any::Keyword(..) => token::Keyword { stream: self, id }.into(), + rule::Any::Ident(..) => token::Ident { stream: self, id }.into(), + + rule::Any::Bracket(..) => { + let Some(&rt::Kind::Offset { cursor, .. }) = meta else { + bug!("missing rt::Metadata::Offset on bracket token") + }; + let open = id; + let close = token::Id( + NonZeroU32::new(id.0.get().wrapping_add_signed(cursor)).unwrap(), + ); + + token::Bracket { + open, + close, + contents: Cursor { + stream: self, + start: open.idx() + 1, + end: close.idx(), + cursor: open.idx() + 1, + meta_cursor: meta_hint + 1, + }, + } + .into() + } + + crate::rule::Any::Quoted(..) => { + let Some(rt::Kind::Quoted(meta)) = meta else { + bug!("missing rt::Metadata::Quoted on quoted token") + }; + + token::Quoted { stream: self, id, meta }.into() + } + + crate::rule::Any::Digital(..) => { + let Some(rt::Kind::Digital(meta)) = meta else { + bug!("missing rt::Metadata::Digital on digital token") + }; + + token::Digital { stream: self, id, meta, idx: 0 }.into() + } + }); + } + + pub(crate) fn lookup_meta(&self, id: token::Id) -> Option<&rt::Metadata> { + let idx = self.meta_idx.binary_search(&id).ok()?; + Some(&self.meta[idx]) + } + + pub(crate) fn lookup_meta_hint( + &self, + id: token::Id, + hint: usize, + ) -> Option<&rt::Metadata> { + if self.meta_idx.get(hint) != Some(&id) { + return None; + } + + Some(&self.meta[hint]) + } + + pub(crate) fn lookup_token(&self, id: token::Id) -> &rt::Token { + &self.toks[id.idx()] + } + + pub(crate) fn lookup_span_no_affix(&self, id: token::Id) -> Span { + let start = self + .toks + .get(id.idx().wrapping_sub(1)) + .map(|t| t.end as usize) + .unwrap_or(0); + let end = self.lookup_token(id).end as usize; + self.file().span(start..end) + } + + pub(crate) fn lookup_prefix(&self, id: token::Id) -> Option { + let prev = id.prev()?; + if self.lookup_token(prev).lexeme != rt::PREFIX { + return None; + } + Some(self.lookup_span_no_affix(prev)) + } + + pub(crate) fn lookup_suffix(&self, id: token::Id) -> Option { + let next = id.next()?; + if next.idx() == self.toks.len() + || self.lookup_token(next).lexeme != rt::SUFFIX + { + return None; + } + Some(self.lookup_span_no_affix(next)) + } + + pub(crate) fn lookup_span_with_affixes(&self, id: token::Id) -> Span { + let span = self.lookup_span_no_affix(id); + + let mut start = span.start(); + if let Some(prefix) = self.lookup_prefix(id) { + start = prefix.start() + } + + let mut end = span.end(); + if let Some(suffix) = self.lookup_suffix(id) { + end = suffix.end(); + } + + self.file.span(start..end) + } + + pub(crate) fn last_meta(&self) -> Option<&rt::Metadata> { + self.meta.last() + } + + pub(crate) fn last_meta_mut(&mut self) -> Option<&mut rt::Metadata> { + self.meta.last_mut() + } } impl<'lex> IntoIterator for &'lex Stream<'_> { @@ -73,35 +242,56 @@ impl fmt::Debug for Stream<'_> { /// also be queried for more specific token kinds. #[derive(Copy, Clone)] pub struct Cursor<'lex> { - file: File<'lex>, - spec: &'lex Spec, - toks: &'lex [rt::Token], + stream: &'lex Stream<'lex>, + + // These are the range within `stream.toks` that we're allowed to yield. + start: usize, + end: usize, + + // This is the position of the cursor in `stream.toks`. cursor: usize, + + // This points to a value in `stream.meta_idx` whose `idx()` is greater than + // or equal to that of cursor; when `stream.toks[cursor]` is a token with + // metadata, this points to its metadata. When advancing, if + // + // ``` + // stream.meta_idx[meta_cursor].idx() == cursor + // ``` + // + // then we advance meta_cursor too. When backing up, we back up meta_cursor + // if + // + // ``` + // stream.meta_idx[meta_cursor - 1].idx() == cursor - 1 + // ``` + meta_cursor: usize, } impl<'lex> Cursor<'lex> { - fn end(&self) -> SpanId { - self.toks.last().unwrap().span + /// Returns the stream this cursor runs over. + pub fn stream(&self) -> &'lex Stream<'lex> { + self.stream } /// Returns the source code context this stream is associated with. pub fn context(&self) -> &'lex Context { - self.file.context() + self.stream.context() } /// Returns the file this stream was lexed from. pub fn file(&self) -> File<'lex> { - self.file + self.stream.file() } /// Returns the lexer spec this stream was lexed with. pub fn spec(&self) -> &'lex Spec { - self.spec + self.stream.spec() } /// Returns whether this cursor has yielded all of its tokens. pub fn is_empty(&self) -> bool { - self.cursor >= self.toks.len() + self.cursor >= self.end } /// Returns the next token under the cursor without consuming it. @@ -117,12 +307,7 @@ impl<'lex> Cursor<'lex> { /// Panics if this causes the internal cursor to underflow. pub fn back_up(&mut self, count: usize) { for _ in 0..count { - assert!(self.cursor > 0, "cursor underflowed"); - self.cursor -= 1; - - if let Kind::Close { offset_to_open, .. } = &self.toks[self.cursor].kind { - self.cursor -= *offset_to_open as usize; - } + assert!(self.step_backward(), "underflow attempting to back up cursor") } } @@ -131,7 +316,7 @@ impl<'lex> Cursor<'lex> { pub fn expect_finished(&self, report: &Report) { if let Some(next) = self.peek_any() { report - .builtins(self.spec) + .builtins(self.spec()) .expected([Lexeme::eof()], next, self.end()); } } @@ -218,26 +403,91 @@ impl<'lex> Cursor<'lex> { }) } - pub(crate) fn fake_token( - file: File<'lex>, - spec: &'lex Spec, - tok: &'lex rt::Token, - ) -> token::Any<'lex> { - Self { - file, - spec, - toks: array::from_ref(tok), - cursor: 0, + // pub(crate) fn fake_token( + // file: File<'lex>, + // spec: &'lex Spec, + // tok: &'lex rt::Token, + // ) -> token::Any<'lex> { + // Self { + // file, + // spec, + // toks: array::from_ref(tok), + // cursor: 0, + // } + // .next() + // .unwrap() + // } + + fn id(&self) -> token::Id { + token::Id(NonZeroU32::new(self.cursor as u32 + 1).unwrap()) + } + + fn step_forward(&mut self) -> bool { + if self.cursor >= self.end { + return false; } - .next() - .unwrap() + + // Step past an open token. This will result in the cursor pointing to + // one-past the end token. + if let Some(&rt::Kind::Offset { cursor, meta }) = self.kind() { + self.cursor = self.cursor.wrapping_add_signed(cursor as isize); + self.meta_cursor = self.meta_cursor.wrapping_add_signed(meta as isize); + } + + if let Some(id) = self.stream.meta_idx.get(self.meta_cursor) { + if id.idx() == self.cursor { + self.meta_cursor += 1; + } + } + + self.cursor += 1; + true + } + + fn step_backward(&mut self) -> bool { + if self.cursor <= self.start { + return false; + } + + if let Some(id) = self.stream.meta_idx.get(self.meta_cursor.wrapping_sub(1)) + { + if id.idx() == self.cursor.wrapping_sub(1) { + self.meta_cursor -= 1; + } + } + + self.cursor -= 1; + + // Step back from a close token. This will result in the cursor pointing to + // the open token. + if let Some(&rt::Kind::Offset { cursor, meta }) = self.kind() { + self.cursor = self.cursor.wrapping_add_signed(cursor as isize); + self.meta_cursor = self.meta_cursor.wrapping_add_signed(meta as isize); + } + + true + } + + fn kind(&self) -> Option<&'lex rt::Kind> { + self + .stream + .lookup_meta_hint(self.id(), self.meta_cursor) + .and_then(|m| m.kind.as_ref()) + } + + fn end(&self) -> Span { + let end = self + .stream() + .lookup_token(token::Id(NonZeroU32::new(self.end as u32 + 1).unwrap())) + .end as usize; + self.file().span(end..end) } } impl fmt::Debug for Cursor<'_> { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { let mut copy = *self; - copy.cursor = 0; + copy.cursor = copy.start; let mut list = f.debug_list(); for (i, tok) in copy.enumerate() { @@ -262,104 +512,41 @@ impl fmt::Debug for Cursor<'_> { impl<'lex> Iterator for Cursor<'lex> { type Item = token::Any<'lex>; fn next(&mut self) -> Option { - let tok = self.toks.get(self.cursor)?; - let next = match &tok.kind { - Kind::Eof => { - self.cursor += 1; - token::Any::Eof(token::Eof { - span: tok.span, - ctx: self.context(), - spec: self.spec, - }) - } - - Kind::Keyword => { - self.cursor += 1; - token::Any::Keyword(token::Keyword { - lexeme: tok.lexeme.cast(), - ctx: self.context(), - spec: self.spec, - span: tok.span, - _ph: PhantomData, - }) + loop { + if self.is_empty() { + return None; } - Kind::Open { offset_to_close } => { - if *offset_to_close == !0 { - // This was called from deep inside the lexer to generate a token - // name for a diagnostic, so we're just gonna give it a... - // stringifyable token. - - return Some(token::Any::Bracket(token::Bracket { - span: tok.span, - open: tok.span, - close: tok.span, - lexeme: tok.lexeme.cast(), - ctx: self.context(), - spec: self.spec, - contents: *self, - })); - } - - let open_idx = self.cursor; - let close_idx = open_idx + (*offset_to_close as usize); - self.cursor = close_idx + 1; - - let close = &self.toks[close_idx]; - let &Kind::Close { full_span, .. } = &close.kind else { - bug!("Kind::Open did not point to an Kind::Close"); - }; - - token::Any::Bracket(token::Bracket { - span: full_span, - open: tok.span, - close: close.span, - lexeme: tok.lexeme.cast(), - ctx: self.context(), - spec: self.spec, - contents: Cursor { - file: self.file, - spec: self.spec, - toks: &self.toks[open_idx + 1..close_idx], - cursor: 0, - }, - }) - } + let next = self.stream.token_at_hint(self.id(), self.meta_cursor); + self.step_forward(); - Kind::Close { .. } => { - bug!("stray closing delimiter {:?} in token stream", tok.span) + if next.is_some() { + return next; } + } + } +} - Kind::Ident { .. } => { - self.cursor += 1; - token::Any::Ident(token::Ident { - tok, - ctx: self.context(), - spec: self.spec, - }) - } +/// An iterator over the comment spans attached to a token. +pub struct Comments<'lex> { + pub(super) stream: &'lex Stream<'lex>, + pub(super) comments: slice::Iter<'lex, token::Id>, +} - Kind::Quoted { .. } => { - self.cursor += 1; - token::Any::Quoted(token::Quoted { - tok, - ctx: self.context(), - spec: self.spec, - }) - } +impl<'lex> Comments<'lex> { + /// Adapts this iterator to return just the text contents of each [`SpanId`]. + pub fn as_strings(self) -> impl Iterator + 'lex { + let ctx = self.stream.context(); + self.map(move |s| s.text(ctx)) + } +} - Kind::Digital { .. } => { - self.cursor += 1; - token::Any::Digital(token::Digital { - tok, - ctx: self.context(), - idx: 0, - spec: self.spec, - }) - } - }; +impl<'lex> Iterator for Comments<'lex> { + type Item = Span; - Some(next) + fn next(&mut self) -> Option { + let id = *self.comments.next()?; + Some(self.stream.lookup_span_no_affix(id)) } } @@ -436,7 +623,7 @@ pub mod switch { X: Impl<'lex, T>, { let Some(next) = cursor.next() else { - report.builtins(cursor.spec).expected( + report.builtins(cursor.spec()).expected( self.0.lexemes(0), Lexeme::eof(), cursor.end(), @@ -450,7 +637,7 @@ pub mod switch { } report - .builtins(cursor.spec) + .builtins(cursor.spec()) .expected(self.0.lexemes(0), next, next); None } From be46ffb6105194d6287975ff32c741b04e63029b Mon Sep 17 00:00:00 2001 From: Miguel Young de la Sota Date: Wed, 1 Jan 2025 15:07:24 -0800 Subject: [PATCH 5/9] ilex: Remove `SpanId` --- ilex/src/file/context.rs | 21 ---------- ilex/src/file/mod.rs | 43 --------------------- ilex/src/report/mod.rs | 2 - ilex/src/rt/emit2.rs | 83 ++++++++++++++++++++++------------------ ilex/src/rt/lexer.rs | 18 +-------- ilex/src/rt/mod.rs | 33 ++++++++++++++-- ilex/src/token/mod.rs | 14 +++---- 7 files changed, 82 insertions(+), 132 deletions(-) diff --git a/ilex/src/file/context.rs b/ilex/src/file/context.rs index 69d2baa..afef1b1 100644 --- a/ilex/src/file/context.rs +++ b/ilex/src/file/context.rs @@ -7,14 +7,11 @@ use camino::Utf8PathBuf; use crate::f; use crate::file::File; -use crate::file::SpanId; use crate::file::CTX_FOR_SPAN_DEBUG; use crate::report; use crate::report::Fatal; use crate::report::Report; -use super::Span; - /// A source context, which owns source code files. /// /// A `Context` contains the full text of all the loaded source files, which @@ -30,8 +27,6 @@ pub struct State { // TODO(mcyoung): Be smarter about this and use something something concurrent // vector? We don't need to have all this stuff behind a lock I think. files: Vec<(Utf8PathBuf, String)>, - - ranges: Vec, } unsafe impl Send for Context {} @@ -142,20 +137,4 @@ impl Context { pub fn file_count(&self) -> usize { self.state.read().unwrap().files.len() } - - /// Gets the byte range for the given span, if it isn't the synthetic span. - pub(crate) fn lookup_range(&self, span: SpanId) -> Span { - let state = self.state.read().unwrap(); - state.ranges[span.0 as usize] - } - - /// Creates a new synthetic span with the given contents. - pub(crate) fn new_span(&self, range: Span) -> SpanId { - let mut state = self.state.write().unwrap(); - assert!(state.ranges.len() <= (u32::MAX as usize), "ran out of spans"); - - let span = SpanId(state.ranges.len() as u32); - state.ranges.push(range); - span - } } diff --git a/ilex/src/file/mod.rs b/ilex/src/file/mod.rs index 736be62..f33fc98 100644 --- a/ilex/src/file/mod.rs +++ b/ilex/src/file/mod.rs @@ -109,30 +109,6 @@ pub struct Span { end: u32, } -/// An interned [`Span`]. -/// -/// Most tokens' spans will never be inspected after lexing, so it's better to -/// make them small for memory saving reasons. This abstraction allows the -/// library to optimize internal handling of spans over time. -/// -/// This type is just a numeric ID; in order to do anything with it, you'll -/// need to call one of the functions in [`Spanned`]. -#[derive(Copy, Clone)] -pub(crate) struct SpanId(u32); - -impl fmt::Debug for SpanId { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - CTX_FOR_SPAN_DEBUG.with(|ctx| { - let ctx = ctx.borrow(); - let Some(ctx) = &*ctx else { - return f.write_str(""); - }; - - fmt::Debug::fmt(&Spanned::span(&self, ctx), f) - }) - } -} - impl Span { /// Constructs a span from a file and a byte range within it. /// @@ -285,19 +261,6 @@ impl Span { best.expect("attempted to join zero spans") } - - /// Bakes this range into a span. - pub(crate) fn intern(self, ctx: &Context) -> SpanId { - ctx.new_span(self) - } - - /// Bakes this range into a span. - pub(crate) fn intern_nonempty(self, ctx: &Context) -> Option { - if self.is_empty() { - return None; - } - Some(self.intern(ctx)) - } } /// A syntax element which contains a span. @@ -339,12 +302,6 @@ pub trait Spanned { } } -impl Spanned for SpanId { - fn span(&self, ctx: &Context) -> Span { - ctx.lookup_range(*self) - } -} - // Spans are spanned by their own spans. impl Spanned for Span { fn span(&self, _ctx: &Context) -> Span { diff --git a/ilex/src/report/mod.rs b/ilex/src/report/mod.rs index a840cad..8c6d74b 100644 --- a/ilex/src/report/mod.rs +++ b/ilex/src/report/mod.rs @@ -17,8 +17,6 @@ use std::process; use std::sync::Arc; use crate::file::Context; -#[cfg(doc)] -use crate::file::SpanId; use crate::spec::Spec; mod builtin; diff --git a/ilex/src/rt/emit2.rs b/ilex/src/rt/emit2.rs index 3cc19aa..cb37764 100644 --- a/ilex/src/rt/emit2.rs +++ b/ilex/src/rt/emit2.rs @@ -8,7 +8,6 @@ use byteyarn::YarnBox; use crate::f; use crate::file::Context; use crate::file::Span; -use crate::file::Spanned; use crate::plural; use crate::report::Expected; use crate::rt; @@ -37,9 +36,8 @@ pub fn emit(lexer: &mut Lexer) { let start = lexer.cursor(); let end = start + match_.len; - let range = lexer.span(start..end); - let span = range.intern(ctx); - let text = range.text(ctx); + let span = lexer.span(start..end); + let text = span.text(ctx); let end = end + match_.extra; // Now we have to decide which of `candidates` is the best one, i.e., @@ -67,7 +65,7 @@ pub fn emit(lexer: &mut Lexer) { // choices; that is independent of which token we decide to create. let mut best = None; 'verify: for &c in &match_.candidates { - let [.., range, _] = find_affixes_partial(range, lexer.spec(), c, ctx); + let [.., range, _] = find_affixes_partial(span, lexer.spec(), c, ctx); // NOTE: We only need to find the first lexeme that is valid. If it's not // valid, we will diagnose that in the next stage. @@ -195,8 +193,8 @@ pub fn emit(lexer: &mut Lexer) { } let best = best.unwrap_or(match_.candidates[0]); - let [sign, prefix, range, suffix] = - find_affixes_partial(range, lexer.spec(), best, ctx); + let [sign_span, prefix, range, suffix] = + find_affixes_partial(span, lexer.spec(), best, ctx); let text = range.text(ctx); let mirrored = match lexer.spec().rule(best.lexeme) { @@ -339,28 +337,33 @@ pub fn emit(lexer: &mut Lexer) { lexer.add_token(rt::PREFIX, prefix.len(), None); lexer.add_token( best.lexeme, - sign.len() + range.len(), + sign_span.len() + range.len(), Some(rt::Kind::Digital(rt::Digital::default())), ); lexer.add_token(rt::SUFFIX, suffix.len(), None); - let sign_text = sign.text(ctx); - let sign = sign.intern_nonempty(ctx).map(|span| { - for (text, value) in &rule.mant.signs { - if text == sign_text { - return (*value, span); - } - } - bug!("could not find appropriate sign for Digital rule") + let sign_text = sign_span.text(ctx); + let sign = (!sign_text.is_empty()).then(|| { + let Some((_, value)) = + rule.mant.signs.iter().find(|(text, _)| text == sign_text) + else { + bug!("could not find appropriate sign for Digital rule"); + }; + + (*value, [sign_span.start() as u32, sign_span.end() as u32]) }); let mut chunks = vec![DigitBlocks { - prefix: prefix.intern_nonempty(ctx), + prefix: [0, 0], sign, blocks: Vec::new(), which_exp: !0, }]; + if !prefix.is_empty() { + chunks[0].prefix = [prefix.start() as u32, prefix.end() as u32]; + } + let mut offset = 0; let mut text = text; @@ -407,9 +410,10 @@ pub fn emit(lexer: &mut Lexer) { ); } - chunk - .blocks - .push(range.subspan(block_start..offset).intern(ctx)); + chunk.blocks.push([ + (range.start() + block_start) as u32, + (range.start() + offset) as u32, + ]); text = rest; offset += rule.point.len(); block_start = offset; @@ -427,14 +431,13 @@ pub fn emit(lexer: &mut Lexer) { ); } - chunk - .blocks - .push(range.subspan(block_start..offset).intern(ctx)); + chunk.blocks.push([ + (range.start() + block_start) as u32, + (range.start() + offset) as u32, + ]); - let prefix = - range.subspan(offset..offset + pre.len()).intern(ctx); + let prefix = range.subspan(offset..offset + pre.len()); text = rest; - offset += pre.len(); let sign = exp @@ -443,20 +446,27 @@ pub fn emit(lexer: &mut Lexer) { .filter(|(y, _)| rest.starts_with(y.as_str())) .max_by_key(|(y, _)| y.len()) .map(|(y, s)| { - let sign = - range.subspan(offset..offset + y.len()).intern(ctx); + let sign = [ + (range.start() + offset) as u32, + (range.start() + offset + y.len()) as u32, + ]; text = &text[y.len()..]; offset += y.len(); (*s, sign) }); chunks.push(DigitBlocks { - prefix: Some(prefix), + prefix: [0, 0], sign, blocks: Vec::new(), which_exp: i, }); + if !prefix.is_empty() { + chunks.last_mut().unwrap().prefix = + [prefix.start() as u32, prefix.end() as u32]; + } + digits = exp; block_start = offset; last_was_sep = false; @@ -480,7 +490,7 @@ pub fn emit(lexer: &mut Lexer) { .last_mut() .unwrap() .blocks - .push(range.subspan(block_start..).intern(ctx)); + .push([(range.start() + block_start) as u32, range.end() as u32]); let mant = chunks.remove(0); let Some(rt::Kind::Digital(meta)) = lexer @@ -510,10 +520,9 @@ pub fn emit(lexer: &mut Lexer) { let chunk_span = Span::union( chunk - .prefix + .prefix(lexer.file()) .into_iter() - .chain(chunk.blocks.iter().copied()) - .map(|s| s.span(ctx)), + .chain(chunk.blocks(lexer.file())), ); if (chunk.blocks.len() as u32) < digits.min_chunks { @@ -528,12 +537,12 @@ pub fn emit(lexer: &mut Lexer) { .at(chunk_span); } - for block in &chunk.blocks { - let range = block.span(ctx); + for block in chunk.blocks(lexer.file()) { let mut text = block.text(ctx); - if range.is_empty() && chunk.prefix.is_some() { - let prefix = chunk.prefix.unwrap(); + // FIXME: The is_some() here should not be necessary. + if range.is_empty() && chunk.prefix(lexer.file()).is_some() { + let prefix = chunk.prefix(lexer.file()).unwrap(); lexer .builtins() .expected( diff --git a/ilex/src/rt/lexer.rs b/ilex/src/rt/lexer.rs index aa97450..d697a9d 100644 --- a/ilex/src/rt/lexer.rs +++ b/ilex/src/rt/lexer.rs @@ -7,10 +7,8 @@ use byteyarn::Yarn; use regex_automata::hybrid::dfa::Cache; use crate::f; -use crate::file::Context; use crate::file::File; use crate::file::Span; -use crate::file::SpanId; use crate::report::Builtins; use crate::report::Report; use crate::rt; @@ -34,7 +32,6 @@ pub struct Lexer<'a, 'ctx> { closers: Vec, comments: Vec, - eof: SpanId, cache: Cache, } @@ -64,7 +61,6 @@ impl<'a, 'ctx> Lexer<'a, 'ctx> { closers: Vec::new(), comments: Vec::new(), - eof: file.span(file.len()..file.len()).intern(file.context()), cache: Cache::new(&spec.dfa().engine), } } @@ -112,8 +108,8 @@ impl<'a, 'ctx> Lexer<'a, 'ctx> { } /// Returns the EOF span. - pub fn eof(&self) -> SpanId { - self.eof + pub fn eof(&self) -> Span { + self.file().span(self.file().len()..self.file().len()) } /// Creates a new range in the current file. @@ -121,11 +117,6 @@ impl<'a, 'ctx> Lexer<'a, 'ctx> { self.file().span(range) } - /// Creates a new range in the current file and bakes it. - pub fn intern(&self, range: impl RangeBounds) -> SpanId { - self.file().span(range).intern(self.ctx()) - } - // Returns the span of the token at the given index. pub fn lookup_span(&self, idx: usize) -> Span { let end = self.stream.toks[idx].end as usize; @@ -136,11 +127,6 @@ impl<'a, 'ctx> Lexer<'a, 'ctx> { self.file().span(start..end) } - /// Creates a new span in the current file with the given range. - pub fn ctx(&self) -> &'ctx Context { - self.file().context() - } - pub fn cache(&mut self) -> &mut Cache { &mut self.cache } diff --git a/ilex/src/rt/mod.rs b/ilex/src/rt/mod.rs index 420f3d2..f390ba8 100644 --- a/ilex/src/rt/mod.rs +++ b/ilex/src/rt/mod.rs @@ -3,7 +3,7 @@ use std::cell::Cell; use crate::file::File; -use crate::file::SpanId; +use crate::file::Span; use crate::report::Fatal; use crate::report::Report; use crate::rule; @@ -106,12 +106,37 @@ pub struct Digital { #[derive(Clone, Default)] pub struct DigitBlocks { - pub prefix: Option, - pub sign: Option<(Sign, SpanId)>, - pub blocks: Vec, + pub prefix: [u32; 2], + pub sign: Option<(Sign, [u32; 2])>, + pub blocks: Vec<[u32; 2]>, pub which_exp: usize, } +impl DigitBlocks { + pub fn prefix(&self, file: File) -> Option { + if self.prefix == [0, 0] { + return None; + } + Some(file.span(self.prefix[0] as usize..self.prefix[1] as usize)) + } + + pub fn sign(&self, file: File) -> Option { + self + .sign + .map(|(_, [a, b])| file.span(a as usize..b as usize)) + } + + pub fn blocks<'a>( + &'a self, + file: File<'a>, + ) -> impl Iterator + 'a { + self + .blocks + .iter() + .map(move |&[a, b]| file.span(a as usize..b as usize)) + } +} + pub const WHITESPACE: Lexeme = Lexeme::new(-1); pub const UNEXPECTED: Lexeme = Lexeme::new(-2); pub const PREFIX: Lexeme = Lexeme::new(-3); diff --git a/ilex/src/token/mod.rs b/ilex/src/token/mod.rs index 91bd055..f456627 100644 --- a/ilex/src/token/mod.rs +++ b/ilex/src/token/mod.rs @@ -23,7 +23,6 @@ use num_traits::Bounded; use crate::f; use crate::file::Context; use crate::file::Span; -use crate::file::SpanId; use crate::file::Spanned; use crate::fp; use crate::report::Report; @@ -616,13 +615,13 @@ impl<'lex> Digital<'lex> { /// Returns the span corresponding to [`Digital::sign()`]. pub fn sign_span(self) -> Option { - self.rt_blocks().sign.map(|(_, sp)| sp.span(self.context())) + self.rt_blocks().sign(self.file(self.context())) } /// Returns the point-separated digit chunks of this digital literal. pub fn digit_blocks(self) -> impl Iterator + 'lex { let ctx = self.context(); - self.digit_slice().iter().map(|s| s.span(ctx)) + self.rt_blocks().blocks(self.file(ctx)) } /// Returns the exponents of this digital literal, if it any. @@ -641,7 +640,7 @@ impl<'lex> Digital<'lex> { /// Returns this token's prefix. pub fn prefix(self) -> Option { if self.idx > 0 { - return self.rt_blocks().prefix.map(|s| s.span(self.context())); + return self.rt_blocks().prefix(self.file(self.context())); } self.stream.lookup_prefix(self.id) @@ -819,10 +818,6 @@ impl<'lex> Digital<'lex> { } } - fn digit_slice(self) -> &'lex [SpanId] { - &self.rt_blocks().blocks - } - fn rt_blocks(&self) -> &'lex DigitBlocks { if self.idx == 0 { return &self.meta.digits; @@ -949,7 +944,8 @@ impl fmt::Debug for Digital<'_> { let mut f = f.debug_struct("Digital"); f.field("span", &self.span(self.context())) .field("radix", &self.radix()) - .field("digits", &self.digit_slice()); + // TODO: Get rid of this collect. + .field("digits", &self.digit_blocks().collect::>()); if let Some(sign) = self.sign() { f.field("sign", &sign); From 2de3a92721d6e861c9bb060910884bbd692d6276 Mon Sep 17 00:00:00 2001 From: Miguel Young de la Sota Date: Wed, 1 Jan 2025 15:59:44 -0800 Subject: [PATCH 6/9] ilex: Make spans remember their file --- ilex/src/file/context.rs | 36 ++++---- ilex/src/file/mod.rs | 150 +++++++++++++++++++++------------- ilex/src/fp.rs | 23 +++--- ilex/src/report/builtin.rs | 62 +++++++------- ilex/src/report/diagnostic.rs | 24 ++++-- ilex/src/report/render.rs | 9 +- ilex/src/rt/emit2.rs | 90 ++++++++++---------- ilex/src/rt/lexer.rs | 8 +- ilex/src/rt/mod.rs | 30 +++---- ilex/src/testing/mod.rs | 7 +- ilex/src/testing/recognize.rs | 17 ++-- ilex/src/token/mod.rs | 142 +++++++++++++++----------------- ilex/src/token/stream.rs | 5 +- ilex/tests/json.rs | 6 +- 14 files changed, 321 insertions(+), 288 deletions(-) diff --git a/ilex/src/file/context.rs b/ilex/src/file/context.rs index afef1b1..97e1103 100644 --- a/ilex/src/file/context.rs +++ b/ilex/src/file/context.rs @@ -3,7 +3,6 @@ use std::sync::Arc; use std::sync::RwLock; use camino::Utf8Path; -use camino::Utf8PathBuf; use crate::f; use crate::file::File; @@ -24,9 +23,12 @@ pub struct Context { #[derive(Default)] pub struct State { + // Each file is laid out as the length of the text, followed by the text data, + // followed by the path. + // // TODO(mcyoung): Be smarter about this and use something something concurrent // vector? We don't need to have all this stuff behind a lock I think. - files: Vec<(Utf8PathBuf, String)>, + files: Vec<(usize, String)>, } unsafe impl Send for Context {} @@ -76,19 +78,21 @@ impl Context { } /// Adds a new file to this source context. - pub fn new_file( + pub fn new_file<'a>( &self, - name: impl Into, + path: impl Into<&'a Utf8Path>, text: impl Into, ) -> File { let mut text = text.into(); text.push(' '); // This space only exists to be somewhere for an EOF span // to point to in diagnostics; user code will never see // it. + let len = text.len(); + text.push_str(path.into().as_str()); let idx = { let mut state = self.state.write().unwrap(); - state.files.push((name.into(), text)); + state.files.push((len, text)); state.files.len() - 1 }; @@ -97,40 +101,40 @@ impl Context { /// Adds a new file to this source context by opening `name` and reading it /// from the file system. - pub fn open_file( + pub fn open_file<'a>( &self, - name: impl Into, + path: impl Into<&'a Utf8Path>, report: &Report, ) -> Result { - let name = name.into(); + let path = path.into(); - let bytes = match fs::read(&name) { + let bytes = match fs::read(path) { Ok(bytes) => bytes, Err(e) => { - report.error(f!("could not open input file `{name}`: {e}")); + report.error(f!("could not open input file `{path}`: {e}")); return report.fatal(); } }; let Ok(utf8) = String::from_utf8(bytes) else { - report.error(f!("input file `{name}` was not valid UTF-8")); + report.error(f!("input file `{path}` was not valid UTF-8")); return report.fatal(); }; - Ok(self.new_file(name, utf8)) + Ok(self.new_file(path, utf8)) } /// Gets the `idx`th file in this source context. pub fn file(&self, idx: usize) -> Option { let state = self.state.read().unwrap(); - let (path, text) = state.files.get(idx)?; - let (path, text) = unsafe { + let (len, text) = state.files.get(idx)?; + let text = unsafe { // SAFETY: The pointer to the file's text is immutable and pointer-stable, // so we can safely extend its lifetime here. - (&*(path.as_path() as *const Utf8Path), &*(text.as_str() as *const str)) + &*(text.as_str() as *const str) }; - Some(File { path, text, ctx: self, idx }) + Some(File { len: *len, text, ctx: self, idx }) } /// Gets the number of files currently tracked by this source context. diff --git a/ilex/src/file/mod.rs b/ilex/src/file/mod.rs index f33fc98..1d6d564 100644 --- a/ilex/src/file/mod.rs +++ b/ilex/src/file/mod.rs @@ -23,7 +23,7 @@ pub use context::Context; /// An input source file. #[derive(Copy, Clone)] pub struct File<'ctx> { - path: &'ctx Utf8Path, + len: usize, text: &'ctx str, ctx: &'ctx Context, idx: usize, @@ -32,7 +32,7 @@ pub struct File<'ctx> { impl<'ctx> File<'ctx> { /// Returns the name of this file, as a path. pub fn path(self) -> &'ctx Utf8Path { - self.path + self.text[self.len..].into() } /// Returns the textual contents of this file. This function takes a range, @@ -48,7 +48,7 @@ impl<'ctx> File<'ctx> { // // XXX: Apparently rustc forgets about other impls if we use // text[..x] here?? - let text = &self.text.get(..self.text.len() - 1).unwrap(); + let text = &self.text.get(..self.len - 1).unwrap(); &text[range] } @@ -59,7 +59,7 @@ impl<'ctx> File<'ctx> { } pub(crate) fn text_with_extra_space(self) -> &'ctx str { - self.text + &self.text[..self.len] } /// Returns the [`Context`] that owns this file. @@ -73,7 +73,7 @@ impl<'ctx> File<'ctx> { /// /// Panics if `start > end`, or if `end` is greater than the length of the /// file. - pub fn span(self, range: impl RangeBounds) -> Span { + pub fn span(self, range: impl RangeBounds) -> Span<'ctx> { Span::new(self, range) } @@ -103,13 +103,36 @@ impl PartialEq for File<'_> { /// so anything that implements [`Spanned`] is suitable for placing spanned data /// in diagnostics. #[derive(Copy, Clone)] -pub struct Span { - file: u32, +pub struct Span<'ctx> { + file: File<'ctx>, start: u32, end: u32, } -impl Span { +// A compressed version of a span that only remembers the start/end. +#[derive(Clone, Copy, Default, PartialEq, Eq)] +pub struct Span2(u32, u32); + +impl Span2 { + pub fn get(self, file: File) -> Span { + file.span(self.0 as usize..self.1 as usize) + } +} + +// A compressed version of a span that remembers the start, end, and file. +#[derive(Clone, Copy)] +pub struct Span3(u32, u32, u32); + +impl Span3 { + pub fn get(self, ctx: &Context) -> Span { + ctx + .file(self.0 as usize) + .unwrap() + .span(self.1 as usize..self.2 as usize) + } +} + +impl<'ctx> Span<'ctx> { /// Constructs a span from a file and a byte range within it. /// /// # Panics @@ -118,7 +141,7 @@ impl Span { /// file. #[track_caller] pub(crate) fn new + fmt::Debug>( - file: File, + file: File<'ctx>, range: impl RangeBounds, ) -> Span { let start = match range.start_bound() { @@ -140,7 +163,15 @@ impl Span { file.text.len(), ); - Span { file: file.idx() as u32, start, end } + Span { file, start, end } + } + + pub(crate) fn span2(self) -> Span2 { + Span2(self.start, self.end) + } + + pub(crate) fn span3(self) -> Span3 { + Span3(self.file.idx() as u32, self.start, self.end) } /// Gets the file for this span. @@ -149,8 +180,8 @@ impl Span { /// /// May panic if this span is not owned by `ctx` (or it may produce an /// unexpected result). - pub fn file(self, ctx: &Context) -> File { - ctx.file(self.file as usize).unwrap() + pub fn file(self) -> File<'ctx> { + self.file } /// Returns the start (inclusive) byte offset of this span. @@ -181,7 +212,7 @@ impl Span { pub fn subspan + fmt::Debug>( self, range: impl RangeBounds, - ) -> Span { + ) -> Self { let start = match range.start_bound() { Bound::Included(&x) => cast(x), Bound::Excluded(&x) => cast(x).saturating_add(1), @@ -212,7 +243,7 @@ impl Span { /// /// # Panics /// /// Panics if `at` is larger than the length of this range. - pub fn split_at(self, at: usize) -> (Span, Span) { + pub fn split_at(self, at: usize) -> (Self, Self) { (self.subspan(..at), self.subspan(at..)) } @@ -222,7 +253,7 @@ impl Span { /// # Panics /// /// Panics if `range` is smaller than `pre + suf`. - pub fn split_around(self, pre: usize, suf: usize) -> [Span; 3] { + pub fn split_around(self, pre: usize, suf: usize) -> [Self; 3] { let (pre, range) = self.split_at(pre); let (range, suf) = range.split_at(range.len() - suf); [pre, range, suf] @@ -234,8 +265,8 @@ impl Span { /// /// May panic if this range is not owned by `ctx` (or it may produce an /// unexpected result). - pub fn text(self, ctx: &Context) -> &str { - self.file(ctx).text(self.start as usize..self.end as usize) + pub fn text(self) -> &'ctx str { + self.file().text(self.start as usize..self.end as usize) } /// Joins together a collection of ranges. @@ -244,7 +275,7 @@ impl Span { /// /// May panic if not all spans are for the same file, or if the iterator /// is empty. - pub fn union(ranges: impl IntoIterator) -> Span { + pub fn union(ranges: impl IntoIterator) -> Self { let mut best = None; for range in ranges { @@ -267,56 +298,56 @@ impl Span { /// /// You should implement this type for any type which naturally has a single /// span that describes it. -pub trait Spanned { +pub trait Spanned<'ctx> { /// Returns the span in this syntax element. - fn span(&self, ctx: &Context) -> Span; + fn span(&self) -> Span<'ctx>; /// Forwards to [`SpanId::file()`]. - fn file<'ctx>(&self, ctx: &'ctx Context) -> File<'ctx> { - self.span(ctx).file(ctx) + fn file(&self) -> File<'ctx> { + self.span().file() } /// Forwards to [`Span::start()`]. - fn start(&self, ctx: &Context) -> usize { - self.span(ctx).start() + fn start(&self) -> usize { + self.span().start() } /// Forwards to [`Span::end()`]. - fn end(&self, ctx: &Context) -> usize { - self.span(ctx).end() + fn end(&self) -> usize { + self.span().end() } /// Forwards to [`Span::is_empty()`]. - fn is_empty(&self, ctx: &Context) -> bool { - self.span(ctx).is_empty() + fn is_empty(&self) -> bool { + self.span().is_empty() } /// Forwards to [`Span::len()`]. - fn len(&self, ctx: &Context) -> usize { - self.span(ctx).len() + fn len(&self) -> usize { + self.span().len() } /// Forwards to [`SpanId::text()`]. - fn text<'ctx>(&self, ctx: &'ctx Context) -> &'ctx str { - self.span(ctx).text(ctx) + fn text(&self) -> &'ctx str { + self.span().text() } } // Spans are spanned by their own spans. -impl Spanned for Span { - fn span(&self, _ctx: &Context) -> Span { +impl<'ctx> Spanned<'ctx> for Span<'ctx> { + fn span(&self) -> Span<'ctx> { *self } } -impl Spanned for &S { - fn span(&self, ctx: &Context) -> Span { - S::span(self, ctx) +impl<'ctx, S: Spanned<'ctx>> Spanned<'ctx> for &S { + fn span(&self) -> Span<'ctx> { + S::span(self) } } -impl Spanned for Never { - fn span(&self, _ctx: &Context) -> Span { +impl<'ctx> Spanned<'ctx> for Never { + fn span(&self) -> Span<'ctx> { self.from_nothing_anything() } } @@ -325,28 +356,33 @@ thread_local! { static CTX_FOR_SPAN_DEBUG: RefCell> = RefCell::new(None); } -impl fmt::Debug for Span { +impl fmt::Debug for File<'_> { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "File({})", self.path()) + } +} + +impl fmt::Debug for Span<'_> { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - CTX_FOR_SPAN_DEBUG.with(|ctx| { - if let Some(ctx) = &*ctx.borrow() { - let text = self.text(ctx); - write!(f, "`")?; - for c in text.chars() { - if ('\x20'..'\x7e').contains(&c) { - f.write_char(c)?; - } else if c < '\x20' { - write!(f, "{}", c.escape_debug())? - } else { - write!(f, "", c as u32)?; - } - } - write!(f, "` @ {}", self.file(ctx).path())?; + write!(f, "`")?; + for c in self.text().chars() { + if ('\x20'..'\x7e').contains(&c) { + f.write_char(c)?; + } else if c < '\x20' { + write!(f, "{}", c.escape_debug())? } else { - write!(f, "<#{}>", self.file)?; + write!(f, "", c as u32)?; } + } + write!(f, "` @ {}", self.file.path())?; + + write!(f, "[{}..{}]", Span::start(*self), Span::end(*self)) + } +} - write!(f, "[{}..{}]", Span::start(*self), Span::end(*self)) - }) +impl fmt::Display for Span<'_> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.write_str(self.text()) } } diff --git a/ilex/src/fp.rs b/ilex/src/fp.rs index e29125b..07d70b7 100644 --- a/ilex/src/fp.rs +++ b/ilex/src/fp.rs @@ -521,7 +521,7 @@ impl Digital<'_> { let mut int_digits = 0i64; let mut frac_digits = 0i64; for (span, digits) in [(int, &mut int_digits), (frac, &mut frac_digits)] { - let Some(mut text) = span.map(|s| s.text(ctx)) else { + let Some(mut text) = span.map(|s| s.text()) else { continue; }; while let Some(c) = text.chars().next() { @@ -554,7 +554,7 @@ impl Digital<'_> { } if let Some(exp) = exp { - let mut text = exp.digit_blocks().next().unwrap().text(ctx); + let mut text = exp.digit_blocks().next().unwrap().text(); while let Some(c) = text.chars().next() { if let Some(suf) = text.strip_prefix(rule.separator.as_str()) { text = suf; @@ -599,7 +599,7 @@ impl Digital<'_> { tok.sign().is_none() || tok.sign().is_some_and(|s| { matches!( - (tok.sign_span().unwrap().text(ctx), s), + (tok.sign_span().unwrap().text(), s), ("+", Sign::Pos) | ("-", Sign::Neg) ) }) @@ -617,13 +617,12 @@ impl Digital<'_> { && has_ordinary_sign(ctx, &exp) })) && (rule.separator.is_empty() - || !self.text(ctx).contains(rule.separator.as_str())) + || !self.text().contains(rule.separator.as_str())) { - let text = self.text(ctx); + let text = self.text(); Fp::__parse( - &text[self.prefix().map(|s| s.text(ctx).len()).unwrap_or(0) - ..text.len() - - self.suffix().map(|s| s.text(ctx).len()).unwrap_or(0)], + &text[self.prefix().map(|s| s.text().len()).unwrap_or(0) + ..text.len() - self.suffix().map(|s| s.text().len()).unwrap_or(0)], ) } else { // Since the fast paths have failed us, we need to construct a suitable @@ -632,7 +631,7 @@ impl Digital<'_> { let buf = (|| { use std::fmt::Write; - let mut buf = String::with_capacity(self.text(ctx).len()); + let mut buf = String::with_capacity(self.text().len()); if self.is_negative() { buf.push('-'); } @@ -640,12 +639,12 @@ impl Digital<'_> { let _ = write!( buf, "{}", - u64::from_radix(int.unwrap().text(ctx), 10, &rule.separator)? + u64::from_radix(int.unwrap().text(), 10, &rule.separator)? ); if let Some(frac) = frac { let sep = rule.separator.as_str(); - let mut frac = frac.text(ctx); + let mut frac = frac.text(); let mut lz = 0; loop { let start_len = frac.len(); @@ -688,7 +687,7 @@ impl Digital<'_> { _ => '+', }, u64::from_radix( - exp.digit_blocks().next().unwrap().text(ctx), + exp.digit_blocks().next().unwrap().text(), exp.radix(), &rule.separator )?, diff --git a/ilex/src/report/builtin.rs b/ilex/src/report/builtin.rs index 4ff5102..31c21d2 100644 --- a/ilex/src/report/builtin.rs +++ b/ilex/src/report/builtin.rs @@ -29,12 +29,11 @@ pub struct Builtins<'a> { impl Builtins<'_> { /// Generates an "unexpected" diagnostic. #[track_caller] - pub fn unexpected<'a, 'b>( + pub fn unexpected<'a, 'b, 's>( &self, - found: impl Into>, unexpected_in: impl Into>, - at: impl Spanned, + at: impl Spanned<'s>, ) -> Diagnostic { let found = found.into(); @@ -52,9 +51,12 @@ impl Builtins<'_> { } #[track_caller] - pub(crate) fn unexpected_token(&self, at: impl Spanned) -> Diagnostic { - let at = at.span(&self.report.ctx); - let found = at.text(&self.report.ctx); + pub(crate) fn unexpected_token<'s>( + &self, + at: impl Spanned<'s>, + ) -> Diagnostic { + let at = at.span(); + let found = at.text(); let diagnostic = self .report @@ -66,13 +68,13 @@ impl Builtins<'_> { } #[track_caller] - pub(crate) fn extra_chars<'a>( + pub(crate) fn extra_chars<'a, 's>( &self, unexpected_in: impl Into>, - at: impl Spanned, + at: impl Spanned<'s>, ) -> Diagnostic { - let at = at.span(&self.report.ctx); - let found = at.text(&self.report.ctx); + let at = at.span(); + let found = at.text(); let diagnostic = self .report @@ -83,7 +85,7 @@ impl Builtins<'_> { )) .at(at) .remark( - at.file(&self.report.ctx) + at.file() .span(at.start().saturating_sub(1)..at.start().saturating_add(1)), "maybe you meant to include a space here", ) @@ -95,12 +97,12 @@ impl Builtins<'_> { /// Generates an "expected one of these tokens but got something else" /// diagnostic. #[track_caller] - pub fn expected<'a, 'b, E: Into>>( + pub fn expected<'a, 'b, 's, E: Into>>( &self, expected: impl IntoIterator, found: impl Into>, - at: impl Spanned, + at: impl Spanned<'s>, ) -> Diagnostic { let expected = expected.into_iter().map(Into::into).collect::>(); let alts = disjunction_to_string(self.spec, &expected); @@ -121,12 +123,11 @@ impl Builtins<'_> { /// Generates an "unopened delimiter" diagnostic, for when a delimiter is /// not opened before expected. #[track_caller] - pub(crate) fn unopened<'a>( + pub(crate) fn unopened<'a, 's>( &self, - expected: &str, found: impl Into>, - at: impl Spanned, + at: impl Spanned<'s>, ) -> Diagnostic { let found = found.into(); @@ -142,13 +143,13 @@ impl Builtins<'_> { /// Generates an "unclosed delimiter" diagnostic, for when a delimiter is /// not closed before expected. #[track_caller] - pub(crate) fn unclosed<'a>( + pub(crate) fn unclosed<'a, 's1, 's2>( &self, - open: impl Spanned, + open: impl Spanned<'s1>, expected: &str, found: impl Into>, - at: impl Spanned, + at: impl Spanned<'s2>, ) -> Diagnostic { let found = found.into(); @@ -168,11 +169,10 @@ impl Builtins<'_> { /// Generates an "unclosed delimiter" diagnostic, for when a delimiter is /// not closed before expected. #[track_caller] - pub(crate) fn non_ascii_in_ident<'a>( + pub(crate) fn non_ascii_in_ident<'a, 's>( &self, - expected: impl Into>, - at: impl Spanned, + at: impl Spanned<'s>, ) -> Diagnostic { self .report @@ -185,11 +185,11 @@ impl Builtins<'_> { } #[track_caller] - pub(crate) fn ident_too_small( + pub(crate) fn ident_too_small<'s>( &self, min_len: usize, actual: usize, - at: impl Spanned, + at: impl Spanned<'s>, ) -> Diagnostic { let diagnostic = self .report @@ -210,27 +210,25 @@ impl Builtins<'_> { /// Generates an "invalid escape sequence" diagnostic. #[track_caller] - pub fn invalid_escape( + pub fn invalid_escape<'s>( &self, - at: impl Spanned, + at: impl Spanned<'s>, why: impl fmt::Display, ) -> Diagnostic { - let at = at.span(&self.report.ctx); - let seq = at.text(&self.report.ctx); + let at = at.span(); self .report - .error(f!("found an invalid escape sequence: `{seq}`")) + .error(f!("found an invalid escape sequence: `{at}`")) .saying(at, why) .reported_at(Location::caller()) } /// Generates a "numeric literal overflowed" diagnostic. #[track_caller] - pub fn literal_out_of_range<'a, N: fmt::Display>( + pub fn literal_out_of_range<'a, 's, N: fmt::Display>( &self, - what: impl Into>, - at: impl Spanned, + at: impl Spanned<'s>, span: &impl RangeBounds, min: &dyn fmt::Display, max: &dyn fmt::Display, diff --git a/ilex/src/report/diagnostic.rs b/ilex/src/report/diagnostic.rs index 9ec1153..41f9071 100644 --- a/ilex/src/report/diagnostic.rs +++ b/ilex/src/report/diagnostic.rs @@ -2,7 +2,7 @@ use std::fmt; use std::mem; use std::panic; -use crate::file::Span; +use crate::file; use crate::file::Spanned; use crate::report::Report; @@ -35,7 +35,7 @@ pub use annotate_snippets::AnnotationType as Kind; pub struct Info { pub kind: Kind, pub message: String, - pub snippets: Vec>, + pub snippets: Vec>, pub notes: Vec<(String, Kind)>, pub reported_at: Option<&'static panic::Location<'static>>, } @@ -70,24 +70,32 @@ impl Diagnostic { } /// Adds a new relevant snippet at the given location. - pub fn at(self, span: impl Spanned) -> Self { + pub fn at<'s>(self, span: impl Spanned<'s>) -> Self { self.saying(span, "") } /// Adds a new diagnostic location, with the given message attached to it. - pub fn saying(self, span: impl Spanned, message: impl fmt::Display) -> Self { + pub fn saying<'s>( + self, + span: impl Spanned<'s>, + message: impl fmt::Display, + ) -> Self { self.snippet(span, message, None) } /// Like `saying`, but the underline is as for a "note" rather than the /// overall diagnostic. - pub fn remark(self, span: impl Spanned, message: impl fmt::Display) -> Self { + pub fn remark<'s>( + self, + span: impl Spanned<'s>, + message: impl fmt::Display, + ) -> Self { self.snippet(span, message, Some(Kind::Help)) } - fn snippet( + fn snippet<'s>( mut self, - span: impl Spanned, + span: impl Spanned<'s>, message: impl fmt::Display, kind: Option, ) -> Self { @@ -96,7 +104,7 @@ impl Diagnostic { } self.info.snippets.last_mut().unwrap().push(( - span.span(&self.report.ctx), + span.span().span3(), message.to_string(), kind.unwrap_or(self.info.kind), )); diff --git a/ilex/src/report/render.rs b/ilex/src/report/render.rs index c942af9..e7e843a 100644 --- a/ilex/src/report/render.rs +++ b/ilex/src/report/render.rs @@ -134,8 +134,9 @@ pub fn render_fmt( let mut cur_file = None; let mut cur_slice = None::; let mut has_eof = false; - for (range, text, kind) in snips { - let file = range.file(&report.ctx); + for (span, text, kind) in snips { + let span = span.get(&report.ctx); + let file = span.file(); if cur_file != Some(file) { cur_file = Some(file); if let Some(mut slice) = cur_slice.take() { @@ -155,8 +156,8 @@ pub fn render_fmt( } let slice = cur_slice.as_mut().unwrap(); - let mut start = range.start(); - let mut end = range.end(); + let mut start = span.start(); + let mut end = span.end(); // Ensure that all ranges have length at least one, and try to get them // to point just after non-whitespace. diff --git a/ilex/src/rt/emit2.rs b/ilex/src/rt/emit2.rs index cb37764..4becb6f 100644 --- a/ilex/src/rt/emit2.rs +++ b/ilex/src/rt/emit2.rs @@ -8,6 +8,7 @@ use byteyarn::YarnBox; use crate::f; use crate::file::Context; use crate::file::Span; +use crate::file::Span2; use crate::plural; use crate::report::Expected; use crate::rt; @@ -37,7 +38,7 @@ pub fn emit(lexer: &mut Lexer) { let start = lexer.cursor(); let end = start + match_.len; let span = lexer.span(start..end); - let text = span.text(ctx); + let text = span.text(); let end = end + match_.extra; // Now we have to decide which of `candidates` is the best one, i.e., @@ -82,12 +83,12 @@ pub fn emit(lexer: &mut Lexer) { }; let [_, name, _] = find_affixes(range, &ident_rule.affixes, ctx); - if name.text(ctx).chars().count() < ident_rule.min_len { + if name.text().chars().count() < ident_rule.min_len { continue 'verify; } if ident_rule.ascii_only { - for c in name.text(ctx).chars() { + for c in name.text().chars() { if !c.is_ascii() && !ident_rule.extra_continues.contains(c) && !ident_rule.extra_starts.contains(c) @@ -195,7 +196,7 @@ pub fn emit(lexer: &mut Lexer) { let best = best.unwrap_or(match_.candidates[0]); let [sign_span, prefix, range, suffix] = find_affixes_partial(span, lexer.spec(), best, ctx); - let text = range.text(ctx); + let text = range.text(); let mirrored = match lexer.spec().rule(best.lexeme) { Any::Bracket(bracket) @@ -208,7 +209,7 @@ pub fn emit(lexer: &mut Lexer) { if !best.is_close { (open, close) } else { (close, open) }; let [_, mid, _] = range.split_around(remove.0.len(), remove.1.len()); - Some(yarn!("{}{}{}", replace.0, mid.text(ctx), replace.1)) + Some(yarn!("{}{}{}", replace.0, mid.text(), replace.1)) } BracketKind::CxxLike { ident_rule, open, close, .. } => { let (remove, replace) = @@ -217,7 +218,7 @@ pub fn emit(lexer: &mut Lexer) { let [_, mid, _] = range.split_around(remove.0.len(), remove.1.len()); let [_, name, _] = find_affixes(mid, &ident_rule.affixes, ctx); - let text = name.text(ctx); + let text = name.text(); let count = text.chars().count(); if count < ident_rule.min_len { lexer @@ -235,7 +236,7 @@ pub fn emit(lexer: &mut Lexer) { } } - Some(yarn!("{}{}{}", replace.0, mid.text(ctx), replace.1)) + Some(yarn!("{}{}{}", replace.0, mid.text(), replace.1)) } }, _ => None, @@ -342,7 +343,7 @@ pub fn emit(lexer: &mut Lexer) { ); lexer.add_token(rt::SUFFIX, suffix.len(), None); - let sign_text = sign_span.text(ctx); + let sign_text = sign_span.text(); let sign = (!sign_text.is_empty()).then(|| { let Some((_, value)) = rule.mant.signs.iter().find(|(text, _)| text == sign_text) @@ -350,18 +351,18 @@ pub fn emit(lexer: &mut Lexer) { bug!("could not find appropriate sign for Digital rule"); }; - (*value, [sign_span.start() as u32, sign_span.end() as u32]) + (*value, sign_span.span2()) }); let mut chunks = vec![DigitBlocks { - prefix: [0, 0], + prefix: Span2::default(), sign, blocks: Vec::new(), which_exp: !0, }]; if !prefix.is_empty() { - chunks[0].prefix = [prefix.start() as u32, prefix.end() as u32]; + chunks[0].prefix = prefix.span2(); } let mut offset = 0; @@ -410,10 +411,9 @@ pub fn emit(lexer: &mut Lexer) { ); } - chunk.blocks.push([ - (range.start() + block_start) as u32, - (range.start() + offset) as u32, - ]); + chunk + .blocks + .push(range.subspan(block_start..offset).span2()); text = rest; offset += rule.point.len(); block_start = offset; @@ -431,10 +431,9 @@ pub fn emit(lexer: &mut Lexer) { ); } - chunk.blocks.push([ - (range.start() + block_start) as u32, - (range.start() + offset) as u32, - ]); + chunk + .blocks + .push(range.subspan(block_start..offset).span2()); let prefix = range.subspan(offset..offset + pre.len()); text = rest; @@ -446,25 +445,21 @@ pub fn emit(lexer: &mut Lexer) { .filter(|(y, _)| rest.starts_with(y.as_str())) .max_by_key(|(y, _)| y.len()) .map(|(y, s)| { - let sign = [ - (range.start() + offset) as u32, - (range.start() + offset + y.len()) as u32, - ]; + let sign = range.subspan(offset..offset + y.len()); text = &text[y.len()..]; offset += y.len(); - (*s, sign) + (*s, sign.span2()) }); chunks.push(DigitBlocks { - prefix: [0, 0], + prefix: Span2::default(), sign, blocks: Vec::new(), which_exp: i, }); if !prefix.is_empty() { - chunks.last_mut().unwrap().prefix = - [prefix.start() as u32, prefix.end() as u32]; + chunks.last_mut().unwrap().prefix = prefix.span2(); } digits = exp; @@ -490,7 +485,7 @@ pub fn emit(lexer: &mut Lexer) { .last_mut() .unwrap() .blocks - .push([(range.start() + block_start) as u32, range.end() as u32]); + .push(range.subspan(block_start..).span2()); let mant = chunks.remove(0); let Some(rt::Kind::Digital(meta)) = lexer @@ -538,7 +533,7 @@ pub fn emit(lexer: &mut Lexer) { } for block in chunk.blocks(lexer.file()) { - let mut text = block.text(ctx); + let mut text = block.text(); // FIXME: The is_some() here should not be necessary. if range.is_empty() && chunk.prefix(lexer.file()).is_some() { @@ -546,10 +541,7 @@ pub fn emit(lexer: &mut Lexer) { lexer .builtins() .expected( - [Expected::Name(yarn!( - "digits after `{}`", - prefix.text(ctx), - ))], + [Expected::Name(yarn!("digits after `{}`", prefix.text(),))], match lexer.text(range.end()..).chars().next() { Some(c) => Expected::Literal(Yarn::from(c)), None => Expected::Lexeme(Lexeme::eof().any()), @@ -793,14 +785,14 @@ pub fn emit(lexer: &mut Lexer) { } /// Extracts the affixes from `text`. -fn find_affixes_partial( - range: Span, +fn find_affixes_partial<'a>( + range: Span<'a>, spec: &Spec, best: Lexeme2, ctx: &Context, -) -> [Span; 4] { - let text = range.text(ctx); - let ep = range.file(ctx).span(0..0); +) -> [Span<'a>; 4] { + let text = range.text(); + let ep = range.file().span(0..0); match spec.rule(best.lexeme) { Any::Ident(rule) => { let [pre, range, suf] = find_affixes(range, &rule.affixes, ctx); @@ -833,14 +825,22 @@ fn find_affixes_partial( } /// Extracts the affixes from `text`. -fn find_affixes(range: Span, affixes: &Affixes, ctx: &Context) -> [Span; 3] { +fn find_affixes<'a>( + range: Span<'a>, + affixes: &Affixes, + ctx: &Context, +) -> [Span<'a>; 3] { let (prefix, range) = find_prefix(range, affixes, ctx); let (range, suffix) = find_suffix(range, affixes, ctx); [prefix, range, suffix] } -fn find_prefix(range: Span, affixes: &Affixes, ctx: &Context) -> (Span, Span) { - let text = range.text(ctx); +fn find_prefix<'a>( + range: Span<'a>, + affixes: &Affixes, + ctx: &Context, +) -> (Span<'a>, Span<'a>) { + let text = range.text(); let prefix = affixes .prefixes() .iter() @@ -851,8 +851,12 @@ fn find_prefix(range: Span, affixes: &Affixes, ctx: &Context) -> (Span, Span) { range.split_at(prefix) } -fn find_suffix(range: Span, affixes: &Affixes, ctx: &Context) -> (Span, Span) { - let text = range.text(ctx); +fn find_suffix<'a>( + range: Span<'a>, + affixes: &Affixes, + ctx: &Context, +) -> (Span<'a>, Span<'a>) { + let text = range.text(); let suffix = affixes .suffixes() .iter() diff --git a/ilex/src/rt/lexer.rs b/ilex/src/rt/lexer.rs index d697a9d..165427c 100644 --- a/ilex/src/rt/lexer.rs +++ b/ilex/src/rt/lexer.rs @@ -108,17 +108,17 @@ impl<'a, 'ctx> Lexer<'a, 'ctx> { } /// Returns the EOF span. - pub fn eof(&self) -> Span { + pub fn eof(&self) -> Span<'ctx> { self.file().span(self.file().len()..self.file().len()) } /// Creates a new range in the current file. - pub fn span(&self, range: impl RangeBounds) -> Span { + pub fn span(&self, range: impl RangeBounds) -> Span<'ctx> { self.file().span(range) } // Returns the span of the token at the given index. - pub fn lookup_span(&self, idx: usize) -> Span { + pub fn lookup_span(&self, idx: usize) -> Span<'ctx> { let end = self.stream.toks[idx].end as usize; let start = self.stream.toks[..idx] .last() @@ -188,7 +188,7 @@ impl<'a, 'ctx> Lexer<'a, 'ctx> { self.builtins().extra_chars( self.spec().rule_name_or( close.lexeme.any(), - f!("{} ... {}", open_sp.text(self.file().context()), close.close), + f!("{} ... {}", open_sp, close.close), ), span, ); diff --git a/ilex/src/rt/mod.rs b/ilex/src/rt/mod.rs index f390ba8..80afcb6 100644 --- a/ilex/src/rt/mod.rs +++ b/ilex/src/rt/mod.rs @@ -4,6 +4,7 @@ use std::cell::Cell; use crate::file::File; use crate::file::Span; +use crate::file::Span2; use crate::report::Fatal; use crate::report::Report; use crate::rule; @@ -106,34 +107,29 @@ pub struct Digital { #[derive(Clone, Default)] pub struct DigitBlocks { - pub prefix: [u32; 2], - pub sign: Option<(Sign, [u32; 2])>, - pub blocks: Vec<[u32; 2]>, + pub prefix: Span2, + pub sign: Option<(Sign, Span2)>, + pub blocks: Vec, pub which_exp: usize, } impl DigitBlocks { - pub fn prefix(&self, file: File) -> Option { - if self.prefix == [0, 0] { + pub fn prefix<'ctx>(&self, file: File<'ctx>) -> Option> { + if self.prefix == Span2::default() { return None; } - Some(file.span(self.prefix[0] as usize..self.prefix[1] as usize)) + Some(self.prefix.get(file)) } - pub fn sign(&self, file: File) -> Option { - self - .sign - .map(|(_, [a, b])| file.span(a as usize..b as usize)) + pub fn sign<'ctx>(&self, file: File<'ctx>) -> Option> { + self.sign.map(|(_, s)| s.get(file)) } - pub fn blocks<'a>( + pub fn blocks<'a, 'ctx: 'a>( &'a self, - file: File<'a>, - ) -> impl Iterator + 'a { - self - .blocks - .iter() - .map(move |&[a, b]| file.span(a as usize..b as usize)) + file: File<'ctx>, + ) -> impl Iterator> + 'a { + self.blocks.iter().map(move |s| s.get(file)) } } diff --git a/ilex/src/testing/mod.rs b/ilex/src/testing/mod.rs index 1fda03e..591af62 100644 --- a/ilex/src/testing/mod.rs +++ b/ilex/src/testing/mod.rs @@ -333,12 +333,9 @@ impl Text { /// Returns whether this span recognizes a particular span. fn recognizes(&self, span: Span, ctx: &Context) -> bool { - !self - .text - .as_ref() - .is_some_and(|text| text != span.text(ctx)) + !self.text.as_ref().is_some_and(|text| text != span.text()) && !self.range.as_ref().is_some_and(|range| { - let r = span.span(ctx); + let r = span.span(); range != &(r.start()..r.end()) }) } diff --git a/ilex/src/testing/recognize.rs b/ilex/src/testing/recognize.rs index 94e4223..d212f81 100644 --- a/ilex/src/testing/recognize.rs +++ b/ilex/src/testing/recognize.rs @@ -64,7 +64,7 @@ impl Matcher { tok: token::Any, ctx: &Context, ) { - state.match_spans("token span", &self.span, Spanned::span(&tok, ctx)); + state.match_spans("token span", &self.span, Spanned::span(&tok)); zip_eq("comments", state, &self.comments, tok.comments(), |state, t, s| { state.match_spans("comment", t, s); @@ -267,20 +267,25 @@ impl<'a> MatchState<'a> { let _ = writeln!(self.errors, ": {msg}"); } - fn match_spans(&mut self, what: &str, text: &Text, span: impl Spanned) { - let span = span.span(self.ctx); + fn match_spans<'s>( + &mut self, + what: &str, + text: &Text, + span: impl Spanned<'s>, + ) { + let span = span.span(); if !text.recognizes(span, self.ctx) { self.error(f!("wrong {what}; want {:?}, got {:?}", text, span)); } } - fn match_options( + fn match_options<'s>( &mut self, what: &str, text: Option<&Text>, - span: Option, + span: Option>, ) { - let span = span.map(|s| s.span(self.ctx)); + let span = span.map(|s| s.span()); if text.is_none() && span.is_none() { return; } diff --git a/ilex/src/token/mod.rs b/ilex/src/token/mod.rs index f456627..c1072b9 100644 --- a/ilex/src/token/mod.rs +++ b/ilex/src/token/mod.rs @@ -66,7 +66,7 @@ impl Id { /// A token type. All types in [`ilex::token`][crate::token] implement this /// trait. pub trait Token<'lex>: - Copy + Spanned + fmt::Debug + TryFrom> + Into> + Copy + Spanned<'lex> + fmt::Debug + TryFrom> + Into> { /// The token this rule was parsed from. type Rule: rule::Rule; @@ -250,15 +250,15 @@ impl fmt::Debug for Any<'_> { } } -impl Spanned for Any<'_> { - fn span(&self, ctx: &Context) -> Span { +impl<'lex> Spanned<'lex> for Any<'lex> { + fn span(&self) -> Span<'lex> { match self { - Self::Eof(tok) => tok.span(ctx), - Self::Keyword(tok) => tok.span(ctx), - Self::Bracket(tok) => tok.span(ctx), - Self::Ident(tok) => tok.span(ctx), - Self::Quoted(tok) => tok.span(ctx), - Self::Digital(tok) => tok.span(ctx), + Self::Eof(tok) => tok.span(), + Self::Keyword(tok) => tok.span(), + Self::Bracket(tok) => tok.span(), + Self::Ident(tok) => tok.span(), + Self::Quoted(tok) => tok.span(), + Self::Digital(tok) => tok.span(), } } } @@ -319,12 +319,12 @@ impl<'lex> TryFrom> for Eof<'lex> { impl fmt::Debug for Eof<'_> { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - write!(f, "Eof({:?})", self.span(self.context())) + write!(f, "Eof({:?})", self.span()) } } -impl Spanned for Eof<'_> { - fn span(&self, _: &Context) -> Span { +impl<'lex> Spanned<'lex> for Eof<'lex> { + fn span(&self) -> Span<'lex> { self.stream.lookup_span_no_affix(self.id) } } @@ -372,12 +372,12 @@ impl<'lex> TryFrom> for Keyword<'lex> { impl fmt::Debug for Keyword<'_> { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - write!(f, "Keyword({:?})", self.span(self.stream.context())) + write!(f, "Keyword({:?})", self.span()) } } -impl Spanned for Keyword<'_> { - fn span(&self, _: &Context) -> Span { +impl<'lex> Spanned<'lex> for Keyword<'lex> { + fn span(&self) -> Span<'lex> { self.stream.lookup_span_no_affix(self.id) } } @@ -396,17 +396,17 @@ pub struct Bracket<'lex> { impl<'lex> Bracket<'lex> { /// Returns this token's open delimiter. - pub fn open(self) -> Span { + pub fn open(self) -> Span<'lex> { self.contents.stream().lookup_span_no_affix(self.open) } /// Returns this token's close delimiter. - pub fn close(self) -> Span { + pub fn close(self) -> Span<'lex> { self.contents.stream().lookup_span_no_affix(self.close) } /// Returns this token's quote delimiters. - pub fn delimiters(self) -> [Span; 2] { + pub fn delimiters(self) -> [Span<'lex>; 2] { [self.open(), self.close()] } @@ -466,8 +466,8 @@ impl fmt::Debug for Bracket<'_> { } } -impl Spanned for Bracket<'_> { - fn span(&self, _: &Context) -> Span { +impl<'lex> Spanned<'lex> for Bracket<'lex> { + fn span(&self) -> Span<'lex> { let [a, b] = self.delimiters(); self.contents.stream().file().span(a.start()..b.end()) } @@ -482,32 +482,28 @@ pub struct Ident<'lex> { impl<'lex> Ident<'lex> { /// Returns this token's name span. - pub fn name(self) -> Span { + pub fn name(self) -> Span<'lex> { self.stream.lookup_span_no_affix(self.id) } /// Returns this token's prefix. - pub fn prefix(self) -> Option { + pub fn prefix(self) -> Option> { self.stream.lookup_prefix(self.id) } /// Checks whether this identifier has a particular prefix. pub fn has_prefix(&self, expected: &str) -> bool { - self - .prefix() - .is_some_and(|s| s.text(self.context()) == expected) + self.prefix().is_some_and(|s| s.text() == expected) } /// Returns this token's suffix. - pub fn suffix(&self) -> Option { + pub fn suffix(&self) -> Option> { self.stream.lookup_suffix(self.id) } /// Checks whether this identifier has a particular prefix. pub fn has_suffix(&self, expected: &str) -> bool { - self - .suffix() - .is_some_and(|s| s.text(self.context()) == expected) + self.suffix().is_some_and(|s| s.text() == expected) } } @@ -548,8 +544,7 @@ impl fmt::Debug for Ident<'_> { } let mut f = f.debug_struct("Ident"); - f.field("span", &self.span(self.context())) - .field("name", &self.name()); + f.field("span", &self.span()).field("name", &self.name()); if let Some(prefix) = self.prefix() { f.field("prefix", &prefix); @@ -563,8 +558,8 @@ impl fmt::Debug for Ident<'_> { } } -impl Spanned for Ident<'_> { - fn span(&self, _: &Context) -> Span { +impl<'lex> Spanned<'lex> for Ident<'lex> { + fn span(&self) -> Span<'lex> { self.stream.lookup_span_with_affixes(self.id) } } @@ -614,14 +609,13 @@ impl<'lex> Digital<'lex> { } /// Returns the span corresponding to [`Digital::sign()`]. - pub fn sign_span(self) -> Option { - self.rt_blocks().sign(self.file(self.context())) + pub fn sign_span(self) -> Option> { + self.rt_blocks().sign(self.file()) } /// Returns the point-separated digit chunks of this digital literal. - pub fn digit_blocks(self) -> impl Iterator + 'lex { - let ctx = self.context(); - self.rt_blocks().blocks(self.file(ctx)) + pub fn digit_blocks(self) -> impl Iterator> + 'lex { + self.rt_blocks().blocks(self.file()) } /// Returns the exponents of this digital literal, if it any. @@ -638,9 +632,9 @@ impl<'lex> Digital<'lex> { } /// Returns this token's prefix. - pub fn prefix(self) -> Option { + pub fn prefix(self) -> Option> { if self.idx > 0 { - return self.rt_blocks().prefix(self.file(self.context())); + return self.rt_blocks().prefix(self.file()); } self.stream.lookup_prefix(self.id) @@ -648,13 +642,11 @@ impl<'lex> Digital<'lex> { /// Checks whether this identifier has a particular prefix. pub fn has_prefix(&self, expected: &str) -> bool { - self - .prefix() - .is_some_and(|s| s.text(self.context()) == expected) + self.prefix().is_some_and(|s| s.text() == expected) } /// Returns this token's suffix. - pub fn suffix(&self) -> Option { + pub fn suffix(&self) -> Option> { if self.idx > 0 { // Exponent tokens never have a suffix. return None; @@ -665,9 +657,7 @@ impl<'lex> Digital<'lex> { /// Checks whether this identifier has a particular prefix. pub fn has_suffix(&self, expected: &str) -> bool { - self - .suffix() - .is_some_and(|s| s.text(self.context()) == expected) + self.suffix().is_some_and(|s| s.text() == expected) } /// Parses this token as an integer. @@ -716,7 +706,7 @@ impl<'lex> Digital<'lex> { self .digit_blocks() .map(|span| { - let text = span.text(self.context()); + let text = span.text(); let buf; let text = if !rule.separator.is_empty() && text.contains(&*rule.separator) { @@ -942,7 +932,7 @@ impl<'lex> TryFrom> for Digital<'lex> { impl fmt::Debug for Digital<'_> { fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { let mut f = f.debug_struct("Digital"); - f.field("span", &self.span(self.context())) + f.field("span", &self.span()) .field("radix", &self.radix()) // TODO: Get rid of this collect. .field("digits", &self.digit_blocks().collect::>()); @@ -967,8 +957,8 @@ impl fmt::Debug for Digital<'_> { } } -impl Spanned for Digital<'_> { - fn span(&self, _: &Context) -> Span { +impl<'lex> Spanned<'lex> for Digital<'lex> { + fn span(&self) -> Span<'lex> { self.stream.lookup_span_with_affixes(self.id) } } @@ -983,17 +973,17 @@ pub struct Quoted<'lex> { impl<'lex> Quoted<'lex> { /// Returns this token's open delimiter. - pub fn open(self) -> Span { + pub fn open(self) -> Span<'lex> { self.delimiters()[0] } /// Returns this token's close delimiter. - pub fn close(self) -> Span { + pub fn close(self) -> Span<'lex> { self.delimiters()[1] } /// Returns this token's quote delimiters. - pub fn delimiters(self) -> [Span; 2] { + pub fn delimiters(self) -> [Span<'lex>; 2] { let span = self.stream.lookup_span_no_affix(self.id); [ self @@ -1016,7 +1006,7 @@ impl<'lex> Quoted<'lex> { /// It is up to the user of the library to decode these two content types into /// strings. [`Quoted::to_utf8()`] helps with the common case of doing this for /// UTF-8 strings. - pub fn raw_content(self) -> impl Iterator + 'lex { + pub fn raw_content(self) -> impl Iterator>> + 'lex { let file = self.stream.file(); let mut next = self.meta.marks[0]; let mut is_escape = false; @@ -1062,7 +1052,7 @@ impl<'lex> Quoted<'lex> { } /// Returns the unique single literal content of this token, if it is unique. - pub fn literal(self) -> Option { + pub fn literal(self) -> Option> { if self.meta.marks.len() > 2 { return None; } @@ -1075,12 +1065,12 @@ impl<'lex> Quoted<'lex> { /// mapping function for escapes. pub fn to_utf8( self, - mut decode_esc: impl FnMut(Span, Option, &mut String), + mut decode_esc: impl FnMut(Span, Option>, &mut String), ) -> String { let total = self .raw_content() .map(|c| match c { - Content::Lit(sp) => sp.text(self.context()).len(), + Content::Lit(sp) => sp.text().len(), Content::Esc(..) => 1, }) .sum(); @@ -1088,7 +1078,7 @@ impl<'lex> Quoted<'lex> { let mut buf = String::with_capacity(total); for chunk in self.raw_content() { match chunk { - Content::Lit(sp) => buf.push_str(sp.text(self.context())), + Content::Lit(sp) => buf.push_str(sp.text()), Content::Esc(sp, data) => decode_esc(sp, data, &mut buf), } } @@ -1096,27 +1086,23 @@ impl<'lex> Quoted<'lex> { } /// Returns this token's prefix. - pub fn prefix(self) -> Option { + pub fn prefix(self) -> Option> { self.stream.lookup_prefix(self.id) } /// Checks whether this identifier has a particular prefix. pub fn has_prefix(&self, expected: &str) -> bool { - self - .prefix() - .is_some_and(|s| s.text(self.context()) == expected) + self.prefix().is_some_and(|s| s.text() == expected) } /// Returns this token's suffix. - pub fn suffix(&self) -> Option { + pub fn suffix(&self) -> Option> { self.stream.lookup_suffix(self.id) } /// Checks whether this identifier has a particular prefix. pub fn has_suffix(&self, expected: &str) -> bool { - self - .suffix() - .is_some_and(|s| s.text(self.context()) == expected) + self.suffix().is_some_and(|s| s.text() == expected) } } @@ -1125,7 +1111,7 @@ impl<'lex> Quoted<'lex> { /// The "span type" is configurable; this type is used by multiple parts of /// the library. #[derive(Copy, Clone, Debug)] -pub enum Content { +pub enum Content { /// A literal chunk, i.e. UTF-8 text directly from the source file. Lit(Span), @@ -1184,7 +1170,7 @@ impl<'lex> TryFrom> for Quoted<'lex> { impl fmt::Debug for Quoted<'_> { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { let mut f = f.debug_struct("Quoted"); - f.field("span", &self.span(self.context())) + f.field("span", &self.span()) .field("delimiters", &self.delimiters()) // TODO: get rid of this collect(). .field("content", &self.raw_content().collect::>()); @@ -1201,8 +1187,8 @@ impl fmt::Debug for Quoted<'_> { } } -impl Spanned for Quoted<'_> { - fn span(&self, _: &Context) -> Span { +impl<'lex> Spanned<'lex> for Quoted<'lex> { + fn span(&self) -> Span<'lex> { self.stream.lookup_span_with_affixes(self.id) } } @@ -1248,16 +1234,16 @@ impl<'lex> Any<'lex> { let ctx = self.context(); let (pre, suf, kind) = match self { Any::Eof(_) => return yarn!(""), - Any::Keyword(tok) => return yarn!("`{}`", tok.text(ctx)), + Any::Keyword(tok) => return yarn!("`{}`", tok.text()), Any::Bracket(d) => { - return yarn!("`{} ... {}`", d.open().text(ctx), d.close().text(ctx)); + return yarn!("`{} ... {}`", d.open().text(), d.close().text()); } Any::Quoted(tok) => { - let pre = tok.prefix().map(|s| s.text(ctx)).unwrap_or(""); - let suf = tok.suffix().map(|s| s.text(ctx)).unwrap_or(""); - let open = tok.open().text(ctx); - let close = tok.close().text(ctx); + let pre = tok.prefix().map(|s| s.text()).unwrap_or(""); + let suf = tok.suffix().map(|s| s.text()).unwrap_or(""); + let open = tok.open().text(); + let close = tok.close().text(); return yarn!("`{pre}{open}...{close}{suf}`"); } @@ -1265,7 +1251,7 @@ impl<'lex> Any<'lex> { Any::Digital(tok) => (tok.prefix(), tok.suffix(), "number"), }; - match (pre.map(|s| s.text(ctx)), suf.map(|s| s.text(ctx))) { + match (pre.map(|s| s.text()), suf.map(|s| s.text())) { (Some(pre), Some(suf)) => { yarn!("`{pre}`-prefixed, `{suf}`-suffixed {kind}") } diff --git a/ilex/src/token/stream.rs b/ilex/src/token/stream.rs index 8783664..8065a80 100644 --- a/ilex/src/token/stream.rs +++ b/ilex/src/token/stream.rs @@ -536,13 +536,12 @@ pub struct Comments<'lex> { impl<'lex> Comments<'lex> { /// Adapts this iterator to return just the text contents of each [`SpanId`]. pub fn as_strings(self) -> impl Iterator + 'lex { - let ctx = self.stream.context(); - self.map(move |s| s.text(ctx)) + self.map(Span::text) } } impl<'lex> Iterator for Comments<'lex> { - type Item = Span; + type Item = Span<'lex>; fn next(&mut self) -> Option { let id = *self.comments.next()?; diff --git a/ilex/tests/json.rs b/ilex/tests/json.rs index 718b243..bf97419 100644 --- a/ilex/tests/json.rs +++ b/ilex/tests/json.rs @@ -249,7 +249,7 @@ fn parse0( ) -> Json { let quote2str = |str: token::Quoted| -> String { str.to_utf8(|key, data, buf| { - let char = match key.text(ctx) { + let char = match key.text() { "\\\"" => '\"', r"\\" => '\\', r"\/" => '/', @@ -262,10 +262,10 @@ fn parse0( r"\u" => { let data = data.unwrap(); let code = - u16::from_str_radix(data.text(ctx), 16).unwrap_or_else(|_| { + u16::from_str_radix(data.text(), 16).unwrap_or_else(|_| { report.builtins(json.spec()).expected( [Expected::Name("hex-encoded u16".into())], - data.text(ctx), + data.text(), data, ); 0 From b1ec359f73dd0304554b72bb3d7e197c332fe59a Mon Sep 17 00:00:00 2001 From: Miguel Young de la Sota Date: Wed, 1 Jan 2025 16:04:15 -0800 Subject: [PATCH 7/9] ilex: Remove redundant `Context` passing --- ilex/src/fp.rs | 8 +++---- ilex/src/rt/emit2.rs | 42 +++++++++++------------------------ ilex/src/testing/mod.rs | 11 ++++----- ilex/src/testing/recognize.rs | 25 ++++++--------------- ilex/src/token/mod.rs | 5 ++--- ilex/tests/greedy.rs | 2 +- ilex/tests/json.rs | 15 +++++-------- ilex/tests/llvm.rs | 2 +- 8 files changed, 36 insertions(+), 74 deletions(-) diff --git a/ilex/src/fp.rs b/ilex/src/fp.rs index 07d70b7..b138078 100644 --- a/ilex/src/fp.rs +++ b/ilex/src/fp.rs @@ -17,7 +17,6 @@ use rustc_apfloat::Round; use rustc_apfloat::Status; use rustc_apfloat::StatusAnd; -use crate::file::Context; use crate::file::Spanned; use crate::report::Report; use crate::token::Digital; @@ -464,7 +463,6 @@ impl Digital<'_> { #[track_caller] pub(crate) fn parse_fp( self, - ctx: &Context, report: &Report, exact: bool, ) -> Result { @@ -595,7 +593,7 @@ impl Digital<'_> { value } } else { - fn has_ordinary_sign(ctx: &Context, tok: &Digital) -> bool { + fn has_ordinary_sign(tok: &Digital) -> bool { tok.sign().is_none() || tok.sign().is_some_and(|s| { matches!( @@ -609,12 +607,12 @@ impl Digital<'_> { // underlying string. This is such a common format that we special case // it. if rule.point == "." - && has_ordinary_sign(ctx, &self) + && has_ordinary_sign(&self) && (exp.is_none() || exp.is_some_and(|exp| { exp.radix() == 10 && (exp.has_prefix("e") || exp.has_prefix("E")) - && has_ordinary_sign(ctx, &exp) + && has_ordinary_sign(&exp) })) && (rule.separator.is_empty() || !self.text().contains(rule.separator.as_str())) diff --git a/ilex/src/rt/emit2.rs b/ilex/src/rt/emit2.rs index 4becb6f..8c2c21a 100644 --- a/ilex/src/rt/emit2.rs +++ b/ilex/src/rt/emit2.rs @@ -6,7 +6,6 @@ use byteyarn::Yarn; use byteyarn::YarnBox; use crate::f; -use crate::file::Context; use crate::file::Span; use crate::file::Span2; use crate::plural; @@ -27,8 +26,6 @@ use super::dfa::Lexeme2; use super::unicode::is_xid; pub fn emit(lexer: &mut Lexer) { - let ctx = lexer.file().context(); - // Start by searching for the longest matches using the DFA. let dfa = lexer.spec().dfa(); let Some(mut match_) = dfa.search(lexer) else { @@ -66,7 +63,7 @@ pub fn emit(lexer: &mut Lexer) { // choices; that is independent of which token we decide to create. let mut best = None; 'verify: for &c in &match_.candidates { - let [.., range, _] = find_affixes_partial(span, lexer.spec(), c, ctx); + let [.., range, _] = find_affixes_partial(span, lexer.spec(), c); // NOTE: We only need to find the first lexeme that is valid. If it's not // valid, we will diagnose that in the next stage. @@ -82,7 +79,7 @@ pub fn emit(lexer: &mut Lexer) { range.split_around(close.0.len(), close.1.len()) }; - let [_, name, _] = find_affixes(range, &ident_rule.affixes, ctx); + let [_, name, _] = find_affixes(range, &ident_rule.affixes); if name.text().chars().count() < ident_rule.min_len { continue 'verify; } @@ -195,7 +192,7 @@ pub fn emit(lexer: &mut Lexer) { let best = best.unwrap_or(match_.candidates[0]); let [sign_span, prefix, range, suffix] = - find_affixes_partial(span, lexer.spec(), best, ctx); + find_affixes_partial(span, lexer.spec(), best); let text = range.text(); let mirrored = match lexer.spec().rule(best.lexeme) { @@ -216,7 +213,7 @@ pub fn emit(lexer: &mut Lexer) { if !best.is_close { (open, close) } else { (close, open) }; let [_, mid, _] = range.split_around(remove.0.len(), remove.1.len()); - let [_, name, _] = find_affixes(mid, &ident_rule.affixes, ctx); + let [_, name, _] = find_affixes(mid, &ident_rule.affixes); let text = name.text(); let count = text.chars().count(); @@ -789,13 +786,12 @@ fn find_affixes_partial<'a>( range: Span<'a>, spec: &Spec, best: Lexeme2, - ctx: &Context, ) -> [Span<'a>; 4] { let text = range.text(); let ep = range.file().span(0..0); match spec.rule(best.lexeme) { Any::Ident(rule) => { - let [pre, range, suf] = find_affixes(range, &rule.affixes, ctx); + let [pre, range, suf] = find_affixes(range, &rule.affixes); [ep, pre, range, suf] } Any::Digital(rule) => { @@ -809,15 +805,15 @@ fn find_affixes_partial<'a>( .unwrap_or(0); let (sign, range) = range.split_at(sign); - let [pre, range, suf] = find_affixes(range, &rule.affixes, ctx); + let [pre, range, suf] = find_affixes(range, &rule.affixes); [sign, pre, range, suf] } Any::Quoted(rule) if !best.is_close => { - let (pre, range) = find_prefix(range, &rule.affixes, ctx); + let (pre, range) = find_prefix(range, &rule.affixes); [ep, pre, range, ep] } Any::Quoted(rule) => { - let (range, suf) = find_suffix(range, &rule.affixes, ctx); + let (range, suf) = find_suffix(range, &rule.affixes); [ep, ep, range, suf] } _ => [ep, ep, range, ep], @@ -825,21 +821,13 @@ fn find_affixes_partial<'a>( } /// Extracts the affixes from `text`. -fn find_affixes<'a>( - range: Span<'a>, - affixes: &Affixes, - ctx: &Context, -) -> [Span<'a>; 3] { - let (prefix, range) = find_prefix(range, affixes, ctx); - let (range, suffix) = find_suffix(range, affixes, ctx); +fn find_affixes<'a>(range: Span<'a>, affixes: &Affixes) -> [Span<'a>; 3] { + let (prefix, range) = find_prefix(range, affixes); + let (range, suffix) = find_suffix(range, affixes); [prefix, range, suffix] } -fn find_prefix<'a>( - range: Span<'a>, - affixes: &Affixes, - ctx: &Context, -) -> (Span<'a>, Span<'a>) { +fn find_prefix<'a>(range: Span<'a>, affixes: &Affixes) -> (Span<'a>, Span<'a>) { let text = range.text(); let prefix = affixes .prefixes() @@ -851,11 +839,7 @@ fn find_prefix<'a>( range.split_at(prefix) } -fn find_suffix<'a>( - range: Span<'a>, - affixes: &Affixes, - ctx: &Context, -) -> (Span<'a>, Span<'a>) { +fn find_suffix<'a>(range: Span<'a>, affixes: &Affixes) -> (Span<'a>, Span<'a>) { let text = range.text(); let suffix = affixes .suffixes() diff --git a/ilex/src/testing/mod.rs b/ilex/src/testing/mod.rs index 591af62..751f4e5 100644 --- a/ilex/src/testing/mod.rs +++ b/ilex/src/testing/mod.rs @@ -13,7 +13,6 @@ use std::fs; use std::ops::Range; use std::path::Path; -use crate::file::Context; use crate::file::Span; use crate::file::Spanned; use crate::report::Report; @@ -185,10 +184,9 @@ impl Matcher { #[track_caller] pub fn assert_matches<'lex>( &self, - ctx: &Context, that: impl IntoIterator>, ) { - self.matches(ctx, that).unwrap() + self.matches(that).unwrap() } /// Sets an expectation for the overall span of the most recently added @@ -226,7 +224,6 @@ impl Matcher { /// If matching fails, returns an error describing why. pub fn matches<'lex>( &self, - ctx: &Context, that: impl IntoIterator>, ) -> Result<(), impl fmt::Debug> { struct DebugBy(String); @@ -236,13 +233,13 @@ impl Matcher { } } - let mut state = recognize::MatchState::new(ctx); + let mut state = recognize::MatchState::new(); recognize::zip_eq( "token streams", &mut state, &self.stream, that, - |state, ours, theirs| ours.recognizes(state, theirs, ctx), + |state, ours, theirs| ours.recognizes(state, theirs), ); state.finish().map_err(DebugBy) } @@ -332,7 +329,7 @@ impl Text { } /// Returns whether this span recognizes a particular span. - fn recognizes(&self, span: Span, ctx: &Context) -> bool { + fn recognizes(&self, span: Span) -> bool { !self.text.as_ref().is_some_and(|text| text != span.text()) && !self.range.as_ref().is_some_and(|range| { let r = span.span(); diff --git a/ilex/src/testing/recognize.rs b/ilex/src/testing/recognize.rs index d212f81..dd7d495 100644 --- a/ilex/src/testing/recognize.rs +++ b/ilex/src/testing/recognize.rs @@ -8,7 +8,6 @@ use std::fmt::DebugStruct; use std::fmt::Display; use crate::f; -use crate::file::Context; use crate::file::Spanned; use crate::rule; use crate::spec::Lexeme; @@ -58,12 +57,7 @@ pub struct DigitalMatcher { } impl Matcher { - pub fn recognizes( - &self, - state: &mut MatchState, - tok: token::Any, - ctx: &Context, - ) { + pub fn recognizes(&self, state: &mut MatchState, tok: token::Any) { state.match_spans("token span", &self.span, Spanned::span(&tok)); zip_eq("comments", state, &self.comments, tok.comments(), |state, t, s| { @@ -157,7 +151,7 @@ impl Matcher { state, tokens, tok.contents(), - |state, ours, theirs| ours.recognizes(state, theirs, ctx), + |state, ours, theirs| ours.recognizes(state, theirs), ); } _ => state.error("mismatched token types"), @@ -235,17 +229,15 @@ impl fmt::Debug for Matcher { } } -pub struct MatchState<'a> { - ctx: &'a Context, +pub struct MatchState { errors: String, stack: Vec, error_count: usize, } -impl<'a> MatchState<'a> { - pub fn new(ctx: &'a Context) -> Self { +impl MatchState { + pub fn new() -> Self { Self { - ctx, errors: String::new(), stack: Vec::new(), error_count: 0, @@ -274,7 +266,7 @@ impl<'a> MatchState<'a> { span: impl Spanned<'s>, ) { let span = span.span(); - if !text.recognizes(span, self.ctx) { + if !text.recognizes(span) { self.error(f!("wrong {what}; want {:?}, got {:?}", text, span)); } } @@ -290,10 +282,7 @@ impl<'a> MatchState<'a> { return; } - if !text - .zip(span) - .is_some_and(|(t, s)| t.recognizes(s, self.ctx)) - { + if !text.zip(span).is_some_and(|(t, s)| t.recognizes(s)) { self.error(f!("wrong {what}; want {:?}, got {:?}", text, span)); } } diff --git a/ilex/src/token/mod.rs b/ilex/src/token/mod.rs index c1072b9..1644b00 100644 --- a/ilex/src/token/mod.rs +++ b/ilex/src/token/mod.rs @@ -758,7 +758,7 @@ impl<'lex> Digital<'lex> { range: impl RangeBounds, report: &Report, ) -> Result { - let fp: Fp = self.parse_fp(self.context(), report, false)?; + let fp: Fp = self.parse_fp(report, false)?; if !fp.__is_finite() || !range.contains(&fp) { report.builtins(self.spec()).literal_out_of_range( @@ -784,7 +784,7 @@ impl<'lex> Digital<'lex> { range: impl RangeBounds, report: &Report, ) -> Result { - let fp: Fp = self.parse_fp(self.context(), report, true)?; + let fp: Fp = self.parse_fp(report, true)?; if !fp.__is_finite() || !range.contains(&fp) { report.builtins(self.spec()).literal_out_of_range( @@ -1231,7 +1231,6 @@ impl<'lex> Any<'lex> { return name.to_box(); } - let ctx = self.context(); let (pre, suf, kind) = match self { Any::Eof(_) => return yarn!(""), Any::Keyword(tok) => return yarn!("`{}`", tok.text()), diff --git a/ilex/tests/greedy.rs b/ilex/tests/greedy.rs index 547d527..306b2f0 100644 --- a/ilex/tests/greedy.rs +++ b/ilex/tests/greedy.rs @@ -48,5 +48,5 @@ fn greedy() { .then2(array, ("[", "]"), Matcher::new().then1(ident, "xyz")) .then2(cpp_like, ("R\"cc(", ")cc\""), ["some c++)\" "]) .eof() - .assert_matches(&ctx, &tokens); + .assert_matches(&tokens); } diff --git a/ilex/tests/json.rs b/ilex/tests/json.rs index bf97419..567ea89 100644 --- a/ilex/tests/json.rs +++ b/ilex/tests/json.rs @@ -167,7 +167,7 @@ fn check_tokens() { ), ) .eof() - .assert_matches(&ctx, &tokens); + .assert_matches(&tokens); } #[derive(Clone, Debug, PartialEq)] @@ -236,17 +236,12 @@ fn parse(data: &str) -> Result { .new_file("", data) .lex(json.spec(), &report) .map_err(|e| Error(e.to_string()))?; - let value = parse0(&ctx, &report, json, &mut stream.cursor()); + let value = parse0(&report, json, &mut stream.cursor()); report.fatal_or(value).map_err(|e| Error(e.to_string())) } -fn parse0( - ctx: &ilex::Context, - report: &Report, - json: &JsonSpec, - cursor: &mut Cursor, -) -> Json { +fn parse0(report: &Report, json: &JsonSpec, cursor: &mut Cursor) -> Json { let quote2str = |str: token::Quoted| -> String { str.to_utf8(|key, data, buf| { let char = match key.text() { @@ -293,7 +288,7 @@ fn parse0( let mut trailing = None; let vec = array .contents() - .delimited(json.comma, |c| Some(parse0(ctx, report, json, c))) + .delimited(json.comma, |c| Some(parse0(report, json, c))) .map(|(e, c)| { trailing = c; e @@ -318,7 +313,7 @@ fn parse0( .map(|q| quote2str(q)) .unwrap_or("😢".into()); c.take(json.colon, report); - let value = parse0(ctx, report, json, c); + let value = parse0(report, json, c); Some((key, value)) }) .map(|(e, c)| { diff --git a/ilex/tests/llvm.rs b/ilex/tests/llvm.rs index 46556be..34e2585 100644 --- a/ilex/tests/llvm.rs +++ b/ilex/tests/llvm.rs @@ -280,5 +280,5 @@ fn llvm() { .then1(llvm.void, "void"), ) .eof() - .assert_matches(&ctx, &tokens) + .assert_matches(&tokens) } From a98c907ae3ec788a5682539d81036e5ed908ae23 Mon Sep 17 00:00:00 2001 From: Miguel Young de la Sota Date: Wed, 1 Jan 2025 16:13:10 -0800 Subject: [PATCH 8/9] chore: Fix warnings --- byteyarn/src/boxed.rs | 2 +- byteyarn/src/reffed.rs | 2 +- ilex/src/file/mod.rs | 4 ++-- ilex/src/ice.rs | 4 ++-- ilex/src/report/mod.rs | 2 +- ilex/src/rt/dfa.rs | 4 +--- ilex/src/spec.rs | 1 - ilex/src/testing/mod.rs | 2 +- ilex/src/token/stream.rs | 6 +++--- 9 files changed, 12 insertions(+), 15 deletions(-) diff --git a/byteyarn/src/boxed.rs b/byteyarn/src/boxed.rs index 82e3ae0..dcd2e1c 100644 --- a/byteyarn/src/boxed.rs +++ b/byteyarn/src/boxed.rs @@ -587,7 +587,7 @@ impl<'a> YarnBox<'a, [u8]> { } } -impl<'a, T> YarnBox<'a, [T]> +impl YarnBox<'_, [T]> where [T]: crate::Buf, { diff --git a/byteyarn/src/reffed.rs b/byteyarn/src/reffed.rs index 86b7b71..87cc84c 100644 --- a/byteyarn/src/reffed.rs +++ b/byteyarn/src/reffed.rs @@ -293,7 +293,7 @@ impl<'a> YarnRef<'a, [u8]> { } } -impl<'a> YarnRef<'a, str> { +impl YarnRef<'_, str> { /// Converts this yarn into a string slice. pub fn as_str(&self) -> &str { self.as_slice() diff --git a/ilex/src/file/mod.rs b/ilex/src/file/mod.rs index 1d6d564..139e11a 100644 --- a/ilex/src/file/mod.rs +++ b/ilex/src/file/mod.rs @@ -143,7 +143,7 @@ impl<'ctx> Span<'ctx> { pub(crate) fn new + fmt::Debug>( file: File<'ctx>, range: impl RangeBounds, - ) -> Span { + ) -> Self { let start = match range.start_bound() { Bound::Included(&x) => cast(x), Bound::Excluded(&x) => cast(x).saturating_add(1), @@ -353,7 +353,7 @@ impl<'ctx> Spanned<'ctx> for Never { } thread_local! { - static CTX_FOR_SPAN_DEBUG: RefCell> = RefCell::new(None); + static CTX_FOR_SPAN_DEBUG: RefCell> = const { RefCell::new(None) }; } impl fmt::Debug for File<'_> { diff --git a/ilex/src/ice.rs b/ilex/src/ice.rs index c3d0a18..6b83e3b 100644 --- a/ilex/src/ice.rs +++ b/ilex/src/ice.rs @@ -8,7 +8,7 @@ use std::backtrace::BacktraceStatus; use std::io; use std::panic; use std::panic::AssertUnwindSafe; -use std::panic::PanicInfo; +use std::panic::PanicHookInfo; use std::panic::UnwindSafe; use std::sync::Mutex; use std::thread; @@ -148,7 +148,7 @@ impl Ice { /// /// The results are "best effort". The Rust backtrace API is incomplete, so we /// make do with some... cleverness around parsing the backtrace itself. - pub fn generate(panic: &PanicInfo, options: Options) -> Self { + pub fn generate(panic: &PanicHookInfo, options: Options) -> Self { let msg = panic.payload(); let msg = Option::or( msg.downcast_ref::<&str>().copied().map(str::to_string), diff --git a/ilex/src/report/mod.rs b/ilex/src/report/mod.rs index 8c6d74b..98a3f5c 100644 --- a/ilex/src/report/mod.rs +++ b/ilex/src/report/mod.rs @@ -68,7 +68,7 @@ impl Report { /// Returns a wrapper for accessing commonly-used, built-in message types. /// /// See [`Builtins`]. - pub fn builtins<'a>(&'a self, spec: &'a Spec) -> Builtins { + pub fn builtins<'a>(&'a self, spec: &'a Spec) -> Builtins<'a> { Builtins { report: self, spec } } diff --git a/ilex/src/rt/dfa.rs b/ilex/src/rt/dfa.rs index 759ad1f..f6dfd06 100644 --- a/ilex/src/rt/dfa.rs +++ b/ilex/src/rt/dfa.rs @@ -95,9 +95,7 @@ impl Dfa { } } - let Some((last_match, state)) = last_match else { - return None; - }; + let (last_match, state) = last_match?; let candidates = (0..dfa.match_len(lexer.cache(), state)) .map(|i| { let id = dfa.match_pattern(lexer.cache(), state, i); diff --git a/ilex/src/spec.rs b/ilex/src/spec.rs index 0a9fe3b..9139843 100644 --- a/ilex/src/spec.rs +++ b/ilex/src/spec.rs @@ -4,7 +4,6 @@ use std::cmp::Ordering; use std::fmt; use std::fmt::Display; use std::hash::Hash; -use std::i32; use std::marker::PhantomData; use byteyarn::yarn; diff --git a/ilex/src/testing/mod.rs b/ilex/src/testing/mod.rs index 751f4e5..8528d9b 100644 --- a/ilex/src/testing/mod.rs +++ b/ilex/src/testing/mod.rs @@ -330,7 +330,7 @@ impl Text { /// Returns whether this span recognizes a particular span. fn recognizes(&self, span: Span) -> bool { - !self.text.as_ref().is_some_and(|text| text != span.text()) + self.text.as_ref().is_none_or(|text| text == span.text()) && !self.range.as_ref().is_some_and(|range| { let r = span.span(); range != &(r.start()..r.end()) diff --git a/ilex/src/token/stream.rs b/ilex/src/token/stream.rs index 8065a80..504fac9 100644 --- a/ilex/src/token/stream.rs +++ b/ilex/src/token/stream.rs @@ -101,7 +101,7 @@ impl<'ctx> Stream<'ctx> { return Some(token::Eof { stream: self, id }.into()); } - return Some(match self.spec().rule(tok.lexeme) { + Some(match self.spec().rule(tok.lexeme) { rule::Any::Comment(..) => return None, rule::Any::Keyword(..) => token::Keyword { stream: self, id }.into(), rule::Any::Ident(..) => token::Ident { stream: self, id }.into(), @@ -144,7 +144,7 @@ impl<'ctx> Stream<'ctx> { token::Digital { stream: self, id, meta, idx: 0 }.into() } - }); + }) } pub(crate) fn lookup_meta(&self, id: token::Id) -> Option<&rt::Metadata> { @@ -374,7 +374,7 @@ impl<'lex> Cursor<'lex> { &'a mut self, delim: Lexeme, mut cb: impl FnMut(&mut Self) -> Option + 'a, - ) -> impl Iterator>)> + '_ { + ) -> impl Iterator>)> + 'a { let mut sep = switch::switch().case(delim, |x, _| x); let mut done = false; let mut prev = self.cursor; From 7e72b6dc18bcc7867c3e02e024e8ad0a29f567ab Mon Sep 17 00:00:00 2001 From: Miguel Young de la Sota Date: Wed, 1 Jan 2025 16:53:33 -0800 Subject: [PATCH 9/9] ilex: Unbreak docs --- ilex/src/file/context.rs | 6 ++++-- ilex/src/file/mod.rs | 4 ++-- ilex/src/report/mod.rs | 7 +++++-- ilex/src/rule.rs | 2 +- ilex/src/token/mod.rs | 2 +- ilex/src/token/stream.rs | 2 +- 6 files changed, 14 insertions(+), 9 deletions(-) diff --git a/ilex/src/file/context.rs b/ilex/src/file/context.rs index 97e1103..5d774e7 100644 --- a/ilex/src/file/context.rs +++ b/ilex/src/file/context.rs @@ -11,11 +11,13 @@ use crate::report; use crate::report::Fatal; use crate::report::Report; +#[cfg(doc)] +use crate::Span; + /// A source context, which owns source code files. /// /// A `Context` contains the full text of all the loaded source files, which -/// [`SpanId`]s ultimately refer to. Most [`SpanId`] operations need their -/// corresponding `Context` available. +/// [`Span`]s ultimately refer to. #[derive(Default)] pub struct Context { state: Arc>, diff --git a/ilex/src/file/mod.rs b/ilex/src/file/mod.rs index 139e11a..10924e4 100644 --- a/ilex/src/file/mod.rs +++ b/ilex/src/file/mod.rs @@ -302,7 +302,7 @@ pub trait Spanned<'ctx> { /// Returns the span in this syntax element. fn span(&self) -> Span<'ctx>; - /// Forwards to [`SpanId::file()`]. + /// Forwards to [`Span::file()`]. fn file(&self) -> File<'ctx> { self.span().file() } @@ -327,7 +327,7 @@ pub trait Spanned<'ctx> { self.span().len() } - /// Forwards to [`SpanId::text()`]. + /// Forwards to [`Span::text()`]. fn text(&self) -> &'ctx str { self.span().text() } diff --git a/ilex/src/report/mod.rs b/ilex/src/report/mod.rs index 98a3f5c..2c19d0e 100644 --- a/ilex/src/report/mod.rs +++ b/ilex/src/report/mod.rs @@ -2,7 +2,7 @@ //! //! This module contains types for generating an *error report*: a collection of //! diagnostics that describe why an operation failed in detail. Diagnostics -//! are basically fancy compiler errors: they use [`SpanId`]s to present faulty +//! are basically fancy compiler errors: they use [`Span`]s to present faulty //! input in context. //! //! The [`Report`] type is a reference-counted list of diagnostics, which is @@ -28,10 +28,13 @@ pub use builtin::Expected; pub use diagnostic::Diagnostic; use diagnostic::Kind; +#[cfg(doc)] +use crate::Span; + /// A collection of errors can may built up over the course of an operation. /// /// To construct a report, see [`Context::new_report()`]. The context that -/// constructs a report is the only one whose [`SpanId`]s should be passed into +/// constructs a report is the only one whose [`Span`]s should be passed into /// it; doing otherwise will result in unspecified output (or probably a panic). pub struct Report { ctx: Context, diff --git a/ilex/src/rule.rs b/ilex/src/rule.rs index 1c12304..cadd086 100644 --- a/ilex/src/rule.rs +++ b/ilex/src/rule.rs @@ -1060,7 +1060,7 @@ impl TryFrom for Digital { /// /// Comments do not generate tokens, unlike most rules. Instead, they are /// attached to the span of a token, and can be inspected through -/// [`Span::comments()`][crate::Span::comments]. +/// [`Token::comments()`][crate::Token::comments]. #[derive(Debug)] pub struct Comment { pub(crate) bracket: Bracket, diff --git a/ilex/src/token/mod.rs b/ilex/src/token/mod.rs index 1644b00..92c2742 100644 --- a/ilex/src/token/mod.rs +++ b/ilex/src/token/mod.rs @@ -1000,7 +1000,7 @@ impl<'lex> Quoted<'lex> { /// Returns the raw content of this token. /// /// There are two kinds of content: either a literal span of Unicode scalars - /// (represented as a [`SpanId`] pointing to those characters) or a single + /// (represented as a [`Span`] pointing to those characters) or a single /// escape, potentially with some side data. /// /// It is up to the user of the library to decode these two content types into diff --git a/ilex/src/token/stream.rs b/ilex/src/token/stream.rs index 504fac9..e25c7ac 100644 --- a/ilex/src/token/stream.rs +++ b/ilex/src/token/stream.rs @@ -534,7 +534,7 @@ pub struct Comments<'lex> { } impl<'lex> Comments<'lex> { - /// Adapts this iterator to return just the text contents of each [`SpanId`]. + /// Adapts this iterator to return just the text contents of each [`Span`]. pub fn as_strings(self) -> impl Iterator + 'lex { self.map(Span::text) }