Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions ilex/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,4 @@ regex-syntax = "0.8.2"
regex-automata = "0.4.3" # Bless Andrew for his patience.
rustc_apfloat = "0.2.0" # By eddyb's recommendation.
unicode-xid = "0.2.4"
bitvec = "1.0.1"
6 changes: 6 additions & 0 deletions ilex/src/rt/dfa.rs
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,12 @@ fn compile_rule(rule: &Any) -> Rule {
let (pat, close) = match rule {
Any::Keyword(rule) => (lit(&rule.value), None),

Any::LineEnd(rule) if rule.cancel.is_empty() => (lit(&"\n".into()), None),

Any::LineEnd(rule) => {
(Hir::alternation(vec![lit(&rule.cancel), lit(&"\n".into())]), None)
}

Any::Comment(rule) => {
// We can just throw the bracket in, regardless of whether it's a line
// comment. Because of how the outer lexer loop works, we will run the DFA
Expand Down
35 changes: 33 additions & 2 deletions ilex/src/rt/emit2.rs
Original file line number Diff line number Diff line change
Expand Up @@ -258,9 +258,34 @@ pub fn emit(lexer: &mut Lexer) {
// Now we have repeat the process from the 'verify, but now we know what kind
// of token we're going to create.

match lexer.spec().rule(best.lexeme) {
let rule = lexer.spec().rule(best.lexeme);
if !matches!(rule, Any::Comment(..)) {
// Diagnose a \ that is not followed by only spaces and comments.
if let Some(cancel) = lexer.line_end_cancel.take() {
let cancel = cancel.get(lexer.file());
lexer
.report()
.builtins(lexer.spec())
.unexpected(cancel.text(), best.lexeme, cancel)
.note(f!(
"expected `{}` to be followed by a new line",
cancel.text()
));
}
}

match rule {
Any::Keyword(..) => lexer.add_token(best.lexeme, range.len(), None),

Any::LineEnd(..) if text == "\n" => {
lexer.add_token(best.lexeme, range.len(), None)
}
Any::LineEnd(..) => {
// The cancel is always inserted as whitespace.
lexer.add_token(rt::WHITESPACE, range.len(), None);
lexer.line_end_cancel = Some(range.span2())
}

Any::Bracket(..) => {
// Construct the closer.
lexer.push_closer(
Expand Down Expand Up @@ -306,6 +331,12 @@ pub fn emit(lexer: &mut Lexer) {
.unclosed(span, &close, Lexeme::eof(), lexer.eof());
}

// Crop off an ending \n so that it can get turned into whitespace or
// a line end token, as appropriate.
if close == "\n" && depth == 0 {
cursor -= 1;
}

lexer.add_token(best.lexeme, cursor - lexer.cursor(), None);
}

Expand Down Expand Up @@ -755,7 +786,7 @@ pub fn emit(lexer: &mut Lexer) {
}

let rest = lexer.text(lexer.cursor()..);
let prev = rest.chars().next_back();
let prev = lexer.text(..lexer.cursor()).chars().next_back();
if prev.is_some_and(is_xid) {
let xids = rest.find(|c| !is_xid(c)).unwrap_or(rest.len());
if xids > 0 {
Expand Down
14 changes: 13 additions & 1 deletion ilex/src/rt/lexer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,14 @@ use std::num::NonZeroU32;
use std::ops::Index;
use std::ops::RangeBounds;

use bitvec::vec::BitVec;
use byteyarn::Yarn;
use regex_automata::hybrid::dfa::Cache;

use crate::f;
use crate::file::File;
use crate::file::Span;
use crate::file::Span2;
use crate::report::Builtins;
use crate::report::Report;
use crate::rt;
Expand All @@ -31,6 +33,7 @@ pub struct Lexer<'a, 'ctx> {
cursor: usize,
closers: Vec<Closer>,
comments: Vec<token::Id>,
pub line_end_cancel: Option<Span2>,

cache: Cache,
}
Expand All @@ -55,11 +58,13 @@ impl<'a, 'ctx> Lexer<'a, 'ctx> {
toks: Vec::new(),
meta_idx: Vec::new(),
meta: Vec::new(),
silent: BitVec::new(),
},

cursor: 0,
closers: Vec::new(),
comments: Vec::new(),
line_end_cancel: None,

cache: Cache::new(&spec.dfa().engine),
}
Expand Down Expand Up @@ -290,10 +295,17 @@ impl<'a, 'ctx> Lexer<'a, 'ctx> {
}

pub fn skip_whitespace(&mut self) -> bool {
let have_line_end = self.spec().builder.line_end.is_some();
let len = self
.text(self.cursor()..)
.chars()
.take_while(|c| c.is_whitespace())
.take_while(|&c| {
if c == '\n' && have_line_end {
return self.line_end_cancel.take().is_some();
}

c.is_whitespace()
})
.map(char::len_utf8)
.sum();

Expand Down
63 changes: 63 additions & 0 deletions ilex/src/rule.rs
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ pub use crate::token::Sign;
#[allow(missing_docs)]
pub enum Any {
Keyword(Keyword),
LineEnd(LineEnd),
Bracket(Bracket),
Ident(Ident),
Quoted(Quoted),
Expand All @@ -44,6 +45,7 @@ impl Any {
pub(crate) fn debug_name(&self) -> &'static str {
match self {
Any::Keyword(_) => "Keyword",
Any::LineEnd(_) => "LineEnd",
Any::Bracket(_) => "Bracket",
Any::Ident(_) => "Ident",
Any::Digital(_) => "Digital",
Expand Down Expand Up @@ -141,6 +143,67 @@ impl TryFrom<Any> for Keyword {
}
}

/// A line ending.
///
/// Line ends are like [`Keyword`]s with the value `"\n"`, but which have two
/// extra features:
///
/// 1. They can specify a "cancel" string for escaping a newline. This is
/// valuable for situations where a line end is syntactically meaningful, but
/// users need to break a line without it affecting lexing. For example, \
/// takes this role in C, since C uses a line-end token for `#define`s.
///
/// The cancel string, followed by whitespace and then a newline, will cause
/// that newline to become whitespace, rather than a token.
///
/// 2. They play nice with line comments. A line comment's ending newline will
/// be turned into a `LineEnd`, unless the comment was prefixed with the
/// cancel string.
#[derive(Default, Debug)]
pub struct LineEnd {
pub(crate) cancel: Yarn,
}

impl LineEnd {
/// Constructs a new line end rule with no cancel.
pub fn new() -> Self {
Self::default()
}

/// COnstructs a new line end rule with the given cancel prefix.
pub fn cancellable(cancel: impl Into<Yarn>) -> Self {
Self { cancel: cancel.into() }
}
}

impl Rule for LineEnd {
type Token<'lex> = token::Keyword<'lex>;

fn try_from_ref(value: &Any) -> Result<&Self, WrongKind> {
match value {
Any::LineEnd(rule) => Ok(rule),
_ => Err(WrongKind { want: "LineEnd", got: value.debug_name() }),
}
}
}

impl From<LineEnd> for Any {
fn from(value: LineEnd) -> Self {
Any::LineEnd(value)
}
}

impl TryFrom<Any> for LineEnd {
type Error = WrongKind;

fn try_from(value: Any) -> Result<Self, Self::Error> {
match value {
Any::LineEnd(rule) => Ok(rule),
_ => Err(WrongKind { want: "LineEnd", got: value.debug_name() }),
}
}
}

/// A paired bracket, such as `(..)`.
///
/// Brackets are pairs of delimiters with tokens between them. They are used as
Expand Down
11 changes: 9 additions & 2 deletions ilex/src/spec.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ use crate::report::Expected;
use crate::rt;
use crate::rule;
use crate::rule::Comment;
use crate::rule::LineEnd;
use crate::rule::Rule;

/// An ID for a lexeme that a [`Spec`][crate::Spec] can capture.
Expand Down Expand Up @@ -85,7 +86,7 @@ impl<R> fmt::Debug for Lexeme<R> {
/// This is a compiled, immutable object that describes how to lex a particular
/// language. The [`Spec::builder()`] function returns a builder for
pub struct Spec {
builder: SpecBuilder,
pub(crate) builder: SpecBuilder,
dfa: rt::Dfa,
}

Expand Down Expand Up @@ -139,6 +140,7 @@ impl Spec {
pub struct SpecBuilder {
pub(crate) rules: Vec<rule::Any>,
pub(crate) names: Vec<Yarn>,
pub(crate) line_end: Option<Lexeme<LineEnd>>,
}

impl SpecBuilder {
Expand Down Expand Up @@ -198,7 +200,11 @@ impl SpecBuilder {

self.names.push(name.into());
self.rules.push(rule.into());
Lexeme::new(self.rules.len() as i32 - 1)
let lex = Lexeme::new(self.rules.len() as i32 - 1);
if let rule::Any::LineEnd(_) = self.rules.last().unwrap() {
self.line_end = Some(lex.cast());
}
lex
}

#[doc(hidden)]
Expand Down Expand Up @@ -258,6 +264,7 @@ impl Lexeme<rule::Any> {

match spec.rule(self) {
rule::Any::Keyword(rule) => yarn!("`{}`", rule.value),
rule::Any::LineEnd(_) => "line ending".into(),
rule::Any::Bracket(d)
| rule::Any::Comment(Comment { bracket: d, .. }) => match &d.kind {
rule::BracketKind::Paired(open, close) => {
Expand Down
Loading
Loading