diff --git a/Cargo.lock b/Cargo.lock index 73ab2d1..af36520 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2,6 +2,67 @@ # It is not intended for manual editing. version = 3 +[[package]] +name = "anyhow" +version = "1.0.79" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "080e9890a082662b09c1ad45f567faeeb47f22b5fb23895fbe1e651e718e25ca" + [[package]] name = "avjason" version = "0.1.0" +dependencies = [ + "anyhow", + "avjason-macros", + "finl_unicode", +] + +[[package]] +name = "avjason-macros" +version = "0.1.0" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "finl_unicode" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8fcfdc7a0362c9f4444381a9e697c79d435fe65b52a37466fc2c1184cee9edc6" + +[[package]] +name = "proc-macro2" +version = "1.0.78" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e2422ad645d89c99f8f3e6b88a9fdeca7fabeac836b1002371c4367c8f984aae" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "quote" +version = "1.0.35" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "291ec9ab5efd934aaf503a6466c5d5251535d108ee747472c3977cc5acc868ef" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "syn" +version = "2.0.48" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0f3531638e407dfc0814761abb7c00a5b54992b849452a0646b7f65c9f770f3f" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "unicode-ident" +version = "1.0.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b" diff --git a/Cargo.toml b/Cargo.toml index 1fd0a5c..1c02b1b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,3 +1,4 @@ +workspace = { members = ["macros"] } [package] name = "avjason" version = "0.1.0" @@ -6,3 +7,8 @@ edition = "2021" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] +anyhow = "1.0.79" +finl_unicode = "1.2.0" + +[dependencies.avjason-macros] +path = "./macros" diff --git a/macros/Cargo.toml b/macros/Cargo.toml new file mode 100644 index 0000000..df1cf4c --- /dev/null +++ b/macros/Cargo.toml @@ -0,0 +1,14 @@ +[package] +name = "avjason-macros" +version = "0.1.0" +edition = "2021" + +[lib] +proc-macro = true + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +proc-macro2 = "1.0.78" +quote = "1.0.35" +syn = { version = "2.0.48", features = ["full"] } diff --git a/macros/src/lib.rs b/macros/src/lib.rs new file mode 100644 index 0000000..ca8f86e --- /dev/null +++ b/macros/src/lib.rs @@ -0,0 +1,306 @@ +//! +//! Macros for the main crate. +//! + +#![feature(proc_macro_diagnostic, char_min)] + +mod spanned; +mod type_traversal; +mod unicode_category; +mod utils; +mod verbatim; + +use proc_macro::{Diagnostic, Level, Span, TokenStream as Tokens}; +use quote::ToTokens; +use spanned::{derive_spanned_for_enum, derive_spanned_for_struct}; +use syn::parse_macro_input; +use unicode_category::UnicodePatInput; +use utils::{get_item_attrs, ECMARef, JSON5Ref, ToRustdoc}; +use verbatim::VerbatimPat; + +/// +/// ## SpecRef +/// +/// Allows easy reference of the [**JSON5** specification](https://spec.json5.org/). +/// +/// This macro will add an additional section at the top of the Rustdoc +/// for the item attached, linking to the relevant section in the specification. +/// +/// ### Example +/// +/// ``` +/// use avjason_macros::SpecRef; +/// +/// // With custom title. +/// #[SpecRef("Number", "JSON5Number")] +/// struct Number; +/// +/// // Without custom title. +/// #[SpecRef("JSON5String")] +/// struct LitString; +/// ``` +/// +#[allow(non_snake_case)] +#[proc_macro_attribute] +pub fn SpecRef(params: Tokens, target: Tokens) -> Tokens { + let mut target: syn::Item = parse_macro_input!(target); + let params: JSON5Ref = parse_macro_input!(params); + let attrs = params.to_rustdoc(); + + let Some(original_attrs) = get_item_attrs(&mut target) else { + return syn::Error::new_spanned(target, "Cannot add spec ref to this item.") + .into_compile_error() + .into(); + }; + + // Prepend our new documentation to the start of + // the attribute macros. + *original_attrs = attrs + .into_iter() + .chain(original_attrs.iter().cloned()) + .collect(); + + target.into_token_stream().into() +} + +/// +/// ## ECMARef +/// +/// Allows easy reference of the [**ECMAScript 5.1** specification](https://262.ecma-international.org/5.1/#). +/// +/// This macro will add an additional section at the top of the Rustdoc +/// for the item attached, linking to the relevant section in the specification. +/// +/// ### Example +/// +/// ``` +/// use avjason_macros::ECMARef; +/// +/// // You must always include an acompanying URL. +/// #[ECMARef("NullLiteral", "https://262.ecma-international.org/5.1/#sec-7.8.1")] +/// struct LitNull; +/// ``` +/// +#[allow(non_snake_case)] +#[proc_macro_attribute] +pub fn ECMARef(params: Tokens, target: Tokens) -> Tokens { + let mut target: syn::Item = parse_macro_input!(target); + let params: ECMARef = parse_macro_input!(params); + let attrs = params.to_rustdoc(); + + let Some(original_attrs) = get_item_attrs(&mut target) else { + return syn::Error::new_spanned(target, "Cannot add spec ref to this item.") + .into_compile_error() + .into(); + }; + + // Prepend our new documentation to the start of + // the attribute macros. + *original_attrs = attrs + .into_iter() + .chain(original_attrs.iter().cloned()) + .collect(); + + target.into_token_stream().into() +} + +/// +/// ## derive(Spanned) +/// +/// Derives the Spanned trait for both structs and enums. +/// +/// ### Terminal Tokens +/// ```ignore +/// use avjason_macros::Spanned; +/// use avjason::common::Span; +/// /// +/// /// (1) Named span field. +/// /// +/// /// ASCII digit '0'..='9'. +/// /// +/// #[derive(Spanned)] +/// struct Digit { +/// letter: char, +/// span: Span, +/// } +/// +/// /// +/// /// (2) Tuple struct. +/// /// +/// /// Literally `.` +/// /// +/// #[derive(Spanned)] +/// struct Dot(Span); +/// ``` +/// These are not composed of any smaller tokens. These *must* either: +/// 1. have a name `span: Span` field, or +/// 2. be a tuple struct with *only* a single Span field. +/// +/// *** +/// +/// ### Non-terminal Tokens +/// ```ignore +/// /// +/// /// (1.1) Named Struct +/// /// +/// /// A base-10 decimal number, +/// /// with optional integral part. +/// /// +/// #[derive(Spanned)] +/// struct Decimal { +/// integral: Many, +/// point: Dot, +/// mantissa: AtLeast<1, Digit> +/// } +/// +/// /// +/// /// (1.2) Tuple struct +/// /// +/// /// A base-10 integer. +/// /// +/// #[derive(Spanned)] +/// struct Integer(AtLeast<1, Digit>); +/// +/// /// +/// /// (2.1) Enum (union of tokens). +/// /// +/// /// A number: either an integer, or floating-point. +/// /// +/// #[derive(Spanned)] +/// enum Number { +/// Decimal(Decimal), +/// Integer(Integer), +/// } +/// +/// /// +/// /// (2.2) More complex enum. +/// /// +/// /// Either a base-10 integer, or hex integer. +/// /// +/// #[derive(Spanned)] +/// enum NumberOrHex { +/// Base10(AtLeast<1, Digit>), +/// Base16(v!("0x"), AtLeast<1, HexDigit>), +/// } +/// ``` +/// +/// These tokens derive their span from all of their child tokens. +/// They can be expressed either as: +/// +/// 1. Structs, either: +/// 1. Named, or +/// 2. Tuple. +/// 2. Enums: +/// 1. Union types, and even +/// 2. More complicated structures. +/// +#[proc_macro_derive(Spanned)] +pub fn spanned(target: Tokens) -> Tokens { + if let Ok(st) = syn::parse::(target.clone()) { + return derive_spanned_for_struct(&st); + } + + if let Ok(en) = syn::parse::(target.clone()) { + return derive_spanned_for_enum(&en); + } + + Diagnostic::spanned( + Span::call_site(), + Level::Error, + "Expected a struct or enum here.", + ) + .emit(); + + Default::default() +} + +/// +/// ## verbatim! +/// +/// Often shortend to `v!`, use *this* macro instead +/// of its struct helper friends `Verbatim<...>`, `CharPattern<...>`. +/// +/// ### Examples +/// ```ignore +/// use avjason_macros::verbatim as v; +/// +/// // (1) Single char match -> Verbatim<{char as &str}> +/// type Comma = v!(','); +/// +/// // (2) String match -> Verbatim<{&str}> +/// type NaN = v!("NaN"); +/// +/// // (3) Char range match -> CharPattern<{CharacterRange { +/// // start: start, +/// // end: end, // (modified to make the end exclusive) +/// // }}> +/// type Digit = v!('0'..='9'); +/// type NonZero = v!('1'..='9'); +/// ``` +/// +#[proc_macro] +pub fn verbatim(params: Tokens) -> Tokens { + let params: VerbatimPat = syn::parse_macro_input!(params); + let ty = params.into_type(); + ty.into_token_stream().into() +} + +/// +/// ## unicode! +/// +/// Allows you to match entire Unicode major, or minor groups +/// (but not both at the same time!) +/// +/// Use *this* macro instead of `MatchMajorCategory<...>` and `MatchMinorCategory<...>`. +/// +/// ### Examples +/// +/// ```ignore +/// use avjason_macros::unicode; +/// +/// /// +/// /// (1) Major category -> any character in the "Letters (L)" category. +/// /// +/// pub type ULetter = unicode!(L); +/// +/// /// +/// /// (2) Minor category -> any character in the "Math symbols (Sm)" category. +/// /// +/// pub type UMathSymbol = unicode!(Sm); +/// +/// /// +/// /// (3.1) Union of major categories -> any unicode character. +/// /// +/// pub type UAll = unicode!(C | L | M | N | P | S | Z); +/// +/// /// +/// /// (3.2) Union of minor categories -> +/// /// equivalent to major category "Letters (L)". +/// /// +/// pub type ULetterUnion = unicode!(Lu | Ll | Lt | Lm | Lo); +/// ``` +/// +/// ### Syntax +/// This macro accepts either: +/// 1. one-letter Unicode major categories (`C`, `L`, `M`, `N`, `P`, `S`, `Z`). +/// 2. two-letter Unicode minor categories: +/// * `C` -> `Cc`, `Cf`, `Cs`, `Co`, `Cn`; +/// * `L` -> `Lu`, `Ll`, `Lt`, `Lm`, `Lo`; +/// * `M` -> `Mm`, `Mc`, `Me`; +/// * `P` -> `Pc`, `Pd`, `Ps`, `Pe`, `Pi`, `Pf`, `Po`; +/// * `S` -> `Sm`, `Sc`, `Sk`, `So`; +/// * `Z` -> `Zs`, `Zl`, `Zp`; +/// 3. Unions of: +/// 1. Major categories (only) +/// 2. Minor categories (only) +/// +#[proc_macro] +pub fn unicode(params: Tokens) -> Tokens { + let params: UnicodePatInput = syn::parse_macro_input!(params); + + params + .into_type() + .map(ToTokens::into_token_stream) + .map(Into::into) + .unwrap_or_default() +} diff --git a/macros/src/spanned.rs b/macros/src/spanned.rs new file mode 100644 index 0000000..965d3a0 --- /dev/null +++ b/macros/src/spanned.rs @@ -0,0 +1,352 @@ +//! +//! Utilities for #[derive(Spanned)] +//! + +use proc_macro::{Diagnostic, Level}; +use proc_macro2::Span; +use quote::quote; +use syn::{punctuated::Punctuated, spanned::Spanned}; + +use crate::type_traversal::{ + field_access, index, is_named_type, self_keyword, variant_path, Generic, ToMember, +}; + +mod paths { + use proc_macro2::Span; + use syn::punctuated::Punctuated; + + /// + /// Equivalent to: + /// + /// ```ignore + /// crate::common::Spanned::span + /// ``` + /// + pub fn span() -> syn::Expr { + syn::Expr::Path(syn::ExprPath { + attrs: Default::default(), + qself: Default::default(), + path: syn::Path { + leading_colon: Default::default(), + segments: Punctuated::from_iter( + ["crate", "common", "Spanned", "span"] + .into_iter() + .map(|s| syn::PathSegment { + ident: syn::Ident::new(s, Span::call_site()), + arguments: syn::PathArguments::None, + }), + ), + }, + }) + } + + /// + /// Equivalent to: + /// + /// ```ignore + /// crate::common::SpanIter::combine + /// ``` + /// + pub fn combine() -> syn::Expr { + syn::Expr::Path(syn::ExprPath { + attrs: Default::default(), + qself: Default::default(), + path: syn::Path { + leading_colon: Default::default(), + segments: Punctuated::from_iter( + ["crate", "common", "SpanIter", "combine"] + .into_iter() + .map(|s| syn::PathSegment { + ident: syn::Ident::new(s, Span::call_site()), + arguments: syn::PathArguments::None, + }), + ), + }, + }) + } +} + +/// +/// Equivalent to: +/// +/// ``` +/// crate::utils::Spanned::span(& $expr) +/// ``` +/// +fn span_of(expr: syn::Expr) -> syn::Expr { + let reference = syn::Expr::Reference(syn::ExprReference { + attrs: Default::default(), + and_token: Default::default(), + mutability: None, + expr: Box::new(expr), + }); + + syn::Expr::Call(syn::ExprCall { + attrs: Default::default(), + func: Box::new(paths::span()), + paren_token: Default::default(), + args: Punctuated::from_iter([reference]), + }) +} + +/// +/// Equivalent to: +/// +/// ``` +/// crate::utils::SpanIter::combine([crate::utils::Spanned::span(& $expr), .. ]) +/// ``` +/// +fn spans_of(exprs: impl IntoIterator) -> syn::Expr { + let spans = exprs.into_iter().map(span_of); + + syn::Expr::Call(syn::ExprCall { + attrs: Default::default(), + func: Box::new(paths::combine()), + paren_token: Default::default(), + args: Punctuated::from_iter([syn::Expr::Array(syn::ExprArray { + attrs: Default::default(), + bracket_token: Default::default(), + elems: Punctuated::from_iter(spans), + })]), + }) +} + +pub fn spanned_for_struct(st: &syn::ItemStruct) -> Option { + let syn::ItemStruct { fields, .. } = st; + + let span_field = fields + .iter() + .enumerate() + .find(|(_, syn::Field { ty, .. })| is_named_type(ty, "Span").is_some()); + + // Case 1: this struct represents a terminal token. + // Use the included `Span` field. + if let Some((idx, span_field)) = span_field { + if matches!(fields, syn::Fields::Unnamed(_)) { + if fields.len() > 1 { + Diagnostic::spanned( + st.fields.span().unwrap(), + Level::Warning, + "Non single-field tuple with Span field.", + ) + .emit(); + + Diagnostic::spanned( + st.ident.span().unwrap(), + Level::Help, + "Make these fields named with a `span` field instead.", + ) + .emit() + } + + return Some(span_of(field_access(index(idx as u32)))); + } + + if matches!(fields, syn::Fields::Named(_)) { + // Unwrap ok since we're not a tuple-struct. + let ident = span_field.ident.clone().unwrap(); + + if ident != "span" { + Diagnostic::spanned( + ident.span().unwrap(), + Level::Warning, + "Named Span field should be called `span`.", + ) + .emit(); + + Diagnostic::spanned( + ident.span().unwrap(), + Level::Help, + "Rename this field to `span`.", + ) + .emit(); + } + + return Some(span_of(field_access(ident))); + } + } + + // Case 2: Product type => combine all span values of our fields, in order. + + match fields { + syn::Fields::Named(syn::FieldsNamed { named, .. }) => 'a: { + if named.is_empty() { + break 'a; + } + + return Some(spans_of( + named + .into_iter() + .cloned() + .filter_map(|f| f.ident) + .map(field_access), + )); + } + syn::Fields::Unnamed(syn::FieldsUnnamed { unnamed, .. }) => 'a: { + if unnamed.is_empty() { + break 'a; + } + + return Some(spans_of( + unnamed + .into_iter() + .cloned() + .enumerate() + .map(|(i, _)| index(i as u32)) + .map(field_access), + )); + } + syn::Fields::Unit => (), + } + + Diagnostic::spanned( + st.span().unwrap(), + Level::Error, + "Cannot derive `Spanned` for unit-like struct.", + ) + .emit(); + + None +} + +fn ident_pat(ident: syn::Ident) -> syn::Pat { + syn::Pat::Ident(syn::PatIdent { + attrs: Default::default(), + by_ref: None, + mutability: None, + ident, + subpat: None, + }) +} + +fn spanned_variant_arm(var: &syn::Variant) -> syn::Arm { + let syn::Variant { ident, fields, .. } = var; + let path = variant_path(ident); + + let (members, f_idents): (Vec<_>, Vec<_>) = fields + .iter() + .enumerate() + .map(|(i, f)| { + f.ident + .clone() + .map(|i| (i.clone().to_member(), i)) + .unwrap_or_else(|| { + ( + index(i as u32).to_member(), + syn::Ident::new(&format!("f{i}"), Span::call_site()), + ) + }) + }) + .unzip(); + + let pat = match fields { + syn::Fields::Named(_) => syn::Pat::Struct(syn::PatStruct { + attrs: Default::default(), + qself: Default::default(), + path, + brace_token: Default::default(), + fields: Punctuated::from_iter(members.into_iter().zip(f_idents.iter().cloned()).map( + |(member, ident)| syn::FieldPat { + attrs: Default::default(), + member, + colon_token: Default::default(), + pat: Box::new(ident_pat(ident)), + }, + )), + rest: Default::default(), + }), + syn::Fields::Unnamed(_) => syn::Pat::TupleStruct(syn::PatTupleStruct { + attrs: Default::default(), + qself: Default::default(), + path, + paren_token: Default::default(), + elems: Punctuated::from_iter(f_idents.iter().cloned().map(ident_pat)), + }), + syn::Fields::Unit => unreachable!(), + }; + + syn::Arm { + attrs: Default::default(), + pat, + guard: None, + fat_arrow_token: Default::default(), + body: Box::new(spans_of(f_idents.into_iter().map(|ident| { + syn::Expr::Path(syn::ExprPath { + attrs: Default::default(), + qself: Default::default(), + path: ident.into(), + }) + }))), + comma: Some(Default::default()), + } +} + +pub fn spanned_for_enum(en: &syn::ItemEnum) -> Option { + let vars = &en.variants; + + if vars.is_empty() { + Diagnostic::spanned( + en.span().unwrap(), + Level::Error, + "Cannot derive spanned for enum with no variants.", + ) + .emit(); + + return None; + } + + // Check if any variants are unit-like, if so give errors then terminate. + if vars + .iter() + .filter(|syn::Variant { fields, .. }| fields.is_empty()) + .map(|f| { + Diagnostic::spanned( + f.span().unwrap(), + Level::Error, + "Cannot derive spanned for enum with unit-like variants.", + ) + .emit() + }) + .next() + .is_some() + { + return None; + } + + Some(syn::Expr::Match(syn::ExprMatch { + attrs: Default::default(), + match_token: Default::default(), + expr: Box::new(self_keyword()), + brace_token: Default::default(), + arms: vars.iter().map(spanned_variant_arm).collect(), + })) +} + +fn derive_spanned(gen: &impl Generic, span_expr: Option) -> proc_macro::TokenStream { + let ident = gen.ident(); + let generics = gen.generics(); + let generic_letters = gen.generic_letters(); + + if let Some(span) = span_expr { + return quote! { + impl #generics crate::common::Spanned for #ident #generic_letters { + fn span(&self) -> crate::common::Span { + #span + } + } + } + .into(); + } + + Default::default() +} + +pub fn derive_spanned_for_struct(st: &syn::ItemStruct) -> proc_macro::TokenStream { + let span_expr = spanned_for_struct(st); + derive_spanned(st, span_expr) +} + +pub fn derive_spanned_for_enum(en: &syn::ItemEnum) -> proc_macro::TokenStream { + let span_expr = spanned_for_enum(en); + derive_spanned(en, span_expr) +} diff --git a/macros/src/type_traversal.rs b/macros/src/type_traversal.rs new file mode 100644 index 0000000..4b13116 --- /dev/null +++ b/macros/src/type_traversal.rs @@ -0,0 +1,119 @@ +//! +//! Utilities that allow use to traverse `struct`s and `enum`s. +//! + +use proc_macro2::Span; +use quote::{quote, ToTokens}; +use syn::punctuated::Punctuated; + +/// +/// Checks to see if an identifier is in a path. +/// +pub fn in_path<'a>(path: &'a syn::Path, ident: &str) -> Option<&'a syn::PathSegment> { + path.segments + .iter() + .find(|syn::PathSegment { ident: id, .. }| id == ident) +} + +/// +/// Checks if a type has the ident inside its name. +/// +pub fn is_named_type<'a>(ty: &'a syn::Type, ident: &str) -> Option<&'a syn::PathSegment> { + match ty { + syn::Type::Path(syn::TypePath { path, .. }) => in_path(path, ident), + _ => None, + } +} + +pub fn self_keyword() -> syn::Expr { + syn::Expr::Path(syn::ExprPath { + attrs: Default::default(), + qself: Default::default(), + path: syn::Ident::new("self", Span::call_site()).into(), + }) +} + +pub trait ToMember { + fn to_member(self) -> syn::Member; +} + +impl ToMember for syn::Index { + fn to_member(self) -> syn::Member { + syn::Member::Unnamed(self) + } +} + +impl ToMember for syn::Ident { + fn to_member(self) -> syn::Member { + syn::Member::Named(self) + } +} + +pub fn index(index: u32) -> syn::Index { + syn::Index { + index, + span: Span::call_site(), + } +} + +pub fn field_access(m: impl ToMember) -> syn::Expr { + syn::Expr::Field(syn::ExprField { + attrs: Default::default(), + base: Box::new(self_keyword()), + dot_token: Default::default(), + member: m.to_member(), + }) +} + +pub trait Generic { + fn ident(&self) -> &syn::Ident; + + fn generics(&self) -> &syn::Generics; + + fn generic_letters(&self) -> proc_macro2::TokenStream { + let generics = self.generics(); + let letters = generics.params.iter().map(|param| match param { + syn::GenericParam::Lifetime(l) => l.lifetime.to_token_stream(), + syn::GenericParam::Type(ty) => ty.ident.to_token_stream(), + syn::GenericParam::Const(cons) => cons.ident.to_token_stream(), + }); + + quote! { + <#(#letters),*> + } + } +} + +impl Generic for syn::ItemStruct { + fn generics(&self) -> &syn::Generics { + &self.generics + } + + fn ident(&self) -> &syn::Ident { + &self.ident + } +} + +impl Generic for syn::ItemEnum { + fn generics(&self) -> &syn::Generics { + &self.generics + } + + fn ident(&self) -> &syn::Ident { + &self.ident + } +} + +pub fn variant_path(var: &syn::Ident) -> syn::Path { + syn::Path { + leading_colon: Default::default(), + segments: Punctuated::from_iter( + [syn::Ident::new("Self", Span::call_site()), var.clone()] + .into_iter() + .map(|ident| syn::PathSegment { + ident, + arguments: syn::PathArguments::None, + }), + ), + } +} diff --git a/macros/src/unicode_category.rs b/macros/src/unicode_category.rs new file mode 100644 index 0000000..7f411ca --- /dev/null +++ b/macros/src/unicode_category.rs @@ -0,0 +1,204 @@ +//! +//! Utilities for the [crate::unicode] macro. +//! + +use std::iter::once; + +use proc_macro::{Diagnostic, Level}; +use proc_macro2::Span; +use syn::{ + parse::{Parse, ParseStream}, + punctuated::Punctuated, + spanned::Spanned, +}; + +/// +/// The input into the [crate::unicode] macro. +/// +pub struct UnicodePatInput { + categories: Vec, +} + +fn ident(st: &str) -> syn::Ident { + syn::Ident::new(st, Span::call_site()) +} + +pub enum UnicodeCategory { + Major(syn::Ident), + Minor(syn::Ident), +} + +impl UnicodePatInput { + /// + /// Converts this collection of unicode catgeories + /// into its appropriate matcher type (determined by the first category). + /// + pub fn into_type(self) -> Option { + let mut iter = self.categories.into_iter(); + let Some(first) = iter.next() else { + Diagnostic::new(Level::Error, "Expected unicode major/minor categories!").emit(); + return None; + }; + Some(first.into_matcher(iter)) + } +} + +impl UnicodeCategory { + /// + /// Attempt to parse a unicode major/minor category + /// from an identifier (only checks length). + /// + fn parse(ident: syn::Ident) -> Result { + let st = ident.to_string(); + + match st.len() { + 0 => unreachable!(), + 1 => Ok(Self::Major(ident)), + 2 => Ok(Self::Minor(ident)), + _ => { + Diagnostic::spanned(ident.span().unwrap(), Level::Error, "Expected either a one-letter unicode major catgeory, or a two-letter unicode minor category.") + .emit(); + + Err(()) + } + } + } + + /// + /// Gets this category as an expression. + /// + fn into_expr(self) -> syn::Expr { + let (ty, cat) = match self { + Self::Major(ident) => ("MajorCategory", ident), + Self::Minor(ident) => ("MinorCategory", ident), + }; + + syn::Expr::Path(syn::ExprPath { + attrs: Default::default(), + qself: None, + path: syn::Path { + leading_colon: None, + segments: Punctuated::from_iter( + ["crate", "lexing", "utils", "unicode", ty, &cat.to_string()] + .iter() + .map(|s| syn::Ident::new(s, cat.span())) + .map(syn::PathSegment::from), + ), + }, + }) + } + + /// + /// Returns the type of this category type's matcher, along with supplying + /// itself and the other categories as const params. + /// + pub fn into_matcher(self, others: impl IntoIterator) -> syn::Type { + let ty = match self { + Self::Major(_) => "MatchMajorCategory", + Self::Minor(_) => "MatchMinorCategory", + }; + + let array = syn::Expr::Array(syn::ExprArray { + attrs: Default::default(), + bracket_token: Default::default(), + elems: Punctuated::from_iter(once(self).chain(others).map(Self::into_expr)), + }); + + let static_ref = syn::Expr::Reference(syn::ExprReference { + attrs: Default::default(), + and_token: Default::default(), + mutability: None, + expr: Box::new(array), + }); + + let braced = syn::Expr::Block(syn::ExprBlock { + attrs: Default::default(), + label: None, + block: syn::Block { + brace_token: Default::default(), + stmts: vec![syn::Stmt::Expr(static_ref, None)], + }, + }); + + let generic_arg = syn::GenericArgument::Const(braced); + syn::Type::Path(syn::TypePath { + qself: None, + path: syn::Path { + leading_colon: None, + segments: Punctuated::from_iter( + ["crate", "lexing", "utils", "unicode"] + .into_iter() + .map(ident) + .map(syn::PathSegment::from) + .chain(once(syn::PathSegment { + ident: ident(ty), + arguments: syn::PathArguments::AngleBracketed( + syn::AngleBracketedGenericArguments { + colon2_token: None, + lt_token: Default::default(), + args: Punctuated::from_iter(once(generic_arg)), + gt_token: Default::default(), + }, + ), + })), + ), + }, + }) + } +} + +impl Parse for UnicodePatInput { + fn parse(input: ParseStream) -> syn::Result { + let pat = syn::Pat::parse_multi(input)?; + let cases: Vec<_> = match pat { + syn::Pat::Or(or) => or.cases.into_iter().collect(), + ident @ syn::Pat::Ident(_) => vec![ident], + pat => { + return Err(syn::Error::new_spanned( + pat, + "Expected either one or many (with |) unicode major/minor categories here.", + )) + } + }; + + let idents: Vec<_> = cases + .iter() + .map(|pat| match pat { + syn::Pat::Ident(syn::PatIdent { ident, .. }) => Some(ident), + pat => { + Diagnostic::spanned( + pat.span().unwrap(), + Level::Error, + "Expected either a unicode major or minor category here.", + ) + .emit(); + None + } + }) + .collect(); + + if idents.iter().any(Option::is_none) { + return Err(syn::Error::new( + Span::call_site(), + "An error occurred whilst parsing syntax.", + )); + } + + // ::unwrap() okay since !any(is_none) -> all(is_some) + let idents = idents.into_iter().map(Option::unwrap); + + let categories: Vec<_> = idents.cloned().map(UnicodeCategory::parse).collect(); + + if categories.iter().any(Result::is_err) { + return Err(syn::Error::new( + Span::call_site(), + "Invalid unicode major or minor category.", + )); + } + + // ::unwrap() okay since !any(is_err) -> all(is_ok) + Ok(Self { + categories: categories.into_iter().map(Result::unwrap).collect(), + }) + } +} diff --git a/macros/src/utils.rs b/macros/src/utils.rs new file mode 100644 index 0000000..78ef8d8 --- /dev/null +++ b/macros/src/utils.rs @@ -0,0 +1,192 @@ +use proc_macro2::Span; +use syn::{punctuated::Punctuated, Token}; + +/// +/// Creates lines of Rustdoc from &self. +/// +/// ### Example +/// ```ignore +/// use proc_macro2::Span; +/// +/// let boolean_lit = ECMARef { +/// name: syn::LitStr::new("BooleanLiteral", Span::call_site()), +/// href: syn::LitStr::new("https://262.ecma-international.org/5.1/#sec-7.8.2", Span::call_site()) +/// }; +/// +/// boolean_lit.to_rustdoc() +/// ``` +/// +/// would return: +/// +/// ```ignore +/// #[doc = "## BooleanLiteral"] +/// #[doc = "See the official [ECMAScript specification](https://262.ecma-international.org/5.1/#sec-7.8.2)."] +/// #[doc = "***"] +/// ``` +/// +pub trait ToRustdoc { + fn to_rustdoc(&self) -> impl IntoIterator; +} + +/// +/// Produces a line of rust doc. +/// +/// ### Example +/// +/// ```ignore +/// rustdoc_line("Ridicule!") +/// ``` +/// +/// will produce: +/// +/// ```ignore +/// #[doc = "Ridicule!"] +/// ``` +/// +fn rustdoc_line(st: impl ToString) -> syn::Attribute { + syn::Attribute { + pound_token: Default::default(), + style: syn::AttrStyle::Outer, + bracket_token: Default::default(), + meta: syn::Meta::NameValue(syn::MetaNameValue { + path: syn::Path { + leading_colon: Default::default(), + segments: Punctuated::from_iter([syn::PathSegment { + ident: syn::Ident::new("doc", Span::call_site()), + arguments: syn::PathArguments::None, + }]), + }, + eq_token: Default::default(), + value: syn::Expr::Lit(syn::ExprLit { + attrs: Default::default(), + lit: syn::Lit::Str(syn::LitStr::new(&st.to_string(), Span::call_site())), + }), + }), + } +} + +/// +/// Represents a reference to the ECMAScript specification. +/// +pub struct ECMARef { + name: syn::LitStr, + href: syn::LitStr, +} + +impl syn::parse::Parse for ECMARef { + fn parse(input: syn::parse::ParseStream) -> syn::Result { + let first: syn::LitStr = input.parse()?; + let _: Token![,] = input.parse()?; + let second: syn::LitStr = input.parse()?; + + Ok(Self { + name: first, + href: second, + }) + } +} + +impl ToRustdoc for ECMARef { + fn to_rustdoc(&self) -> impl IntoIterator { + let Self { name, href } = self; + let (name, href) = (name.value(), href.value()); + [ + format!("## {name}"), + format!("See more on the [ECMAScript specification]({href})."), + "***".to_string(), + ] + .into_iter() + .map(rustdoc_line) + } +} + +/// +/// Represents a reference to the JSON5 specification. +/// +/// ### Example +/// +/// ```ignore +/// #[JSON5Ref("Null", "JSON5Null")] // (a) +/// #[JSON5Ref("JSON5Identifier")] // (b) +/// ``` +/// +/// would yield: +/// +/// ```ignore +/// // (a) +/// JSON5Ref { +/// name: Some(syn::LitStr::new("Null", _)), +/// id: syn::LitStr::new("JSON5Null", _), +/// } +/// +/// // (b) +/// JSON5Ref { +/// name: None, +/// id: syn::LitStr::new("JSON5Identifier", _), +/// } +/// ``` +/// +pub struct JSON5Ref { + name: Option, + id: syn::LitStr, +} + +impl syn::parse::Parse for JSON5Ref { + fn parse(input: syn::parse::ParseStream) -> syn::Result { + let first: syn::LitStr = input.parse()?; + + if input.peek(Token![,]) { + let _: Token![,] = input.parse()?; + let second: syn::LitStr = input.parse()?; + + return Ok(Self { + name: Some(first), + id: second, + }); + } + + Ok(Self { + name: None, + id: first, + }) + } +} + +impl ToRustdoc for JSON5Ref { + fn to_rustdoc(&self) -> impl IntoIterator { + let Self { name, id } = self; + let (name, id) = (name.as_ref().map(|s| s.value()), id.value()); + [ + format!("## {}", name.as_ref().unwrap_or(&id)), + format!("See more on the [JSON5 specification](https://spec.json5.org/#prod-{id})."), + "***".to_string(), + ] + .into_iter() + .map(rustdoc_line) + } +} + +/// +/// Attempt to get the attribute macros for a [syn::Item]. +/// +pub fn get_item_attrs(item: &mut syn::Item) -> Option<&mut Vec> { + match item { + syn::Item::Const(syn::ItemConst { ref mut attrs, .. }) => Some(attrs), + syn::Item::Enum(syn::ItemEnum { ref mut attrs, .. }) => Some(attrs), + syn::Item::ExternCrate(syn::ItemExternCrate { ref mut attrs, .. }) => Some(attrs), + syn::Item::Fn(syn::ItemFn { ref mut attrs, .. }) => Some(attrs), + syn::Item::ForeignMod(syn::ItemForeignMod { ref mut attrs, .. }) => Some(attrs), + syn::Item::Impl(syn::ItemImpl { ref mut attrs, .. }) => Some(attrs), + syn::Item::Macro(syn::ItemMacro { ref mut attrs, .. }) => Some(attrs), + syn::Item::Mod(syn::ItemMod { ref mut attrs, .. }) => Some(attrs), + syn::Item::Static(syn::ItemStatic { ref mut attrs, .. }) => Some(attrs), + syn::Item::Struct(syn::ItemStruct { ref mut attrs, .. }) => Some(attrs), + syn::Item::Trait(syn::ItemTrait { ref mut attrs, .. }) => Some(attrs), + syn::Item::TraitAlias(syn::ItemTraitAlias { ref mut attrs, .. }) => Some(attrs), + syn::Item::Type(syn::ItemType { ref mut attrs, .. }) => Some(attrs), + syn::Item::Union(syn::ItemUnion { ref mut attrs, .. }) => Some(attrs), + syn::Item::Use(syn::ItemUse { ref mut attrs, .. }) => Some(attrs), + syn::Item::Verbatim(_) => None, + _ => None, + } +} diff --git a/macros/src/verbatim.rs b/macros/src/verbatim.rs new file mode 100644 index 0000000..6b580a1 --- /dev/null +++ b/macros/src/verbatim.rs @@ -0,0 +1,232 @@ +//! +//! Utilities for the `verbatim!` macro. +//! + +use std::ops::Deref; + +use proc_macro2::Span; +use syn::parse::{Parse, ParseStream}; + +use self::paths::generic_path; + +/// +/// Accepted patterns for `verbatim!`. +/// +pub enum VerbatimPat { + LitStr(syn::LitStr), + LitChar(syn::LitChar), + CharRange(char, char), +} + +mod paths { + use proc_macro2::Span; + use syn::punctuated::Punctuated; + + use crate::type_traversal::ToMember; + + /// + /// Makes an ident from a string, + /// with the Span resolving to Span::call_site() + /// + fn ident(st: &str) -> syn::Ident { + syn::Ident::new(st, Span::call_site()) + } + + /// + /// Generates a path with the last segment + /// having generic parameters. + /// + /// Equivalent to: + /// + /// ```ignore + /// $path<$arg> + /// ``` + /// + pub fn generic_path(path: [&str; N], arg: syn::GenericArgument) -> syn::Type { + syn::Type::Path(syn::TypePath { + qself: None, + path: syn::Path { + leading_colon: None, + segments: Punctuated::from_iter( + path[..N - 1] + .iter() + .copied() + .map(ident) + .map(syn::PathSegment::from) + .chain([syn::PathSegment { + ident: ident(path[N - 1]), + arguments: syn::PathArguments::AngleBracketed( + syn::AngleBracketedGenericArguments { + colon2_token: Default::default(), + lt_token: Default::default(), + args: Punctuated::from_iter([arg]), + gt_token: Default::default(), + }, + ), + }]), + ), + }, + }) + } + + /// + /// Equivalent to: + /// + /// ```ignore + /// crate::lexing::CharacterRange { + /// start: $start, + /// end: $end, + /// } + /// ``` + /// + pub fn character_range(start: syn::Expr, end: syn::Expr) -> syn::Expr { + let path = syn::Path { + leading_colon: None, + segments: Punctuated::from_iter( + ["crate", "lexing", "CharacterRange"] + .map(ident) + .map(syn::PathSegment::from), + ), + }; + + syn::Expr::Struct(syn::ExprStruct { + attrs: Default::default(), + qself: Default::default(), + path, + brace_token: Default::default(), + fields: Punctuated::from_iter([("start", start), ("end", end)].map(|(f, expr)| { + syn::FieldValue { + attrs: Default::default(), + member: ident(f).to_member(), + colon_token: Some(Default::default()), + expr, + } + })), + dot2_token: None, + rest: None, + }) + } +} + +impl VerbatimPat { + /// + /// Build the AST for this pattern, + /// using helper structs in the main crate. + /// + pub fn into_type(self) -> syn::Type { + match self { + VerbatimPat::LitStr(st) => paths::generic_path( + ["crate", "lexing", "Verbatim"], + syn::GenericArgument::Const(syn::Expr::Lit(syn::ExprLit { + attrs: Default::default(), + lit: syn::Lit::Str(st), + })), + ), + VerbatimPat::LitChar(ch) => paths::generic_path( + ["crate", "lexing", "Verbatim"], + syn::GenericArgument::Const(syn::Expr::Lit(syn::ExprLit { + attrs: Default::default(), + lit: syn::Lit::Str(syn::LitStr::new( + &ch.value().to_string(), + Span::call_site(), + )), + })), + ), + VerbatimPat::CharRange(start, end) => { + let bounds = [start, end].map(|c| syn::LitChar::new(c, Span::call_site())); + let [start, end] = bounds.map(|ch| { + syn::Expr::Lit(syn::ExprLit { + attrs: Default::default(), + lit: syn::Lit::Char(ch), + }) + }); + + let constructed = paths::character_range(start, end); + let braced = syn::Expr::Block(syn::ExprBlock { + attrs: Default::default(), + label: Default::default(), + block: syn::Block { + brace_token: Default::default(), + stmts: vec![syn::Stmt::Expr(constructed, None)], + }, + }); + + let const_param = syn::GenericArgument::Const(braced); + + generic_path(["crate", "lexing", "CharPattern"], const_param) + } + } + } +} + +/// +/// Is this expression a char literal? +/// +fn is_lit_char(expr: &impl Deref) -> bool { + let expr = expr.deref(); + matches!( + expr, + syn::Expr::Lit(syn::ExprLit { + lit: syn::Lit::Char(_), + .. + }) + ) +} + +/// +/// Gets the character value if this expression is a char literal. +/// +fn get_char(expr: &impl Deref) -> Option { + match expr.deref() { + syn::Expr::Lit(syn::ExprLit { + lit: syn::Lit::Char(litchar), + .. + }) => Some(litchar.value()), + _ => None, + } +} + +impl Parse for VerbatimPat { + fn parse(input: ParseStream) -> syn::Result { + let pat = syn::Pat::parse_single(input)?; + + // Nasty pattern matching, but that's the downside of nested enums. + match pat { + syn::Pat::Lit(syn::ExprLit { + lit: lit @ (syn::Lit::Char(_) | syn::Lit::Str(_)), + .. + }) => match lit { + syn::Lit::Char(ch) => Ok(Self::LitChar(ch)), + syn::Lit::Str(st) => Ok(Self::LitStr(st)), + _ => unreachable!(), + }, + syn::Pat::Range(syn::PatRange { + start, end, limits, .. + }) if start.as_ref().map(is_lit_char).unwrap_or(true) + && end.as_ref().map(is_lit_char).unwrap_or(true) => + { + let c_start = start.as_ref().and_then(get_char).unwrap_or(char::MIN); + let c_end = end.as_ref().and_then(get_char).unwrap_or(char::MAX); + + let (c_start, c_end) = match limits { + syn::RangeLimits::HalfOpen(_) => (c_start, Some(c_end)), + syn::RangeLimits::Closed(_) => (c_start, char::from_u32(c_end as u32 + 1)), + }; + + if c_end.is_none() { + return Err(syn::Error::new_spanned( + end, + "This char literal cannot be used as an inclusive end.", + )); + } + + let (start, end) = (c_start, c_end.unwrap()); + Ok(Self::CharRange(start, end)) + } + _ => Err(syn::Error::new_spanned( + pat, + "Only string and char literals, and char ranges are accepted here", + )), + } + } +} diff --git a/src/common/file.rs b/src/common/file.rs new file mode 100644 index 0000000..be401b2 --- /dev/null +++ b/src/common/file.rs @@ -0,0 +1,216 @@ +//! +//! A source file. +//! + +use std::{fmt::Formatter, ops::Range, path::Path}; + +use super::{Loc, Source, Span, Spanned}; + +/// +/// Line and column information for +/// a particular location in source code. +/// +#[derive(Debug)] +pub struct LineColumn<'a> { + file: &'a str, + line: usize, + column: usize, +} + +/// +/// Converting to 1-based only for display. +/// +impl<'a> std::fmt::Display for LineColumn<'a> { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!(f, "{}:{}:{}", self.file, self.line + 1, self.column + 1) + } +} + +/// +/// Finds the starting character index of all +/// lines, using any [ECMAScript LineTerminatorSequence](https://262.ecma-international.org/5.1/#sec-7.3) +/// to delimit lines. +/// +fn line_starts(st: &[char]) -> Vec { + let mut v = vec![0]; + let mut i = 0; + + while i < st.len() { + let ch = st[i]; + + match ch { + '\u{000A}' => v.push(i + 1), // + '\u{2028}' => v.push(i + 1), // + '\u{2029}' => v.push(i + 1), // + '\u{000D}' => { + if matches!(st.get(i + 1), Some('\u{000A}')) { + v.push(i + 2); // + i += 1; + } else { + v.push(i + 1); // + } + } + _ => (), + } + + i += 1; + } + + if matches!(v.last(), Some(i) if *i >= st.len()) { + let _ = v.pop(); + } + + v +} + +/// +/// A real source file. +/// +/// Here, line-column information can be provided. +/// +#[derive(Debug, Clone)] +pub struct SourceFile { + path: String, + contents: String, + chars: Vec, + line_starts: Vec, +} + +impl SourceFile { + /// + /// TESTING ONLY + /// *** + /// Create a dumy file with a fake path. + /// + #[cfg(test)] + pub fn dummy_file(contents: &'static str) -> Self { + let path = "DUMMY.FILE".to_string(); + let contents = contents.to_string(); + + let chars = contents.chars().collect::>(); + let line_starts = line_starts(&chars); + + Self { + path, + contents, + chars, + line_starts, + } + } + + /// + /// Attempts to read source code from a given file path. + /// + pub fn read_from_file>(path: P) -> std::io::Result { + let path = path.as_ref().to_owned(); + let contents = std::fs::read_to_string(&path)?; + + let path = path.to_str().expect("Valid path as string").to_string(); + let chars = contents.chars().collect::>(); + let line_starts = line_starts(&chars); + + Ok(Self { + path, + contents, + chars, + line_starts, + }) + } + + /// + /// Return the (0-based) line and column information at a [Loc] in this file. + /// + fn line_col(&self, loc: Loc) -> Option<(usize, usize)> { + // Essentially, pair the start of the a line with the end of the next (or EOF), + // check if loc is in its range. If so, get the corresponding line and calculate the + // corresponding column. + self.line_starts + .iter() + .copied() + .zip( + self.line_starts + .iter() + .copied() + .skip(1) + .chain([self.contents.len()]), + ) + .enumerate() + .filter(|&(_, (start_col, end_col))| (start_col <= loc.0 && loc.0 < end_col)) + .map(|(ln, (start_col, _))| (ln, loc.0 - start_col)) + .next() + } +} + +impl Source for SourceFile { + type Location<'a> = LineColumn<'a> + where Self: 'a; + + fn locate(&self, span: Span) -> Option> { + if self.in_bounds(&span) { + let (line, column) = self.line_col(span.start)?; + return Some(LineColumn { + file: &self.path, + line, + column, + }); + } + + None + } + + fn bounds(&self) -> Range { + Loc(0)..Loc(self.chars.len()) + } + + fn source_at(&self, span: impl Spanned) -> Option { + let span = span.span(); + if self.in_bounds(&span) { + return Some(self.chars[span.as_range()].iter().collect()); + } + + None + } + + fn characters(&self) -> &[char] { + &self.chars + } +} + +#[cfg(test)] +mod tests { + use crate::common::{file::LineColumn, Source}; + + use super::{super::ToSpan, line_starts, SourceFile}; + + #[test] + fn lines() { + assert!(matches!( + &line_starts(&"ba\nb\nc".chars().collect::>())[..], + &[0, 3, 5] + )); + + assert!(matches!( + &line_starts( + &"babs\r\nbaaa\r__\u{2028}asagsgas\u{2029}a\nc\n" + .chars() + .collect::>() + )[..], + &[0, 6, 11, 14, 23, 25,] + )) + } + + #[test] + fn line_col() { + let f = SourceFile::dummy_file("PEN\nPINEAPPLE\nAPPLE\nPEN"); + let ananas = (4..13).to_span(&f); + assert_eq!(f.source_at(ananas), Some("PINEAPPLE".to_string())); + assert!(matches!( + f.locate(ananas), + Some(LineColumn { + line: 1, + column: 0, + .. + }) + )); + } +} diff --git a/src/common/location.rs b/src/common/location.rs new file mode 100644 index 0000000..01538b4 --- /dev/null +++ b/src/common/location.rs @@ -0,0 +1,254 @@ +//! +//! Things that help trace errors and tokens: [Span] and [Loc]. +//! + +use std::ops::{Add, Bound, Range, RangeBounds}; + +/// +/// Represents the index of a character in source code. +/// +#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)] +pub struct Loc(pub(crate) usize); + +impl From for Loc { + fn from(value: usize) -> Self { + Self(value) + } +} + +impl From for usize { + fn from(value: Loc) -> Self { + value.0 + } +} + +impl Add for Loc +where + usize: Add, +{ + type Output = Loc; + + fn add(self, rhs: A) -> Self::Output { + Self(self.0 + rhs) + } +} + +/// +/// Represents the location of a token in source code. +/// +#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)] +pub struct Span { + /// + /// Start index: inclusive lower bound. + /// + pub(crate) start: Loc, + + /// + /// End index: exclusive upper bound. + /// + pub(crate) end: Loc, +} + +impl Span { + /// + /// Returns Some(subspan), given the relative indexes from the start of this span, + /// returning None if the end index is out of this span's bounds. + /// + pub fn subspan(&self, indexes: impl RangeBounds) -> Option { + let start = match indexes.start_bound() { + Bound::Included(included) => self.start + included, + Bound::Excluded(excluded) => self.start + (excluded + 1), + Bound::Unbounded => self.start, + }; + + let end = match indexes.end_bound() { + Bound::Included(included) => self.start + (included + 1), + Bound::Excluded(excluded) => self.start + excluded, + Bound::Unbounded => self.end, + }; + + if end > self.end { + // Not a subspan, since we overflow the end. + return None; + } + + Some(Self { start, end }) + } + + /// + /// Use this [Span] as a start, taking the range between this span's start, + /// and the end oi the last of the passed in iterator (including itself). + /// + pub fn combine(self, others: impl IntoIterator) -> Self { + let Self { start, end } = self; + + // Take the end bound of the last non-empty Span, + // if others is not empty, and use that instead. + let last = others.into_iter().filter(|s| !s.is_empty()).last(); + if let Some(Self { end, .. }) = last { + return Self { start, end }; + } + + Self { start, end } + } + + /// + /// Return the start and end bounds as a Rust [Range] + /// + pub fn as_range(&self) -> Range { + self.start.0..self.end.0 + } + + /// + /// Returns an empty span. + /// + pub fn empty() -> Self { + Self { + start: Loc(0), + end: Loc(0), + } + } + + /// + /// Is this [Span] empty (captures nothing). + /// + pub fn is_empty(&self) -> bool { + self.end.0 - self.start.0 == 0 + } +} + +/// +/// Utility trait for handling multiple spans. +/// +pub trait SpanIter: Sized + IntoIterator { + /// + /// Combine all of this iterator's spans, + /// resulting in a [Span] encompassing all + /// passed in [Span]s (assuming this iter is in ascending order). + /// + fn combine(self) -> Span; +} + +impl> SpanIter for Iter { + fn combine(self) -> Span { + let mut iter = self.into_iter(); + iter.next() + .map(|s| s.combine(iter)) + .unwrap_or(Span::empty()) + } +} + +impl RangeBounds for Span { + fn start_bound(&self) -> Bound<&usize> { + Bound::Included(&self.start.0) + } + + fn end_bound(&self) -> Bound<&usize> { + Bound::Excluded(&self.end.0) + } +} + +impl RangeBounds for Span { + fn start_bound(&self) -> Bound<&Loc> { + Bound::Included(&self.start) + } + + fn end_bound(&self) -> Bound<&Loc> { + Bound::Excluded(&self.end) + } +} + +impl From for Span { + /// + /// Span of a single character. + /// + fn from(start: Loc) -> Self { + Self { + start, + end: start + 1, + } + } +} + +/// +/// Returns the span attached to this +/// object. +/// +pub trait Spanned { + /// + /// Returns the span attached to this + /// object. + /// + fn span(&self) -> Span; +} + +impl Spanned for Span { + fn span(&self) -> Span { + *self + } +} + +impl<'a, S: Spanned> Spanned for &'a S { + fn span(&self) -> Span { + (*self).span() + } +} + +#[cfg(test)] +mod tests { + use crate::common::source::{DummySource, Source, ToSpan}; + + use super::SpanIter; + + #[test] + fn subspan() { + let source = DummySource::new("testthing."); + let span = (0..9).to_span(&source); + + // Valid + assert_eq!(span.subspan(1..).map(|s| s.as_range()), Some(1..9)); + assert_eq!(span.subspan(1..2).map(|s| s.as_range()), Some(1..2)); + assert_eq!(span.subspan(..5).map(|s| s.as_range()), Some(0..5)); + assert_eq!(span.subspan(..).map(|s| s.as_range()), Some(0..9)); + + // Invalid + assert_eq!(span.subspan(..17).map(|s| s.as_range()), None); + assert_eq!(span.subspan(144..1343).map(|s| s.as_range()), None); + } + + #[test] + fn source_at() { + let source = DummySource::new("testthing."); + let span = (0..9).to_span(&source); + + assert_eq!( + span.subspan(..4).and_then(|s| source.source_at(s)), + Some("test".to_string()) + ); + + assert_eq!( + span.subspan(4..).and_then(|s| source.source_at(s)), + Some("thing".to_string()) + ); + + assert_eq!( + span.subspan(..4).and_then(|s| source.source_at(s)), + Some("test".to_string()) + ); + + assert_eq!(span.subspan(49..).and_then(|s| source.source_at(s)), None); + } + + #[test] + fn test_combine_span() { + let source = DummySource::new( + "agdshJAGDJHAVghVJAtesfsdagdsagdsaJGASDHJGAWDHJAGSDASGHJASGHASDGJSADBHJASDGVBJHtthing.", + ); + + let s1 = (0..13).to_span(&source); + let s2 = (13..23).to_span(&source); + let s3 = (23..26).to_span(&source); + + assert_eq!([s1, s2, s3].combine(), (0..26).to_span(&source)); + } +} diff --git a/src/common/mod.rs b/src/common/mod.rs new file mode 100644 index 0000000..07adee2 --- /dev/null +++ b/src/common/mod.rs @@ -0,0 +1,10 @@ +//! +//! Common utilities across lexing and syntax-parsing. +//! + +pub mod file; +pub mod location; +pub mod source; + +pub use location::*; +pub use source::*; diff --git a/src/common/source.rs b/src/common/source.rs new file mode 100644 index 0000000..39a09f5 --- /dev/null +++ b/src/common/source.rs @@ -0,0 +1,153 @@ +//! +//! Sources of source code. +//! + +use std::ops::{Bound, Range, RangeBounds}; + +use super::{Loc, Span, Spanned}; +use crate::lexing::utils::SourceStream; + +#[cfg(test)] +pub use testing_only::DummySource; + +/// +/// Generic idea of source code: could be a file, +/// or a simple string. +/// +/// This trait aims to abstract the gathering of the source +/// text and focus on the Source -> Lexing -> Syntax -> AST +/// pipeline. +/// +pub trait Source { + /// + /// A friendly appropriate format to point + /// to a location of a token. + /// + /// This could be line-column information, or simply an index. + /// + type Location<'a> + where + Self: 'a; + + /// + /// Find the location of this span, + /// and put it into a friendly appropriate format. + /// + fn locate(&self, span: Span) -> Option>; + + /// + /// Returns the start and (exclusive) end index of this source. + /// + fn bounds(&self) -> Range; + + /// + /// Checks if a given [Span] is within bounds. + /// + fn in_bounds(&self, span: &Span) -> bool { + self.bounds().end >= span.end + } + + /// + /// Returns the source code at a given [Span], if within bounds. + /// + fn source_at(&self, span: impl Spanned) -> Option; + + /// + /// Get the characters in this [Source]. + /// + fn characters(&self) -> &[char]; + + /// + /// Crate a stream from this source. + /// + fn stream(&self) -> SourceStream + where + Self: Sized, + { + SourceStream::new(self) + } +} + +/// +/// Utility conversion into a [Span], given +/// boundary information from the origin [Source]. +/// +pub trait ToSpan { + fn to_span(self, source: &impl Source) -> Span; +} + +impl> ToSpan for R { + fn to_span(self, source: &impl Source) -> Span { + let Range { + start: start_bound, + end: end_bound, + } = source.bounds(); + + let start = match self.start_bound() { + Bound::Included(included) => Loc(*included), + Bound::Excluded(excluded) => Loc(*excluded + 1), + Bound::Unbounded => start_bound, + }; + + let end = match self.end_bound() { + Bound::Included(included) => Loc(included + 1), + Bound::Excluded(excluded) => Loc(*excluded), + Bound::Unbounded => end_bound, + }; + + Span { start, end } + } +} + +#[cfg(test)] +mod testing_only { + use std::ops::Range; + + use crate::common::{Loc, Span, Spanned}; + + use super::Source; + + /// + /// [Source] implementation for testing purposes only! + /// + pub struct DummySource { + text: String, + } + + impl DummySource { + pub fn new(text: impl ToString) -> Self { + let text = text.to_string(); + Self { text } + } + } + + impl Source for DummySource { + type Location<'a> = Range + where Self: 'a; + + fn locate(&self, span: Span) -> Option> { + if self.in_bounds(&span) { + return Some(span.start.0..span.end.0); + } + + None + } + + fn bounds(&self) -> Range { + Loc(0)..Loc(self.text.len()) + } + + fn source_at(&self, span: impl Spanned) -> Option { + let span = span.span(); + if self.in_bounds(&span) { + self.text.get(span.as_range()).map(ToString::to_string) + } else { + None + } + } + + fn characters(&self) -> &[char] { + unimplemented!() + } + } +} diff --git a/src/lexing/mod.rs b/src/lexing/mod.rs new file mode 100644 index 0000000..a3a14d3 --- /dev/null +++ b/src/lexing/mod.rs @@ -0,0 +1,14 @@ +//! +//! The process of lexing involves converting [char]s +//! from source code into lexical tokens according to +//! some [lexical grammar](https://en.wikipedia.org/wiki/Lexical_grammar). +//! + +pub mod tokens; +pub mod utils; + +pub use utils::{ + stream::CharacterRange, + verbatim::{CharPattern, Verbatim}, + AtLeast, Exactly, Lex, LexError, LexResult, LexT, Many, Peek, SourceStream, +}; diff --git a/src/lexing/tokens/comment.rs b/src/lexing/tokens/comment.rs new file mode 100644 index 0000000..97cb627 --- /dev/null +++ b/src/lexing/tokens/comment.rs @@ -0,0 +1,172 @@ +//! +//! ## Comments +//! + +use avjason_macros::{verbatim as v, ECMARef, Spanned}; + +use crate::{ + common::{Source, Span, Spanned}, + lexing::{Lex, LexError, LexT, SourceStream}, +}; + +use super::line_terminator::LineTerminator; + +/// +/// ```js +/// // Comments +/// /* of either type. */ +/// ``` +/// +#[ECMARef("Comment", "https://262.ecma-international.org/5.1/#sec-7.4")] +#[derive(Debug, Spanned)] +pub enum Comment { + Single(SingleLineComment), + Multi(MultiLineComment), +} + +/// +/// ```js +/// // Single-line comment. +/// ``` +/// +#[ECMARef("SingleLineComment", "https://262.ecma-international.org/5.1/#sec-7.4")] +#[derive(Debug, Spanned)] +pub struct SingleLineComment { + span: Span, + + /// + /// Span of the contents of this comment + /// + inner: Span, +} + +/// +/// ```js +/// /* Multi-line comment. */ +/// ``` +/// +#[ECMARef("MultiLineComment", "https://262.ecma-international.org/5.1/#sec-7.4")] +#[derive(Debug, Spanned)] +pub struct MultiLineComment { + span: Span, + + /// + /// Span of the contents of this comment + /// + inner: Span, +} + +impl Comment { + pub fn inner(&self) -> Span { + match self { + Comment::Single(single) => single.inner, + Comment::Multi(multi) => multi.inner, + } + } +} + +impl LexT for Comment { + fn peek(input: &SourceStream) -> bool { + ::peek(input) || ::peek(input) + } + + fn lex(input: &mut SourceStream) -> Result { + // .unwrap_as_result() ok since Self::peek() -> exists either variant. + Lex::lex(input) + .map(Self::Single) + .or(|| Lex::lex(input).map(Self::Multi)) + .unwrap_as_result() + } +} + +impl LexT for SingleLineComment { + fn peek(input: &SourceStream) -> bool { + input.upcoming("//") + } + + fn lex(input: &mut SourceStream) -> Result { + let double_slash = ::lex(input)?; + let contents = input + .take_until(::peek) + .map(|(span, _)| span) + .unwrap_or(Span::empty()); + + Ok(Self { + span: double_slash.span().combine([contents]), + inner: contents, + }) + } +} + +impl LexT for MultiLineComment { + fn peek(input: &SourceStream) -> bool { + input.upcoming("/*") + } + + fn lex(input: &mut SourceStream) -> Result { + let opening = ::lex(input)?; + let contents = input + .take_until(::peek) + .map(|(span, _)| span) + .unwrap_or(Span::empty()); + + Ok(Self { + span: opening.span().combine([contents]), + inner: contents, + }) + } +} + +#[cfg(test)] +mod tests { + use crate::{ + common::{file::SourceFile, Source}, + lexing::tokens::comment::Comment, + }; + + use super::{MultiLineComment, SingleLineComment}; + + #[test] + fn single_line_comment() { + { + let source = SourceFile::dummy_file("// An apple a day..."); + let input = &mut source.stream(); + let comment: SingleLineComment = input.lex().expect("Valid parse"); + + assert_eq!( + source.source_at(comment.inner), + Some(" An apple a day...".to_string()) + ); + } + } + + #[test] + fn multi_line_comment() { + { + let source = + SourceFile::dummy_file("/* An apple a day\n\r\u{2029}Keeps the doctor away! */"); + let input = &mut source.stream(); + let comment: MultiLineComment = input.lex().expect("Valid parse"); + + assert_eq!( + source.source_at(comment.inner), + Some(" An apple a day\n\r\u{2029}Keeps the doctor away! ".to_string()) + ); + } + } + + #[test] + fn comments() { + { + let source = + SourceFile::dummy_file("/* An apple a day\n\r\u{2029}Keeps the doctor away! */"); + let input = &mut source.stream(); + let comment: Comment = input.lex().expect("Valid parse"); + + assert_eq!( + source.source_at(comment.inner()), + Some(" An apple a day\n\r\u{2029}Keeps the doctor away! ".to_string()) + ); + } + } +} diff --git a/src/lexing/tokens/escapes.rs b/src/lexing/tokens/escapes.rs new file mode 100644 index 0000000..4e25d5a --- /dev/null +++ b/src/lexing/tokens/escapes.rs @@ -0,0 +1,497 @@ +//! +//! ## Escape Codes +//! +//! Technically not tokens. +//! These are used between strings and identifiers. +//! + +use avjason_macros::{verbatim as v, ECMARef, Spanned}; + +use crate::{ + common::{Source, Span}, + lexing::{Exactly, Lex, LexError, LexT, SourceStream}, +}; + +use super::{ + line_terminator::is_line_terminator, + number::{HexDigit, MathematicalValue}, + string::CharacterValue, +}; + +/// +/// Any valid ECMAScript escape sequence: +/// +/// ```javascript +/// '\n' // Escaped character +/// '\y' // Non-escaped character +/// '\0' // Null character +/// '\x1A' // Hex code escape +/// '\u0A1B'// Unicode escape +/// ``` +/// +/// *** +/// +/// ### Note +/// Since the octal escape syntax is optional and not part of the main spec +/// (see [Section B.1.2](https://262.ecma-international.org/5.1/#sec-B.1.2)), +/// it is *not* supported. +/// +#[ECMARef("EscapeSequence", "https://262.ecma-international.org/5.1/#sec-7.8.4")] +#[derive(Debug, Spanned)] +pub enum EscapeSequence { + CharacterEscapeSequence(CharacterEscapeSequence), + Null(Null), + HexEscapeSequence(HexEscapeSequence), + UnicodeEscapeSequence(UnicodeEscapeSequence), +} + +/// +/// Single characters that have been escaped +/// with a `\`. +/// +#[ECMARef( + "CharacterEscapeSequence", + "https://262.ecma-international.org/5.1/#sec-7.8.4" +)] +#[derive(Debug, Spanned)] +pub enum CharacterEscapeSequence { + Single(SingleEscapeChar), + NonEscape(NonEscapeChar), +} + +/// +/// An escape character, like `\t` for `HORIZONTAL TAB`. +/// +#[ECMARef( + "SingleEscapeChar", + "https://262.ecma-international.org/5.1/#sec-7.8.4" +)] +#[derive(Debug, Spanned)] +pub struct SingleEscapeChar { + span: Span, + raw: char, +} + +/// +/// A character that's not an escape character, +/// and should be treated verbatim. +/// +#[ECMARef( + "NonEscapeChar", + "https://262.ecma-international.org/5.1/#sec-7.8.4" +)] +#[derive(Debug, Spanned)] +pub struct NonEscapeChar { + span: Span, + raw: char, +} + +/// +/// Represents a `NULL` character `U+0000` +/// +#[derive(Debug, Spanned)] +pub struct Null { + span: Span, +} + +#[ECMARef( + "HexEscapeSequence", + "https://262.ecma-international.org/5.1/#sec-7.8.4" +)] +#[derive(Debug, Spanned)] +pub struct HexEscapeSequence(v!('x'), Exactly<2, HexDigit>); + +#[ECMARef( + "UnicodeEscapeSequence", + "https://262.ecma-international.org/5.1/#sec-7.8.4" +)] +#[derive(Debug, Spanned, Clone)] +pub struct UnicodeEscapeSequence(v!('u'), Exactly<4, HexDigit>); + +// --- + +impl LexT for EscapeSequence { + fn peek(input: &SourceStream) -> bool { + ::peek(input) + || ::peek(input) + || ::peek(input) + || ::peek(input) + } + + fn lex(input: &mut SourceStream) -> Result { + // .unwrap_as_result() ok since one of these variants is upcoming. + input + .lex() + .map(Self::CharacterEscapeSequence) + .or(|| input.lex().map(Self::Null)) + .or(|| input.lex().map(Self::HexEscapeSequence)) + .or(|| input.lex().map(Self::UnicodeEscapeSequence)) + .unwrap_as_result() + } +} + +impl LexT for CharacterEscapeSequence { + fn peek(input: &SourceStream) -> bool { + ::peek(input) || ::peek(input) + } + + fn lex(input: &mut SourceStream) -> Result { + // .unwrap_as_result() ok since Self::peek() -> there is one variant ahead. + Lex::lex(input) + .map(Self::Single) + .or(|| Lex::lex(input).map(Self::NonEscape)) + .unwrap_as_result() + } +} + +fn is_single_escape_char(ch: &char) -> bool { + matches!(ch, '\'' | '"' | '\\' | 'b' | 'f' | 'n' | 'r' | 't' | 'v') +} + +impl LexT for SingleEscapeChar { + fn peek(input: &SourceStream) -> bool { + input.upcoming(is_single_escape_char) + } + + fn lex(input: &mut SourceStream) -> Result { + // Unwrap ok since Self::peek() -> a character exists. + let (loc, raw) = input.take().unwrap(); + + Ok(Self { + span: Span::from(loc), + raw, + }) + } +} + +fn is_escape_char(ch: &char) -> bool { + is_single_escape_char(ch) || matches!(ch, '0'..='9' | 'x' | 'u') +} + +impl LexT for NonEscapeChar { + fn peek(input: &SourceStream) -> bool { + input.upcoming(|ch: &char| !(is_line_terminator(ch) || is_escape_char(ch))) + } + + fn lex(input: &mut SourceStream) -> Result { + // Unwrap ok since Self::peek() -> a character exists. + let (loc, raw) = input.take().unwrap(); + + Ok(Self { + span: Span::from(loc), + raw, + }) + } +} + +impl LexT for Null { + fn peek(input: &SourceStream) -> bool { + input.upcoming("0") && !matches!(input.peek_n(1), Some('0'..='9')) + } + + fn lex(input: &mut SourceStream) -> Result { + // .unwrap() ok since Self::peek() -> next character exists. + let (loc, _) = input.take().unwrap(); + + Ok(Self { + span: Span::from(loc), + }) + } +} + +impl LexT for HexEscapeSequence { + fn peek(input: &SourceStream) -> bool { + ::peek(input) + } + + fn lex(input: &mut SourceStream) -> Result { + Ok(Self(LexT::lex(input)?, Lex::lex(input).unwrap_as_result()?)) + } +} + +impl LexT for UnicodeEscapeSequence { + fn peek(input: &SourceStream) -> bool { + ::peek(input) + } + + fn lex(input: &mut SourceStream) -> Result { + Ok(Self(LexT::lex(input)?, Lex::lex(input).unwrap_as_result()?)) + } +} + +// --- + +impl CharacterValue for EscapeSequence { + fn cv<'a, 'b: 'a>(&'a self, buf: &'b mut [u16; 2]) -> &'b [u16] { + match self { + EscapeSequence::CharacterEscapeSequence(esc) => esc.cv(buf), + EscapeSequence::Null(null) => null.cv(buf), + EscapeSequence::HexEscapeSequence(hex) => hex.cv(buf), + EscapeSequence::UnicodeEscapeSequence(unicode) => unicode.cv(buf), + } + } +} + +impl CharacterValue for CharacterEscapeSequence { + fn cv<'a, 'b: 'a>(&'a self, buf: &'b mut [u16; 2]) -> &'b [u16] { + match self { + CharacterEscapeSequence::Single(single) => single.cv(buf), + CharacterEscapeSequence::NonEscape(non_escape) => non_escape.cv(buf), + } + } +} + +impl CharacterValue for Null { + fn cv<'a, 'b: 'a>(&'a self, buf: &'b mut [u16; 2]) -> &'b [u16] { + '\u{0000}'.encode_utf16(buf) + } +} + +impl CharacterValue for SingleEscapeChar { + /// + /// Compliant with [Table 4, Section 7.4](https://262.ecma-international.org/5.1/#sec-7.8.4) + /// of the ECMAScript spec. + /// + fn cv<'a, 'b: 'a>(&'a self, buf: &'b mut [u16; 2]) -> &'b [u16] { + match self.raw { + '\'' => '\u{0027}', // single quote + '"' => '\u{0022}', // double quote + '\\' => '\u{005C}', // backslash + 'b' => '\u{0008}', // backspace + 'f' => '\u{000C}', // form feed + 'n' => '\u{000A}', // line feed (new line) + 'r' => '\u{000D}', // carriage return + 't' => '\u{0009}', // horizontal tab + 'v' => '\u{000B}', // vertical tab + _ => unreachable!(), + } + .encode_utf16(buf) + } +} + +impl CharacterValue for NonEscapeChar { + /// + /// > The CV of NonEscapeCharacter :: SourceCharacter but not one of EscapeCharacter or + /// > LineTerminator is the SourceCharacter character itself. + /// + fn cv<'a, 'b: 'a>(&'a self, buf: &'b mut [u16; 2]) -> &'b [u16] { + self.raw.encode_utf16(buf) + } +} + +impl CharacterValue for HexEscapeSequence { + fn cv<'a, 'b: 'a>(&'a self, buf: &'b mut [u16; 2]) -> &'b [u16] { + buf[0] = self.1.mv() as u16; + &buf[0..1] + } +} + +impl CharacterValue for UnicodeEscapeSequence { + fn cv<'a, 'b: 'a>(&'a self, buf: &'b mut [u16; 2]) -> &'b [u16] { + buf[0] = self.1.mv(); + &buf[0..1] + } +} + +#[cfg(test)] +mod tests { + use crate::{ + common::{file::SourceFile, Source}, + lexing::{ + tokens::escapes::{CharacterEscapeSequence, EscapeSequence, NonEscapeChar}, + Exactly, Lex, Verbatim, + }, + }; + + use super::{HexEscapeSequence, Null, SingleEscapeChar, UnicodeEscapeSequence}; + + #[test] + fn single_escape() { + let source = SourceFile::dummy_file("'\"\\bfnrtv"); + let input = &mut source.stream(); + let esc: Exactly<9, SingleEscapeChar> = input.lex().expect("Valid parse"); + assert!(matches!( + &*esc, + &[ + SingleEscapeChar { raw: '\'', .. }, + SingleEscapeChar { raw: '"', .. }, + SingleEscapeChar { raw: '\\', .. }, + SingleEscapeChar { raw: 'b', .. }, + SingleEscapeChar { raw: 'f', .. }, + SingleEscapeChar { raw: 'n', .. }, + SingleEscapeChar { raw: 'r', .. }, + SingleEscapeChar { raw: 't', .. }, + SingleEscapeChar { raw: 'v', .. }, + ] + )) + } + + #[test] + fn non_escape_char() { + let source = SourceFile::dummy_file("a!£%*&-=💩"); + let input = &mut source.stream(); + let esc: Exactly<9, NonEscapeChar> = input.lex().expect("Valid parse"); + assert!(matches!( + &*esc, + &[ + NonEscapeChar { raw: 'a', .. }, + NonEscapeChar { raw: '!', .. }, + NonEscapeChar { raw: '£', .. }, + NonEscapeChar { raw: '%', .. }, + NonEscapeChar { raw: '*', .. }, + NonEscapeChar { raw: '&', .. }, + NonEscapeChar { raw: '-', .. }, + NonEscapeChar { raw: '=', .. }, + NonEscapeChar { raw: '💩', .. }, + ] + )) + } + + #[test] + fn character_escape_sequence() { + let source = SourceFile::dummy_file("'\"\\bfnrtva!£%*&-=💩"); + let input = &mut source.stream(); + let esc: Exactly<18, CharacterEscapeSequence> = input.lex().expect("Valid parse"); + assert!(matches!( + &*esc, + &[ + CharacterEscapeSequence::Single(SingleEscapeChar { raw: '\'', .. }), + CharacterEscapeSequence::Single(SingleEscapeChar { raw: '"', .. }), + CharacterEscapeSequence::Single(SingleEscapeChar { raw: '\\', .. }), + CharacterEscapeSequence::Single(SingleEscapeChar { raw: 'b', .. }), + CharacterEscapeSequence::Single(SingleEscapeChar { raw: 'f', .. }), + CharacterEscapeSequence::Single(SingleEscapeChar { raw: 'n', .. }), + CharacterEscapeSequence::Single(SingleEscapeChar { raw: 'r', .. }), + CharacterEscapeSequence::Single(SingleEscapeChar { raw: 't', .. }), + CharacterEscapeSequence::Single(SingleEscapeChar { raw: 'v', .. }), + CharacterEscapeSequence::NonEscape(NonEscapeChar { raw: 'a', .. }), + CharacterEscapeSequence::NonEscape(NonEscapeChar { raw: '!', .. }), + CharacterEscapeSequence::NonEscape(NonEscapeChar { raw: '£', .. }), + CharacterEscapeSequence::NonEscape(NonEscapeChar { raw: '%', .. }), + CharacterEscapeSequence::NonEscape(NonEscapeChar { raw: '*', .. }), + CharacterEscapeSequence::NonEscape(NonEscapeChar { raw: '&', .. }), + CharacterEscapeSequence::NonEscape(NonEscapeChar { raw: '-', .. }), + CharacterEscapeSequence::NonEscape(NonEscapeChar { raw: '=', .. }), + CharacterEscapeSequence::NonEscape(NonEscapeChar { raw: '💩', .. }), + ] + )) + } + + #[test] + fn null_char() { + { + let source = SourceFile::dummy_file("0"); + let input = &mut source.stream(); + let _: Null = input.lex().expect("Valid parse"); + } + + { + let source = SourceFile::dummy_file("01"); + let input = &mut source.stream(); + let esc = Null::lex(input); + assert!(esc.is_nothing()) + } + } + + #[test] + fn hex_escape() { + let source = SourceFile::dummy_file("x20x26x25x3c"); + let input = &mut source.stream(); + let _: Exactly<4, HexEscapeSequence> = input.lex().expect("Valid parse"); + } + + #[test] + fn unicode_escape() { + let source = SourceFile::dummy_file("u0000u2AFCu6798u1623"); + let input = &mut source.stream(); + let _: Exactly<4, UnicodeEscapeSequence> = input.lex().expect("Valid parse"); + } + + #[test] + fn mixed() { + let source = + SourceFile::dummy_file("'\"\\bfnrtva!£%*&-=💩0x20x26x25x3cu0000u2AFCu6798u1623"); + let input = &mut source.stream(); + let esc: Exactly<27, EscapeSequence> = input.lex().expect("Valid parse"); + assert!(matches!( + &*esc, + &[ + EscapeSequence::CharacterEscapeSequence(CharacterEscapeSequence::Single( + SingleEscapeChar { raw: '\'', .. } + )), + EscapeSequence::CharacterEscapeSequence(CharacterEscapeSequence::Single( + SingleEscapeChar { raw: '"', .. } + )), + EscapeSequence::CharacterEscapeSequence(CharacterEscapeSequence::Single( + SingleEscapeChar { raw: '\\', .. } + )), + EscapeSequence::CharacterEscapeSequence(CharacterEscapeSequence::Single( + SingleEscapeChar { raw: 'b', .. } + )), + EscapeSequence::CharacterEscapeSequence(CharacterEscapeSequence::Single( + SingleEscapeChar { raw: 'f', .. } + )), + EscapeSequence::CharacterEscapeSequence(CharacterEscapeSequence::Single( + SingleEscapeChar { raw: 'n', .. } + )), + EscapeSequence::CharacterEscapeSequence(CharacterEscapeSequence::Single( + SingleEscapeChar { raw: 'r', .. } + )), + EscapeSequence::CharacterEscapeSequence(CharacterEscapeSequence::Single( + SingleEscapeChar { raw: 't', .. } + )), + EscapeSequence::CharacterEscapeSequence(CharacterEscapeSequence::Single( + SingleEscapeChar { raw: 'v', .. } + )), + EscapeSequence::CharacterEscapeSequence(CharacterEscapeSequence::NonEscape( + NonEscapeChar { raw: 'a', .. } + )), + EscapeSequence::CharacterEscapeSequence(CharacterEscapeSequence::NonEscape( + NonEscapeChar { raw: '!', .. } + )), + EscapeSequence::CharacterEscapeSequence(CharacterEscapeSequence::NonEscape( + NonEscapeChar { raw: '£', .. } + )), + EscapeSequence::CharacterEscapeSequence(CharacterEscapeSequence::NonEscape( + NonEscapeChar { raw: '%', .. } + )), + EscapeSequence::CharacterEscapeSequence(CharacterEscapeSequence::NonEscape( + NonEscapeChar { raw: '*', .. } + )), + EscapeSequence::CharacterEscapeSequence(CharacterEscapeSequence::NonEscape( + NonEscapeChar { raw: '&', .. } + )), + EscapeSequence::CharacterEscapeSequence(CharacterEscapeSequence::NonEscape( + NonEscapeChar { raw: '-', .. } + )), + EscapeSequence::CharacterEscapeSequence(CharacterEscapeSequence::NonEscape( + NonEscapeChar { raw: '=', .. } + )), + EscapeSequence::CharacterEscapeSequence(CharacterEscapeSequence::NonEscape( + NonEscapeChar { raw: '💩', .. } + )), + EscapeSequence::Null(Null { .. }), + EscapeSequence::HexEscapeSequence(HexEscapeSequence(Verbatim::<"x"> { .. }, _)), + EscapeSequence::HexEscapeSequence(HexEscapeSequence(Verbatim::<"x"> { .. }, _)), + EscapeSequence::HexEscapeSequence(HexEscapeSequence(Verbatim::<"x"> { .. }, _)), + EscapeSequence::HexEscapeSequence(HexEscapeSequence(Verbatim::<"x"> { .. }, _)), + EscapeSequence::UnicodeEscapeSequence(UnicodeEscapeSequence( + Verbatim::<"u"> { .. }, + _ + )), + EscapeSequence::UnicodeEscapeSequence(UnicodeEscapeSequence( + Verbatim::<"u"> { .. }, + _ + )), + EscapeSequence::UnicodeEscapeSequence(UnicodeEscapeSequence( + Verbatim::<"u"> { .. }, + _ + )), + EscapeSequence::UnicodeEscapeSequence(UnicodeEscapeSequence( + Verbatim::<"u"> { .. }, + _ + )), + ] + )) + } +} diff --git a/src/lexing/tokens/identifier.rs b/src/lexing/tokens/identifier.rs new file mode 100644 index 0000000..7429850 --- /dev/null +++ b/src/lexing/tokens/identifier.rs @@ -0,0 +1,750 @@ +//! +//! ## Identifiers +//! + +use std::iter::once; + +use finl_unicode::categories::{CharacterCategories, MinorCategory}; + +use crate::{ + common::{Source, Spanned}, + lexing::{Lex, LexError, LexResult, LexT, Many, SourceStream}, + unicode as u, verbatim as v, ECMARef, Spanned, SpecRef, +}; + +use super::{ + escapes::UnicodeEscapeSequence, + string::{collect_cv_into_utf16, CharacterValue, StringValue}, +}; + +#[SpecRef("JSON5Identifier")] +#[derive(Debug, Spanned)] +pub struct Identifier(IdentifierName); + +/// +/// > Identifier Names are tokens that are interpreted +/// > according to the grammar given in the “Identifiers” section +/// > of chapter 5 of the Unicode standard, with some small modifications. +/// +#[ECMARef("IdentifierName", "https://262.ecma-international.org/5.1/#sec-7.6")] +#[derive(Debug, Spanned)] +pub struct IdentifierName(IdentifierStart, Many); + +/// +/// The first character in an identifier. +/// +#[ECMARef("IdentifierStart", "https://262.ecma-international.org/5.1/#sec-7.6")] +#[derive(Debug, Spanned, Clone)] +pub enum IdentifierStart { + Letter(UnicodeLetter), + Dollar(v!('$')), + Underscore(v!('_')), + Escape(v!('\\'), UnicodeEscapeSequence), +} + +/// +/// Any part of an identifier folowing the starting part. +/// +#[ECMARef("IdentifierPart", "https://262.ecma-international.org/5.1/#sec-7.6")] +#[derive(Debug, Spanned, Clone)] +pub enum IdentifierPart { + /// + /// This is not part of the ECMAScript spec, + /// but is necessary in order to get the context + /// correctly in the escaped character's validity checks. + /// + Escape(v!('\\'), UnicodeEscapeSequence), + Start(IdentifierStart), + CombiningMark(UnicodeCombiningMark), + Digit(UnicodeDigit), + ConnectorPunctuation(UnicodeConnectorPunctuation), + + /// + /// Zero width non-joiner + /// + ZWNJ(v!('\u{200C}')), + + /// + /// Zero width joiner + /// + ZWJ(v!('\u{200D}')), +} + +/// +/// > any character in the Unicode categories “Uppercase letter (Lu)”, +/// > “Lowercase letter (Ll)”, “Titlecase letter (Lt)”, “Modifier letter (Lm)”, +/// > “Other letter (Lo)”, or “Letter number (Nl)” +/// +#[ECMARef("UnicodeLetter", "https://262.ecma-international.org/5.1/#sec-7.6")] +pub type UnicodeLetter = u!(Lu | Ll | Lt | Lm | Lo | Nl); + +/// +/// > any character in the Unicode categories “Non-spacing mark (Mn)” +/// > or “Combining spacing mark (Mc)” +/// +#[ECMARef( + "UnicodeCombiningMark", + "https://262.ecma-international.org/5.1/#sec-7.6" +)] +pub type UnicodeCombiningMark = u!(Mn | Mc); + +/// +/// > any character in the Unicode category “Decimal number (Nd)” +/// +#[ECMARef("UnicodeDigit", "https://262.ecma-international.org/5.1/#sec-7.6")] +pub type UnicodeDigit = u!(Nd); + +/// +/// any character in the Unicode category “Connector punctuation (Pc)” +/// +#[ECMARef( + "UnicodeConnectorPunctuation", + "https://262.ecma-international.org/5.1/#sec-7.6" +)] +pub type UnicodeConnectorPunctuation = u!(Pc); + +// --- + +/// +/// What characters does this identifier part accept? +/// +pub trait CharacterAcceptor { + fn accepts(ch: &char) -> bool; +} + +impl CharacterAcceptor for IdentifierStart { + fn accepts(ch: &char) -> bool { + use MinorCategory::*; + match ch { + c if matches!(c.get_minor_category(), Lu | Ll | Lt | Lm | Lo | Nl) => true, + '$' => true, + '_' => true, + _ => false, + } + } +} + +impl CharacterAcceptor for IdentifierPart { + fn accepts(ch: &char) -> bool { + use MinorCategory::*; + match ch { + c if IdentifierStart::accepts(c) => true, + c if matches!(c.get_minor_category(), Mn | Mc | Nd | Pc) => true, + '\u{200C}' => true, + '\u{200D}' => true, + _ => false, + } + } +} + +/// +/// Check to see if the unicode escape code's value +/// is still valid in the context of an identifier part. +/// +/// > A UnicodeEscapeSequence cannot be used to put a +/// > character into an IdentifierName that would otherwise be illegal. +/// +/// — [see more](https://262.ecma-international.org/5.1/#sec-7.6). +/// +pub fn check_unicode_escape( + backslash: v!('\\'), + escape: UnicodeEscapeSequence, + map: fn(v!('\\'), UnicodeEscapeSequence) -> T, +) -> LexResult { + let ch = escape.try_as_char(); + if !ch.map(|ch: char| T::accepts(&ch)).unwrap_or(false) { + return LexResult::Errant(LexError::new( + &backslash.span().combine([escape.span()]), + format!( + "Invalid escaped character in identifier: `{}` is not valid here.", + ch.unwrap() + ), + )); + } + + LexResult::Lexed(map(backslash, escape)) +} + +// --- + +impl LexT for Identifier { + fn peek(input: &SourceStream) -> bool { + ::peek(input) + } + + fn lex(input: &mut SourceStream) -> Result { + Ok(Self(::lex(input)?)) + } +} + +impl LexT for IdentifierName { + fn peek(input: &SourceStream) -> bool { + ::peek(input) + } + + fn lex(input: &mut SourceStream) -> Result { + let start = LexT::lex(input)?; + let after = Lex::lex(input).unwrap_as_result()?; + Ok(Self(start, after)) + } +} + +impl LexT for IdentifierStart { + fn peek(input: &SourceStream) -> bool { + ::peek(input) + || ::peek(input) + || ::peek(input) + || ::peek(input) + } + + fn lex(input: &mut SourceStream) -> Result { + // .unwrap_as_reult() ok since Self::peek() -> one variant exists. + Lex::lex(input) + .map(Self::Letter) + .or(|| input.lex().map(Self::Dollar)) + .or(|| input.lex().map(Self::Underscore)) + .or(|| { + input.lex().and(|backslash: v!('\\')| { + input + .lex() + .expected_msg(input, "Expected a unicode escape sequence `\\uXXXX` here.") + .and(|escape: UnicodeEscapeSequence| { + check_unicode_escape(backslash, escape, Self::Escape) + }) + }) + }) + .unwrap_as_result() + } +} + +impl LexT for IdentifierPart { + fn peek(input: &SourceStream) -> bool { + ::peek(input) + || ::peek(input) + || ::peek(input) + || ::peek(input) + || ::peek(input) + || ::peek(input) + } + + fn lex(input: &mut SourceStream) -> Result { + // .unwrap_as_result() ok since Self::peek() -> exists one of the variants. + Lex::lex(input) + .and(|backslash: v!('\\')| { + input + .lex() + .expected_msg(input, "Expected a unicode escape sequence `\\uXXXX` here.") + .and(|escape: UnicodeEscapeSequence| { + check_unicode_escape(backslash, escape, Self::Escape) + }) + }) + .or(|| input.lex().map(Self::Start)) + .or(|| input.lex().map(Self::CombiningMark)) + .or(|| input.lex().map(Self::Digit)) + .or(|| input.lex().map(Self::ConnectorPunctuation)) + .or(|| input.lex().map(Self::ZWNJ)) + .or(|| input.lex().map(Self::ZWJ)) + .unwrap_as_result() + } +} + +// --- + +impl StringValue for Identifier { + fn sv(&self) -> Vec { + self.0.sv() + } +} + +impl StringValue for IdentifierName { + fn sv(&self) -> Vec { + let binding = IdentifierPart::Start(self.0.clone()); + let tmp: Vec<_> = once(&binding).chain(self.1.iter()).collect(); + collect_cv_into_utf16(tmp) + } +} + +// --- + +impl CharacterValue for IdentifierStart { + fn cv<'a, 'b: 'a>(&'a self, buf: &'b mut [u16; 2]) -> &'b [u16] { + match self { + IdentifierStart::Letter(letter) => letter.cv(buf), + IdentifierStart::Dollar(_) => '$'.encode_utf16(buf), + IdentifierStart::Underscore(_) => '_'.encode_utf16(buf), + IdentifierStart::Escape(_, esc) => esc.cv(buf), + } + } +} + +impl CharacterValue for IdentifierPart { + fn cv<'a, 'b: 'a>(&'a self, buf: &'b mut [u16; 2]) -> &'b [u16] { + match self { + IdentifierPart::Escape(_, escape) => escape.cv(buf), + IdentifierPart::Start(start) => start.cv(buf), + IdentifierPart::CombiningMark(cm) => cm.cv(buf), + IdentifierPart::Digit(digit) => digit.cv(buf), + IdentifierPart::ConnectorPunctuation(cp) => cp.cv(buf), + IdentifierPart::ZWNJ(_) => '\u{200C}'.encode_utf16(buf), + IdentifierPart::ZWJ(_) => '\u{200D}'.encode_utf16(buf), + } + } +} + +#[cfg(test)] +mod tests { + use crate::{ + common::{file::SourceFile, Source}, + lexing::LexResult, + }; + + use super::{Identifier, IdentifierPart, IdentifierStart}; + + fn test_identifier(st: &'static str) -> LexResult { + let source = SourceFile::dummy_file(st); + let input = &mut source.stream(); + input.lex() + } + + fn test_start(st: &'static str) -> LexResult { + let source = SourceFile::dummy_file(st); + let input = &mut source.stream(); + input.lex() + } + + fn test_middle(st: &'static str) -> LexResult { + let source = SourceFile::dummy_file(st); + let input = &mut source.stream(); + input.lex() + } + + #[test] + fn start() { + // Ll + test_identifier("a").expect("Valid parse!"); + test_identifier("ʘ").expect("Valid parse!"); + test_identifier("ξ").expect("Valid parse!"); + test_identifier("я").expect("Valid parse!"); + test_identifier("ᴓ").expect("Valid parse!"); + test_identifier("ⱅ").expect("Valid parse!"); + test_identifier("ꮇ").expect("Valid parse!"); + test_identifier("v").expect("Valid parse!"); + test_identifier("𐳭").expect("Valid parse!"); + test_identifier("𝐨").expect("Valid parse!"); + test_identifier("𝕘").expect("Valid parse!"); + test_identifier("𝛝").expect("Valid parse!"); + test_identifier("𞥃").expect("Valid parse!"); + + // Lm + test_identifier("ˑ").expect("Valid parse!"); + test_identifier("ˬ").expect("Valid parse!"); + test_identifier("ᶾ").expect("Valid parse!"); + test_identifier("〲").expect("Valid parse!"); + test_identifier("ꫝ").expect("Valid parse!"); + test_identifier("𖿡").expect("Valid parse!"); + + // Lo + test_identifier("ڧ").expect("Valid parse!"); + test_identifier("ݦ").expect("Valid parse!"); + test_identifier("ࠊ").expect("Valid parse!"); + test_identifier("ओ").expect("Valid parse!"); + test_identifier("ੴ").expect("Valid parse!"); + test_identifier("ࣅ").expect("Valid parse!"); + test_identifier("ഐ").expect("Valid parse!"); + test_identifier("ᆿ").expect("Valid parse!"); + test_identifier("ሥ").expect("Valid parse!"); + test_identifier("ᐚ").expect("Valid parse!"); + test_identifier("ᑺ").expect("Valid parse!"); + test_identifier("ᔐ").expect("Valid parse!"); + test_identifier("ᖲ").expect("Valid parse!"); + test_identifier("ᚙ").expect("Valid parse!"); + test_identifier("ᛦ").expect("Valid parse!"); + test_identifier("ᠩ").expect("Valid parse!"); + test_identifier("ᩐ").expect("Valid parse!"); + test_identifier("ᮯ").expect("Valid parse!"); + test_identifier("ⶦ").expect("Valid parse!"); + test_identifier("ツ").expect("Valid parse!"); + test_identifier("ㆈ").expect("Valid parse!"); + test_identifier("㐯").expect("Valid parse!"); + test_identifier("㔇").expect("Valid parse!"); + test_identifier("㠓").expect("Valid parse!"); + test_identifier("㨝").expect("Valid parse!"); + + // Lt + test_identifier("ᾫ").expect("Valid parse!"); + test_identifier("ᾝ").expect("Valid parse!"); + test_identifier("Dž").expect("Valid parse!"); + + // Lu + test_identifier("A").expect("Valid parse!"); + test_identifier("Ǡ").expect("Valid parse!"); + test_identifier("Έ").expect("Valid parse!"); + test_identifier("Щ").expect("Valid parse!"); + test_identifier("Ꮿ").expect("Valid parse!"); + test_identifier("Å").expect("Valid parse!"); + test_identifier("ℜ").expect("Valid parse!"); + test_identifier("Ᵽ").expect("Valid parse!"); + test_identifier("T").expect("Valid parse!"); + test_identifier("𐲱").expect("Valid parse!"); + test_identifier("𝓨").expect("Valid parse!"); + test_identifier("𝗨").expect("Valid parse!"); + test_identifier("𝝫").expect("Valid parse!"); + test_identifier("𞤞").expect("Valid parse!"); + + // Nl + test_identifier("Ⅲ").expect("Valid parse!"); + test_identifier("ↈ").expect("Valid parse!"); + test_identifier("𐅰").expect("Valid parse!"); + test_identifier("𒐒").expect("Valid parse!"); + test_identifier("𒐪").expect("Valid parse!"); + test_identifier("𒑚").expect("Valid parse!"); + test_identifier("𒑮").expect("Valid parse!"); + + test_identifier("_").expect("Valid parse!"); + test_identifier("$").expect("Valid parse!"); + test_identifier(r"\u0041").expect("Valid parse"); // `A` + + // Invalid Starting unicode escape code `@` + test_identifier(r"\u0040").unwrap_err(); + + // Middle-only characters + // Mn + assert!(!test_start("◌̣").is_lexed()); + assert!(!test_start("◌ַ").is_lexed()); + assert!(!test_start("◌ܶ").is_lexed()); + assert!(!test_start("◌ࣟ").is_lexed()); + assert!(!test_start("◌ై").is_lexed()); + assert!(!test_start("◌ླྀ").is_lexed()); + assert!(!test_start("◌ᬼ").is_lexed()); + assert!(!test_start("◌ⷻ").is_lexed()); + assert!(!test_start("◌ꦸ").is_lexed()); + assert!(!test_start("◌𝨰").is_lexed()); + assert!(!test_start("◌𝪩").is_lexed()); + assert!(!test_start("◌󠇬").is_lexed()); + + // Mc + assert!(!test_start("ா").is_lexed()); + assert!(!test_start("ௌ").is_lexed()); + assert!(!test_start("ෛ").is_lexed()); + assert!(!test_start("ြ").is_lexed()); + assert!(!test_start("ᬽ").is_lexed()); + assert!(!test_start("ꦾ").is_lexed()); + assert!(!test_start("𑍣").is_lexed()); + assert!(!test_start("𑲩").is_lexed()); + assert!(!test_start("𝅲").is_lexed()); + assert!(!test_start("𝅦").is_lexed()); + + // Nd + assert!(!test_start("1").is_lexed()); + assert!(!test_start("9").is_lexed()); + assert!(!test_start("٢").is_lexed()); + assert!(!test_start("٤").is_lexed()); + assert!(!test_start("৩").is_lexed()); + assert!(!test_start("੦").is_lexed()); + assert!(!test_start("௫").is_lexed()); + assert!(!test_start("൫").is_lexed()); + assert!(!test_start("໙").is_lexed()); + assert!(!test_start("႒").is_lexed()); + assert!(!test_start("᭑").is_lexed()); + assert!(!test_start("꧓").is_lexed()); + assert!(!test_start("꩘").is_lexed()); + assert!(!test_start("𝟯").is_lexed()); + assert!(!test_start("🯷").is_lexed()); + + // Pc + assert!(!test_start("‿").is_lexed()); + assert!(!test_start("⁀").is_lexed()); + assert!(!test_start("⁔").is_lexed()); + assert!(!test_start("︳").is_lexed()); + assert!(!test_start("︴").is_lexed()); + assert!(!test_start("﹍").is_lexed()); + assert!(!test_start("﹎").is_lexed()); + assert!(!test_start("﹏").is_lexed()); + assert!(!test_start("_").is_lexed()); + } + + #[test] + fn middle() { + // Ll + test_identifier("_a").expect("Valid parse!"); + test_identifier("_ʘ").expect("Valid parse!"); + test_identifier("_ξ").expect("Valid parse!"); + test_identifier("_я").expect("Valid parse!"); + test_identifier("_ᴓ").expect("Valid parse!"); + test_identifier("_ⱅ").expect("Valid parse!"); + test_identifier("_ꮇ").expect("Valid parse!"); + test_identifier("_v").expect("Valid parse!"); + test_identifier("_𐳭").expect("Valid parse!"); + test_identifier("_𝐨").expect("Valid parse!"); + test_identifier("_𝕘").expect("Valid parse!"); + test_identifier("_𝛝").expect("Valid parse!"); + test_identifier("_𞥃").expect("Valid parse!"); + + // Lm + test_identifier("_ˑ").expect("Valid parse!"); + test_identifier("_ˬ").expect("Valid parse!"); + test_identifier("_ᶾ").expect("Valid parse!"); + test_identifier("_〲").expect("Valid parse!"); + test_identifier("_ꫝ").expect("Valid parse!"); + test_identifier("_𖿡").expect("Valid parse!"); + + // Lo + test_identifier("_ڧ").expect("Valid parse!"); + test_identifier("_ݦ").expect("Valid parse!"); + test_identifier("_ࠊ").expect("Valid parse!"); + test_identifier("_ओ").expect("Valid parse!"); + test_identifier("_ੴ").expect("Valid parse!"); + test_identifier("_ࣅ").expect("Valid parse!"); + test_identifier("_ഐ").expect("Valid parse!"); + test_identifier("_ᆿ").expect("Valid parse!"); + test_identifier("_ሥ").expect("Valid parse!"); + test_identifier("_ᐚ").expect("Valid parse!"); + test_identifier("_ᑺ").expect("Valid parse!"); + test_identifier("_ᔐ").expect("Valid parse!"); + test_identifier("_ᖲ").expect("Valid parse!"); + test_identifier("_ᚙ").expect("Valid parse!"); + test_identifier("_ᛦ").expect("Valid parse!"); + test_identifier("_ᠩ").expect("Valid parse!"); + test_identifier("_ᩐ").expect("Valid parse!"); + test_identifier("_ᮯ").expect("Valid parse!"); + test_identifier("_ⶦ").expect("Valid parse!"); + test_identifier("_ツ").expect("Valid parse!"); + test_identifier("_ㆈ").expect("Valid parse!"); + test_identifier("_㐯").expect("Valid parse!"); + test_identifier("_㔇").expect("Valid parse!"); + test_identifier("_㠓").expect("Valid parse!"); + test_identifier("_㨝").expect("Valid parse!"); + + // Lt + test_identifier("_ᾫ").expect("Valid parse!"); + test_identifier("_ᾝ").expect("Valid parse!"); + test_identifier("_Dž").expect("Valid parse!"); + + // Lu + test_identifier("_A").expect("Valid parse!"); + test_identifier("_Ǡ").expect("Valid parse!"); + test_identifier("_Έ").expect("Valid parse!"); + test_identifier("_Щ").expect("Valid parse!"); + test_identifier("_Ꮿ").expect("Valid parse!"); + test_identifier("_Å").expect("Valid parse!"); + test_identifier("_ℜ").expect("Valid parse!"); + test_identifier("_Ᵽ").expect("Valid parse!"); + test_identifier("_T").expect("Valid parse!"); + test_identifier("_𐲱").expect("Valid parse!"); + test_identifier("_𝓨").expect("Valid parse!"); + test_identifier("_𝗨").expect("Valid parse!"); + test_identifier("_𝝫").expect("Valid parse!"); + test_identifier("_𞤞").expect("Valid parse!"); + + // Nl + test_identifier("_Ⅲ").expect("Valid parse!"); + test_identifier("_ↈ").expect("Valid parse!"); + test_identifier("_𐅰").expect("Valid parse!"); + test_identifier("_𒐒").expect("Valid parse!"); + test_identifier("_𒐪").expect("Valid parse!"); + test_identifier("_𒑚").expect("Valid parse!"); + test_identifier("_𒑮").expect("Valid parse!"); + + // Mn + test_identifier("_◌̣").expect("Valid parse!"); + test_identifier("_◌ַ").expect("Valid parse!"); + test_identifier("_◌ܶ").expect("Valid parse!"); + test_identifier("_◌ࣟ").expect("Valid parse!"); + test_identifier("_◌ై").expect("Valid parse!"); + test_identifier("_◌ླྀ").expect("Valid parse!"); + test_identifier("_◌ᬼ").expect("Valid parse!"); + test_identifier("_◌ⷻ").expect("Valid parse!"); + test_identifier("_◌ꦸ").expect("Valid parse!"); + test_identifier("_◌𝨰").expect("Valid parse!"); + test_identifier("_◌𝪩").expect("Valid parse!"); + test_identifier("_◌󠇬").expect("Valid parse!"); + + // Mc + test_identifier("_ா").expect("Valid parse!"); + test_identifier("_ௌ").expect("Valid parse!"); + test_identifier("_ෛ").expect("Valid parse!"); + test_identifier("_ြ").expect("Valid parse!"); + test_identifier("_ᬽ").expect("Valid parse!"); + test_identifier("_ꦾ").expect("Valid parse!"); + test_identifier("_𑍣").expect("Valid parse!"); + test_identifier("_𑲩").expect("Valid parse!"); + test_identifier("_𝅲").expect("Valid parse!"); + test_identifier("_𝅦").expect("Valid parse!"); + + // Nd + test_identifier("_1").expect("Valid parse!"); + test_identifier("_9").expect("Valid parse!"); + test_identifier("_٢").expect("Valid parse!"); + test_identifier("_٤").expect("Valid parse!"); + test_identifier("_৩").expect("Valid parse!"); + test_identifier("_੦").expect("Valid parse!"); + test_identifier("_௫").expect("Valid parse!"); + test_identifier("_൫").expect("Valid parse!"); + test_identifier("_໙").expect("Valid parse!"); + test_identifier("_႒").expect("Valid parse!"); + test_identifier("_᭑").expect("Valid parse!"); + test_identifier("_꧓").expect("Valid parse!"); + test_identifier("_꩘").expect("Valid parse!"); + test_identifier("_𝟯").expect("Valid parse!"); + test_identifier("_🯷").expect("Valid parse!"); + + // Pc + test_identifier("_‿").expect("Valid parse!"); + test_identifier("_⁀").expect("Valid parse!"); + test_identifier("_⁔").expect("Valid parse!"); + test_identifier("_︳").expect("Valid parse!"); + test_identifier("_︴").expect("Valid parse!"); + test_identifier("_﹍").expect("Valid parse!"); + test_identifier("_﹎").expect("Valid parse!"); + test_identifier("_﹏").expect("Valid parse!"); + test_identifier("__").expect("Valid parse!"); + + test_identifier("__").expect("Valid parse!"); + test_identifier("_$").expect("Valid parse!"); + test_identifier(r"_\u0041").expect("Valid parse"); // `A` + + test_identifier(r"_\u0040").unwrap_err(); + } + + #[test] + fn invalid() { + // Sm + assert!(!test_start(r"÷").is_lexed()); + assert!(!test_start(r"⅀").is_lexed()); + assert!(!test_start(r"∃").is_lexed()); + assert!(!test_start(r"∉").is_lexed()); + assert!(!test_start(r"∏").is_lexed()); + assert!(!test_start(r"∜").is_lexed()); + assert!(!test_start(r"⌠").is_lexed()); + assert!(!test_start(r"⌡").is_lexed()); + assert!(!test_start(r"⟜").is_lexed()); + assert!(!test_start(r"⨜").is_lexed()); + assert!(!test_start(r"⨷").is_lexed()); + assert!(!test_start(r"⪔").is_lexed()); + assert!(!test_start(r"𞻱").is_lexed()); + + assert!(!test_middle(r"÷").is_lexed()); + assert!(!test_middle(r"⅀").is_lexed()); + assert!(!test_middle(r"∃").is_lexed()); + assert!(!test_middle(r"∉").is_lexed()); + assert!(!test_middle(r"∏").is_lexed()); + assert!(!test_middle(r"∜").is_lexed()); + assert!(!test_middle(r"⌠").is_lexed()); + assert!(!test_middle(r"⌡").is_lexed()); + assert!(!test_middle(r"⟜").is_lexed()); + assert!(!test_middle(r"⨜").is_lexed()); + assert!(!test_middle(r"⨷").is_lexed()); + assert!(!test_middle(r"⪔").is_lexed()); + assert!(!test_middle(r"𞻱").is_lexed()); + } + + #[test] + fn escape_codes() { + // Valid Start tests + test_start(r"\u0061").expect("Valid parse!"); + test_start(r"\u0298").expect("Valid parse!"); + test_start(r"\u03be").expect("Valid parse!"); + test_start(r"\u044f").expect("Valid parse!"); + test_start(r"\u1d13").expect("Valid parse!"); + test_start(r"\u2c45").expect("Valid parse!"); + test_start(r"\uab87").expect("Valid parse!"); + test_start(r"\uff56").expect("Valid parse!"); + + test_start(r"\u02d1").expect("Valid parse!"); + test_start(r"\u02ec").expect("Valid parse!"); + test_start(r"\u1dbe").expect("Valid parse!"); + test_start(r"\u3032").expect("Valid parse!"); + test_start(r"\uaadd").expect("Valid parse!"); + test_start(r"\u06a7").expect("Valid parse!"); + test_start(r"\u0766").expect("Valid parse!"); + test_start(r"\u080a").expect("Valid parse!"); + test_start(r"\u0913").expect("Valid parse!"); + test_start(r"\u0a74").expect("Valid parse!"); + test_start(r"\u08c5").expect("Valid parse!"); + test_start(r"\u0d10").expect("Valid parse!"); + test_start(r"\u11bf").expect("Valid parse!"); + test_start(r"\u1225").expect("Valid parse!"); + test_start(r"\u141a").expect("Valid parse!"); + test_start(r"\u147a").expect("Valid parse!"); + test_start(r"\u1510").expect("Valid parse!"); + test_start(r"\u15b2").expect("Valid parse!"); + test_start(r"\u1699").expect("Valid parse!"); + test_start(r"\u16e6").expect("Valid parse!"); + test_start(r"\u1829").expect("Valid parse!"); + test_start(r"\u1a50").expect("Valid parse!"); + test_start(r"\u1baf").expect("Valid parse!"); + test_start(r"\u2da6").expect("Valid parse!"); + test_start(r"\u30c4").expect("Valid parse!"); + test_start(r"\u3188").expect("Valid parse!"); + test_start(r"\u342f").expect("Valid parse!"); + test_start(r"\u3507").expect("Valid parse!"); + test_start(r"\u3813").expect("Valid parse!"); + test_start(r"\u3a1d").expect("Valid parse!"); + test_start(r"\u1fab").expect("Valid parse!"); + test_start(r"\u1f9d").expect("Valid parse!"); + test_start(r"\u01c5").expect("Valid parse!"); + test_start(r"\u0041").expect("Valid parse!"); + test_start(r"\u01e0").expect("Valid parse!"); + test_start(r"\u0388").expect("Valid parse!"); + test_start(r"\u0429").expect("Valid parse!"); + test_start(r"\u13ef").expect("Valid parse!"); + test_start(r"\u212b").expect("Valid parse!"); + test_start(r"\u211c").expect("Valid parse!"); + test_start(r"\u2c63").expect("Valid parse!"); + test_start(r"\uff34").expect("Valid parse!"); + test_start(r"\u2162").expect("Valid parse!"); + test_start(r"\u2188").expect("Valid parse!"); + test_start(r"\u005f").expect("Valid parse!"); + test_start(r"\u0024").expect("Valid parse!"); + + // Invalid start character tests + assert!(!test_start(r"\u0031").is_lexed()); + assert!(!test_start(r"\u0039").is_lexed()); + assert!(!test_start(r"\u0662").is_lexed()); + assert!(!test_start(r"\u0664").is_lexed()); + assert!(!test_start(r"\u09e9").is_lexed()); + assert!(!test_start(r"\u0a66").is_lexed()); + assert!(!test_start(r"\u0beb").is_lexed()); + assert!(!test_start(r"\u0d6b").is_lexed()); + assert!(!test_start(r"\u0ed9").is_lexed()); + assert!(!test_start(r"\u1092").is_lexed()); + assert!(!test_start(r"\u1b51").is_lexed()); + assert!(!test_start(r"\ua9d3").is_lexed()); + assert!(!test_start(r"\uaa58").is_lexed()); + assert!(!test_start(r"\u203f").is_lexed()); + assert!(!test_start(r"\u2040").is_lexed()); + assert!(!test_start(r"\u2054").is_lexed()); + assert!(!test_start(r"\ufe33").is_lexed()); + assert!(!test_start(r"\ufe34").is_lexed()); + assert!(!test_start(r"\ufe4d").is_lexed()); + assert!(!test_start(r"\ufe4e").is_lexed()); + assert!(!test_start(r"\ufe4f").is_lexed()); + assert!(!test_start(r"\uff3f").is_lexed()); + + // Valid middle character tests + assert!(test_middle(r"\u0031").is_lexed()); + assert!(test_middle(r"\u0039").is_lexed()); + assert!(test_middle(r"\u0662").is_lexed()); + assert!(test_middle(r"\u0664").is_lexed()); + assert!(test_middle(r"\u09e9").is_lexed()); + assert!(test_middle(r"\u0a66").is_lexed()); + assert!(test_middle(r"\u0beb").is_lexed()); + assert!(test_middle(r"\u0d6b").is_lexed()); + assert!(test_middle(r"\u0ed9").is_lexed()); + assert!(test_middle(r"\u1092").is_lexed()); + assert!(test_middle(r"\u1b51").is_lexed()); + assert!(test_middle(r"\ua9d3").is_lexed()); + assert!(test_middle(r"\uaa58").is_lexed()); + assert!(test_middle(r"\u203f").is_lexed()); + assert!(test_middle(r"\u2040").is_lexed()); + assert!(test_middle(r"\u2054").is_lexed()); + assert!(test_middle(r"\ufe33").is_lexed()); + assert!(test_middle(r"\ufe34").is_lexed()); + assert!(test_middle(r"\ufe4d").is_lexed()); + assert!(test_middle(r"\ufe4e").is_lexed()); + assert!(test_middle(r"\ufe4f").is_lexed()); + assert!(test_middle(r"\uff3f").is_lexed()); + assert!(test_middle(r"\u005f").is_lexed()); + assert!(test_middle(r"\u0024").is_lexed()); + } +} diff --git a/src/lexing/tokens/line_terminator.rs b/src/lexing/tokens/line_terminator.rs new file mode 100644 index 0000000..e2d1ddf --- /dev/null +++ b/src/lexing/tokens/line_terminator.rs @@ -0,0 +1,114 @@ +//! +//! ## Line Terminators +//! +//! These signify the end of lines (although techincally [LineTerminatorSequence]s do!) +//! + +use avjason_macros::{verbatim as v, ECMARef, Spanned}; + +use crate::{ + common::Source, + lexing::{Lex, LexError, LexT, SourceStream}, +}; + +#[ECMARef("LineTerminator", "https://262.ecma-international.org/5.1/#sec-7.3")] +#[derive(Debug, Spanned)] +pub enum LineTerminator { + LF(v!('\n')), + CR(v!('\r')), + LS(v!('\u{2028}')), + PS(v!('\u{2029}')), +} + +#[ECMARef( + "LineTerminatorSequence", + "https://262.ecma-international.org/5.1/#sec-7.3" +)] +#[derive(Debug, Spanned)] +pub enum LineTerminatorSequence { + CRLF(v!("\r\n")), + LF(v!('\n')), + CR(v!('\r')), + LS(v!('\u{2028}')), + PS(v!('\u{2029}')), +} + +pub fn is_line_terminator(ch: &char) -> bool { + matches!(ch, '\n' | '\r' | '\u{2028}' | '\u{2029}') +} + +impl LexT for LineTerminator { + fn peek(input: &SourceStream) -> bool { + input.upcoming(is_line_terminator) + } + + fn lex(input: &mut SourceStream) -> Result { + // .unwrap_as_result() ok since we know there's at least one upcoming variant. + Lex::lex(input) + .map(Self::LF) + .or(|| Lex::lex(input).map(Self::CR)) + .or(|| Lex::lex(input).map(Self::LS)) + .or(|| Lex::lex(input).map(Self::PS)) + .unwrap_as_result() + } +} + +impl LexT for LineTerminatorSequence { + fn peek(input: &SourceStream) -> bool { + input.upcoming(is_line_terminator) + } + + fn lex(input: &mut SourceStream) -> Result { + // .unwrap_as_result() ok since we know there's at least one upcoming variant. + Lex::lex(input) + .map(Self::CRLF) + .or(|| Lex::lex(input).map(Self::LF)) + .or(|| Lex::lex(input).map(Self::CR)) + .or(|| Lex::lex(input).map(Self::LS)) + .or(|| Lex::lex(input).map(Self::PS)) + .unwrap_as_result() + } +} + +#[cfg(test)] +mod tests { + use crate::{ + common::{file::SourceFile, Source}, + lexing::{tokens::line_terminator::LineTerminatorSequence, Exactly}, + }; + + use super::LineTerminator; + + #[test] + fn line_terminators() { + let source = SourceFile::dummy_file("\r\n\u{2028}\u{2029}"); + let input = &mut source.stream(); + let new_lines: Exactly<4, LineTerminator> = input.lex().expect("Valid parse"); + assert!(matches!( + &*new_lines, + &[ + LineTerminator::CR(_), + LineTerminator::LF(_), + LineTerminator::LS(_), + LineTerminator::PS(_) + ] + )); + } + + #[test] + fn line_terminator_sequences() { + let source = SourceFile::dummy_file("\r\r\n\n\u{2028}\u{2029}"); + let input = &mut source.stream(); + let new_lines: Exactly<5, LineTerminatorSequence> = input.lex().expect("Valid parse"); + assert!(matches!( + &*new_lines, + &[ + LineTerminatorSequence::CR(_), + LineTerminatorSequence::CRLF(_), + LineTerminatorSequence::LF(_), + LineTerminatorSequence::LS(_), + LineTerminatorSequence::PS(_) + ] + )); + } +} diff --git a/src/lexing/tokens/mod.rs b/src/lexing/tokens/mod.rs new file mode 100644 index 0000000..405d269 --- /dev/null +++ b/src/lexing/tokens/mod.rs @@ -0,0 +1,12 @@ +//! +//! Lexical tokens. +//! + +pub mod comment; +pub mod line_terminator; +pub mod punctuator; +pub mod whitespace; +pub mod number; +pub mod escapes; +pub mod string; +pub mod identifier; diff --git a/src/lexing/tokens/number.rs b/src/lexing/tokens/number.rs new file mode 100644 index 0000000..84eb28a --- /dev/null +++ b/src/lexing/tokens/number.rs @@ -0,0 +1,148 @@ +//! +//! ## Number literals +//! +//! Number tokens like integers, hex integers, and decimals, +//! + +use std::ops::Add; + +use avjason_macros::{verbatim as v, ECMARef, Spanned}; + +use crate::{ + common::{Source, Span}, + lexing::{AtLeast, Exactly, LexError, LexT, SourceStream}, +}; + +#[ECMARef("DecimalDigit", "https://262.ecma-international.org/5.1/#sec-7.8.3")] +pub type DecimalDigit = v!('0'..='9'); + +#[ECMARef("HexDigit", "https://262.ecma-international.org/5.1/#sec-7.8.3")] +#[derive(Debug, Spanned, Clone)] +pub struct HexDigit { + span: Span, + raw: char, +} + +// TODO: Implement Lexical grammar for rest of Number. +// TODO: Implement syntactical grammar. +// TODO: Implement serde integration (+ fancy Spanned) + +// --- + +impl LexT for HexDigit { + fn peek(input: &SourceStream) -> bool { + input.upcoming(char::is_ascii_hexdigit) + } + + fn lex(input: &mut SourceStream) -> Result { + // .unwrap() ok since Self::peek() -> character exists. + let (loc, raw) = input.take().unwrap(); + Ok(Self { + span: Span::from(loc), + raw, + }) + } +} + +// --- + +/// +/// The numerical value of a literal. +/// +/// See the [ECMAScript spec](https://262.ecma-international.org/5.1/#sec-7.8.3). +/// +pub trait MathematicalValue { + type Value: Copy + Add; + const BASE: usize; + + fn mv(&self) -> Self::Value; +} + +impl MathematicalValue for DecimalDigit { + type Value = u8; + const BASE: usize = 10; + + fn mv(&self) -> Self::Value { + match self.raw() { + '0' => 0, + '1' => 1, + '2' => 2, + '3' => 3, + '4' => 4, + '5' => 5, + '6' => 6, + '7' => 7, + '8' => 8, + '9' => 9, + _ => unreachable!(), + } + } +} + +impl MathematicalValue for HexDigit { + type Value = u8; + const BASE: usize = 16; + + fn mv(&self) -> Self::Value { + match self.raw { + '0' => 0x0, + '1' => 0x1, + '2' => 0x2, + '3' => 0x3, + '4' => 0x4, + '5' => 0x5, + '6' => 0x6, + '7' => 0x7, + '8' => 0x8, + '9' => 0x9, + 'A' => 0xA, + 'B' => 0xB, + 'C' => 0xC, + 'D' => 0xD, + 'E' => 0xE, + 'F' => 0xF, + 'a' => 0xA, + 'b' => 0xB, + 'c' => 0xC, + 'd' => 0xD, + 'e' => 0xE, + 'f' => 0xF, + _ => unreachable!(), + } + } +} + +impl MathematicalValue for Exactly<2, HexDigit> { + type Value = u8; + const BASE: usize = 16; + + fn mv(&self) -> Self::Value { + self[0].mv() * Self::BASE as u8 + self[1].mv() + } +} + +impl MathematicalValue for Exactly<4, HexDigit> { + type Value = u16; + const BASE: usize = 16; + + fn mv(&self) -> Self::Value { + (self[0].mv() as u16) * (Self::BASE.pow(3) as u16) + + (self[1].mv() as u16) * (Self::BASE.pow(2) as u16) + + (self[2].mv() as u16) * (Self::BASE.pow(1) as u16) + + self[3].mv() as u16 + } +} + +impl MathematicalValue for AtLeast { + type Value = u64; + const BASE: usize = 16; + + fn mv(&self) -> Self::Value { + self.iter() + .map(MathematicalValue::mv) + .map(|mv| mv as u64) + .enumerate() + .map(|(i, v)| v * (Self::BASE.pow(i as u32) as u64)) + .sum() + } +} diff --git a/src/lexing/tokens/punctuator.rs b/src/lexing/tokens/punctuator.rs new file mode 100644 index 0000000..8cfc5dd --- /dev/null +++ b/src/lexing/tokens/punctuator.rs @@ -0,0 +1,108 @@ +//! +//! ## Punctuators +//! +//! Pieces of punctuation: `{}[]:,`. +//! + +use avjason_macros::{verbatim as v, SpecRef}; + +use crate::{ + common::Source, + lexing::{LexError, LexT, SourceStream}, +}; + +/// +/// `{` +/// +pub type OpenBrace = v!('{'); + +/// +/// `}` +/// +pub type CloseBrace = v!('}'); + +/// +/// `[` +/// +pub type OpenBracket = v!('['); + +/// +/// `]` +/// +pub type CloseBracket = v!(']'); + +/// +/// `:` +/// +pub type Colon = v!(':'); + +/// +/// `,` +/// +pub type Comma = v!(','); + +/// +/// `{ } [ ] : ,` +/// +#[SpecRef("JSON5Punctuator")] +pub enum Punctuator { + OpenBrace(OpenBrace), + CloseBrace(CloseBrace), + OpenBracket(OpenBracket), + CloseBracket(CloseBracket), + Colon(Colon), + Comma(Comma), +} + +impl LexT for Punctuator { + fn peek(input: &SourceStream) -> bool { + ::peek(input) + || ::peek(input) + || ::peek(input) + || ::peek(input) + || ::peek(input) + || ::peek(input) + } + + fn lex(input: &mut SourceStream) -> Result { + // .unwrap_as_result() ok since Self::peek() -> one variant present. + input + .lex() + .map(Self::OpenBrace) + .or(|| input.lex().map(Self::CloseBrace)) + .or(|| input.lex().map(Self::OpenBracket)) + .or(|| input.lex().map(Self::CloseBracket)) + .or(|| input.lex().map(Self::Colon)) + .or(|| input.lex().map(Self::Comma)) + .unwrap_as_result() + } +} + +#[cfg(test)] +mod tests { + use crate::{ + common::{file::SourceFile, Source}, + lexing::Exactly, + }; + + use super::Punctuator; + + #[test] + fn mixed_test() { + let source = SourceFile::dummy_file("{}[]:,"); + let input = &mut source.stream(); + let puncts: Exactly<6, Punctuator> = input.lex().expect("Valid parse"); + + assert!(matches!( + &*puncts, + &[ + Punctuator::OpenBrace(_), + Punctuator::CloseBrace(_), + Punctuator::OpenBracket(_), + Punctuator::CloseBracket(_), + Punctuator::Colon(_), + Punctuator::Comma(_) + ] + )) + } +} diff --git a/src/lexing/tokens/string.rs b/src/lexing/tokens/string.rs new file mode 100644 index 0000000..d7b292f --- /dev/null +++ b/src/lexing/tokens/string.rs @@ -0,0 +1,355 @@ +//! +//! ## String Literals +//! + +use avjason_macros::{verbatim as v, Spanned, SpecRef}; + +use crate::{ + common::{Source, Span}, + lexing::{LexError, LexResult, LexT, Many, SourceStream}, +}; + +use super::{ + escapes::EscapeSequence, + line_terminator::{is_line_terminator, LineTerminatorSequence}, +}; + +/// +/// String literals. +/// +#[SpecRef("JSON5String")] +#[derive(Debug, Spanned)] +pub enum LString { + Double(v!('"'), Many>, v!('"')), + Single(v!('\''), Many>, v!('\'')), +} + +/// +/// All possible parts of a string literal. +/// +#[derive(Debug, Spanned)] +pub enum StringPart { + Char(StringChar), + Escape(v!('\\'), EscapeSequence), + LineContinuation(v!('\\'), LineTerminatorSequence), + LS(v!('\u{2028}')), + PS(v!('\u{2029}')), +} + +/// +/// A non-escaped string character. +/// +/// This represents itself. +/// +#[derive(Debug, Spanned)] +pub struct StringChar { + span: Span, + raw: char, +} + +// --- + +impl LexT for LString { + fn peek(input: &SourceStream) -> bool { + ::peek(input) || ::peek(input) + } + + fn lex(input: &mut SourceStream) -> Result { + input + .lex() + .and(|opening| { + let contents = input.lex()?; + let closing = input.lex().expected_msg(input, "Expected closing `\"`")?; + LexResult::Lexed(Self::Double(opening, contents, closing)) + }) + .or(|| { + input.lex().and(|opening| { + let contents = input.lex()?; + let closing = input.lex().expected_msg(input, "Expected closing `'`")?; + LexResult::Lexed(Self::Single(opening, contents, closing)) + }) + }) + .unwrap_as_result() + } +} + +impl LexT for StringPart { + fn peek(input: &SourceStream) -> bool { + as LexT>::peek(input) + || ::peek(input) + || ::peek(input) + || ::peek(input) + } + + fn lex(input: &mut SourceStream) -> Result { + // Some real nastiness going on here: + // essentially, complex functional-like control flow + // for the \ character to check if either . + + // .unwrap_as_result() ok since Self::peek() + input + .lex() + .map(Self::LS) + .or(|| input.lex().map(Self::PS)) + .or(|| input.lex().map(Self::Char)) + .or(|| { + input.lex().and(|backslash: v!('\\')| { + input + .lex() + .map(|esc| Self::Escape(backslash.clone(), esc)) + .or(|| { + LexResult::Lexed(Self::LineContinuation( + backslash, + input.lex().expected_msg( + input, + "Expected either an escape code here, or newline; got neither.", + )?, + )) + }) + }) + }) + .unwrap_as_result() + } +} + +impl LexT for StringChar { + fn peek(input: &SourceStream) -> bool { + !(input.upcoming(D) || input.upcoming(is_line_terminator) || input.upcoming("\\")) + && input.peek().is_some() + } + + fn lex(input: &mut SourceStream) -> Result { + // .unwrap() ok since Self::peek() -> next character exists. + let (loc, raw) = input.take().unwrap(); + + Ok(Self { + span: Span::from(loc), + raw, + }) + } +} + +// --- + +/// +/// The character value of a part of a string literal, which +/// dictates which character that part represents. +/// +/// See the [ECMAScript spec](https://262.ecma-international.org/5.1/#sec-7.8.4). +/// +pub trait CharacterValue { + /// + /// Encodes the utf-16 based character value into a + /// buffer, returning a slice of the bytes used. + /// + fn cv<'a, 'b: 'a>(&'a self, buf: &'b mut [u16; 2]) -> &'b [u16]; + + /// + /// Attempts to convert this utf-16 as a Rust char. + /// + fn try_as_char(&self) -> Option { + let buf = &mut [0u16; 2]; + + let mut a = char::decode_utf16(self.cv(buf).iter().copied()); + a.next().and_then(Result::ok) + } +} + +/// +/// The value a string literal represents. +/// +/// See the [ECMAScript spec](https://262.ecma-international.org/5.1/#sec-7.8.4). +/// +pub trait StringValue { + /// + /// Because this is ECMAScript, strings are utf-16 encoded + /// — this will be preserved at this stage. + /// + fn sv(&self) -> Vec; + + /// + /// Workaround for testing only. + /// + #[cfg(test)] + fn to_rust_string_lossy(&self) -> String { + let utf16 = self.sv(); + String::from_utf16_lossy(&utf16) + } +} + +// --- + +impl CharacterValue for StringPart { + fn cv<'a, 'b: 'a>(&'a self, buf: &'b mut [u16; 2]) -> &'b [u16] { + match self { + StringPart::Char(ch) => ch.cv(buf), + StringPart::Escape(_, esc) => esc.cv(buf), + StringPart::LineContinuation(_, _) => &buf[0..0], // Skip. + StringPart::LS(_) => '\u{2028}'.encode_utf16(buf), + StringPart::PS(_) => '\u{2029}'.encode_utf16(buf), + } + } +} + +impl CharacterValue for StringChar { + fn cv<'a, 'b: 'a>(&'a self, buf: &'b mut [u16; 2]) -> &'b [u16] { + self.raw.encode_utf16(buf) + } +} + +// --- + +impl StringValue for LString { + fn sv(&self) -> Vec { + match self { + LString::Double(_, contents, _) => contents.sv(), + LString::Single(_, contents, _) => contents.sv(), + } + } +} + +/// +/// Collect character values as a UTF-16 string. +/// +pub fn collect_cv_into_utf16<'a, CV: CharacterValue + 'a>( + iter: impl IntoIterator + 'a, +) -> Vec { + let iter: Vec<_> = iter.into_iter().collect(); + // Complete guesswork about the initial capacity: + // I'm assuming that we're not going to get too many multi-u16 chars. + let mut string = Vec::with_capacity(iter.len() * 5 / 4); + + let buf = &mut [0; 2]; + for part in iter { + string.extend(part.cv(buf)) + } + + string +} + +impl StringValue for Many> { + fn sv(&self) -> Vec { + collect_cv_into_utf16(self.iter()) + } +} +// --- + +#[cfg(test)] +mod tests { + use crate::{ + common::{file::SourceFile, Source}, + lexing::{tokens::string::StringValue, LexResult}, + }; + + use super::LString; + + fn test_string(st: &'static str) -> LexResult { + let source = SourceFile::dummy_file(st); + let input = &mut source.stream(); + input.lex() + } + + #[test] + fn normal_use_case() { + assert_eq!( + test_string(r"'AvdanOS is a community-led open-source project that attempts to implement Avdan\'s \'AvdanOS\' concept as a Wayland compositor.'") + .unwrap().to_rust_string_lossy(), + "AvdanOS is a community-led open-source project that attempts to implement Avdan\'s \'AvdanOS\' concept as a Wayland compositor." + ); + } + + #[test] + fn empty_string() { + assert_eq!(test_string("''").unwrap().to_rust_string_lossy(), ""); + assert_eq!(test_string("\"\"").unwrap().to_rust_string_lossy(), ""); + } + + #[test] + fn escapes() { + let lit = test_string( + r"'\'\\\b\f\n\r\t\v\a\!\£\%\*\&\-\=\💩\0\x20\x26\x25\x3c\u0000\u2AFC\u6798\u1623'", + ) + .expect("Valid parse"); + + assert_eq!( + lit.sv(), + // Answer from JavaScript (Chrome's V8). + vec![ + 39, 92, 8, 12, 10, 13, 9, 11, 97, 33, 163, 37, 42, 38, 45, 61, 55357, 56489, 0, 32, + 38, 37, 60, 0, 11004, 26520, 5667 + ] + ) + } + + #[test] + fn unbalanced_quotes() { + test_string(r"'Think this is unbalanced -- have you seen capitalism?").unwrap_err(); + test_string(r"'They don\'t let dogs in prison, Grommit! They\'ll put you down!\'") + .unwrap_err(); + test_string("\"Nothing is more appealing right now than a cup of coffee").unwrap_err(); + test_string("\"Have you heard about the album 'Nervermind'?\\\"").unwrap_err(); + } + + #[test] + fn invalid_escapes() { + test_string(r"'\x2'").unwrap_err(); + test_string(r"'\xSS'").unwrap_err(); + test_string(r"'\uSFAA'").unwrap_err(); + test_string(r"'\u2AA'").unwrap_err(); + + // It turns out that this form of escape is, in fact, octal. + // This is not mentioned in the official ECMAScript spec, + // But is in the optional extenstions: Section B.1.2(https://262.ecma-international.org/5.1/#sec-B.1.2). + + // For example, Node (V8) supports this, but Bun (JavaScriptCore) does not. + // As it is not mentioned whether to comply with optional extensions, + // this crate will not be implementing octal syntax. + test_string(r"'\1'").unwrap_err(); + } + + /// + /// Random series of u16's interpreted as + /// string literals, with the utf-16 value + /// compared to V8's answer. + /// + #[test] + #[allow(text_direction_codepoint_in_literal)] + fn fuzzing() { + assert_eq!( + test_string(r"'䂞ᤴ쭜ؚ洲綏뤒힓蔫黮뱏꽻ꜵ킩악\x19젏◣愜ꏟ醟㾊䑥뷜筵읩ꡓむ髇阏⍉딴퓼됪璮轫ʢ톽觻䀫ꮳ玐耠綈亄宅坍♳ꯑ\uDBCD㇀甚渭￐㛓魴矮︄跕鹞㉋᪽ꎓ鸩먾汕䱏쏀㘓씩㕟챬ᆀ瓅㫱భd瀒峊ツꮫ뀥靺㉏ꙓⷳᨾ짽ꑙΥפ肜혃ᐜ恴婁⛫╴䰛⾁\x9A䚠댂䜡ૢ¦ꊠ⧽랸儔根햩쫹輤Ȫ䜭ﺆᬒ偠⊽Ṑ敇봅¨팔檵\uDBB9Գ౓ถ啼摚㿓껠͛躏湜㵬褤쵐㽴䒦迼\uD933ᛳ뵁楻뤣璻㰒\uDB11疲ᆐ腻抐즲ଉ灮鷋䝡밶ꛃ\uDF4Br⯝ଆ㷍工좭澏挣\uDC83◘语开劊椢䀐럵갿懼嗵⊫ꑬ縭郁얱仁༅ⷬ垉₍荌ﵙ䭿⦤牐詌撸উ崙\uDE8E荓畨ꯔᇤ垯蠐⏧쨁▏賈⇜欁ꓕ⍎讷∥㫲画鴶醎迚崴쭹짲교뎈噍⽚\uDFB8냅㥤射'") + .expect("Valid parse").sv(), + vec![ + 16542, 6452, 52060, 1562, 27954, 32143, 47378, 55187, 34091, 40686, 48207, 44923, 42805, 53417, 50501, 25, 51215, 9699, 62034, 24860, 41951, 37279, 16266, 57468, 17509, 57866, 48604, 31605, 51049, 43091, 12416, 39623, 38415, 9033, 58372, 46388, 54524, 46122, 29870, 36715, 674, 53693, 35323, 16427, 43955, 29584, 32800, 32136, 20100, 64004, 60979, 22349, 9843, 58280, 43985, 56269, 12736, 29978, 28205, 65488, 14035, 39796, 60797, 30702, 65028, 36309, 40542, 12875, 6845, 41875, 40489, 61197, 47678, 27733, 19535, 50112, 13843, 50473, 13663, 52332, 4480, 29893, 15089, 3117, 100, 28690, 23754, 65410, 43947, 45093, 38778, 12879, 42579, 61708, 11763, 6718, 51709, 42073, 933, 1508, 32924, 54787, 5148, 24692, 23105, 9963, 9588, 19483, 12161, 154, 18080, 45826, 18209, 2786, 65508, 41632, 10749, 47032, 20756, 26681, 54697, 51961, 57412, 36644, 554, 18221, 65158, 6930, 20576, 62494, 8893, 7760, 25927, 48389, 168, 58805, 54036, 27317, 56249, 1331, 3155, 3606, 21884, 25690, 16339, 44768, 859, 36495, 28252, 15724, 35108, 52560, 16244, 61134, 17574, 36860, 55603, 5875, 48449, 27003, 47395, 29883, 15378, 56081, 61909, 30130, 4496, 33147, 61001, 25232, 51634, 2825, 28782, 58861, 40395, 18273, 48182, 42691, 57163, 114, 11229, 2822, 15821, 24037, 60822, 51373, 28559, 25379, 62890, 56451, 9688, 35821, 59961, 24320, 21130, 26914, 16400, 47093, 44095, 25084, 22005, 8875, 42092, 32301, 37057, 50609, 20161, 3845, 11756, 22409, 8333, 33612, 64857, 19327, 10660, 29264, 35404, 25784, 2441, 23833, 58025, 58894, 56974, 33619, 61599, 30056, 43988, 4580, 22447, 34832, 9191, 51713, 9615, 36040, 8668, 27393, 42197, 9038, 35767, 8741, 15090, 64163, 40246, 37262, 36826, 23860, 52089, 51698, 44368, 45960, 22093, 12122, 57272, 45253, 14692, 23556 + ] + ); + + assert_eq!( + test_string(r"'秚놰ꚋ⾏<給齌걿괔鍺江ﬧ䭑钣ᆲ茊琳株໶杴칽\uDCB1渾⭮ⶕ墢啐渍홦䳹艘紕혺镨쾋冻喢喚䣳㙤봽级邒ថ\uD9B8ោ䋀껄䦐椴⎨譴꽲沺᷆롥ᗐ赙쿰⸲᪘꿲鏸帠梯튋궳 ㌦땭ӂ咶鞝卓硄뷬䫾ୢ蘪ク㉃᯲ຮ೚⃊̽詁ꓔ㴺뮢׳Թ尀塠鶈퟾뷊娈鶍х㍣铽렑轨ߵꧮ㒉콳$ꖃ붟섈⟃ᰫ턖\uDAABLꄅ\uDEE2鰔程륡㩜旎ᢛ᜴휫澜䬁쾘྾퍂畐囃꺴ነ泴얽㤢瀊Ⱃྡྷ뙷輇ዉаᅦ㠮㚾졲揿䠭஍磡༛논렺鵠篩㣴셑拨튮ꈌἛ隸눙埊㙺겓셀꠱♌\uDD7E䂼귘檚홗誚͔ꦣ锴ߓ\uDB03匷䏄膟鿕僥粡塕ꎟ宗彲댙䈹⟚ད軵픣㇅燺盰籞睻䋫얨♶኶車\uDBD1젔䖬⬓࣌㺓ྂ꤯⽊᫖ᚋ焹￲甃ꇢ뛉芀ฑ訾蔾\uD96C捈㮙཯㜄'") + .expect("Valid parse").sv(), + vec![31194, 45488, 62680, 42635, 12175, 65308, 32102, 40780, 44159, 44308, 37754, 27743, 64295, 19281, 38051, 65452, 33546, 29747, 26666, 3830, 26484, 52861, 56497, 28222, 11118, 11669, 22690, 21840, 28173, 54886, 19705, 58159, 33368, 32021, 54842, 59331, 62622, 38248, 53131, 20923, 21922, 21914, 18675, 13924, 48445, 61070, 32423, 37010, 6032, 55736, 6084, 17088, 44740, 18832, 26932, 9128, 35700, 61865, 44914, 27834, 7622, 47205, 5584, 36185, 53232, 11826, 6808, 45042, 37880, 24096, 26799, 53899, 44467, 12288, 13094, 46445, 1218, 21686, 38813, 21331, 30788, 59530, 61378, 58739, 48620, 19198, 2914, 34346, 12463, 12867, 7154, 3758, 3290, 8394, 829, 35393, 42196, 15674, 48034, 1523, 1337, 23552, 22624, 40328, 55294, 48586, 23048, 40333, 63659, 1093, 13155, 38141, 47121, 36712, 2037, 43502, 13449, 53107, 36, 42371, 48543, 49416, 10179, 7211, 53526, 55979, 65324, 41221, 57058, 39956, 31243, 47457, 14940, 26062, 6299, 5940, 55083, 28572, 19201, 53144, 4030, 54082, 30032, 22211, 44724, 4752, 27892, 59533, 50621, 14626, 28682, 11283, 4002, 46711, 36615, 4809, 1072, 65479, 14382, 14014, 51314, 25599, 18477, 2957, 30945, 3867, 45436, 47162, 40288, 31721, 14580, 49489, 25320, 53934, 41484, 7963, 38584, 45593, 22474, 13946, 44179, 57572, 49472, 43057, 63050, 9804, 62475, 56702, 16572, 44504, 27290, 57604, 54871, 35482, 852, 43427, 38196, 2003, 56067, 21303, 17348, 33183, 40917, 20709, 31905, 22613, 41887, 23447, 24434, 45849, 60174, 16953, 10202, 3921, 36597, 60241, 54563, 12741, 29178, 30448, 31838, 30587, 17131, 58558, 50600, 9846, 4790, 63746, 56273, 51220, 17836, 11027, 2252, 16019, 3970, 43311, 12106, 59874, 6870, 5771, 28985, 57731, 65522, 29955, 41442, 57924, 46793, 33408, 3601, 35390, 34110, 55660, 25416, 15257, 3951, 14084 + ] + ); + + assert_eq!( + test_string(r"'ᐇ➢ᷙ榃훳휆ꅦ欥㒎ᩀஒ䧓㼿\uDBBE䍷ख़ꔬ쳩呍ꑼ᧡譶䮿뽕ꙴ뢪촗㲪袹쟓Ὴ棅捈批쟹砛▟즣㎜펒巵ꚓ꜐Ꞝ톘ᅿ㣓䐩籮晤饳堓䋤੡㇪ᾚ厤秲猪絓ꡨ俛붷継㤕᠍䌖\uDEDA砇₈㴹牙뛞ꃤ˕ඟ蚍醚픦먜ἺṴ茫뚯﹤唟풰섙碁젋졂∽赞摖隆걑쩒柀瞛擧获㺟染ሏ᧻사汋迪셨㸹嵂䤬闄䏇䒘㎓뻑ꭊ圹衁끇ᮓ빪耔怮⺇䳐묃䅻׫磼脉ò᷾姰佄鶕붬ጛ갲祔奔㖔⣪℄蝱靦ꆯꜮ궻씍ﹶ쑿鞾轪⠱胼螓멣栟跦沭⊾夏尃먗㲳瀆ퟎ콆攂喉ㄻ嶳鸹䉭뾐铥䤰漘뉦ᅭ䐨钞薑涐⹾쏾䏔蝶フ⯐ຒ藧㒴緽듹⇒㛎明黩㛳氓梛辽\uD850㣞\uDD33鼚暤梅㧊Ҩᩰ圄찅甦信矆誐ոꖚ冐䞭㹳㹆鰑'") + .expect("Valid parse").sv(), + vec![59730, 5127, 10146, 7641, 59098, 27011, 55027, 55046, 41318, 27429, 13454, 6720, 2962, 63467, 18899, 16191, 56254, 17271, 2393, 42284, 52457, 21581, 42108, 6625, 63025, 35702, 59558, 19391, 48981, 42612, 47274, 52503, 15530, 35001, 51155, 8138, 26821, 25416, 25209, 51193, 59246, 30747, 9631, 51619, 13212, 54162, 24053, 42643, 42768, 42908, 53656, 4479, 14547, 17449, 31854, 26212, 39283, 22547, 17124, 2657, 60174, 12778, 8090, 21412, 31218, 29482, 32083, 63500, 43112, 20443, 48567, 32153, 14613, 6157, 17174, 58187, 57050, 30727, 8328, 15673, 29273, 46814, 41188, 725, 3487, 34445, 37274, 54566, 47644, 7994, 7796, 33579, 46767, 65124, 21791, 54448, 49433, 30849, 51211, 51266, 8765, 63507, 36190, 25686, 38534, 44113, 51794, 26560, 30619, 58556, 25831, 33719, 16031, 61982, 26579, 4623, 6651, 57722, 49324, 27723, 36842, 49512, 15929, 23874, 58228, 18732, 38340, 17351, 17560, 13203, 48849, 43850, 61644, 22329, 34881, 45127, 7059, 48746, 32788, 24622, 11911, 19664, 47875, 16763, 1515, 30972, 33033, 242, 7678, 23024, 20292, 40341, 48556, 4891, 44082, 31060, 22868, 13716, 10474, 8452, 34673, 60035, 38758, 41391, 62827, 42798, 44475, 63743, 58786, 50445, 62643, 65142, 50303, 38846, 36714, 10289, 58929, 33020, 34707, 47715, 26655, 36326, 27821, 62815, 8894, 22799, 23555, 47639, 60265, 15539, 28678, 58432, 55246, 53062, 25858, 60646, 21897, 60958, 12603, 23987, 40505, 17005, 49040, 38117, 18736, 28440, 45670, 4461, 17448, 38046, 34193, 28048, 11902, 50174, 17364, 34678, 12501, 11216, 3730, 34279, 13492, 32253, 46329, 8658, 60048, 14030, 26126, 40681, 14067, 27667, 61088, 26779, 36797, 55376, 14558, 56627, 40730, 60610, 26276, 26757, 14794, 1192, 6768, 22276, 52229, 29990, 20449, 30662, 35472, 1400, 42394, 20880, 18349, 15987, 15942, 39953 + ] + ); + + assert_eq!( + test_string(r"'祐䇛珈䣏둫䠽㩅⏇ᗊꥷ⛙寎杅똦儣桴糎絪㋢雳쑢㡟ⓘ譏笜穘ᎏ난ᡂᣕ䯹嗔楗鏯⼺㌨떟ሎỬ⵹䪋౿⸬ה\uDCAE釉萳阪櫒洈宀駅뻍슘ᴘ錱⎝ᓛ堼䲃㖭鸜鸍\uDABC掰カ픸⑘佫䔻樟嗌軓\x83喋瀛䙳峦튬酥㫑䔶␱씿芩鵙䗲衜賈\uDF4B㋋颡쩾敯侥㰟ᱍሇ笿뭦ۑ؄ࡾ갆쌨嬓ꑌ⼮犥䏧擌臤ꋪ꡿↊됰㏞讐റᲱ篤ⴓꚹ菙䪦엇⡗袦嵻嶬捡쭇䙑婫⏱韲흛⌠ꊜ볲緙덕㣔鍸暐䄧뭝鳴ᙇ莯覧⑩쿿벹⦠紈ۃ戎쥔븗ꍏ桝\uDCD8໗坊圻賈꧙볰ꁣ칏ᄩ\uDC7F삃爞겕虵䡏έ홰⸱焸왵⒗㚪좵Ϗ훱熞䗳၇ェ죳ਙ\uDE73\uD8E8汃ᚠ鏮恹⺐죾ﮒ툔퐬ઇﵔ촶⊄෈᬴늮\uDE3F䣓攙蘿儠ූആ䞔ꄶ亏㘝迬'") + .expect("Valid parse").sv(), + vec![31056, 57578, 16859, 29640, 18639, 58428, 46187, 18493, 14917, 9159, 5578, 43383, 9945, 23502, 26437, 46630, 20771, 26740, 31950, 32106, 59267, 13026, 38643, 59963, 58248, 50274, 14431, 9432, 35663, 31516, 31320, 5007, 45212, 6210, 6357, 63623, 19449, 60661, 21972, 26967, 37871, 12090, 58152, 13096, 46495, 62016, 4622, 7916, 11641, 19083, 3199, 11820, 1492, 56494, 37321, 33843, 38442, 27346, 27912, 23424, 39365, 48845, 49816, 7448, 37681, 9117, 5339, 22588, 19587, 58498, 13741, 40476, 40461, 55996, 25520, 62609, 65398, 54584, 9304, 20331, 57805, 17723, 27167, 61366, 57712, 57423, 21964, 36563, 131, 21899, 28699, 18035, 23782, 53932, 37221, 61779, 15057, 17718, 9265, 50495, 33449, 40281, 59399, 17906, 59923, 34908, 36040, 61662, 57163, 13003, 39073, 51838, 25967, 20389, 15391, 7245, 60883, 4615, 31551, 47974, 1745, 1540, 2174, 44038, 49960, 58601, 23315, 42060, 12078, 29349, 17383, 25804, 33252, 41706, 43135, 8586, 46128, 13278, 35728, 3377, 57603, 7345, 31716, 11539, 42681, 33753, 19110, 50631, 10327, 34982, 23931, 60456, 23980, 25441, 52039, 63224, 18001, 23147, 9201, 38898, 55131, 8992, 58355, 41628, 48370, 32217, 45909, 14548, 37752, 26256, 16679, 57404, 47965, 40180, 5703, 33711, 35239, 9321, 53247, 48313, 10656, 32008, 1731, 25102, 51540, 48663, 41807, 26717, 56536, 3799, 22346, 22331, 63747, 43481, 48368, 41059, 52815, 58274, 4393, 56447, 49283, 29214, 44181, 34421, 18511, 8051, 62021, 54896, 11825, 28984, 50805, 9367, 13994, 51381, 975, 55025, 57379, 29086, 17907, 4167, 12455, 51443, 2585, 56947, 55528, 27715, 5792, 59015, 37870, 24697, 11920, 51454, 64402, 53780, 54316, 2695, 64852, 52534, 8836, 3528, 6964, 45742, 56895, 18643, 62500, 25881, 63760, 20768, 3542, 3334, 18324, 41270, 20111, 13853, 61106, 36844 + ] + ); + + // This one broke clippy, because text changes directions halfway through, + // but we don't care about that! + assert_eq!( + test_string(r"'䊄㨍䕇㉆鹹䤑謲虉喙帺⫮૚謤㵳骼뜜ᳪﱞ䀅ߢ兾ỷ煡鼱뚹ꕖ䜻\uDC9F終蚔㏼뫨軗쯰붰줓城鱃膫⌶틧ﲔ醛㹣䳵踠圆귚ᇟ赒ᡘ浚預鿹ᘓฑ圲肋ꕬ჆㓘륳쌫텮厬攞ᕇ䮽ꢗ牴쫚굣篁ж怏娈뭑싒樞ጡ矡鸉퉱㾼러⁩⨥ቭ桅做휚࠿멞㓧\uDA80䷸㠻ご砕紭䞏玆䪗ৰﰸ斺㯈璏ﶔꃙ剧뇗ވ㥋༣咨喘벷긳닅厒ᆻ唣퓽뾖跴퉈ㄳ⟵⼚셅쒱輎ᾟ笴㗸䩽\uDF42吓ꅘ軟᢭褶欲෗᪸蠬騻ꥼ籎䋾âꙷನ䮲蹼㗘㞑\uDD45ꦪ쮝乳頇ᘜ智⊴Ꟙ䍹♀뷿짿ꍵᲜ촾㉂냰騞Ҥ晲駀牵揄䤸䆳၅뿐䧨箧곟὚㫽揖쬨繥쨉딭㶲쁉㝓Հ濘⑙࢟鸀兊\uD881ꪨ줢ꁳ㎥哕ヅ䳓רּ㴤됈֕倬詗ʿ깗憭㼟᜗꭛욙⎅⑴ឮ窗'") + .expect("Valid parse").sv(), + vec![ 17028, 14861, 17735, 12870, 40569, 18705, 35634, 34377, 21913, 24122, 10990, 2778, 35620, 61404, 15731, 63376, 39612, 46876, 7402, 64606, 16389, 2018, 20862, 7927, 29025, 40753, 46777, 42326, 18235, 56479, 32066, 61027, 34452, 13308, 47848, 36567, 52208, 48560, 51475, 22478, 40003, 33195, 57595, 9014, 53991, 64660, 37275, 15971, 61616, 19701, 36384, 22278, 57883, 44506, 4575, 36178, 58627, 6232, 27994, 38928, 40953, 5651, 3601, 22322, 32907, 57580, 42348, 4294, 13528, 47475, 61491, 49963, 53614, 21420, 25886, 5447, 19389, 43159, 29300, 51930, 44387, 31681, 1078, 24591, 23048, 47953, 49874, 60619, 27166, 4897, 30689, 40457, 53873, 16316, 61713, 47084, 8297, 10789, 61059, 4717, 26693, 20570, 55066, 2111, 47710, 13543, 55936, 57772, 19960, 57799, 14395, 12372, 30741, 32045, 18319, 29574, 19095, 58510, 2544, 64568, 57946, 26042, 15304, 58185, 29839, 64916, 41177, 21095, 45527, 1928, 14667, 3875, 21672, 63164, 21912, 48311, 44595, 45765, 21394, 4539, 21795, 54525, 49046, 36340, 53832, 12595, 10229, 12058, 49477, 50353, 36622, 8095, 31540, 13816, 19069, 57154, 21523, 41304, 36575, 60038, 6317, 35126, 27442, 3543, 6840, 34860, 39483, 58493, 43388, 31822, 17150, 226, 42615, 3240, 19378, 36476, 13784, 14225, 56645, 43434, 52125, 20083, 38919, 5660, 26234, 8884, 42968, 17273, 9792, 48639, 51711, 41845, 7324, 52542, 12866, 45296, 39454, 1188, 26226, 39360, 29301, 64141, 18744, 57736, 16819, 4165, 62390, 59943, 49104, 18920, 31655, 44255, 8026, 15101, 25558, 52008, 32357, 51721, 46381, 15794, 49225, 14163, 1344, 28632, 9305, 2207, 40448, 60855, 61830, 20810, 55425, 43688, 57670, 51490, 41075, 13221, 21717, 12485, 19667, 64328, 15652, 46088, 1429, 20524, 35415, 703, 44631, 25005, 16159, 5911, 43867, 50841, 9093, 9332, 6062, 31383] + ); + } +} diff --git a/src/lexing/tokens/whitespace.rs b/src/lexing/tokens/whitespace.rs new file mode 100644 index 0000000..fc94b70 --- /dev/null +++ b/src/lexing/tokens/whitespace.rs @@ -0,0 +1,65 @@ +//! +//! ## Whitespace +//! +//! Empty space that doesn't contribute syntactically. +//! + +use avjason_macros::{ECMARef, Spanned}; +use finl_unicode::categories::{CharacterCategories, MinorCategory}; + +use crate::{ + common::{Source, Span}, + lexing::{LexError, LexT, SourceStream}, +}; + +/// +/// Whitespace characters. +/// +#[derive(Debug, Spanned)] +#[ECMARef("WhiteSpace", "https://262.ecma-international.org/5.1/#sec-7.2")] +pub struct WhiteSpace { + span: Span, +} + +/// +/// Is this character whitespace? +/// +/// Compliant with [Table 2, Section 7.2](https://262.ecma-international.org/5.1/#sec-7.2) of the ECMAScript specification. +/// +fn is_whitespace(ch: &char) -> bool { + use MinorCategory::Zs; + + match ch { + '\u{0009}' | '\u{000B}' | '\u{000C}' | '\u{0020}' | '\u{00A0}' | '\u{FEFF}' => true, + c if matches!(c.get_minor_category(), Zs) => true, + _ => false, + } +} + +impl LexT for WhiteSpace { + fn peek(input: &SourceStream) -> bool { + input.upcoming(is_whitespace) + } + + fn lex(input: &mut SourceStream) -> Result { + // Since Self::peek() -> there's at least one character. + let (span, _) = input.take_while(is_whitespace).unwrap(); + Ok(Self { span }) + } +} + +#[cfg(test)] +mod tests { + use crate::common::{file::SourceFile, Source}; + + use super::WhiteSpace; + + #[test] + fn lex_whitespace() { + let ws = "\t\t \t\t\u{000B}\u{000C}"; + let source = SourceFile::dummy_file(ws); + let input = &mut source.stream(); + let whitespace: WhiteSpace = input.lex().expect("Valid parse"); + assert_eq!(source.source_at(whitespace), Some(ws.to_string())) + } +} diff --git a/src/lexing/utils/lex_impls.rs b/src/lexing/utils/lex_impls.rs new file mode 100644 index 0000000..0cc3ce4 --- /dev/null +++ b/src/lexing/utils/lex_impls.rs @@ -0,0 +1,152 @@ +//! +//! Utility implementations for [Lex]. +//! + +use std::{ + marker::PhantomData, + ops::{Deref, DerefMut}, +}; + +use crate::common::{Source, Span, SpanIter, Spanned}; + +use super::{Lex, LexResult, LexT, Peek, SourceStream}; + +/// +/// Many (possibly one or zero) of `L`-tokens. +/// +pub type Many = Vec; + +impl Lex for Many { + fn peek(_: &SourceStream) -> Peek { + Peek::Possible(PhantomData::) + } + + fn lex(input: &mut SourceStream) -> LexResult { + let mut v = vec![]; + + loop { + let res = ::lex(input); + match res { + LexResult::Lexed(lexed) => v.push(lexed), + LexResult::Errant(errant) => return LexResult::Errant(errant), + LexResult::Nothing => break, + } + } + + LexResult::Lexed(v) + } +} + +impl Spanned for Many { + fn span(&self) -> Span { + SpanIter::combine(self.iter().map(S::span)) + } +} + +/// +/// At least `N` lots of `L`-tokens. +/// +#[derive(Debug, Clone)] +pub struct AtLeast(Vec); + +impl Lex for AtLeast { + fn peek(input: &SourceStream) -> Peek { + if N == 0 { + return Peek::Possible(PhantomData::); + } + + ::peek(input).map() + } + + fn lex(input: &mut SourceStream) -> LexResult { + let many: Many = Lex::lex(input)?; + + if many.len() < N { + return LexResult::Errant(input.error(format!( + "Expected at least {N} {} tokens: got {}.", + std::any::type_name::(), + many.len(), + ))); + } + + LexResult::Lexed(Self(many)) + } +} + +impl Spanned for AtLeast { + fn span(&self) -> Span { + SpanIter::combine(self.iter().map(S::span)) + } +} + +impl Deref for AtLeast { + type Target = Vec; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +impl DerefMut for AtLeast { + fn deref_mut(&mut self) -> &mut Self::Target { + &mut self.0 + } +} + +/// +/// Exactly `N` lots of `L`-tokens: no more, no less. +/// +#[derive(Debug, Clone)] +pub struct Exactly([L; N]) +where + [(); N]: Sized; + +impl Lex for Exactly +where + [(); N]: Sized, +{ + fn peek(input: &SourceStream) -> Peek { + if N == 0 { + return Peek::Possible(PhantomData::); + } + + ::peek(input).map() + } + + fn lex(input: &mut SourceStream) -> LexResult { + let many: Many = Lex::lex(input)?; + + if many.len() != N { + return LexResult::Errant(input.error(format!( + "Expected {N} {} tokens: got {}.", + std::any::type_name::(), + many.len() + ))); + } + + // SAFETY: Just checked the length, so unwrap okay. + let many: [L; N] = unsafe { many.try_into().unwrap_unchecked() }; + + LexResult::Lexed(Self(many)) + } +} + +impl Spanned for Exactly { + fn span(&self) -> Span { + SpanIter::combine(self.iter().map(S::span)) + } +} + +impl Deref for Exactly { + type Target = [L; N]; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +impl DerefMut for Exactly { + fn deref_mut(&mut self) -> &mut Self::Target { + &mut self.0 + } +} diff --git a/src/lexing/utils/mod.rs b/src/lexing/utils/mod.rs new file mode 100644 index 0000000..53c55dc --- /dev/null +++ b/src/lexing/utils/mod.rs @@ -0,0 +1,86 @@ +//! +//! Utilities for lexing. +//! + +pub mod lex_impls; +pub mod peek; +pub mod result; +pub mod stream; +pub mod verbatim; +pub mod unicode; + +use std::marker::PhantomData; + +use crate::common::Source; + +pub use self::{ + lex_impls::{AtLeast, Exactly, Many}, + peek::Peek, + result::{LexError, LexResult}, + stream::SourceStream, +}; + +/// +/// For internal use. +/// +#[doc(hidden)] +pub trait LexT: Sized { + /// + /// Checks to see if this token is possibly upcoming. + /// + fn peek(input: &SourceStream) -> bool; + + /// + /// Given that the token is potentially present, + /// start lexing. + /// + /// This function has guaranteed side-effects on the input [SourceStream] (advancing it). + /// + fn lex(input: &mut SourceStream) -> Result; +} + +/// +/// Oprations on lexical tokens: +/// * Lexing, +/// * Peeking +/// +pub trait Lex: Sized { + /// + /// Checks is this token is potentially present, + /// which can then be further further lexed. + /// + fn peek(input: &SourceStream) -> Peek; + + /// + /// Returns a [LexResult] with either: + /// * a valid token [LexResult::Lexed], + /// * [LexResult::Nothing] (token not present), + /// * or [LexResult::Errant] (spanned error). + /// + fn lex(input: &mut SourceStream) -> LexResult; +} + +/// +/// The public-facing implementation. +/// +impl Lex for L { + #[inline] + fn peek(input: &SourceStream) -> Peek { + // Forward to internal impl, then make proper [Peek] + // enum variant. + match ::peek(input) { + true => Peek::Possible(PhantomData::), + false => Peek::Absent, + } + } + + /// + /// Returns a [LexResult] with either: + /// * a valid token [LexResult::Lexed], + /// * [LexResult::Nothing] (token not present), + /// * or [LexResult::Errant] (spanned error). + /// + fn lex(input: &mut SourceStream) -> LexResult { + ::peek(input).then_lex(input) + } +} diff --git a/src/lexing/utils/peek.rs b/src/lexing/utils/peek.rs new file mode 100644 index 0000000..23c81ee --- /dev/null +++ b/src/lexing/utils/peek.rs @@ -0,0 +1,38 @@ +//! +//! Peeking for lexical tokens. +//! + +use std::marker::PhantomData; + +use crate::common::Source; + +use super::{LexResult, LexT, SourceStream}; + +/// +/// Result of a peek, either: +/// * Possibly present, +/// * or not. +/// +pub enum Peek { + Possible(PhantomData), + Absent, +} + +impl Peek { + pub fn then_lex(self, input: &mut SourceStream) -> LexResult { + match self { + Peek::Possible(_) => match LexT::lex(input) { + Ok(lexed) => LexResult::Lexed(lexed), + Err(errant) => LexResult::Errant(errant), + }, + Peek::Absent => LexResult::Nothing, + } + } + + pub fn map(self) -> Peek { + match self { + Peek::Possible(_) => Peek::Possible(PhantomData::), + Peek::Absent => Peek::Absent, + } + } +} diff --git a/src/lexing/utils/result.rs b/src/lexing/utils/result.rs new file mode 100644 index 0000000..063808a --- /dev/null +++ b/src/lexing/utils/result.rs @@ -0,0 +1,239 @@ +use std::{ + any::type_name, + convert::Infallible, + fmt::Debug, + ops::{ControlFlow, FromResidual, Try}, +}; + +use avjason_macros::Spanned; + +use crate::common::{Source, Span, Spanned}; + +use super::SourceStream; + +#[derive(Debug, Spanned)] +pub struct LexError { + span: Span, + message: String, +} + +impl LexError { + pub fn new(span: &impl Spanned, message: impl ToString) -> Self { + Self { + span: span.span(), + message: message.to_string(), + } + } +} + +impl<'a, S: Source> SourceStream<'a, S> { + /// + /// Make a new error at the stream's current location. + /// + pub fn error(&self, msg: impl ToString) -> LexError { + LexError::new(self, msg) + } +} + +/// +/// The rust of attempting parse token `L` +/// from a [SourceStream]. +/// +#[derive(Debug)] +pub enum LexResult { + /// + /// Valid `L` token. + /// + Lexed(L), + + /// + /// An attempt was made to parse an `L` token, + /// but the input did not fully abide by `L`'s lexical grammar. + /// + Errant(LexError), + + /// + /// The token `L` was not found, + /// so the parsing was skipped. + /// + Nothing, +} + +impl LexResult { + /// + /// Allegory of [Result::expect] + /// + pub fn expect(self, msg: impl ToString) -> L { + match self { + LexResult::Lexed(lexed) => lexed, + LexResult::Errant(errant) => panic!("{}: {errant:?}", msg.to_string()), + LexResult::Nothing => panic!("{}: on LexResult::Nothing", msg.to_string()), + } + } + + /// + /// Allegory of [Result::unwrap] + /// + pub fn unwrap(self) -> L { + match self { + LexResult::Lexed(lexed) => lexed, + LexResult::Errant(errant) => { + panic!("called `LexResult::unwrap()` on an `Errant` value: {errant:?}") + } + LexResult::Nothing => panic!("called `LexResult::unwrap()` on a `Nothing` value"), + } + } + + /// + /// Allegory of [Result::unwrap_err] + /// + pub fn unwrap_err(self) -> LexError + where + L: Debug, + { + match self { + LexResult::Lexed(lexed) => { + panic!("called `LexResult::unwrap()` on an `Lexed` value: {lexed:?}") + } + LexResult::Errant(errant) => errant, + LexResult::Nothing => panic!("called `LexResult::unwrap_err()` on a `Nothing` value"), + } + } + + /// + /// Turn this into a normal Rust [Result], + /// panicking if this is a [LexResult::Nothing]. + /// + pub fn unwrap_as_result(self) -> Result { + match self { + LexResult::Lexed(lexed) => Ok(lexed), + LexResult::Errant(errant) => Err(errant), + LexResult::Nothing => panic!("Called `LexResult::into_result()` on a Nothing value."), + } + } + + /// + /// Is this [LexResult::Errant]? + /// + pub fn is_errant(&self) -> bool { + matches!(self, Self::Errant(_)) + } + + /// + /// Is this [LexResult::Lexed]? + /// + pub fn is_lexed(&self) -> bool { + matches!(self, Self::Lexed(_)) + } + + /// + /// Is this [LexResult::Nothing]? + /// + pub fn is_nothing(&self) -> bool { + matches!(self, Self::Nothing) + } + + /// + /// Allegory of [Result::map]. + /// + /// If this is [LexResult::Lexed], the mapper function will be called, + /// and then its return type will be re-wrapped. + /// + pub fn map T>(self, mapper: F) -> LexResult { + match self { + LexResult::Lexed(lexed) => LexResult::Lexed(mapper(lexed)), + LexResult::Errant(errant) => LexResult::Errant(errant), + LexResult::Nothing => LexResult::Nothing, + } + } + + /// + /// If this is [LexResult::Nothing], execute the `or` function instead, + /// and return its result. + /// + /// This allows for chaining of results, which may be useful + /// in lexing enums with different variants. + /// + pub fn or Self>(self, or: F) -> Self { + match self { + s @ LexResult::Lexed(_) => s, + s @ LexResult::Errant(_) => s, + LexResult::Nothing => or(), + } + } + + /// + /// Allegory of [Result::and_then]. + /// + /// If this is [LexResult::Lexed], the mapper function will be called, + /// and its return value is returned. + /// + pub fn and LexResult>(self, mapper: F) -> LexResult { + match self { + LexResult::Lexed(lexed) => mapper(lexed), + LexResult::Errant(errant) => LexResult::Errant(errant), + LexResult::Nothing => LexResult::Nothing, + } + } + + /// + /// Require this potential token to be present, not [LexResult::Nothing] or [LexResult::Errant]. + /// + /// If this is [LexResult::Nothing], make this into a [LexResult::Errant] + /// with the message "expected a {$`L`} token". + /// + pub fn expected(self, input: &SourceStream) -> Self { + match self { + s @ LexResult::Lexed(_) => s, + s @ LexResult::Errant(_) => s, + LexResult::Nothing => LexResult::Errant(LexError { + span: input.span(), + message: format!("Expected a {} token here.", type_name::()), + }), + } + } + + /// + /// Require this potential token to be present, not [LexResult::Nothing] or [LexResult::Errant]. + /// + /// If this is [LexResult::Nothing], display the custom message. + /// + pub fn expected_msg(self, input: &SourceStream, msg: impl ToString) -> Self { + match self { + s @ LexResult::Lexed(_) => s, + s @ LexResult::Errant(_) => s, + LexResult::Nothing => LexResult::Errant(LexError { + span: input.span(), + message: msg.to_string(), + }), + } + } +} + +impl Try for LexResult { + type Output = L; + + type Residual = LexResult; + + fn from_output(output: Self::Output) -> Self { + Self::Lexed(output) + } + + fn branch(self) -> ControlFlow { + match self { + LexResult::Lexed(lexed) => ControlFlow::Continue(lexed), + LexResult::Errant(errant) => ControlFlow::Break(LexResult::Errant(errant)), + LexResult::Nothing => ControlFlow::Break(LexResult::Nothing), + } + } +} + +impl FromResidual for LexResult { + fn from_residual(residual: ::Residual) -> Self { + match residual { + LexResult::Lexed(_) => unreachable!(), + LexResult::Errant(errant) => LexResult::Errant(errant), + LexResult::Nothing => LexResult::Nothing, + } + } +} diff --git a/src/lexing/utils/stream.rs b/src/lexing/utils/stream.rs new file mode 100644 index 0000000..419168d --- /dev/null +++ b/src/lexing/utils/stream.rs @@ -0,0 +1,194 @@ +use std::marker::ConstParamTy; + +use crate::common::{Loc, Source, Span, Spanned, ToSpan}; + +use super::{Lex, LexResult}; + +/// +/// Things that [SourceStream] can +/// check are coming up. +/// +pub trait Lookahead { + fn upcoming(self, input: &SourceStream) -> bool; +} + +impl<'a> Lookahead for &'a str { + fn upcoming(self, input: &SourceStream) -> bool { + let chars = self.chars().collect::>(); + input + .source + .characters() + .get(input.index..(input.index + chars.len())) + .map(|st| st == chars) + .unwrap_or(false) + } +} + +impl bool> Lookahead for F { + fn upcoming(self, input: &SourceStream) -> bool { + input.peek().map(self).unwrap_or(false) + } +} + +/// +/// A const-friendly implementation of [std::ops::Range]\<char>. +/// +/// This works with the [crate::verbatim] macro to support +/// the range syntax: `v!('0'..='9')`. +/// +#[derive(Debug, PartialEq, Eq, PartialOrd, Ord)] +pub struct CharacterRange { + /// + /// Inclusive start. + /// + pub start: char, + + /// + /// Exclusive end. + /// + pub end: char, +} + +impl ConstParamTy for CharacterRange {} + +impl<'a> Lookahead for &'a CharacterRange { + fn upcoming(self, input: &SourceStream) -> bool { + input + .source + .characters() + .get(input.index) + .map(|ch| (self.start..self.end).contains(ch)) + .unwrap_or(false) + } +} + +/// +/// Represents a stream of input characters, +/// which can attempted to be lexed into tokens. +/// +#[derive(Debug, Clone)] +pub struct SourceStream<'a, S: Source> { + index: usize, + source: &'a S, +} + +impl<'a, S: Source> SourceStream<'a, S> { + /// + /// Create a new stream from a source. + /// + pub fn new(source: &'a S) -> Self { + Self { index: 0, source } + } + + /// + /// Returns the source where this [SourceStream] + /// came from. + /// + pub fn source(&self) -> &S { + self.source + } + + /// + /// Take the next character in this [SourceStream]. + /// + pub fn take(&mut self) -> Option<(Loc, char)> { + let start = self.index; + + if let Some(ch) = self.source.characters().get(self.index) { + self.index += 1; + return Some((Loc(start), *ch)); + } + + None + } + + /// + /// Take characters in this [SourceStream] whilst they + /// satisfy some predicate. + /// + pub fn take_while(&mut self, pred: impl Fn(&char) -> bool) -> Option<(Span, Vec)> { + let start = self.index; + let mut chars = vec![]; + while let Some(ch) = self.source.characters().get(self.index) { + if !pred(ch) { + break; + } + + chars.push(*ch); + self.index += 1; + } + + if chars.is_empty() { + return None; + } + + Some(((start..self.index).to_span(self.source), chars)) + } + + /// + /// Take characters in this [SourceStream] until + /// the precdicate return true. + /// + pub fn take_until(&mut self, pred: impl Fn(&Self) -> bool) -> Option<(Span, Vec)> { + let start = self.index; + let mut chars = vec![]; + while let Some(ch) = self.source.characters().get(self.index) { + if pred(self) { + break; + } + + chars.push(*ch); + self.index += 1; + } + + if chars.is_empty() { + return None; + } + + Some(((start..self.index).to_span(self.source), chars)) + } + + /// + /// Attempt to lex for token `L`. + /// + pub fn lex(&mut self) -> LexResult { + Lex::lex(self) + } + + /// + /// Checks if a lookahead pattern is next in the stream. + /// + pub fn upcoming(&self, lookahead: L) -> bool { + lookahead.upcoming(self) + } + + /// + /// Peeks at the next upcoming character. + /// + pub fn peek(&self) -> Option<&char> { + self.source.characters().get(self.index) + } + + /// + /// Peeks at the (0-based) n-th next upcoming character. + /// + pub fn peek_n(&self, n: usize) -> Option<&char> { + self.source.characters().get(self.index + n) + } + + /// + /// Returns the unlexed portion of this stream — what's next. + /// + pub fn left(&self) -> Option { + self.source + .characters() + .get(self.index..) + .map(|s| s.iter().collect()) + } +} + +impl<'a, S: Source> Spanned for SourceStream<'a, S> { + fn span(&self) -> Span { + (self.index..=self.index).to_span(self.source) + } +} diff --git a/src/lexing/utils/unicode.rs b/src/lexing/utils/unicode.rs new file mode 100644 index 0000000..b12278e --- /dev/null +++ b/src/lexing/utils/unicode.rs @@ -0,0 +1,274 @@ +//! +//! Allows for capturing different unicode groups. +//! +//! This is a stupid hack because at the moment, +//! ConstParamTy is not auto-implemented, +//! so [finl_unicode::categories::MinorCategory] doesn't implement it; +//! meaning we must do it nastily. + +use std::marker::ConstParamTy; + +use avjason_macros::Spanned; +use finl_unicode::categories::CharacterCategories; + +use crate::{ + common::{Source, Span}, + lexing::tokens::string::CharacterValue, +}; + +use super::{LexError, LexT, SourceStream}; + +/// +/// Looks for a character in any of the +/// unicode major categories supplied as a const parameter. +/// +/// *** +/// +/// **Do not use me directly, use [crate::unicode] instead!** +/// +#[derive(Debug, Spanned, Clone)] +pub struct MatchMajorCategory { + span: Span, + raw: char, +} + +/// +/// Looks for a character in any of the +/// unicode minor categories supplied as a const parameter. +/// +/// *** +/// +/// **Do not use me directly, use [crate::unicode] instead!** +/// +#[derive(Debug, Spanned, Clone)] +pub struct MatchMinorCategory { + span: Span, + raw: char, +} + +// --- + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum MajorCategory { + /// Letter + L, + /// Mark + M, + /// Number + N, + /// Punctuation + P, + /// Symbol + S, + /// Separator + Z, + /// Other character + C, +} + +#[doc(hidden)] +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum MinorCategory { + /// Uppercase letter + Lu, + /// Lowercase letter + Ll, + /// Titlecase letter + Lt, + /// Modifier letter + Lm, + /// Other letter + Lo, + /// Non-spacing mark + Mn, + /// Spacing mark + Mc, + /// Enclosing mark + Me, + /// Decimal number + Nd, + /// Letterlike number + Nl, + /// Other number + No, + /// Connector punctuation + Pc, + /// Dash punctuation + Pd, + /// Opening punctuation + Ps, + /// Closing punctuation + Pe, + /// Initial punctuation + Pi, + /// Final punctuation + Pf, + /// Other punctuation + Po, + /// Math symbol + Sm, + /// Modifier symbol + Sk, + /// Currency symbol + Sc, + /// Other symbol + So, + /// Space separator + Zs, + /// Line separator + Zl, + /// Paragraph separator + Zp, + /// Control character + Cc, + /// Format character + Cf, + /// Private use character + Co, + /// Unassigned character + Cn, +} + +// --- + +impl LexT for MatchMajorCategory { + fn peek(input: &SourceStream) -> bool { + input + .peek() + .map(|ch| { + let cat = ch.get_major_category(); + C.iter().any(|major| &cat == major) + }) + .unwrap_or(false) + } + + fn lex(input: &mut SourceStream) -> Result { + // .unwrap() ok since Self::peek() -> next character exists. + let (loc, raw) = input.take().unwrap(); + Ok(Self { + span: Span::from(loc), + raw, + }) + } +} + +impl LexT for MatchMinorCategory { + fn peek(input: &SourceStream) -> bool { + input + .peek() + .map(|ch| { + let cat = ch.get_minor_category(); + C.iter().any(|major| &cat == major) + }) + .unwrap_or(false) + } + + fn lex(input: &mut SourceStream) -> Result { + // .unwrap() ok since Self::peek() -> next character exists. + let (loc, raw) = input.take().unwrap(); + Ok(Self { + span: Span::from(loc), + raw, + }) + } +} + +// --- + +impl CharacterValue for MatchMajorCategory { + fn cv<'a, 'b: 'a>(&'a self, buf: &'b mut [u16; 2]) -> &'b [u16] { + self.raw.encode_utf16(buf) + } +} + +impl CharacterValue for MatchMinorCategory { + fn cv<'a, 'b: 'a>(&'a self, buf: &'b mut [u16; 2]) -> &'b [u16] { + self.raw.encode_utf16(buf) + } +} + +// --- + +impl ConstParamTy for MajorCategory {} +impl ConstParamTy for MinorCategory {} + +impl From for finl_unicode::categories::MajorCategory { + fn from(value: MajorCategory) -> Self { + match value { + MajorCategory::L => Self::L, + MajorCategory::M => Self::M, + MajorCategory::N => Self::N, + MajorCategory::P => Self::P, + MajorCategory::S => Self::S, + MajorCategory::Z => Self::Z, + MajorCategory::C => Self::C, + } + } +} + +impl From for finl_unicode::categories::MinorCategory { + fn from(value: MinorCategory) -> Self { + match value { + MinorCategory::Lu => Self::Lu, + MinorCategory::Ll => Self::Ll, + MinorCategory::Lt => Self::Lt, + MinorCategory::Lm => Self::Lm, + MinorCategory::Lo => Self::Lo, + MinorCategory::Mn => Self::Mn, + MinorCategory::Mc => Self::Mc, + MinorCategory::Me => Self::Me, + MinorCategory::Nd => Self::Nd, + MinorCategory::Nl => Self::Nl, + MinorCategory::No => Self::No, + MinorCategory::Pc => Self::Pc, + MinorCategory::Pd => Self::Pd, + MinorCategory::Ps => Self::Ps, + MinorCategory::Pe => Self::Pe, + MinorCategory::Pi => Self::Pi, + MinorCategory::Pf => Self::Pf, + MinorCategory::Po => Self::Po, + MinorCategory::Sm => Self::Sm, + MinorCategory::Sk => Self::Sk, + MinorCategory::Sc => Self::Sc, + MinorCategory::So => Self::So, + MinorCategory::Zs => Self::Zs, + MinorCategory::Zl => Self::Zl, + MinorCategory::Zp => Self::Zp, + MinorCategory::Cc => Self::Cc, + MinorCategory::Cf => Self::Cf, + MinorCategory::Co => Self::Co, + MinorCategory::Cn => Self::Cn, + } + } +} + +impl PartialEq for finl_unicode::categories::MajorCategory { + fn eq(&self, other: &MajorCategory) -> bool { + Self::from(*other).eq(self) + } +} + +impl PartialEq for finl_unicode::categories::MinorCategory { + fn eq(&self, other: &MinorCategory) -> bool { + Self::from(*other).eq(self) + } +} + +#[cfg(test)] +mod tests { + use avjason_macros::unicode; + + use crate::{ + common::{file::SourceFile, Source}, + lexing::Many, + }; + + type Letter = unicode!(Lu | Ll); + + #[test] + fn test_lex() { + let source = SourceFile::dummy_file("Apples"); + let input = &mut source.stream(); + let _: Many = input.lex().expect("Valid parse"); + } +} diff --git a/src/lexing/utils/verbatim.rs b/src/lexing/utils/verbatim.rs new file mode 100644 index 0000000..da68431 --- /dev/null +++ b/src/lexing/utils/verbatim.rs @@ -0,0 +1,139 @@ +//! +//! Pattern matching helpers. +//! + +use avjason_macros::Spanned; + +use crate::common::{Source, Span, SpanIter}; + +use crate::lexing::{CharacterRange, LexError, LexT, SourceStream}; + +/// +/// Looks for a particular string in input. +/// +/// *** +/// +/// **Do not use me directly, use [crate::verbatim] instead!** +/// +#[derive(Debug, Spanned, Clone)] +pub struct Verbatim { + span: Span, +} + +impl Verbatim { + fn char_length() -> usize { + A.chars().count() + } +} + +impl LexT for Verbatim { + fn peek(input: &SourceStream) -> bool { + input.upcoming(A) + } + + fn lex(input: &mut SourceStream) -> Result { + let mut locs = vec![]; + + for _ in 0..Self::char_length() { + let (loc, _) = input.take().unwrap(); + locs.push(Span::from(loc)); + } + + Ok(Self { + // If A == "", then an empty Span is returned. + span: locs.into_iter().combine(), + }) + } +} + +/// +/// Matches a character with a given range. +/// +/// *** +/// +/// **Do not use me directly, use [crate::verbatim] instead!** +/// +#[derive(Debug, Spanned)] +pub struct CharPattern { + raw: char, + span: Span, +} + +impl CharPattern { + pub fn raw(&self) -> &char { + &self.raw + } +} + +impl LexT for CharPattern { + fn peek(input: &SourceStream) -> bool { + input.upcoming(&R) + } + + fn lex(input: &mut SourceStream) -> Result { + let (loc, raw) = input.take().unwrap(); + Ok(Self { + raw, + span: Span::from(loc), + }) + } +} + +#[cfg(test)] +mod tests { + use avjason_macros::verbatim as v; + + use crate::{ + common::{file::SourceFile, Source}, + lexing::{ + utils::{stream::CharacterRange, Many}, + CharPattern, + }, + }; + + use super::Verbatim; + + #[test] + fn verbatim() { + let source = SourceFile::dummy_file(",."); + let input = &mut source.stream(); + let _: Verbatim<","> = input.lex().expect("Valid parse"); + } + + #[test] + fn ranged() { + const DIGIT: CharacterRange = CharacterRange { + start: '0', + end: ':', + }; + + let source = SourceFile::dummy_file("126439012363421890"); + let input = &mut source.stream(); + let _: Many> = input.lex().expect("Valid parse"); + } + + #[test] + fn verbatim_macro_test() { + type Comma = v!(','); + type DoubleColon = v!("::"); + type Digit = v!('0'..='9'); + + { + let source = SourceFile::dummy_file(","); + let input = &mut source.stream(); + let _: Comma = input.lex().expect("Valid parse"); + } + + { + let source = SourceFile::dummy_file("::"); + let input = &mut source.stream(); + let _: DoubleColon = input.lex().expect("Valid parse"); + } + + { + let source = SourceFile::dummy_file("126439012363421890"); + let input = &mut source.stream(); + let _: Many = input.lex().expect("Valid parse"); + } + } +} diff --git a/src/lib.rs b/src/lib.rs index aee46cb..f83ba51 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,6 +1,68 @@ //! //! ## AvJason //! > A child of the [AvdanOS](https://github.com/Avdan-OS) project. -//! +//! //! A parser for [JSON5](https://json5.org/). -//! \ No newline at end of file +//! +//! ## Why? +//! This crate provides a very important function: traceability. +//! ### Traceability +//! This allows for line-column data to be preserved so that further +//! processing can benefit from spanned errors, which tell the end +//! user *where* the error happened. +//! + +// This will have to be removed to solve #5: +#![allow(incomplete_features)] +#![feature(adt_const_params, try_trait_v2)] + +pub mod common; +pub mod lexing; + +pub(crate) use avjason_macros::*; + +mod macro_test { + use std::marker::PhantomData; + + use super::{ECMARef, Spanned, SpecRef}; + + use crate::common::Span; + + #[SpecRef("Identifier", "JSON5Identifier")] + #[allow(unused)] + struct Identifier; + + #[SpecRef("JSON5Null")] + #[allow(unused)] + struct Null; + + #[ECMARef("BooleanLiteral", "https://262.ecma-international.org/5.1/#sec-7.8.2")] + #[allow(unused)] + struct LitBool; + + #[derive(Spanned)] + struct True(Span); + + #[derive(Spanned)] + struct False { + span: Span, + ghost: PhantomData, + } + + #[derive(Spanned)] + struct Is { + span: Span, + } + + #[derive(Spanned)] + struct IsTrue(Is, True); + + #[derive(Spanned)] + #[allow(unused)] + enum Boolean { + True(True), + False(False), + Both(True, False), + Complex { truthy: True, falsey: False }, + } +}