diff --git a/Cargo.lock b/Cargo.lock index d34974e3..1a691416 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -388,6 +388,14 @@ dependencies = [ "serde_json", ] +[[package]] +name = "doodle-pdf" +version = "0.1.0" +dependencies = [ + "doodle", + "doodle-formats", +] + [[package]] name = "doodle_gencode" version = "0.1.0" diff --git a/Cargo.toml b/Cargo.toml index dc36b8c2..0be0d22f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,5 +1,5 @@ [workspace] -members = [".", "generated/", "doodle-formats/"] +members = [".", "generated/", "doodle-formats/", "experiments/doodle-pdf"] [package] name = "doodle" diff --git a/Daedalus.md b/Daedalus.md new file mode 100644 index 00000000..121ec194 --- /dev/null +++ b/Daedalus.md @@ -0,0 +1,266 @@ +# Daedalus Notes + +## 3.1 (Primitive Parsers) + +* `^` is achievable through `compute` +* `Fail ` requires a refactor we have discussed but never implemented + * This could be a modification of `Fail` itself, or a wrapper using `StyleHint` +* We don't have a notion of character classes for the parser `$[c]`, but we could emulate this with `ByteSet` constants +* The closest thing to `Match ` is `is_bytes`, which stores an N-Tuple ([issue #260](https://github.com/yeslogic/doodle/issues/260) discusses this) + +## 3.2 (Sequential Composition) + +The syntax `{ P1; ...; Pn }` can be emulated with `LetFormat`/`MonadSeq`. + +### Example Translations + +```ddl +def Add2 = + block + let x = BEUInt64 + let y = BEUInt64 + ^ x + y + +def StatementSemi = + block + $$ = Statement + $[';'] +``` + +```rust +let add2 = + chain( + base.u64be(), "x", + chain( + base.u64be(), + "y", + compute(add(var("x"), var("y"))) + ) + ); + +let statement_semi = + chain( + statement, // defined elsewhere, + "ret", + monad_seq( + is_byte(b';'), + compute(var("ret")) + ) + ); +``` + +## 3.3 (Parallel Composition) + +There is currently no way of performing *unbiased* composition in `doodle`; all parallel compositions +are first-come-first-served and will bias towards the first non-error result. + +`<|` is therefore supported, while `|` is not. + +## 3.4 (Repetition) + +* Kleene Star - `Many

` is just `Repeat`, while `Many (1..)

` is `Repeat` +* Kleene Star with State - At least some cases of `many (x = s)

` can be emulated with `Map(Repeat, LeftFold)`, while `AccumUntil` might be usable in other cases; there may be cases where neither are applicable, in which case a more bespoke `Format` may be required. +* `map (k,v in c)

` and `for (x = s; k,v in c)

` could be emulated with `ForEach`, at least in certain instances. + +## 3.5 (Branching) + +* `case-of` (and, by extension, `if-then-else`) appear one-to-one with `Match` + +## 3.6 (Coercions) + +* There are no format-level types per-se, but various `Expr`s like `AsU64` and family accomplish much-the-same, albeit in a closed class rather with a constructive syntax. + * `as?` (dynamic safe coercion) is closest to what we have, since `AsU{8,16,32,64}`/`AsChar` are runtime-checked; we currently have nothing of the sort of `as` (static safe) or `as!` (static lossy) coercion + +## 5.1 (Bitdata) + +Through helpers like `bit_fields_u8` and so forth, which can be defined as-needed, we have a plausible +analogue to the `bitdata` declarations in Daedalus. + +However, the current implementation of `BitFieldKind` is somewhat restrictive, in the following ways, compared to `bitdata`: + +* It does not support type-coercions (e.g. u8 packed in a u16) +* It does not support fixed-bits checking other than all-zero + +These are features that could be added with various caveats, if necessary. + +### Examples + +```ddl +bitdata Packed where + x: uint 8 + 0xFFFF + y: uint 8 + +bitdata Uni where + value = { get: Packed } + null = 0 +``` + +```rust +use doodle::helper::BitFieldKind::*; + +let packed = bit_fields_u32([ // <- this is not yet defined + BitsField { field_name: "x", bit_width: 8 }, + Reserved { bit_width: 16, check_zero: false }, + BitsField { field_name: "y", bit_width: 8 }, +]); + +let uni = union_nondet([ + ("null", is_bytes(&[0; 4])), + ("value", packed), +]); +``` + +This is more of a parse-level directive than a data-type declaration, however, +as while in Daedalus the two are implicitly specified with the same declaration, +in `doodle` the data-type is a synthetic implication of the parse declaration, +and cannot be used in coercions; for that, we would need a separate declaration +of a dependent `u32 -?-> Packed` computation that could then be fed in arbitrary +arguments to interpret as `Packed`. + +```rust +use doodle::helper::BitFieldKind::*; +let as_packed = module.define_format_args( + "Packed-Coerce", + [(Label::Borrowed("raw"), ValueType::Base(BaseType::U32))], + cast_u32_bit_fields( // <- also not defined, but furthermore has no archetype + var("raw"), + [ + BitsField { field_name: "x", bit_width: 8 }, + Reserved { bit_width: 16, check_zero: false }, + BitsField { field_name: "y", bit_width: 8 }, + ] + ) +); +``` + +## 5.2 (Automatic ADT Synthesis) + +We have no first-class types in the specification language of `doodle`, and all +types are implied through synthesis over the declared formats and expressions. +As a result, type-ascriptions are syntactically unavailable. + +Even currently, we can still at least ensure that two parsers have mutually +compatible types, by defining a declaration-check marker-format that we run +through type-checking but discard afterwards: + +```rust +let point = module.define_format("types.point", record([("x", base.u8()), ("y", base.u8())])); +let point_x = module.define_format("types.point_x", record([("x", base.u8()), ("y", is_byte(0))])); + +let __type_proof = module.define_format( + "__TYPE_PROOF", + monad_seq( + union([point.call(), point_x.call()]), + /* we can sequence other type-compatibility assertion-formats here as well */ + Format::EMPTY, + ) +); +``` + +Because every format needs a reified type for the module to be usable, but these +type-ascriptions need not be a bijection, there would be no implicitly-defined +alias `type PointX = Point` as would be synthesized by the corresponding +Daedalus declarations; instead, whichever type-name is preferred would win, and +both formats would receive verbatim-identical type-ascriptions. + +While tagged unions in general are supportable, the example given for tagged +unions cannot be constructed in `doodle` because of a lack of support for +auto-recursive and mutually-recursive format-constructs. Implementing these is +not *a priori* impossible, but would require a noticeable investment of effort +into a design to support this, which would most notably require a +termination-rule for otherwise infinitely-recursive type-checking. + +## 6 (Lookahead and Stream Manipulation) + +The concept of a `Stream` is not first-class within the `Format` model of `doodle`, +though there are various combinators that interact with it. + +* `GetStream` does not obviously have a one-to-one equivalent in `doodle`, though constructs that use it may be replicable in other ways +* `Drop n s` does not properly exist as a first-class construction but can be emulated with some degree of ingenuity +* `Take n s` itself is not quite analogous to anything in `doodle` +* `SetStream` does not properly exist as a first-class construction but can be emulated with some degree of ingenuity +* `Chunk n P` is equivalent to `Slice` +* `Lookahead P` is equivalent to `Peek` +* `WithStream` can be emulated using `DecodeBytes` up to a certain degree, where if the parser itself is responsible for determining where the stream ends, there may be issues in capturing the stream into a suitable buffer. + +### Example + +```ddl +block + let base = GetStream + Many block + let offset = Word64 + let here = GetStream + SetStream (Drop offset base) + $$ = ParseObject + SetStream here +``` + +```rust +chain( + Format::Pos, + "base", + repeat( + chain(base.u64be(), "offset", + with_relative_offset(Some(var("base")), var("offset"), parse_object) + ) + ) +) +``` + +## 7 (Eager vs. Lazy) + +There is currently no support, at any layer, for parse-level laziness in the +`doodle` processing model; there is some value-level laziness involving +constructed sequences, but that is more of an representation-level optimization +than a feature of the processing model, and aside from performance concerns, +nothing would change if it were eliminated. + +Multiple paths cannot be explored in parallel, both for unbounded/indefinite-length +repetitions as well as for more explicit alternations over N branches. There are currently only two places where nondeterministic unions are used: + +* At the top-level, for alternating between distinct data-formats (e.g. png, OpenType, gzip); and secondly, to allow fallback to uninterpreted bytes during speculative +parsing of zlib-compressed UTF-8 in an `iTXt` PNG-chunk. + +The latter usage is more of a band-aid against unwanted parse-failure, and due +to the limitations of the error-propagation model whereby high-confidence +partial-parses of a given top-format are nevertheless rejected altogether when +even a single, possibly trivial component encounters uncaught parse-failure. +Aside from mitigation within the operational model that would allow +determinations such as 'correct format, malformed data', there are ways to get +around this locally without adjusting the model, by using a separate construct +from `UnionNondet` to avoid coupling one form of speculative parsing to the only +version of something like that in the current implementation of `doodle`: + +```rust +fn try_with_fallback(f0: Format, f1: Format) -> Format { + TryWithFallback(Box::new(f0), Box::new(f1)) +} + +/* .. */ + +let zlib_utf8txt_or_raw = + try_with_fallback( + fmt_variant("valid", zlib_utf8text), + fmt_variant("invalid", repeat(base.u8())), + ); +``` + +This can be used in a broader sense, as a more generic 'permit local failure gracefully' +construct: + +```rust +fn permit_fail(f: Format, dummy: Expr) -> Format { + TryWithFallback(Box::new(f), Box::new(compute(dummy))) +} + +// vvv Usage Patterns vvv +fn opt_success(f: Format) -> Format { + permit_fail(fmt_some(f), expr_none()) +} + +fn try_or_invalid(f: Format) -> Format { + permit_fail(fmt_variant("valid", f), Expr::Variant("invalid".into(), Expr::UNIT)) +} +``` diff --git a/doodle-formats/src/format/png.rs b/doodle-formats/src/format/png.rs index 80ba7e8b..c578b375 100644 --- a/doodle-formats/src/format/png.rs +++ b/doodle-formats/src/format/png.rs @@ -466,7 +466,7 @@ pub fn main( module.define_format( "png.main", record_auto([ - ("__signature", is_bytes(PNG_SIGNATURE)), + ("signature", byte_seq(PNG_SIGNATURE)), ("ihdr", ihdr.call()), ("chunks", repeat(png_chunk.call_args(vec![var("ihdr")]))), ( diff --git a/experiments/doodle-pdf/Cargo.toml b/experiments/doodle-pdf/Cargo.toml new file mode 100644 index 00000000..87e700b0 --- /dev/null +++ b/experiments/doodle-pdf/Cargo.toml @@ -0,0 +1,8 @@ +[package] +name = "doodle-pdf" +version = "0.1.0" +edition = "2024" + +[dependencies] +doodle = { path = "../../" } +doodle-formats = { path = "../../doodle-formats/" } diff --git a/experiments/doodle-pdf/src/main.rs b/experiments/doodle-pdf/src/main.rs new file mode 100644 index 00000000..701b2a50 --- /dev/null +++ b/experiments/doodle-pdf/src/main.rs @@ -0,0 +1,5 @@ +mod pdf; + +fn main() { + println!("Hello, world!"); +} diff --git a/experiments/doodle-pdf/src/pdf.rs b/experiments/doodle-pdf/src/pdf.rs new file mode 100644 index 00000000..4f9bf934 --- /dev/null +++ b/experiments/doodle-pdf/src/pdf.rs @@ -0,0 +1,357 @@ +use std::ops::Range; + +use doodle::{ + BaseType, Expr, Format, FormatModule, FormatRef, IntoLabel, Label, Pattern, ValueType, + helper::*, prelude::ByteSet, +}; +use doodle_formats::format::base::BaseModule; + +fn hex_val() -> Format { + const HEX09: Range = b'0'..b'9'; + const HEX_MAJ: Range = b'A'..b'F'; + const HEX_MIN: Range = b'a'..b'f'; + + union([ + map(byte_in(HEX09), lambda("b", sub(var("b"), Expr::U8(b'0')))), + map( + byte_in(HEX_MIN), + lambda("b", sub(var("b"), Expr::U8(b'a' - 10))), + ), + map( + byte_in(HEX_MAJ), + lambda("b", sub(var("b"), Expr::U8(b'A' - 10))), + ), + ]) +} + +fn record_daedalus>( + fields: impl IntoIterator, +) -> Format { + let mut uniq_ix: Option = None; + let mut fields_ext = Vec::new(); + for (_ix, (label, format)) in fields.into_iter().enumerate() { + let field = match label.as_ref() { + _nil if _nil.is_empty() || _nil.starts_with("__") => (None, format), + _tmp if _tmp.starts_with("_") || _tmp.starts_with("@") => { + (Some((Label::from(_tmp.to_owned()), true)), format) + } + uniq if uniq.starts_with("$$") => match uniq_ix.replace(_ix) { + None => { + // patch all previous fields to be temporary + for (fld_info, _) in fields_ext.iter_mut() { + if let Some((_, is_persist)) = fld_info { + *is_persist = false; + } + } + (Some((uniq.replace("$", "x").into(), true)), format) + } + Some(old) => { + unreachable!("cannot have more than one '$$' field: {} != {}", old, _ix) + } + }, + other => (Some((other.to_owned().into(), uniq_ix.is_none())), format), + }; + fields_ext.push(field) + } + record_ext(fields_ext, true) +} + +fn hex_num_u32(len: usize) -> Format { + let raw = repeat_count(Expr::U32(len as u32), hex_val()); + map( + raw, + lambda( + "raw", + left_fold( + lambda_tuple(["acc", "b"], add(var("b"), mul(var("acc"), Expr::U32(16)))), + Expr::U32(0), + ValueType::Base(BaseType::U32), + var("raw"), + ), + ), + ) +} + +fn interpret(assocs: impl IntoIterator) -> Format { + union(assocs.into_iter().map(|(f, e)| monad_seq(f, compute(e)))) +} + +/// Greedily matches the given format, skipping any bytes that don't match. +fn skip_to(_f: Format) -> Format { + // FIXME - we need a combinator for this to avoid infinite recursion while constructing the Format + // Format::UnionNondet(vec![ + // f.clone(), + // monad_seq(Format::ANY_BYTE, skip_to(f)), + // ]) + todo!() +} + +/// Parse `f` and ensure that there is no trailing content +fn only(f: Format) -> Format { + chain(f, "val", monad_seq(Format::EndOfInput, compute(var("val")))) +} + +/// Runs a parser and discards its value, returning `Format::EMPTY` +fn void(f: Format) -> Format { + monad_seq(f, Format::EMPTY) +} + +/// Matches a string (as bytes) exactly, and returns it as a Seq(u8) +fn match_str(bytes: &[u8]) -> Format { + byte_seq(bytes) +} + +const ASCII_LF: u8 = b'\n'; +const ASCII_CR: u8 = b'\r'; + +const ASCII_NUL: u8 = b'\0'; +const ASCII_TAB: u8 = b'\t'; +const ASCII_FF: u8 = b'\x0c'; +const ASCII_SPACE: u8 = b' '; + +const __WS_MASK: u64 = 1u64 | 1 << ASCII_TAB | 1 << ASCII_FF | 1 << ASCII_SPACE; +const SIMPLE_WS: ByteSet = ByteSet::from_bits([__WS_MASK, 0, 0, 0]); + +const DIGIT: ByteSet = ByteSet::from_bits([0b11_1111_1111 << b'0', 0, 0, 0]); + +pub fn main(module: &mut FormatModule, base: &BaseModule) -> FormatRef { + { + let _simple_ws = ByteSet::from([b'\0', b'\t', 0x0C, b' ']); + debug_assert_eq!(_simple_ws, SIMPLE_WS); + + let _digit = ByteSet::from(b'0'..=b'9'); + debug_assert_eq!(_digit, DIGIT); + } + + let inline_char = Format::Byte(ByteSet::from([ASCII_LF, ASCII_CR]).complement()); + + let simple_eol = module.define_format( + "pdf.util.simple_eol", + Format::UnionNondet(vec![ + void(is_bytes(&[ASCII_CR, ASCII_LF])), + void(is_byte(ASCII_LF)), + ]), + ); + + let eol = module.define_format( + "pdf.util.eol", + Format::UnionNondet(vec![simple_eol.call(), void(is_byte(ASCII_CR))]), + ); + + let then_eol = + |format: Format| chain(format, "ret", monad_seq(eol.call(), compute(var("ret")))); + + let comment = module.define_format( + "pdf.util.comment", + monad_seq(is_byte(b'%'), monad_seq(repeat(inline_char), eol.call())), + ); + + let any_ws = module.define_format( + "pdf.util.any_ws", + Format::UnionNondet(vec![ + void(byte_in(SIMPLE_WS)), // $simpleWS + comment.call(), // Comment + eol.call(), // EOL + ]), + ); + + let token = |f: Format| { + chain( + f, + "tok", + monad_seq(repeat(any_ws.call()), compute(var("tok"))), + ) + }; + + let kw = |b: &[u8]| token(match_str(b)); + + let between = |open: &[u8], close: &[u8], f: Format| { + monad_seq( + kw(open), + chain(f, "val", monad_seq(kw(close), compute(var("val")))), + ) + }; + + let natural = module.define_format("pdf.util.natural", repeat1(Format::Byte(DIGIT))); + + let unsigned_lead_digits = chain( + natural.call(), + "n", + Format::UnionNondet(vec![ + chain( + cons(is_byte(b'.'), repeat(Format::Byte(DIGIT))), + "frac", + compute(append(var("n"), var("frac"))), + ), + compute(var("n")), + ]), + ); + + // TODO - add support to allow de-stringification of numbers + let unsigned_number = module.define_format( + "pdf.util.unsigned_number", + Format::UnionNondet(vec![ + unsigned_lead_digits, + cons(is_byte(b'.'), repeat1(Format::Byte(DIGIT))), + ]), + ); + + // TODO: add support to allow de-stringification of numbers + let number = module.define_format( + "pdf.util.number", + union([ + monad_seq( + is_byte(b'-'), + map( + unsigned_number.call(), + lambda( + "digits", + concat(Expr::Seq(vec![ + Expr::Seq(vec![as_char(Expr::U8(b'-'))]), + var("digits"), + ])), + ), + ), + ), + monad_seq(optional(is_byte(b'+')), unsigned_number.call()), + ]), + ); + + let v_null = void(kw(b"null")); + let v_bool = interpret([ + (kw(b"true"), Expr::Bool(true)), + (kw(b"false"), Expr::Bool(false)), + ]); + let v_ref = record_auto([ + ("obj", token(natural.call())), + ("gen", token(natural.call())), + ("__kw", kw(b"R")), + ]); + let name_char = { + let ok_raw = ByteSet::from(*b"\0\t\n\x0C\r ()<>[]{}/%#").complement(); + let name_esc = monad_seq( + is_byte(b'#'), + where_nonzero::(map(hex_num_u32(2), lambda("x32", as_u8(var("x32"))))), + ); + union([Format::Byte(ok_raw), name_esc]) + }; + let v_name = module.define_format( + "pdf.value.name", + token(cons(is_byte(b'/'), repeat(name_char))), + ); + let v_string = { + let string_in_parens = module.define_format_args( + "pdf.value.string_in_parens", + vec![(Label::Borrowed("lim"), ValueType::Base(BaseType::U64))], + if_then_else( + expr_eq(var("lim"), Expr::U64(0)), + Format::Fail, // "string nesting limit exceeded" + snoc(cons(match_str(b"("), todo!()), match_str(b")")), // TODO: support mutual recursion + ), + ); + + let string_esc = todo!(); + + let string_chunk = module.define_format_args( + "pdf.value.string_chunk", + vec![(Label::Borrowed("lim"), ValueType::Base(BaseType::U64))], + Format::UnionNondet(vec![ + string_in_parens.call_args(vec![var("lim")]), + string_esc, + repeat1(byte_in(ByteSet::from(*b"\\()").complement())), + ]), + ); + + let string_chars = module.define_format_args( + "pdf.value.string_chars", + vec![(Label::Borrowed("lim"), ValueType::Base(BaseType::U64))], + map( + repeat(string_chunk.call_args(vec![var("lim")])), + lambda("xs", flat_map(lambda("x", var("x")), var("xs"))), + ), + ); + + record_daedalus([ + ("__open", is_byte(b'(')), + ("$$", string_chars.call_args(vec![Expr::U64(16)])), + ("__close", is_byte(b')')), + ("__skipWS", repeat(any_ws.call())), + ]) + }; + let v_hex_string = { todo!() }; + let pdf_value = module.define_format( + "pdf.value", + union_nondet([ + ("null", v_null), + ("bool", v_bool), + ("ref", v_ref), + ("name", v_name.call()), + ("string", v_string), + ("string", v_hex_string), + ]), + ); + let pdf_value_type = module.get_format_type(pdf_value.get_level()).clone(); + let stream: FormatRef = todo!("implement Stream (val : Value)"); + let top_decl_def = module.define_format_args( + "pdf.top_decl", + vec![(Label::Borrowed("val"), pdf_value_type)], + record([ + ("stream", stream.call_args(vec![var("val")])), + ("value", compute(var("val"))), + ]), + ); + let obj_decl = module.define_format( + "pdf.object", + record_auto([ + ("id", token(natural.call())), + ("gen", token(natural.call())), + ("__kw_obj", kw(b"obj")), + ("_val", pdf_value.call()), + ("obj", top_decl_def.call_args(vec![var("_val")])), + ("__kw_end", kw(b"endobj")), + ]), + ); + let cross_ref_section: FormatRef = todo!(); + let pdf_dict = { + let dict = module.define_format_args( + "pdf.util.dict", + vec![(Label::Borrowed("lim"), ValueType::Base(BaseType::U64))], + between( + b"<<", + b">>", + repeat(record([ + ("key", v_name.call()), + ("value", pdf_value.call_args(vec![var("lim")])), + ])), + ), + ); + + module.define_format("pdf.util.pdf-dict", dict.call_args(vec![Expr::U64(32)])) + }; + + let trailer = module.define_format("pdf.trailer", monad_seq(kw(b"trailer"), pdf_dict.call())); + + let pdf_chunk = module.define_format( + "pdf.chunk", + record_auto([ + ("objects", repeat(obj_decl.call())), + ("xref", cross_ref_section.call()), + ("trailer", trailer.call()), + ("__start_xref", then_eol(match_str(b"startxref"))), + ("declared_start_xref", then_eol(natural.call())), + ("__kw_eof", kw(b"%%EOF")), + ]), + ); + + module.define_format( + "pdf.main", + only(record_auto([ + ( + "version", + skip_to(monad_seq(is_bytes(b"%PDF-"), number.call())), + ), + ("chunks", repeat(pdf_chunk.call())), + ("__rem", repeat(any_ws.call())), + ])), + ) +} diff --git a/generated/gencode.rs b/generated/gencode.rs index 4e22e13c..8b4ce61e 100644 --- a/generated/gencode.rs +++ b/generated/gencode.rs @@ -4242,9 +4242,10 @@ tag: (u8, u8, u8, u8), crc: u32 } -/// expected size: 184 +/// expected size: 208 #[derive(Debug, Clone)] pub struct png_main { +signature: Vec, ihdr: png_ihdr, chunks: Vec, idat: zlib_main, @@ -4951,72 +4952,72 @@ PResult::Ok(mpeg4_main { atoms }) } fn Decoder_png_main<>(_input: &mut Parser<'_>) -> Result { -{ -let field0 = ((|| { +let signature = { +let _seq0 = { let b = _input.read_byte()?; -PResult::Ok(if b == 137 { +if b == 137 { b } else { return Err(ParseError::ExcludedBranch(8253205784254894771u64)); -}) -})())?; -let field1 = ((|| { +} +}; +let _seq1 = { let b = _input.read_byte()?; -PResult::Ok(if b == 80 { +if b == 80 { b } else { return Err(ParseError::ExcludedBranch(1225514472166157741u64)); -}) -})())?; -let field2 = ((|| { +} +}; +let _seq2 = { let b = _input.read_byte()?; -PResult::Ok(if b == 78 { +if b == 78 { b } else { return Err(ParseError::ExcludedBranch(1224415506115142500u64)); -}) -})())?; -let field3 = ((|| { +} +}; +let _seq3 = { let b = _input.read_byte()?; -PResult::Ok(if b == 71 { +if b == 71 { b } else { return Err(ParseError::ExcludedBranch(16859485491091215361u64)); -}) -})())?; -let field4 = ((|| { +} +}; +let _seq4 = { let b = _input.read_byte()?; -PResult::Ok(if b == 13 { +if b == 13 { b } else { return Err(ParseError::ExcludedBranch(14898840355839773829u64)); -}) -})())?; -let field5 = ((|| { +} +}; +let _seq5 = { let b = _input.read_byte()?; -PResult::Ok(if b == 10 { +if b == 10 { b } else { return Err(ParseError::ExcludedBranch(9453951600195794313u64)); -}) -})())?; -let field6 = ((|| { +} +}; +let _seq6 = { let b = _input.read_byte()?; -PResult::Ok(if b == 26 { +if b == 26 { b } else { return Err(ParseError::ExcludedBranch(10036157788440812915u64)); -}) -})())?; -let field7 = ((|| { +} +}; +let _seq7 = { let b = _input.read_byte()?; -PResult::Ok(if b == 10 { +if b == 10 { b } else { return Err(ParseError::ExcludedBranch(6349531732377484771u64)); -}) -})())?; -(field0, field1, field2, field3, field4, field5, field6, field7) +} +}; +vec![_seq0, _seq1, _seq2, _seq3, _seq4, _seq5, _seq6, _seq7] }; let ihdr = (Decoder_png_ihdr(_input))?; let chunks = { @@ -5162,7 +5163,7 @@ break accum }; let iend = (Decoder_png_iend(_input))?; -PResult::Ok(png_main { ihdr, chunks, idat, more_chunks, iend }) +PResult::Ok(png_main { signature, ihdr, chunks, idat, more_chunks, iend }) } fn Decoder_riff_main<>(_input: &mut Parser<'_>) -> Result { diff --git a/src/codegen/mod.rs b/src/codegen/mod.rs index 4fd2a409..96857b78 100644 --- a/src/codegen/mod.rs +++ b/src/codegen/mod.rs @@ -313,6 +313,14 @@ impl CodeGen { "TypedDecoder::Tuple expected to have type RustType::AnonTuple(..) (or UNIT if empty), found {other:?}" ), } + TypedDecoder::Sequence(gt, elts) => match gt { + GenType::Inline(RustType::Atom(AtomType::Comp(CompType::Vec(_t)))) => { + let as_array = _t.prefer_array(elts.len()); + let elements = elts.iter().map(|elt| self.translate(elt.get_dec())).collect(); + CaseLogic::Sequential(SequentialLogic::AccumSeq { as_array, elements }) + }, + other => unreachable!("TypedDecoder::Sequence expected to have type CompType::Vec(..), found {other:?}"), + }, TypedDecoder::Repeat0While(_gt, tree_continue, single) => CaseLogic::Repeat( RepeatLogic::Repeat0ContinueOnMatch( @@ -839,7 +847,11 @@ fn embed_expr(expr: >Expr, info: ExprInfo) -> RustExpr { ) ) } - + TypedExpr::Append(_, seq0, seq1) => { + let lhs = embed_expr(seq0, info); + let rhs = embed_expr(seq1, info); + RustExpr::FunctionCall(Box::new(RustExpr::local("seq_append")), vec![lhs, rhs]) + } TypedExpr::SubSeq(_, seq, ix, len) => { let start_expr = embed_expr_dft(ix); let bind_ix = RustStmt::assign( @@ -2751,7 +2763,12 @@ enum RepeatLogic { /// Fused logic for a left-fold that is updated on each repeat, and contributes to the condition for termination /// /// Lambda order: termination-predicate, then update-function - AccumUntil(GenLambda, GenLambda, Typed, Typed>>), + AccumUntil( + GenLambda, + GenLambda, + Typed, + Typed>>, + ), } pub(crate) type Typed = (T, GenType); @@ -3134,7 +3151,10 @@ where loop_body.push(RustStmt::assign("elem", elt_expr)); loop_body.push(RustStmt::Expr(RustExpr::local("seq").call_method_with( "push", - [RustExpr::owned(RustExpr::local("elem"), elt_type.to_rust_type())], + [RustExpr::owned( + RustExpr::local("elem"), + elt_type.to_rust_type(), + )], ))); let new_acc = update.apply_pair( RustExpr::local("acc"), @@ -3161,6 +3181,10 @@ enum SequentialLogic { constructor: Option, elements: Vec>, }, + AccumSeq { + as_array: bool, + elements: Vec>, + }, } impl ToAst for SequentialLogic @@ -3171,6 +3195,27 @@ where fn to_ast(&self, ctxt: ProdCtxt<'_>) -> GenBlock { match self { + // REVIEW - in certain cases, we may be able to use fixed-sized arrays instead of vec, but that might complicate matters... + SequentialLogic::AccumSeq { as_array, elements } => { + if elements.is_empty() { + return GenBlock::simple_expr(RustExpr::VEC_NIL); + } + let mut stmts = Vec::new(); + let mut terms = Vec::new(); + + for (ix, cl) in elements.iter().enumerate() { + const LAB_PREFIX: &str = "_seq"; + let lab = Label::Owned(format!("{LAB_PREFIX}{ix}")); + stmts.push(GenStmt::BindOnce(lab.clone(), cl.to_ast(ctxt))); + terms.push(RustExpr::local(lab)); + } + let ret = Some(GenExpr::Embed(if *as_array { + RustExpr::ArrayLit(terms) + } else { + RustExpr::Macro(RustMacro::Vec(VecExpr::List(terms))) + })); + GenBlock { stmts, ret } + } SequentialLogic::AccumTuple { constructor, elements, @@ -4141,6 +4186,24 @@ impl<'a> Elaborator<'a> { let gt = self.get_gt_from_index(index); TypedFormat::Tuple(gt, t_elts) } + Format::Sequence(formats) => { + let index = self.get_and_increment_index(); + self.increment_index(); + let t_formats = match &formats[..] { + [] => unreachable!("empty list has no unambiguous type"), + [v] => vec![self.elaborate_format(v, dyn_scope)], + formats => { + let mut t_formats = Vec::with_capacity(formats.len()); + for t in formats { + let t_format = self.elaborate_format(t, dyn_scope); + t_formats.push(t_format); + } + t_formats + } + }; + let gt = self.get_gt_from_index(index); + TypedFormat::Sequence(gt, t_formats) + } Format::Repeat(inner) => { let index = self.get_and_increment_index(); let t_inner = self.elaborate_format(inner, dyn_scope); @@ -4484,7 +4547,14 @@ impl<'a> Elaborator<'a> { let gt = self.get_gt_from_index(index); TypedExpr::Seq(gt, t_elts) } + Expr::Append(lhs, rhs) => { + let t_lhs = self.elaborate_expr(lhs); + let t_rhs = self.elaborate_expr(rhs); + self.increment_index(); + let gt = self.get_gt_from_index(index); + TypedExpr::Append(gt, Box::new(t_lhs), Box::new(t_rhs)) + } Expr::RecordProj(e, fld) => { self.codegen.name_gen.ctxt.push_atom(NameAtom::DeadEnd); let t_e = self.elaborate_expr(e); @@ -5044,7 +5114,10 @@ mod tests { "{}", lambda .apply_pair( - RustExpr::CloneOf(Box::new(RustExpr::local("acc"))), + RustExpr::Owned(OwnedRustExpr { + expr: Box::new(RustExpr::local("acc")), + kind: OwnedKind::Cloned + }), RustExpr::local("seq"), ExprInfo::default() ) diff --git a/src/codegen/rust_ast/mod.rs b/src/codegen/rust_ast/mod.rs index e0c76efb..f8192c50 100644 --- a/src/codegen/rust_ast/mod.rs +++ b/src/codegen/rust_ast/mod.rs @@ -685,6 +685,15 @@ impl RustType { } } + /// Returns `true` if seq-formats ([`Format::Sequence`]) of type `Seq()` should prefer to use + /// fixed-size arrays (`[T; N]`) over vectors (`Vec`) during construction. An additional parameter, + /// the length of the sequence (`len`), is passed in to guide the decision, as simple types can be + /// preferable as vectors depending more on the length of the sequence than anything else. + pub(crate) fn prefer_array(&self, _n: usize) -> bool { + // REVIEW - currently, we would need to orchestrate the correct decision at multiple layers, which would take a lot of work + false + } + /// Returns `true` if `self` is a known-`Copy` `RustType`. /// /// # Note @@ -1336,8 +1345,8 @@ impl ToFragment for StructExpr { #[derive(Debug, Clone)] pub(crate) struct OwnedRustExpr { - expr: Box, - kind: OwnedKind, + pub expr: Box, + pub kind: OwnedKind, } #[derive(Debug, Clone)] @@ -1399,6 +1408,17 @@ pub(crate) enum RustExpr { #[derive(Debug, Clone)] pub(crate) enum RustMacro { Matches(Box, Vec), + Vec(VecExpr), +} + +#[derive(Debug, Clone)] +pub(crate) enum VecExpr { + Nil, + #[expect(dead_code)] + Single(Box), + #[expect(dead_code)] + Repeat(Box, Box), + List(Vec), } impl RustExpr { @@ -1410,6 +1430,8 @@ impl RustExpr { pub const FALSE: Self = Self::PrimitiveLit(RustPrimLit::Boolean(false)); + pub const VEC_NIL: Self = RustExpr::Macro(RustMacro::Vec(VecExpr::Nil)); + /// Returns `Some(varname)` if `self` is a simple entity-reference to identifier `varname`, and /// `None` otherwise. pub fn as_local(&self) -> Option<&Label> { @@ -1490,12 +1512,15 @@ impl RustExpr { Self::Owned(OwnedRustExpr { expr, .. }) => match &*expr { Self::FieldAccess(..) => Self::Borrow(expr), _ => *expr, - } + }, other => Self::Borrow(Box::new(other)), } } - pub fn field(self, name: Name) -> Self where Name: Into