Skip to content

Commit 17e575e

Browse files
authored
Merge pull request #904 from Ninja3047/serialize-cdata
Implement serializing CDATA
2 parents 655691c + c526ff7 commit 17e575e

File tree

8 files changed

+354
-91
lines changed

8 files changed

+354
-91
lines changed

Changelog.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,10 +16,17 @@
1616

1717
### New Features
1818

19+
- [#353]: Add ability to serialize textual content as CDATA sections in `Serializer`.
20+
Everywhere where the text node may be created, a CDATA section(s) could be produced instead.
21+
See the new [`Serializer::text_format()`] method.
22+
1923
### Bug Fixes
2024

2125
### Misc Changes
2226

27+
[#353]: https://github.com/tafia/quick-xml/issues/353
28+
[`Serializer::text_format()`]: https://docs.rs/quick-xml/0.38.4/quick_xml/se/struct.Serializer.html#method.text_format
29+
2330

2431
## 0.38.3 -- 2025-08-24
2532

src/escape.rs

Lines changed: 33 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
33
use memchr::{memchr, memchr2_iter, memchr3};
44
use std::borrow::Cow;
5+
use std::fmt::{self, Write};
56
use std::num::ParseIntError;
67
use std::ops::Range;
78

@@ -147,54 +148,55 @@ pub fn minimal_escape<'a>(raw: impl Into<Cow<'a, str>>) -> Cow<'a, str> {
147148
_escape(raw, |ch| matches!(ch, b'<' | b'&'))
148149
}
149150

151+
pub(crate) fn escape_char<W>(writer: &mut W, value: &str, from: usize, to: usize) -> fmt::Result
152+
where
153+
W: fmt::Write,
154+
{
155+
writer.write_str(&value[from..to])?;
156+
match value.as_bytes()[to] {
157+
b'<' => writer.write_str("&lt;")?,
158+
b'>' => writer.write_str("&gt;")?,
159+
b'\'' => writer.write_str("&apos;")?,
160+
b'&' => writer.write_str("&amp;")?,
161+
b'"' => writer.write_str("&quot;")?,
162+
163+
// This set of escapes handles characters that should be escaped
164+
// in elements of xs:lists, because those characters works as
165+
// delimiters of list elements
166+
b'\t' => writer.write_str("&#9;")?,
167+
b'\n' => writer.write_str("&#10;")?,
168+
b'\r' => writer.write_str("&#13;")?,
169+
b' ' => writer.write_str("&#32;")?,
170+
_ => unreachable!("Only '<', '>','\', '&', '\"', '\\t', '\\r', '\\n', and ' ' are escaped"),
171+
}
172+
Ok(())
173+
}
174+
150175
/// Escapes an `&str` and replaces a subset of xml special characters (`<`, `>`,
151176
/// `&`, `'`, `"`) with their corresponding xml escaped value.
152-
pub(crate) fn _escape<'a, F: Fn(u8) -> bool>(
153-
raw: impl Into<Cow<'a, str>>,
154-
escape_chars: F,
155-
) -> Cow<'a, str> {
177+
fn _escape<'a, F: Fn(u8) -> bool>(raw: impl Into<Cow<'a, str>>, escape_chars: F) -> Cow<'a, str> {
156178
let raw = raw.into();
157179
let bytes = raw.as_bytes();
158180
let mut escaped = None;
159181
let mut iter = bytes.iter();
160182
let mut pos = 0;
161183
while let Some(i) = iter.position(|&b| escape_chars(b)) {
162184
if escaped.is_none() {
163-
escaped = Some(Vec::with_capacity(raw.len()));
185+
escaped = Some(String::with_capacity(raw.len()));
164186
}
165187
let escaped = escaped.as_mut().expect("initialized");
166188
let new_pos = pos + i;
167-
escaped.extend_from_slice(&bytes[pos..new_pos]);
168-
match bytes[new_pos] {
169-
b'<' => escaped.extend_from_slice(b"&lt;"),
170-
b'>' => escaped.extend_from_slice(b"&gt;"),
171-
b'\'' => escaped.extend_from_slice(b"&apos;"),
172-
b'&' => escaped.extend_from_slice(b"&amp;"),
173-
b'"' => escaped.extend_from_slice(b"&quot;"),
174-
175-
// This set of escapes handles characters that should be escaped
176-
// in elements of xs:lists, because those characters works as
177-
// delimiters of list elements
178-
b'\t' => escaped.extend_from_slice(b"&#9;"),
179-
b'\n' => escaped.extend_from_slice(b"&#10;"),
180-
b'\r' => escaped.extend_from_slice(b"&#13;"),
181-
b' ' => escaped.extend_from_slice(b"&#32;"),
182-
_ => unreachable!(
183-
"Only '<', '>','\', '&', '\"', '\\t', '\\r', '\\n', and ' ' are escaped"
184-
),
185-
}
189+
// SAFETY: It should fail only on OOM
190+
escape_char(escaped, &raw, pos, new_pos).unwrap();
186191
pos = new_pos + 1;
187192
}
188193

189194
if let Some(mut escaped) = escaped {
190-
if let Some(raw) = bytes.get(pos..) {
191-
escaped.extend_from_slice(raw);
195+
if let Some(raw) = raw.get(pos..) {
196+
// SAFETY: It should fail only on OOM
197+
escaped.write_str(raw).unwrap();
192198
}
193-
// SAFETY: we operate on UTF-8 input and search for an one byte chars only,
194-
// so all slices that was put to the `escaped` is a valid UTF-8 encoded strings
195-
// TODO: Can be replaced with `unsafe { String::from_utf8_unchecked() }`
196-
// if unsafe code will be allowed
197-
Cow::Owned(String::from_utf8(escaped).unwrap())
199+
Cow::Owned(escaped)
198200
} else {
199201
raw
200202
}

src/events/mod.rs

Lines changed: 7 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ use crate::escape::{
5353
partial_escape, EscapeError,
5454
};
5555
use crate::name::{LocalName, QName};
56-
use crate::utils::{name_len, trim_xml_end, trim_xml_start, write_cow_string, Bytes};
56+
use crate::utils::{self, name_len, trim_xml_end, trim_xml_start, write_cow_string};
5757
use attributes::{AttrError, Attribute, Attributes};
5858

5959
/// Opening tag data (`Event::Start`), with optional attributes: `<name attr="value">`.
@@ -783,8 +783,7 @@ impl<'a> BytesCData<'a> {
783783
#[inline]
784784
pub fn escaped(content: &'a str) -> CDataIterator<'a> {
785785
CDataIterator {
786-
unprocessed: content.as_bytes(),
787-
finished: false,
786+
inner: utils::CDataIterator::new(content),
788787
}
789788
}
790789

@@ -984,41 +983,18 @@ impl<'a> arbitrary::Arbitrary<'a> for BytesCData<'a> {
984983
/// Iterator over `CDATA` sections in a string.
985984
///
986985
/// This iterator is created by the [`BytesCData::escaped`] method.
987-
#[derive(Clone)]
986+
#[derive(Debug, Clone)]
988987
pub struct CDataIterator<'a> {
989-
/// The unprocessed data which should be emitted as `BytesCData` events.
990-
/// At each iteration, the processed data is cut from this slice.
991-
unprocessed: &'a [u8],
992-
finished: bool,
993-
}
994-
995-
impl<'a> Debug for CDataIterator<'a> {
996-
fn fmt(&self, f: &mut Formatter) -> fmt::Result {
997-
f.debug_struct("CDataIterator")
998-
.field("unprocessed", &Bytes(self.unprocessed))
999-
.field("finished", &self.finished)
1000-
.finish()
1001-
}
988+
inner: utils::CDataIterator<'a>,
1002989
}
1003990

1004991
impl<'a> Iterator for CDataIterator<'a> {
1005992
type Item = BytesCData<'a>;
1006993

1007994
fn next(&mut self) -> Option<BytesCData<'a>> {
1008-
if self.finished {
1009-
return None;
1010-
}
1011-
1012-
for gt in memchr::memchr_iter(b'>', self.unprocessed) {
1013-
if self.unprocessed[..gt].ends_with(b"]]") {
1014-
let (slice, rest) = self.unprocessed.split_at(gt);
1015-
self.unprocessed = rest;
1016-
return Some(BytesCData::wrap(slice, Decoder::utf8()));
1017-
}
1018-
}
1019-
1020-
self.finished = true;
1021-
Some(BytesCData::wrap(self.unprocessed, Decoder::utf8()))
995+
self.inner
996+
.next()
997+
.map(|slice| BytesCData::wrap(slice.as_bytes(), Decoder::utf8()))
1022998
}
1023999
}
10241000

src/se/content.rs

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
use crate::de::TEXT_KEY;
44
use crate::se::element::{ElementSerializer, Struct, Tuple};
55
use crate::se::simple_type::{QuoteTarget, SimpleTypeSerializer};
6-
use crate::se::{Indent, QuoteLevel, SeError, WriteResult, XmlName};
6+
use crate::se::{Indent, QuoteLevel, SeError, TextFormat, WriteResult, XmlName};
77
use serde::ser::{
88
Impossible, Serialize, SerializeSeq, SerializeTuple, SerializeTupleStruct, Serializer,
99
};
@@ -71,6 +71,8 @@ pub struct ContentSerializer<'w, 'i, W: Write> {
7171
/// If `true`, then current indent will be written before writing the content,
7272
/// but only if content is not empty. This flag is reset after writing indent.
7373
pub write_indent: bool,
74+
/// Defines how text content should be serialized (as escaped text or CDATA)
75+
pub text_format: TextFormat,
7476
/// If `true`, then primitive types that serializes to a text content without
7577
/// surrounding tag will be allowed, otherwise the [`SeError::Unsupported`]
7678
/// will be returned.
@@ -88,10 +90,12 @@ impl<'w, 'i, W: Write> ContentSerializer<'w, 'i, W> {
8890
/// Turns this serializer into serializer of a text content
8991
#[inline]
9092
pub fn into_simple_type_serializer_impl(self) -> SimpleTypeSerializer<&'w mut W> {
91-
//TODO: Customization point: choose between CDATA and Text representation
9293
SimpleTypeSerializer {
9394
writer: self.writer,
94-
target: QuoteTarget::Text,
95+
target: match self.text_format {
96+
TextFormat::Text => QuoteTarget::Text,
97+
TextFormat::CData => QuoteTarget::CData,
98+
},
9599
level: self.level,
96100
}
97101
}
@@ -119,6 +123,7 @@ impl<'w, 'i, W: Write> ContentSerializer<'w, 'i, W> {
119123
level: self.level,
120124
indent: self.indent.borrow(),
121125
write_indent: self.write_indent,
126+
text_format: self.text_format,
122127
allow_primitive,
123128
expand_empty_elements: self.expand_empty_elements,
124129
}
@@ -600,6 +605,7 @@ pub(super) mod tests {
600605
level: QuoteLevel::Full,
601606
indent: Indent::None,
602607
write_indent: false,
608+
text_format: TextFormat::Text,
603609
allow_primitive: true,
604610
expand_empty_elements: false,
605611
};
@@ -623,6 +629,7 @@ pub(super) mod tests {
623629
level: QuoteLevel::Full,
624630
indent: Indent::None,
625631
write_indent: false,
632+
text_format: TextFormat::Text,
626633
allow_primitive: true,
627634
expand_empty_elements: false,
628635
};
@@ -1070,6 +1077,7 @@ pub(super) mod tests {
10701077
level: QuoteLevel::Full,
10711078
indent: Indent::Owned(Indentation::new(b' ', 2)),
10721079
write_indent: false,
1080+
text_format: TextFormat::Text,
10731081
allow_primitive: true,
10741082
expand_empty_elements: false,
10751083
};
@@ -1093,6 +1101,7 @@ pub(super) mod tests {
10931101
level: QuoteLevel::Full,
10941102
indent: Indent::Owned(Indentation::new(b' ', 2)),
10951103
write_indent: false,
1104+
text_format: TextFormat::Text,
10961105
allow_primitive: true,
10971106
expand_empty_elements: false,
10981107
};

src/se/element.rs

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -443,6 +443,7 @@ impl<'w, 'k, W: Write> Struct<'w, 'k, W> {
443443
indent: self.ser.ser.indent.borrow(),
444444
// If previous field does not require indent, do not write it
445445
write_indent: self.write_indent,
446+
text_format: self.ser.ser.text_format,
446447
allow_primitive: true,
447448
expand_empty_elements: self.ser.ser.expand_empty_elements,
448449
};
@@ -596,7 +597,7 @@ impl<'w, 'k, W: Write> SerializeMap for Map<'w, 'k, W> {
596597
mod tests {
597598
use super::*;
598599
use crate::se::content::tests::*;
599-
use crate::se::{Indent, QuoteLevel};
600+
use crate::se::{Indent, QuoteLevel, TextFormat};
600601
use crate::utils::Bytes;
601602
use serde::Serialize;
602603
use std::collections::BTreeMap;
@@ -635,6 +636,7 @@ mod tests {
635636
level: QuoteLevel::Full,
636637
indent: Indent::None,
637638
write_indent: false,
639+
text_format: TextFormat::Text,
638640
allow_primitive: true,
639641
expand_empty_elements: false,
640642
},
@@ -661,6 +663,7 @@ mod tests {
661663
level: QuoteLevel::Full,
662664
indent: Indent::None,
663665
write_indent: false,
666+
text_format: TextFormat::Text,
664667
allow_primitive: true,
665668
expand_empty_elements: false,
666669
},
@@ -1356,6 +1359,7 @@ mod tests {
13561359
level: QuoteLevel::Full,
13571360
indent: Indent::Owned(Indentation::new(b' ', 2)),
13581361
write_indent: false,
1362+
text_format: TextFormat::Text,
13591363
allow_primitive: true,
13601364
expand_empty_elements: false,
13611365
},
@@ -1382,6 +1386,7 @@ mod tests {
13821386
level: QuoteLevel::Full,
13831387
indent: Indent::Owned(Indentation::new(b' ', 2)),
13841388
write_indent: false,
1389+
text_format: TextFormat::Text,
13851390
allow_primitive: true,
13861391
expand_empty_elements: false,
13871392
},
@@ -2099,6 +2104,7 @@ mod tests {
20992104
level: QuoteLevel::Full,
21002105
indent: Indent::None,
21012106
write_indent: false,
2107+
text_format: TextFormat::Text,
21022108
allow_primitive: true,
21032109
expand_empty_elements: true,
21042110
},

src/se/mod.rs

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -320,6 +320,16 @@ where
320320

321321
////////////////////////////////////////////////////////////////////////////////////////////////////
322322

323+
/// Defines the format for text content serialization
324+
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
325+
#[non_exhaustive]
326+
pub enum TextFormat {
327+
/// Serialize as regular text content with escaping
328+
Text,
329+
/// Serialize as CDATA section without escaping
330+
CData,
331+
}
332+
323333
/// Defines which characters would be escaped in [`Text`] events and attribute
324334
/// values.
325335
///
@@ -557,6 +567,7 @@ impl<'w, 'r, W: Write> Serializer<'w, 'r, W> {
557567
level: QuoteLevel::Partial,
558568
indent: Indent::None,
559569
write_indent: false,
570+
text_format: TextFormat::Text,
560571
allow_primitive: true,
561572
expand_empty_elements: false,
562573
},
@@ -623,6 +634,7 @@ impl<'w, 'r, W: Write> Serializer<'w, 'r, W> {
623634
level: QuoteLevel::Partial,
624635
indent: Indent::None,
625636
write_indent: false,
637+
text_format: TextFormat::Text,
626638
allow_primitive: true,
627639
expand_empty_elements: false,
628640
},
@@ -663,6 +675,40 @@ impl<'w, 'r, W: Write> Serializer<'w, 'r, W> {
663675
self
664676
}
665677

678+
/// Set the text format used for serializing text content.
679+
///
680+
/// - [`TextFormat::Text`]: Regular XML escaping (default)
681+
/// - [`TextFormat::CData`]: CDATA sections for text content
682+
///
683+
/// # Examples
684+
///
685+
/// ```
686+
/// # use pretty_assertions::assert_eq;
687+
/// # use serde::Serialize;
688+
/// # use quick_xml::se::{Serializer, TextFormat};
689+
///
690+
/// #[derive(Debug, PartialEq, Serialize)]
691+
/// struct Document {
692+
/// #[serde(rename = "$text")]
693+
/// content: String,
694+
/// }
695+
///
696+
/// let mut buffer = String::new();
697+
/// let mut ser = Serializer::with_root(&mut buffer, Some("doc")).unwrap();
698+
/// ser.text_format(TextFormat::CData);
699+
///
700+
/// let data = Document {
701+
/// content: "Content with <markup> & entities".to_string(),
702+
/// };
703+
///
704+
/// data.serialize(ser).unwrap();
705+
/// assert_eq!(buffer, "<doc><![CDATA[Content with <markup> & entities]]></doc>");
706+
/// ```
707+
pub fn text_format(&mut self, format: TextFormat) -> &mut Self {
708+
self.ser.text_format = format;
709+
self
710+
}
711+
666712
/// Configure indent for a serializer
667713
pub fn indent(&mut self, indent_char: char, indent_size: usize) -> &mut Self {
668714
self.ser.indent = Indent::Owned(Indentation::new(indent_char as u8, indent_size));

0 commit comments

Comments
 (0)