Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enforce unescape functions to operate over UTF-8 and fix serde bugs #421

Merged
merged 8 commits into from
Jul 15, 2022
14 changes: 14 additions & 0 deletions Changelog.md
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,11 @@
- [#363]: Do not generate empty `Event::Text` events
- [#412]: Fix using incorrect encoding if `read_to_end` family of methods or `read_text`
method not found a corresponding end tag and reader has non-UTF-8 encoding
- [#421]: Fix incorrect order of unescape and decode operations for serde deserializer:
decoding should be first, unescape is the second
- [#421]: Fixed unknown bug in serde deserialization of externally tagged enums
when an enum variant represented as a `Text` event (i.e. `<xml>tag</xml>`)
and a document encoding is not an UTF-8

### Misc Changes

Expand Down Expand Up @@ -109,6 +114,7 @@
|`read_event_unbuffered` |`read_event`
|`read_to_end_unbuffered` |`read_to_end`
- [#412]: Change `read_to_end*` and `read_text_into` to accept `QName` instead of `AsRef<[u8]>`

- [#415]: Changed custom entity unescaping API to accept closures rather than a mapping of entity to
replacement text. This avoids needing to allocate a map and provides the user with more flexibility.
- [#415]: Renamed many functions following the pattern `unescape_and_decode*` to `decode_and_unescape*`
Expand All @@ -118,9 +124,16 @@
`BytesText::unescape()`, `BytesText::unescaped_with()` renamed to `BytesText::unescape_with()`,
`Attribute::escaped_value()` renamed to `Attribute::escape_value()`, and `Attribute::escaped_value_with()`
renamed to `Attribute::escape_value_with()` for consistency across the API.

- [#416]: `BytesStart::to_borrowed` renamed to `BytesStart::borrow`, the same method
added to all events

- [#421]: `decode_and_unescape*` methods now does one less allocation if unescaping is not required
- [#421]: Removed ability to deserialize byte arrays from serde deserializer.
XML is not able to store binary data directly, you should always use some encoding
scheme, for example, HEX or Base64
- [#421]: All unescaping functions now accepts and returns strings instead of byte slices

### New Tests

- [#9]: Added tests for incorrect nested tags in input
Expand Down Expand Up @@ -148,6 +161,7 @@
[#415]: https://github.com/tafia/quick-xml/pull/415
[#416]: https://github.com/tafia/quick-xml/pull/416
[#418]: https://github.com/tafia/quick-xml/pull/418
[#421]: https://github.com/tafia/quick-xml/pull/421

## 0.23.0 -- 2022-05-08

Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ loop {
_ => (),
}
},
Ok(Event::Text(e)) => txt.push(e.unescape_and_decode(&reader).unwrap()),
Ok(Event::Text(e)) => txt.push(e.unescape_and_decode(&reader).unwrap().into_owned()),
Ok(Event::Eof) => break, // exits the loop when reaching end of file
Err(e) => panic!("Error at position {}: {:?}", reader.buffer_position(), e),
_ => (), // There are several other `Event`s we do not consider here
Expand Down
4 changes: 2 additions & 2 deletions benches/macrobenches.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,11 @@ fn parse_document(doc: &[u8]) -> XmlResult<()> {
match r.read_event()? {
Event::Start(e) | Event::Empty(e) => {
for attr in e.attributes() {
criterion::black_box(attr?.unescape_value()?);
criterion::black_box(attr?.decode_and_unescape_value(&r)?);
}
}
Event::Text(e) => {
criterion::black_box(e.unescape()?);
criterion::black_box(e.decode_and_unescape(&r)?);
}
Event::CData(e) => {
criterion::black_box(e.into_inner());
Expand Down
20 changes: 10 additions & 10 deletions benches/microbenches.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@ use quick_xml::Reader;
static SAMPLE: &[u8] = include_bytes!("../tests/documents/sample_rss.xml");
static PLAYERS: &[u8] = include_bytes!("../tests/documents/players.xml");

static LOREM_IPSUM_TEXT: &[u8] =
b"Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt
static LOREM_IPSUM_TEXT: &str =
"Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt
ut labore et dolore magna aliqua. Hac habitasse platea dictumst vestibulum rhoncus est pellentesque.
Risus ultricies tristique nulla aliquet enim tortor at. Fermentum odio eu feugiat pretium nibh ipsum.
Volutpat sed cras ornare arcu dui. Scelerisque fermentum dui faucibus in ornare quam. Arcu cursus
Expand Down Expand Up @@ -176,7 +176,7 @@ fn one_event(c: &mut Criterion) {
.check_comments(false)
.trim_text(true);
match r.read_event_into(&mut buf) {
Ok(Event::Comment(ref e)) => nbtxt += e.unescape().unwrap().len(),
Ok(Event::Comment(e)) => nbtxt += e.decode_and_unescape(&r).unwrap().len(),
something_else => panic!("Did not expect {:?}", something_else),
};

Expand Down Expand Up @@ -293,7 +293,7 @@ fn escaping(c: &mut Criterion) {

group.bench_function("no_chars_to_escape_long", |b| {
b.iter(|| {
criterion::black_box(escape(LOREM_IPSUM_TEXT));
criterion::black_box(escape(LOREM_IPSUM_TEXT.as_bytes()));
})
});

Expand Down Expand Up @@ -345,31 +345,31 @@ fn unescaping(c: &mut Criterion) {

group.bench_function("no_chars_to_unescape_short", |b| {
b.iter(|| {
criterion::black_box(unescape(b"just a bit of text")).unwrap();
criterion::black_box(unescape("just a bit of text")).unwrap();
})
});

group.bench_function("char_reference", |b| {
b.iter(|| {
let text = b"prefix &#34;some stuff&#34;,&#x22;more stuff&#x22;";
let text = "prefix &#34;some stuff&#34;,&#x22;more stuff&#x22;";
criterion::black_box(unescape(text)).unwrap();
let text = b"&#38;&#60;";
let text = "&#38;&#60;";
criterion::black_box(unescape(text)).unwrap();
})
});

group.bench_function("entity_reference", |b| {
b.iter(|| {
let text = b"age &gt; 72 &amp;&amp; age &lt; 21";
let text = "age &gt; 72 &amp;&amp; age &lt; 21";
criterion::black_box(unescape(text)).unwrap();
let text = b"&quot;what&apos;s that?&quot;";
let text = "&quot;what&apos;s that?&quot;";
criterion::black_box(unescape(text)).unwrap();
})
});

group.bench_function("mixed", |b| {
let text =
b"Lorem ipsum dolor sit amet, &amp;consectetur adipiscing elit, sed do eiusmod tempor incididunt
"Lorem ipsum dolor sit amet, &amp;consectetur adipiscing elit, sed do eiusmod tempor incididunt
ut labore et dolore magna aliqua. Hac habitasse platea dictumst vestibulum rhoncus est pellentesque.
Risus ultricies &quot;tristique nulla aliquet enim tortor&quot; at. Fermentum odio eu feugiat pretium
nibh ipsum. Volutpat sed cras ornare arcu dui. Scelerisque fermentum dui faucibus in ornare quam. Arcu
Expand Down
5 changes: 3 additions & 2 deletions examples/custom_entities.rs
Original file line number Diff line number Diff line change
Expand Up @@ -28,15 +28,15 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
reader.trim_text(true);

let mut buf = Vec::new();
let mut custom_entities: HashMap<Vec<u8>, String> = HashMap::new();
let mut custom_entities: HashMap<String, String> = HashMap::new();
let entity_re = Regex::new(r#"<!ENTITY\s+([^ \t\r\n]+)\s+"([^"]*)"\s*>"#)?;

loop {
match reader.read_event_into(&mut buf) {
Ok(Event::DocType(ref e)) => {
for cap in entity_re.captures_iter(&e) {
custom_entities.insert(
cap[1].to_vec(),
reader.decoder().decode(&cap[1])?.into_owned(),
reader.decoder().decode(&cap[2])?.into_owned(),
);
}
Expand All @@ -51,6 +51,7 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
custom_entities.get(ent).map(|s| s.as_str())
})
.unwrap()
.into_owned()
})
.collect::<Vec<_>>();
println!("attributes values: {:?}", attributes);
Expand Down
36 changes: 21 additions & 15 deletions src/de/escape.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
//! Serde `Deserializer` module

use crate::de::deserialize_bool;
use crate::{errors::serialize::DeError, errors::Error, escape::unescape, reader::Decoder};
use crate::errors::serialize::DeError;
use crate::escape::unescape;
use crate::reader::Decoder;
use serde::de::{DeserializeSeed, EnumAccess, VariantAccess, Visitor};
use serde::{self, forward_to_deserialize_any, serde_if_integer128};
use std::borrow::Cow;
Expand Down Expand Up @@ -30,13 +32,6 @@ impl<'a> EscapedDeserializer<'a> {
escaped,
}
}
fn unescaped(&self) -> Result<Cow<[u8]>, DeError> {
if self.escaped {
unescape(&self.escaped_value).map_err(|e| DeError::InvalidXml(Error::EscapeError(e)))
} else {
Ok(Cow::Borrowed(&self.escaped_value))
}
}
}

macro_rules! deserialize_num {
Expand Down Expand Up @@ -66,20 +61,31 @@ impl<'de, 'a> serde::Deserializer<'de> for EscapedDeserializer<'a> {
where
V: Visitor<'de>,
{
let unescaped = self.unescaped()?;
let value = self.decoder.decode(&unescaped)?;

visitor.visit_str(&value)
let decoded = self.decoder.decode(&self.escaped_value)?;
if self.escaped {
match unescape(&decoded)? {
Cow::Borrowed(s) => visitor.visit_str(s),
Cow::Owned(s) => visitor.visit_string(s),
}
} else {
match decoded {
Cow::Borrowed(s) => visitor.visit_str(s),
Cow::Owned(s) => visitor.visit_string(s),
}
}
}

fn deserialize_bytes<V>(self, visitor: V) -> Result<V::Value, Self::Error>
/// Returns [`DeError::Unsupported`]
fn deserialize_bytes<V>(self, _visitor: V) -> Result<V::Value, Self::Error>
where
V: Visitor<'de>,
{
let v = self.unescaped()?;
visitor.visit_bytes(&v)
Err(DeError::Unsupported(
"binary data content is not supported by XML format",
))
}

/// Forwards deserialization to the [`Self::deserialize_bytes`]
fn deserialize_byte_buf<V>(self, visitor: V) -> Result<V::Value, Self::Error>
where
V: Visitor<'de>,
Expand Down
21 changes: 4 additions & 17 deletions src/de/map.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,10 @@ use crate::{
de::escape::EscapedDeserializer,
de::seq::{not_in, TagFilter},
de::simple_type::SimpleTypeDeserializer,
de::{deserialize_bool, DeEvent, Deserializer, XmlRead, INNER_VALUE, UNFLATTEN_PREFIX},
de::{str2bool, DeEvent, Deserializer, XmlRead, INNER_VALUE, UNFLATTEN_PREFIX},
errors::serialize::DeError,
events::attributes::IterState,
events::{BytesCData, BytesStart},
reader::Decoder,
events::BytesStart,
};
use serde::de::{self, DeserializeSeed, IntoDeserializer, SeqAccess, Visitor};
use serde::serde_if_integer128;
Expand Down Expand Up @@ -479,15 +478,9 @@ where
{
/// Returns a text event, used inside [`deserialize_primitives!()`]
#[inline]
fn next_text(&mut self, unescape: bool) -> Result<BytesCData<'de>, DeError> {
fn next_text(&mut self, unescape: bool) -> Result<Cow<'de, str>, DeError> {
self.map.de.next_text_impl(unescape, self.allow_start)
}

/// Returns a decoder, used inside [`deserialize_primitives!()`]
#[inline]
fn decoder(&self) -> Decoder {
self.map.de.reader.decoder()
}
}

impl<'de, 'a, 'm, R> de::Deserializer<'de> for MapValueDeserializer<'de, 'a, 'm, R>
Expand Down Expand Up @@ -649,15 +642,9 @@ where
{
/// Returns a text event, used inside [`deserialize_primitives!()`]
#[inline]
fn next_text(&mut self, unescape: bool) -> Result<BytesCData<'de>, DeError> {
fn next_text(&mut self, unescape: bool) -> Result<Cow<'de, str>, DeError> {
self.map.de.next_text_impl(unescape, true)
}

/// Returns a decoder, used inside [`deserialize_primitives!()`]
#[inline]
fn decoder(&self) -> Decoder {
self.map.de.reader.decoder()
}
}

impl<'de, 'a, 'm, R> de::Deserializer<'de> for SeqValueDeserializer<'de, 'a, 'm, R>
Expand Down
46 changes: 17 additions & 29 deletions src/de/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -117,8 +117,7 @@ macro_rules! deserialize_type {
{
// No need to unescape because valid integer representations cannot be escaped
let text = self.next_text(false)?;
let string = text.decode(self.decoder())?;
visitor.$visit(string.parse()?)
visitor.$visit(text.parse()?)
}
};
}
Expand Down Expand Up @@ -152,7 +151,7 @@ macro_rules! deserialize_primitives {
// No need to unescape because valid boolean representations cannot be escaped
let text = self.next_text(false)?;

deserialize_bool(text.as_ref(), self.decoder(), visitor)
str2bool(&text, visitor)
}

/// Representation of owned strings the same as [non-owned](#method.deserialize_str).
Expand All @@ -176,30 +175,26 @@ macro_rules! deserialize_primitives {
V: Visitor<'de>,
{
let text = self.next_text(true)?;
let string = text.decode(self.decoder())?;
match string {
match text {
Cow::Borrowed(string) => visitor.visit_borrowed_str(string),
Cow::Owned(string) => visitor.visit_string(string),
}
}

fn deserialize_bytes<V>($($mut)? self, visitor: V) -> Result<V::Value, DeError>
/// Returns [`DeError::Unsupported`]
fn deserialize_bytes<V>(self, _visitor: V) -> Result<V::Value, DeError>
where
V: Visitor<'de>,
{
// No need to unescape because bytes gives access to the raw XML input
let text = self.next_text(false)?;
visitor.visit_bytes(&text)
Err(DeError::Unsupported("binary data content is not supported by XML format"))
}

fn deserialize_byte_buf<V>($($mut)? self, visitor: V) -> Result<V::Value, DeError>
/// Forwards deserialization to the [`deserialize_bytes`](#method.deserialize_bytes).
fn deserialize_byte_buf<V>(self, visitor: V) -> Result<V::Value, DeError>
where
V: Visitor<'de>,
{
// No need to unescape because bytes gives access to the raw XML input
let text = self.next_text(false)?;
let value = text.into_inner().into_owned();
visitor.visit_byte_buf(value)
self.deserialize_bytes(visitor)
}

/// Identifiers represented as [strings](#method.deserialize_str).
Expand Down Expand Up @@ -576,7 +571,7 @@ where
}

#[inline]
fn next_text(&mut self, unescape: bool) -> Result<BytesCData<'de>, DeError> {
fn next_text(&mut self, unescape: bool) -> Result<Cow<'de, str>, DeError> {
self.next_text_impl(unescape, true)
}

Expand Down Expand Up @@ -616,25 +611,24 @@ where
&mut self,
unescape: bool,
allow_start: bool,
) -> Result<BytesCData<'de>, DeError> {
) -> Result<Cow<'de, str>, DeError> {
let decoder = self.reader.decoder();
match self.next()? {
DeEvent::Text(e) if unescape => e.unescape_as_cdata().map_err(Into::into),
DeEvent::Text(e) => Ok(BytesCData::new(e.into_inner())),
DeEvent::CData(e) => Ok(e),
DeEvent::Text(e) => Ok(e.decode(decoder, unescape)?),
DeEvent::CData(e) => Ok(e.decode(decoder)?),
DeEvent::Start(e) if allow_start => {
// allow one nested level
let inner = self.next()?;
let t = match inner {
DeEvent::Text(t) if unescape => t.unescape_as_cdata()?,
DeEvent::Text(t) => BytesCData::new(t.into_inner()),
DeEvent::CData(t) => t,
DeEvent::Text(t) => t.decode(decoder, unescape)?,
DeEvent::CData(t) => t.decode(decoder)?,
DeEvent::Start(s) => {
return Err(DeError::UnexpectedStart(s.name().as_ref().to_owned()))
}
// We can get End event in case of `<tag></tag>` or `<tag/>` input
// Return empty text in that case
DeEvent::End(end) if end.name() == e.name() => {
return Ok(BytesCData::new(&[] as &[u8]));
return Ok("".into());
}
DeEvent::End(end) => {
return Err(DeError::UnexpectedEnd(end.name().as_ref().to_owned()))
Expand All @@ -650,12 +644,6 @@ where
}
}

/// Returns a decoder, used inside `deserialize_primitives!()`
#[inline]
fn decoder(&self) -> Decoder {
self.reader.decoder()
}

/// Drops all events until event with [name](BytesEnd::name()) `name` won't be
/// dropped. This method should be called after [`Self::next()`]
#[cfg(feature = "overlapped-lists")]
Expand Down
Loading