tafia · dralley · Jul 15, 2022 · Jul 15, 2022 · Jul 11, 2022 · Jul 14, 2022
diff --git a/Changelog.md b/Changelog.md
@@ -42,6 +42,11 @@
 - [#363]: Do not generate empty `Event::Text` events
 - [#412]: Fix using incorrect encoding if `read_to_end` family of methods or `read_text`
   method not found a corresponding end tag and reader has non-UTF-8 encoding
+- [#421]: Fix incorrect order of unescape and decode operations for serde deserializer:
+  decoding should be first, unescape is the second
+- [#421]: Fixed unknown bug in serde deserialization of externally tagged enums
+  when an enum variant represented as a `Text` event (i.e. `<xml>tag</xml>`)
+  and a document encoding is not an UTF-8
 
 ### Misc Changes
 
@@ -109,6 +114,7 @@
   |`read_event_unbuffered`  |`read_event`
   |`read_to_end_unbuffered` |`read_to_end`
 - [#412]: Change `read_to_end*` and `read_text_into` to accept `QName` instead of `AsRef<[u8]>`
+
 - [#415]: Changed custom entity unescaping API to accept closures rather than a mapping of entity to
   replacement text. This avoids needing to allocate a map and provides the user with more flexibility.
 - [#415]: Renamed many functions following the pattern `unescape_and_decode*` to `decode_and_unescape*`
@@ -118,9 +124,16 @@
   `BytesText::unescape()`, `BytesText::unescaped_with()` renamed to `BytesText::unescape_with()`,
   `Attribute::escaped_value()` renamed to `Attribute::escape_value()`, and `Attribute::escaped_value_with()`
   renamed to `Attribute::escape_value_with()` for consistency across the API.
+
 - [#416]: `BytesStart::to_borrowed` renamed to `BytesStart::borrow`, the same method
   added to all events
 
+- [#421]: `decode_and_unescape*` methods now does one less allocation if unescaping is not required
+- [#421]: Removed ability to deserialize byte arrays from serde deserializer.
+  XML is not able to store binary data directly, you should always use some encoding
+  scheme, for example, HEX or Base64
+- [#421]: All unescaping functions now accepts and returns strings instead of byte slices
+
 ### New Tests
 
 - [#9]: Added tests for incorrect nested tags in input
@@ -148,6 +161,7 @@
 [#415]: https://github.com/tafia/quick-xml/pull/415
 [#416]: https://github.com/tafia/quick-xml/pull/416
 [#418]: https://github.com/tafia/quick-xml/pull/418
+[#421]: https://github.com/tafia/quick-xml/pull/421
 
 ## 0.23.0 -- 2022-05-08
 

diff --git a/README.md b/README.md
@@ -51,7 +51,7 @@ loop {
                 _ => (),
             }
         },
-        Ok(Event::Text(e)) => txt.push(e.unescape_and_decode(&reader).unwrap()),
+        Ok(Event::Text(e)) => txt.push(e.unescape_and_decode(&reader).unwrap().into_owned()),
         Ok(Event::Eof) => break, // exits the loop when reaching end of file
         Err(e) => panic!("Error at position {}: {:?}", reader.buffer_position(), e),
         _ => (), // There are several other `Event`s we do not consider here

diff --git a/benches/macrobenches.rs b/benches/macrobenches.rs
@@ -24,11 +24,11 @@ fn parse_document(doc: &[u8]) -> XmlResult<()> {
         match r.read_event()? {
             Event::Start(e) | Event::Empty(e) => {
                 for attr in e.attributes() {
-                    criterion::black_box(attr?.unescape_value()?);
+                    criterion::black_box(attr?.decode_and_unescape_value(&r)?);
                 }
             }
             Event::Text(e) => {
-                criterion::black_box(e.unescape()?);
+                criterion::black_box(e.decode_and_unescape(&r)?);
             }
             Event::CData(e) => {
                 criterion::black_box(e.into_inner());

diff --git a/benches/microbenches.rs b/benches/microbenches.rs
@@ -8,8 +8,8 @@ use quick_xml::Reader;
 static SAMPLE: &[u8] = include_bytes!("../tests/documents/sample_rss.xml");
 static PLAYERS: &[u8] = include_bytes!("../tests/documents/players.xml");
 
-static LOREM_IPSUM_TEXT: &[u8] =
-b"Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt
+static LOREM_IPSUM_TEXT: &str =
+"Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt
 ut labore et dolore magna aliqua. Hac habitasse platea dictumst vestibulum rhoncus est pellentesque.
 Risus ultricies tristique nulla aliquet enim tortor at. Fermentum odio eu feugiat pretium nibh ipsum.
 Volutpat sed cras ornare arcu dui. Scelerisque fermentum dui faucibus in ornare quam. Arcu cursus
@@ -176,7 +176,7 @@ fn one_event(c: &mut Criterion) {
                 .check_comments(false)
                 .trim_text(true);
             match r.read_event_into(&mut buf) {
-                Ok(Event::Comment(ref e)) => nbtxt += e.unescape().unwrap().len(),
+                Ok(Event::Comment(e)) => nbtxt += e.decode_and_unescape(&r).unwrap().len(),
                 something_else => panic!("Did not expect {:?}", something_else),
             };
 
@@ -293,7 +293,7 @@ fn escaping(c: &mut Criterion) {
 
     group.bench_function("no_chars_to_escape_long", |b| {
         b.iter(|| {
-            criterion::black_box(escape(LOREM_IPSUM_TEXT));
+            criterion::black_box(escape(LOREM_IPSUM_TEXT.as_bytes()));
         })
     });
 
@@ -345,31 +345,31 @@ fn unescaping(c: &mut Criterion) {
 
     group.bench_function("no_chars_to_unescape_short", |b| {
         b.iter(|| {
-            criterion::black_box(unescape(b"just a bit of text")).unwrap();
+            criterion::black_box(unescape("just a bit of text")).unwrap();
         })
     });
 
     group.bench_function("char_reference", |b| {
         b.iter(|| {
-            let text = b"prefix &#34;some stuff&#34;,&#x22;more stuff&#x22;";
+            let text = "prefix &#34;some stuff&#34;,&#x22;more stuff&#x22;";
             criterion::black_box(unescape(text)).unwrap();
-            let text = b"&#38;&#60;";
+            let text = "&#38;&#60;";
             criterion::black_box(unescape(text)).unwrap();
         })
     });
 
     group.bench_function("entity_reference", |b| {
         b.iter(|| {
-            let text = b"age &gt; 72 &amp;&amp; age &lt; 21";
+            let text = "age &gt; 72 &amp;&amp; age &lt; 21";
             criterion::black_box(unescape(text)).unwrap();
-            let text = b"&quot;what&apos;s that?&quot;";
+            let text = "&quot;what&apos;s that?&quot;";
             criterion::black_box(unescape(text)).unwrap();
         })
     });
 
     group.bench_function("mixed", |b| {
         let text =
-b"Lorem ipsum dolor sit amet, &amp;consectetur adipiscing elit, sed do eiusmod tempor incididunt
+"Lorem ipsum dolor sit amet, &amp;consectetur adipiscing elit, sed do eiusmod tempor incididunt
 ut labore et dolore magna aliqua. Hac habitasse platea dictumst vestibulum rhoncus est pellentesque.
 Risus ultricies &quot;tristique nulla aliquet enim tortor&quot; at. Fermentum odio eu feugiat pretium
 nibh ipsum. Volutpat sed cras ornare arcu dui. Scelerisque fermentum dui faucibus in ornare quam. Arcu

diff --git a/examples/custom_entities.rs b/examples/custom_entities.rs
@@ -28,15 +28,15 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
     reader.trim_text(true);
 
     let mut buf = Vec::new();
-    let mut custom_entities: HashMap<Vec<u8>, String> = HashMap::new();
+    let mut custom_entities: HashMap<String, String> = HashMap::new();
     let entity_re = Regex::new(r#"<!ENTITY\s+([^ \t\r\n]+)\s+"([^"]*)"\s*>"#)?;
 
     loop {
         match reader.read_event_into(&mut buf) {
             Ok(Event::DocType(ref e)) => {
                 for cap in entity_re.captures_iter(&e) {
                     custom_entities.insert(
-                        cap[1].to_vec(),
+                        reader.decoder().decode(&cap[1])?.into_owned(),
                         reader.decoder().decode(&cap[2])?.into_owned(),
                     );
                 }
@@ -51,6 +51,7 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
                                     custom_entities.get(ent).map(|s| s.as_str())
                                 })
                                 .unwrap()
+                                .into_owned()
                         })
                         .collect::<Vec<_>>();
                     println!("attributes values: {:?}", attributes);

diff --git a/src/de/escape.rs b/src/de/escape.rs
@@ -1,7 +1,9 @@
 //! Serde `Deserializer` module
 
 use crate::de::deserialize_bool;
-use crate::{errors::serialize::DeError, errors::Error, escape::unescape, reader::Decoder};
+use crate::errors::serialize::DeError;
+use crate::escape::unescape;
+use crate::reader::Decoder;
 use serde::de::{DeserializeSeed, EnumAccess, VariantAccess, Visitor};
 use serde::{self, forward_to_deserialize_any, serde_if_integer128};
 use std::borrow::Cow;
@@ -30,13 +32,6 @@ impl<'a> EscapedDeserializer<'a> {
             escaped,
         }
     }
-    fn unescaped(&self) -> Result<Cow<[u8]>, DeError> {
-        if self.escaped {
-            unescape(&self.escaped_value).map_err(|e| DeError::InvalidXml(Error::EscapeError(e)))
-        } else {
-            Ok(Cow::Borrowed(&self.escaped_value))
-        }
-    }
 }
 
 macro_rules! deserialize_num {
@@ -66,20 +61,31 @@ impl<'de, 'a> serde::Deserializer<'de> for EscapedDeserializer<'a> {
     where
         V: Visitor<'de>,
     {
-        let unescaped = self.unescaped()?;
-        let value = self.decoder.decode(&unescaped)?;
-
-        visitor.visit_str(&value)
+        let decoded = self.decoder.decode(&self.escaped_value)?;
+        if self.escaped {
+            match unescape(&decoded)? {
+                Cow::Borrowed(s) => visitor.visit_str(s),
+                Cow::Owned(s) => visitor.visit_string(s),
+            }
+        } else {
+            match decoded {
+                Cow::Borrowed(s) => visitor.visit_str(s),
+                Cow::Owned(s) => visitor.visit_string(s),
+            }
+        }
     }
 
-    fn deserialize_bytes<V>(self, visitor: V) -> Result<V::Value, Self::Error>
+    /// Returns [`DeError::Unsupported`]
+    fn deserialize_bytes<V>(self, _visitor: V) -> Result<V::Value, Self::Error>
     where
         V: Visitor<'de>,
     {
-        let v = self.unescaped()?;
-        visitor.visit_bytes(&v)
+        Err(DeError::Unsupported(
+            "binary data content is not supported by XML format",
+        ))
     }
 
+    /// Forwards deserialization to the [`Self::deserialize_bytes`]
     fn deserialize_byte_buf<V>(self, visitor: V) -> Result<V::Value, Self::Error>
     where
         V: Visitor<'de>,

diff --git a/src/de/map.rs b/src/de/map.rs
@@ -4,11 +4,10 @@ use crate::{
     de::escape::EscapedDeserializer,
     de::seq::{not_in, TagFilter},
     de::simple_type::SimpleTypeDeserializer,
-    de::{deserialize_bool, DeEvent, Deserializer, XmlRead, INNER_VALUE, UNFLATTEN_PREFIX},
+    de::{str2bool, DeEvent, Deserializer, XmlRead, INNER_VALUE, UNFLATTEN_PREFIX},
     errors::serialize::DeError,
     events::attributes::IterState,
-    events::{BytesCData, BytesStart},
-    reader::Decoder,
+    events::BytesStart,
 };
 use serde::de::{self, DeserializeSeed, IntoDeserializer, SeqAccess, Visitor};
 use serde::serde_if_integer128;
@@ -479,15 +478,9 @@ where
 {
     /// Returns a text event, used inside [`deserialize_primitives!()`]
     #[inline]
-    fn next_text(&mut self, unescape: bool) -> Result<BytesCData<'de>, DeError> {
+    fn next_text(&mut self, unescape: bool) -> Result<Cow<'de, str>, DeError> {
         self.map.de.next_text_impl(unescape, self.allow_start)
     }
-
-    /// Returns a decoder, used inside [`deserialize_primitives!()`]
-    #[inline]
-    fn decoder(&self) -> Decoder {
-        self.map.de.reader.decoder()
-    }
 }
 
 impl<'de, 'a, 'm, R> de::Deserializer<'de> for MapValueDeserializer<'de, 'a, 'm, R>
@@ -649,15 +642,9 @@ where
 {
     /// Returns a text event, used inside [`deserialize_primitives!()`]
     #[inline]
-    fn next_text(&mut self, unescape: bool) -> Result<BytesCData<'de>, DeError> {
+    fn next_text(&mut self, unescape: bool) -> Result<Cow<'de, str>, DeError> {
         self.map.de.next_text_impl(unescape, true)
     }
-
-    /// Returns a decoder, used inside [`deserialize_primitives!()`]
-    #[inline]
-    fn decoder(&self) -> Decoder {
-        self.map.de.reader.decoder()
-    }
 }
 
 impl<'de, 'a, 'm, R> de::Deserializer<'de> for SeqValueDeserializer<'de, 'a, 'm, R>

diff --git a/src/de/mod.rs b/src/de/mod.rs
@@ -117,8 +117,7 @@ macro_rules! deserialize_type {
         {
             // No need to unescape because valid integer representations cannot be escaped
             let text = self.next_text(false)?;
-            let string = text.decode(self.decoder())?;
-            visitor.$visit(string.parse()?)
+            visitor.$visit(text.parse()?)
         }
     };
 }
@@ -152,7 +151,7 @@ macro_rules! deserialize_primitives {
             // No need to unescape because valid boolean representations cannot be escaped
             let text = self.next_text(false)?;
 
-            deserialize_bool(text.as_ref(), self.decoder(), visitor)
+            str2bool(&text, visitor)
         }
 
         /// Representation of owned strings the same as [non-owned](#method.deserialize_str).
@@ -176,30 +175,26 @@ macro_rules! deserialize_primitives {
             V: Visitor<'de>,
         {
             let text = self.next_text(true)?;
-            let string = text.decode(self.decoder())?;
-            match string {
+            match text {
                 Cow::Borrowed(string) => visitor.visit_borrowed_str(string),
                 Cow::Owned(string) => visitor.visit_string(string),
             }
         }
 
-        fn deserialize_bytes<V>($($mut)? self, visitor: V) -> Result<V::Value, DeError>
+        /// Returns [`DeError::Unsupported`]
+        fn deserialize_bytes<V>(self, _visitor: V) -> Result<V::Value, DeError>
         where
             V: Visitor<'de>,
         {
-            // No need to unescape because bytes gives access to the raw XML input
-            let text = self.next_text(false)?;
-            visitor.visit_bytes(&text)
+            Err(DeError::Unsupported("binary data content is not supported by XML format"))
         }
 
-        fn deserialize_byte_buf<V>($($mut)? self, visitor: V) -> Result<V::Value, DeError>
+        /// Forwards deserialization to the [`deserialize_bytes`](#method.deserialize_bytes).
+        fn deserialize_byte_buf<V>(self, visitor: V) -> Result<V::Value, DeError>
         where
             V: Visitor<'de>,
         {
-            // No need to unescape because bytes gives access to the raw XML input
-            let text = self.next_text(false)?;
-            let value = text.into_inner().into_owned();
-            visitor.visit_byte_buf(value)
+            self.deserialize_bytes(visitor)
         }
 
         /// Identifiers represented as [strings](#method.deserialize_str).
@@ -576,7 +571,7 @@ where
     }
 
     #[inline]
-    fn next_text(&mut self, unescape: bool) -> Result<BytesCData<'de>, DeError> {
+    fn next_text(&mut self, unescape: bool) -> Result<Cow<'de, str>, DeError> {
         self.next_text_impl(unescape, true)
     }
 
@@ -616,25 +611,24 @@ where
         &mut self,
         unescape: bool,
         allow_start: bool,
-    ) -> Result<BytesCData<'de>, DeError> {
+    ) -> Result<Cow<'de, str>, DeError> {
+        let decoder = self.reader.decoder();
         match self.next()? {
-            DeEvent::Text(e) if unescape => e.unescape_as_cdata().map_err(Into::into),
-            DeEvent::Text(e) => Ok(BytesCData::new(e.into_inner())),
-            DeEvent::CData(e) => Ok(e),
+            DeEvent::Text(e) => Ok(e.decode(decoder, unescape)?),
+            DeEvent::CData(e) => Ok(e.decode(decoder)?),
             DeEvent::Start(e) if allow_start => {
                 // allow one nested level
                 let inner = self.next()?;
                 let t = match inner {
-                    DeEvent::Text(t) if unescape => t.unescape_as_cdata()?,
-                    DeEvent::Text(t) => BytesCData::new(t.into_inner()),
-                    DeEvent::CData(t) => t,
+                    DeEvent::Text(t) => t.decode(decoder, unescape)?,
+                    DeEvent::CData(t) => t.decode(decoder)?,
                     DeEvent::Start(s) => {
                         return Err(DeError::UnexpectedStart(s.name().as_ref().to_owned()))
                     }
                     // We can get End event in case of `<tag></tag>` or `<tag/>` input
                     // Return empty text in that case
                     DeEvent::End(end) if end.name() == e.name() => {
-                        return Ok(BytesCData::new(&[] as &[u8]));
+                        return Ok("".into());
                     }
                     DeEvent::End(end) => {
                         return Err(DeError::UnexpectedEnd(end.name().as_ref().to_owned()))
@@ -650,12 +644,6 @@ where
         }
     }
 
-    /// Returns a decoder, used inside `deserialize_primitives!()`
-    #[inline]
-    fn decoder(&self) -> Decoder {
-        self.reader.decoder()
-    }
-
     /// Drops all events until event with [name](BytesEnd::name()) `name` won't be
     /// dropped. This method should be called after [`Self::next()`]
     #[cfg(feature = "overlapped-lists")]