Skip to content

Commit ddacb31

Browse files
committed
added EntityResolver to Deserializer
1 parent 2e9123a commit ddacb31

File tree

5 files changed

+71
-13
lines changed

5 files changed

+71
-13
lines changed

src/de/mod.rs

+48-10
Original file line numberDiff line numberDiff line change
@@ -1842,7 +1842,8 @@ use crate::{
18421842
errors::Error,
18431843
events::{BytesCData, BytesEnd, BytesStart, BytesText, Event},
18441844
name::QName,
1845-
reader::Reader,
1845+
reader::Reader,
1846+
resolver::{EntityResolver, DefaultEntityResolver},
18461847
};
18471848
use serde::de::{self, Deserialize, DeserializeOwned, DeserializeSeed, SeqAccess, Visitor};
18481849
use std::borrow::Cow;
@@ -1954,26 +1955,34 @@ impl<'a> PayloadEvent<'a> {
19541955
}
19551956
}
19561957

1958+
19571959
/// An intermediate reader that consumes [`PayloadEvent`]s and produces final [`DeEvent`]s.
19581960
/// [`PayloadEvent::Text`] events, that followed by any event except
19591961
/// [`PayloadEvent::Text`] or [`PayloadEvent::CData`], are trimmed from the end.
1960-
struct XmlReader<'i, R: XmlRead<'i>> {
1962+
struct XmlReader<'i, R: XmlRead<'i>, E: EntityResolver<'i> = DefaultEntityResolver> {
19611963
/// A source of low-level XML events
19621964
reader: R,
19631965
/// Intermediate event, that could be returned by the next call to `next()`.
19641966
/// If that is the `Text` event then leading spaces already trimmed, but
19651967
/// trailing spaces is not. Before the event will be returned, trimming of
19661968
/// the spaces could be necessary
19671969
lookahead: Result<PayloadEvent<'i>, DeError>,
1970+
1971+
entity_resolver: E
19681972
}
19691973

1970-
impl<'i, R: XmlRead<'i>> XmlReader<'i, R> {
1971-
fn new(mut reader: R) -> Self {
1974+
impl<'i, R: XmlRead<'i>, E: EntityResolver<'i>> XmlReader<'i, R, E> {
1975+
fn new(reader: R) -> Self
1976+
where E: Default {
1977+
Self::with_resolver(reader, E::default())
1978+
}
1979+
1980+
fn with_resolver(mut reader: R, entity_resolver: E) -> Self {
19721981
// Lookahead by one event immediately, so we do not need to check in the
19731982
// loop if we need lookahead or not
19741983
let lookahead = reader.next();
19751984

1976-
Self { reader, lookahead }
1985+
Self { reader, lookahead, entity_resolver }
19771986
}
19781987

19791988
/// Read next event and put it in lookahead, return the current lookahead
@@ -2029,7 +2038,7 @@ impl<'i, R: XmlRead<'i>> XmlReader<'i, R> {
20292038
if self.need_trim_end() {
20302039
e.inplace_trim_end();
20312040
}
2032-
Ok(e.unescape()?)
2041+
Ok(e.unescape_with(|ent: &str| self.entity_resolver.resolve_entity(ent))?)
20332042
}
20342043
PayloadEvent::CData(e) => Ok(e.decode()?),
20352044

@@ -2167,12 +2176,12 @@ where
21672176
////////////////////////////////////////////////////////////////////////////////////////////////////
21682177

21692178
/// A structure that deserializes XML into Rust values.
2170-
pub struct Deserializer<'de, R>
2179+
pub struct Deserializer<'de, R, S: EntityResolver<'de> = DefaultEntityResolver>
21712180
where
21722181
R: XmlRead<'de>,
21732182
{
21742183
/// An XML reader that streams events into this deserializer
2175-
reader: XmlReader<'de, R>,
2184+
reader: XmlReader<'de, R, S>,
21762185

21772186
/// When deserializing sequences sometimes we have to skip unwanted events.
21782187
/// That events should be stored and then replayed. This is a replay buffer,
@@ -2557,17 +2566,46 @@ where
25572566
/// instead, because it will borrow instead of copy. If you have `&[u8]` which
25582567
/// is known to represent UTF-8, you can decode it first before using [`from_str`].
25592568
pub fn from_reader(reader: R) -> Self {
2569+
Self::with_resolver(reader, DefaultEntityResolver)
2570+
}
2571+
}
2572+
2573+
2574+
impl<'de, R, E: EntityResolver<'de>> Deserializer<'de, IoReader<R>, E>
2575+
where
2576+
R: BufRead,
2577+
{
2578+
/// Create new deserializer that will copy data from the specified reader
2579+
/// into internal buffer. If you already have a string use [`Self::from_str`]
2580+
/// instead, because it will borrow instead of copy. If you have `&[u8]` which
2581+
/// is known to represent UTF-8, you can decode it first before using [`from_str`].
2582+
pub fn with_resolver(reader: R, entity_resolver: E) -> Self {
25602583
let mut reader = Reader::from_reader(reader);
25612584
reader.expand_empty_elements(true).check_end_names(true);
25622585

2563-
Self::new(IoReader {
2586+
let io_reader = IoReader {
25642587
reader,
25652588
start_trimmer: StartTrimmer::default(),
25662589
buf: Vec::new(),
2567-
})
2590+
};
2591+
2592+
Self {
2593+
reader: XmlReader::with_resolver(io_reader, entity_resolver),
2594+
2595+
#[cfg(feature = "overlapped-lists")]
2596+
read: VecDeque::new(),
2597+
#[cfg(feature = "overlapped-lists")]
2598+
write: VecDeque::new(),
2599+
#[cfg(feature = "overlapped-lists")]
2600+
limit: None,
2601+
2602+
#[cfg(not(feature = "overlapped-lists"))]
2603+
peek: None,
2604+
}
25682605
}
25692606
}
25702607

2608+
25712609
impl<'de, 'a, R> de::Deserializer<'de> for &'a mut Deserializer<'de, R>
25722610
where
25732611
R: XmlRead<'de>,

src/escapei.rs

+2-2
Original file line numberDiff line numberDiff line change
@@ -159,11 +159,11 @@ pub fn unescape(raw: &str) -> Result<Cow<str>, EscapeError> {
159159
/// [HTML5 escapes]: https://dev.w3.org/html5/html-author/charref
160160
pub fn unescape_with<'input, 'entity, F>(
161161
raw: &'input str,
162-
resolve_entity: F,
162+
mut resolve_entity: F,
163163
) -> Result<Cow<'input, str>, EscapeError>
164164
where
165165
// the lifetime of the output comes from a capture or is `'static`
166-
F: Fn(&str) -> Option<&'entity str>,
166+
F: FnMut(&str) -> Option<&'entity str>,
167167
{
168168
let bytes = raw.as_bytes();
169169
let mut unescaped = None;

src/events/mod.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -740,7 +740,7 @@ impl<'a> BytesText<'a> {
740740
/// non-UTF-8 encoding.
741741
pub fn unescape_with<'entity>(
742742
&self,
743-
resolve_entity: impl Fn(&str) -> Option<&'entity str>,
743+
resolve_entity: impl FnMut(&str) -> Option<&'entity str>,
744744
) -> Result<Cow<'a, str>> {
745745
let decoded = match &self.content {
746746
Cow::Borrowed(bytes) => self.decoder.decode(bytes)?,

src/lib.rs

+1
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@
5151
#[cfg(feature = "serialize")]
5252
pub mod de;
5353
pub mod encoding;
54+
pub mod resolver;
5455
mod errors;
5556
mod escapei;
5657
pub mod escape {

src/resolver.rs

+19
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
//! Entity resolver module
2+
//!
3+
4+
/// Used to resolve unknown entities while parsing
5+
pub trait EntityResolver<'entity_out> {
6+
/// Called when an entity needs to be resolved.
7+
/// None is retuned if a sutable value can not be found.
8+
fn resolve_entity(&mut self, entity: &str) -> Option<&'entity_out str>;
9+
}
10+
11+
/// An EntityResolver that always returns None.
12+
#[derive(Default, Copy, Clone)]
13+
pub struct DefaultEntityResolver;
14+
15+
impl<'entity_out> EntityResolver<'entity_out> for DefaultEntityResolver{
16+
fn resolve_entity(&mut self, _: &str) -> Option<&'entity_out str> {
17+
None
18+
}
19+
}

0 commit comments

Comments
 (0)