Skip to content

Commit bf03e44

Browse files
Merge pull request #90 from Kijewski/pr-faster-skip-till
parser: use `memchr` to speed-up `skip_till()`
2 parents 732d6b4 + 7b99783 commit bf03e44

File tree

5 files changed

+153
-16
lines changed

5 files changed

+153
-16
lines changed

rinja_parser/Cargo.toml

+1
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ edition = "2021"
1414
rust-version = "1.71"
1515

1616
[dependencies]
17+
memchr = "2"
1718
nom = { version = "7", default-features = false, features = ["alloc"] }
1819

1920
[dev-dependencies]

rinja_parser/src/lib.rs

+12-11
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ use nom::{error_position, AsChar, InputTakeAtPosition};
2020

2121
pub mod expr;
2222
pub use expr::{Expr, Filter};
23+
mod memchr_splitter;
2324
pub mod node;
2425
pub use node::Node;
2526

@@ -362,22 +363,22 @@ fn ws<'a, O>(
362363

363364
/// Skips input until `end` was found, but does not consume it.
364365
/// Returns tuple that would be returned when parsing `end`.
365-
fn skip_till<'a, O>(
366+
fn skip_till<'a, 'b, O>(
367+
candidate_finder: impl crate::memchr_splitter::Splitter,
366368
end: impl FnMut(&'a str) -> ParseResult<'a, O>,
367369
) -> impl FnMut(&'a str) -> ParseResult<'a, (&'a str, O)> {
368-
enum Next<O> {
369-
IsEnd(O),
370-
NotEnd,
371-
}
372-
let mut next = alt((map(end, Next::IsEnd), map(anychar, |_| Next::NotEnd)));
370+
let mut next = alt((map(end, Some), map(anychar, |_| None)));
373371
move |start: &'a str| {
374372
let mut i = start;
375373
loop {
376-
let (j, is_end) = next(i)?;
377-
match is_end {
378-
Next::IsEnd(lookahead) => return Ok((i, (j, lookahead))),
379-
Next::NotEnd => i = j,
380-
}
374+
i = match candidate_finder.split(i) {
375+
Some((_, j)) => j,
376+
None => return Err(nom::Err::Error(ErrorContext::new("`end` not found`", i))),
377+
};
378+
i = match next(i)? {
379+
(j, Some(lookahead)) => return Ok((i, (j, lookahead))),
380+
(j, None) => j,
381+
};
381382
}
382383
}
383384
}

rinja_parser/src/memchr_splitter.rs

+101
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,101 @@
1+
pub(crate) trait Splitter: Copy {
2+
/// If any of the needles was found in the haystack, then split the haystack at the first hit.
3+
///
4+
/// Since only the first byte of a needle is inspected, be aware that there can be
5+
/// false-positives. Always compare the latter string of the output if it fits the expected
6+
/// prefix.
7+
fn split<'a>(&self, haystack: &'a str) -> Option<(&'a str, &'a str)>;
8+
}
9+
10+
impl<T: Splitter + ?Sized> Splitter for &T {
11+
#[inline]
12+
fn split<'a>(&self, haystack: &'a str) -> Option<(&'a str, &'a str)> {
13+
T::split(self, haystack)
14+
}
15+
}
16+
17+
// define and implement a string splitter using memchr
18+
macro_rules! new_memchr_type {
19+
($struct:ident $split_unchecked:ident $memchr:ident $($field:ident)*) => {
20+
#[derive(Debug, Clone, Copy)]
21+
pub(crate) struct $struct {
22+
$($field: u8,)*
23+
}
24+
25+
impl $struct {
26+
#[track_caller]
27+
pub(crate) fn new($($field: &str),*) -> Self {
28+
Self {
29+
$($field: $field.as_bytes()[0],)*
30+
}
31+
}
32+
33+
#[inline]
34+
pub(crate) fn split<'a>(&self, haystack: &'a str) -> Option<(&'a str, &'a str)> {
35+
// SAFETY: During the construction of `self` we used strings as inputs, and a
36+
// string always starts with a byte at char boundary.
37+
unsafe { $split_unchecked($(self.$field,)* haystack) }
38+
}
39+
}
40+
41+
impl Splitter for $struct {
42+
#[inline]
43+
fn split<'a>(&self, haystack: &'a str) -> Option<(&'a str, &'a str)> {
44+
self.split(haystack)
45+
}
46+
}
47+
48+
/// SAFETY: caller has to ensure that the needle is at a char boundary
49+
pub(crate) unsafe fn $split_unchecked(
50+
$($field: u8,)*
51+
haystack: &str,
52+
) -> Option<(&str, &str)> {
53+
let idx = memchr::$memchr($($field,)* haystack.as_bytes())?;
54+
// SAFETY: The caller ensures that the needles are at char boundary.
55+
// The found index `< haystack.len()`.
56+
Some((haystack.get_unchecked(..idx), haystack.get_unchecked(idx..)))
57+
}
58+
};
59+
}
60+
61+
new_memchr_type!(Splitter1 split1_unchecked memchr a);
62+
new_memchr_type!(Splitter2 split2_unchecked memchr2 a b);
63+
new_memchr_type!(Splitter3 split3_unchecked memchr3 a b c);
64+
65+
#[test]
66+
fn candidate_finder() {
67+
assert_eq!(
68+
Splitter1::new("test").split("abctefg"),
69+
Some(("abc", "tefg")),
70+
);
71+
assert_eq!(Splitter1::new("xyz").split("abctefg"), None);
72+
73+
assert_eq!(
74+
Splitter2::new("xyz", "foo").split("abctefg"),
75+
Some(("abcte", "fg")),
76+
);
77+
assert_eq!(Splitter2::new("oof", "xyz").split("abctefg"), None);
78+
79+
assert_eq!(
80+
Splitter3::new("oof", "apples", "xyz").split("abctefg"),
81+
Some(("", "abctefg")),
82+
);
83+
assert_eq!(
84+
Splitter3::new("oof", "peaches", "xyz").split("abctefg"),
85+
None
86+
);
87+
88+
assert_eq!(
89+
Splitter3::new("test", "test", "test").split("abctefg"),
90+
Some(("abc", "tefg")),
91+
);
92+
93+
assert_eq!(
94+
Splitter3::new("🧚‍♀️Life", "😀Laugh", "😻Love")
95+
.split("sed diam nonumy eirmod tempor 🧚‍♀️Life ut labore et dolore magna aliquyam"),
96+
Some((
97+
"sed diam nonumy eirmod tempor ",
98+
"🧚‍♀️Life ut labore et dolore magna aliquyam"
99+
)),
100+
);
101+
}

rinja_parser/src/node.rs

+12-4
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ use nom::error_position;
99
use nom::multi::{many0, many1, separated_list0};
1010
use nom::sequence::{delimited, pair, preceded, tuple};
1111

12+
use crate::memchr_splitter::{Splitter1, Splitter2, Splitter3};
1213
use crate::{
1314
filter, identifier, is_ws, keyword, not_ws, skip_till, str_lit, ws, ErrorContext, Expr, Filter,
1415
ParseResult, State, Target, WithSpan,
@@ -755,14 +756,20 @@ pub struct Lit<'a> {
755756
impl<'a> Lit<'a> {
756757
fn parse(i: &'a str, s: &State<'_>) -> ParseResult<'a, WithSpan<'a, Self>> {
757758
let start = i;
759+
let (i, _) = not(eof)(i)?;
760+
761+
let candidate_finder = Splitter3::new(
762+
s.syntax.block_start,
763+
s.syntax.comment_start,
764+
s.syntax.expr_start,
765+
);
758766
let p_start = alt((
759767
tag(s.syntax.block_start),
760768
tag(s.syntax.comment_start),
761769
tag(s.syntax.expr_start),
762770
));
763771

764-
let (i, _) = not(eof)(i)?;
765-
let (i, content) = opt(recognize(skip_till(p_start)))(i)?;
772+
let (i, content) = opt(recognize(skip_till(candidate_finder, p_start)))(i)?;
766773
let (i, content) = match content {
767774
Some("") => {
768775
// {block,comment,expr}_start follows immediately.
@@ -810,7 +817,7 @@ impl<'a> Raw<'a> {
810817
cut(tuple((
811818
opt(Whitespace::parse),
812819
|i| s.tag_block_end(i),
813-
consumed(skip_till(endraw)),
820+
consumed(skip_till(Splitter1::new(s.syntax.block_start), endraw)),
814821
))),
815822
));
816823

@@ -989,7 +996,8 @@ impl<'a> Comment<'a> {
989996
let mut depth = 0usize;
990997
loop {
991998
let start = i;
992-
let (_, tag) = opt(skip_till(|i| tag(i, s)))(i)?;
999+
let splitter = Splitter2::new(s.syntax.comment_start, s.syntax.comment_end);
1000+
let (_, tag) = opt(skip_till(splitter, |i| tag(i, s)))(i)?;
9931001
let Some((j, tag)) = tag else {
9941002
return Err(
9951003
ErrorContext::unclosed("comment", s.syntax.comment_end, start).into(),

rinja_parser/src/tests.rs

+27-1
Original file line numberDiff line numberDiff line change
@@ -371,10 +371,36 @@ fn change_delimiters_parse_filter() {
371371
expr_end: "=}",
372372
..Syntax::default()
373373
};
374-
375374
Ast::from_str("{= strvar|e =}", None, &syntax).unwrap();
376375
}
377376

377+
#[test]
378+
fn unicode_delimiters_in_syntax() {
379+
let syntax = Syntax {
380+
expr_start: "🖎", // U+1F58E == b"\xf0\x9f\x96\x8e"
381+
expr_end: "✍", // U+270D = b'\xe2\x9c\x8d'
382+
..Syntax::default()
383+
};
384+
assert_eq!(
385+
Ast::from_str("Here comes the expression: 🖎 e ✍.", None, &syntax)
386+
.unwrap()
387+
.nodes(),
388+
[
389+
Node::Lit(WithSpan::no_span(Lit {
390+
lws: "",
391+
val: "Here comes the expression:",
392+
rws: " ",
393+
})),
394+
Node::Expr(Ws(None, None), WithSpan::no_span(Expr::Var("e")),),
395+
Node::Lit(WithSpan::no_span(Lit {
396+
lws: "",
397+
val: ".",
398+
rws: "",
399+
})),
400+
],
401+
);
402+
}
403+
378404
#[test]
379405
fn test_precedence() {
380406
let syntax = Syntax::default();

0 commit comments

Comments
 (0)