Skip to content

Commit 5755258

Browse files
wangrunji0408MingjiHan99
authored andcommitted
perf(expr): further optimize performance (#744)
* optimize to string array Signed-off-by: Runji Wang <wangrunji0408@163.com> * optimize bitvec Signed-off-by: Runji Wang <wangrunji0408@163.com> * avoid zip_eq for performance Signed-off-by: Runji Wang <wangrunji0408@163.com> * array: add `is_null` and `get_raw` Signed-off-by: Runji Wang <wangrunji0408@163.com> * add bench for array filter Signed-off-by: Runji Wang <wangrunji0408@163.com> * optimize filter -30% Signed-off-by: Runji Wang <wangrunji0408@163.com> * optimize filter from bool array Signed-off-by: Runji Wang <wangrunji0408@163.com> * clear null data Signed-off-by: Runji Wang <wangrunji0408@163.com> * fix cardinality error Signed-off-by: Runji Wang <wangrunji0408@163.com> * remove array iterator Signed-off-by: Runji Wang <wangrunji0408@163.com> * introduce non-null iterator Signed-off-by: Runji Wang <wangrunji0408@163.com> * optimize bitmap && Signed-off-by: Runji Wang <wangrunji0408@163.com> * optimize BitVec operations Signed-off-by: Runji Wang <wangrunji0408@163.com> * fix clippy and test Signed-off-by: Runji Wang <wangrunji0408@163.com> Signed-off-by: Runji Wang <wangrunji0408@163.com> Signed-off-by: MingjiHan <mjhan@bu.edu>
1 parent a839ca0 commit 5755258

20 files changed

+287
-251
lines changed

benches/array.rs

+35-7
Original file line numberDiff line numberDiff line change
@@ -39,12 +39,17 @@ fn ops(c: &mut Criterion) {
3939
}
4040

4141
for_all_size(c, "and(bool,bool)", |b, &size| {
42-
let a1: ArrayImpl = (0..size).map(|i| i % 2 == 0).collect::<BoolArray>().into();
43-
let a2: ArrayImpl = a1.clone();
42+
let a1: ArrayImpl = make_bool_array(size);
43+
let a2: ArrayImpl = make_bool_array(size);
4444
b.iter(|| a1.and(&a2));
4545
});
46+
for_all_size(c, "or(bool,bool)", |b, &size| {
47+
let a1: ArrayImpl = make_bool_array(size);
48+
let a2: ArrayImpl = make_bool_array(size);
49+
b.iter(|| a1.or(&a2));
50+
});
4651
for_all_size(c, "not(bool)", |b, &size| {
47-
let a1: ArrayImpl = (0..size).map(|i| i % 2 == 0).collect::<BoolArray>().into();
52+
let a1: ArrayImpl = make_bool_array(size);
4853
b.iter(|| a1.not());
4954
});
5055
}
@@ -85,9 +90,24 @@ fn cast(c: &mut Criterion) {
8590
let a1 = make_f64_array(size);
8691
b.iter(|| a1.cast(&DataTypeKind::Decimal(None, None)))
8792
});
88-
for_all_size(c, "cast(i32->string)", |b, &size| {
93+
for ty in ["i32", "f64", "decimal"] {
94+
for_all_size(c, format!("cast({ty}->string)"), |b, &size| {
95+
let a1 = match ty {
96+
"i32" => make_i32_array(size),
97+
"f64" => make_f64_array(size),
98+
"decimal" => make_decimal_array(size),
99+
_ => unreachable!(),
100+
};
101+
b.iter(|| a1.cast(&DataTypeKind::String))
102+
});
103+
}
104+
}
105+
106+
fn filter(c: &mut Criterion) {
107+
for_all_size(c, "filter(i32)", |b, &size| {
89108
let a1 = make_i32_array(size);
90-
b.iter(|| a1.cast(&DataTypeKind::String))
109+
let ArrayImpl::Bool(a2) = make_bool_array(size) else { unreachable!() };
110+
b.iter(|| a1.filter(a2.true_array()))
91111
});
92112
}
93113

@@ -155,6 +175,14 @@ fn function(c: &mut Criterion) {
155175
}
156176
}
157177

178+
fn make_bool_array(size: usize) -> ArrayImpl {
179+
let mask = make_valid_bitmap(size);
180+
let iter = (0..size as i32)
181+
.zip(mask.clone())
182+
.map(|(i, v)| if v { i % 2 == 0 } else { false });
183+
BoolArray::from_data(iter, mask).into()
184+
}
185+
158186
fn make_i32_array(size: usize) -> ArrayImpl {
159187
let mask = make_valid_bitmap(size);
160188
let iter = (0..size as i32)
@@ -199,11 +227,11 @@ fn for_all_size(
199227
) {
200228
let mut group = c.benchmark_group(name);
201229
group.plot_config(PlotConfiguration::default().summary_scale(AxisScale::Logarithmic));
202-
for size in [1, 16, 256, 4096, 65536] {
230+
for size in [1, 16, 256, 4096] {
203231
group.bench_with_input(BenchmarkId::from_parameter(size), &size, &mut f);
204232
}
205233
group.finish();
206234
}
207235

208-
criterion_group!(benches, function, ops, agg, cast);
236+
criterion_group!(benches, function, ops, agg, cast, filter);
209237
criterion_main!(benches);

src/array/data_chunk.rs

+6-7
Original file line numberDiff line numberDiff line change
@@ -88,15 +88,14 @@ impl DataChunk {
8888
}
8989

9090
/// Filter elements and create a new chunk.
91-
pub fn filter(&self, visibility: impl Iterator<Item = bool> + Clone) -> Self {
92-
let arrays = self
93-
.arrays
94-
.iter()
95-
.map(|a| a.filter(visibility.clone()))
96-
.collect();
91+
pub fn filter(&self, visibility: &[bool]) -> Self {
92+
let arrays: Arc<[ArrayImpl]> = self.arrays.iter().map(|a| a.filter(visibility)).collect();
9793
DataChunk {
94+
cardinality: match arrays.first() {
95+
Some(a) => a.len(),
96+
None => visibility.iter().filter(|b| **b).count(),
97+
},
9898
arrays,
99-
cardinality: visibility.filter(|b| *b).count(),
10099
}
101100
}
102101

src/array/data_chunk_builder.rs

+2-4
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,5 @@
11
use std::iter::IntoIterator;
22

3-
use itertools::Itertools;
4-
53
use super::{ArrayBuilderImpl, DataChunk};
64
use crate::types::{ConvertError, DataType, DataValue};
75

@@ -37,7 +35,7 @@ impl DataChunkBuilder {
3735
pub fn push_row(&mut self, row: impl IntoIterator<Item = DataValue>) -> Option<DataChunk> {
3836
self.array_builders
3937
.iter_mut()
40-
.zip_eq(row)
38+
.zip(row)
4139
.for_each(|(builder, v)| builder.push(&v));
4240
self.size += 1;
4341
if self.size == self.capacity {
@@ -60,7 +58,7 @@ impl DataChunkBuilder {
6058
&mut self,
6159
row: impl IntoIterator<Item = &'a str>,
6260
) -> Result<Option<DataChunk>, ConvertError> {
63-
for (builder, r) in self.array_builders.iter_mut().zip_eq(row) {
61+
for (builder, r) in self.array_builders.iter_mut().zip(row) {
6462
builder.push_str(r)?
6563
}
6664

src/array/iterator.rs

-93
This file was deleted.

src/array/mod.rs

+32-29
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22

33
use std::convert::TryFrom;
44
use std::fmt::Debug;
5-
use std::iter::TrustedLen;
65
use std::ops::{Bound, RangeBounds};
76
use std::sync::Arc;
87

@@ -17,14 +16,12 @@ use crate::types::{
1716

1817
mod data_chunk;
1918
mod data_chunk_builder;
20-
mod iterator;
2119
pub mod ops;
2220
mod primitive_array;
2321
mod utf8_array;
2422

2523
pub use self::data_chunk::*;
2624
pub use self::data_chunk_builder::*;
27-
pub use self::iterator::ArrayIter;
2825
pub use self::primitive_array::*;
2926
pub use self::utf8_array::*;
3027

@@ -101,50 +98,56 @@ pub trait Array: Sized + Send + Sync + 'static {
10198
/// Type of element in the array.
10299
type Item: ToOwned + ?Sized;
103100

104-
type RawIter<'a>: Iterator<Item = &'a Self::Item> + TrustedLen;
101+
/// Returns true if the value at `idx` is null.
102+
fn is_null(&self, idx: usize) -> bool;
105103

106-
/// Retrieve a reference to value.
107-
fn get(&self, idx: usize) -> Option<&Self::Item>;
108-
109-
fn get_unchecked(&self, idx: usize) -> &Self::Item;
104+
/// Returns the raw value at `idx` regardless of null.
105+
fn get_raw(&self, idx: usize) -> &Self::Item;
110106

111107
/// Number of items of array.
112108
fn len(&self) -> usize;
113109

110+
/// Retrieve a reference to value.
111+
fn get(&self, idx: usize) -> Option<&Self::Item> {
112+
if self.is_null(idx) {
113+
None
114+
} else {
115+
Some(self.get_raw(idx))
116+
}
117+
}
118+
119+
fn filter(&self, p: &[bool]) -> Self;
120+
114121
/// Get iterator of current array.
115-
fn iter(&self) -> ArrayIter<'_, Self> {
116-
ArrayIter::new(self)
122+
fn iter(&self) -> impl DoubleEndedIterator<Item = Option<&Self::Item>> {
123+
(0..self.len()).map(|i| self.get(i))
124+
}
125+
126+
/// Get iterator over the raw values.
127+
fn raw_iter(&self) -> impl DoubleEndedIterator<Item = &Self::Item> {
128+
(0..self.len()).map(|i| self.get_raw(i))
129+
}
130+
131+
/// Get iterator over the non-null values.
132+
fn nonnull_iter(&self) -> impl DoubleEndedIterator<Item = &Self::Item> {
133+
(0..self.len())
134+
.filter(|i| !self.is_null(*i))
135+
.map(|i| self.get_raw(i))
117136
}
118137

119138
/// Check if `Array` is empty.
120139
fn is_empty(&self) -> bool {
121140
self.len() == 0
122141
}
123-
124-
fn raw_iter(&self) -> Self::RawIter<'_>;
125142
}
126143

127144
/// An extension trait for [`Array`].
128145
pub trait ArrayExt: Array {
129-
/// Filter the elements and return a new array.
130-
fn filter(&self, visibility: impl Iterator<Item = bool>) -> Self;
131-
132146
/// Return a slice of self for the provided range.
133147
fn slice(&self, range: impl RangeBounds<usize>) -> Self;
134148
}
135149

136150
impl<A: Array> ArrayExt for A {
137-
/// Filter the elements and return a new array.
138-
fn filter(&self, visibility: impl Iterator<Item = bool>) -> Self {
139-
let mut builder = Self::Builder::with_capacity(self.len());
140-
for (a, visible) in self.iter().zip(visibility) {
141-
if visible {
142-
builder.push(a);
143-
}
144-
}
145-
builder.finish()
146-
}
147-
148151
/// Return a slice of self for the provided range.
149152
fn slice(&self, range: impl RangeBounds<usize>) -> Self {
150153
let len = self.len();
@@ -547,11 +550,11 @@ macro_rules! impl_array {
547550
}
548551

549552
/// Filter the elements and return a new array.
550-
pub fn filter(&self, visibility: impl Iterator<Item = bool>) -> Self {
553+
pub fn filter(&self, visibility: &[bool]) -> Self {
551554
match self {
552-
Self::Null(a) => Self::Null(a.filter(visibility).into()),
555+
Self::Null(a) => Self::Null(a.filter(&visibility).into()),
553556
$(
554-
Self::$Abc(a) => Self::$Abc(a.filter(visibility).into()),
557+
Self::$Abc(a) => Self::$Abc(a.filter(&visibility).into()),
555558
)*
556559
}
557560
}

0 commit comments

Comments
 (0)