From 529722118b2d2d63d4c2437e3f6407ef05736efc Mon Sep 17 00:00:00 2001 From: Jesse Szwedko Date: Fri, 7 Jul 2023 13:34:41 -0700 Subject: [PATCH 1/2] chore(deps): Swap out bloom crate for bloomy Signed-off-by: Jesse Szwedko --- Cargo.lock | 22 +++++------- Cargo.toml | 4 +-- LICENSE-3rdparty.csv | 2 +- .../tag_cardinality_limit/config.rs | 4 +-- .../tag_cardinality_limit/tag_value_set.rs | 34 ++++++++----------- src/transforms/tag_cardinality_limit/tests.rs | 4 +-- 6 files changed, 30 insertions(+), 40 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 5f4c8eb541bc5..d48b0d907f777 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1384,15 +1384,9 @@ version = "0.5.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0700ddab506f33b20a03b13996eccd309a48e5ff77d0d95926aa0210fb4e95f1" dependencies = [ - "bit-vec 0.6.3", + "bit-vec", ] -[[package]] -name = "bit-vec" -version = "0.4.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "02b4ff8b16e6076c3e14220b39fbc1fabb6737522281a388998046859400895f" - [[package]] name = "bit-vec" version = "0.6.3" @@ -1475,12 +1469,12 @@ dependencies = [ ] [[package]] -name = "bloom" -version = "0.3.2" +name = "bloomy" +version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d00ac8e5056d6d65376a3c1aa5c7c34850d6949ace17f0266953a254eb3d6fe8" +checksum = "489d2af57852b78a86478273ac6a1ef912061b6af3a439694c49f309f6ea3bdd" dependencies = [ - "bit-vec 0.4.4", + "siphasher", ] [[package]] @@ -2926,7 +2920,7 @@ dependencies = [ "once_cell", "proc-macro2 1.0.63", "quote 1.0.29", - "syn 2.0.10", + "syn 2.0.23", ] [[package]] @@ -6402,7 +6396,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e6eb95b2e36b92d3e0536be87eaf7accb17db39f5a44452759b43f1328e82dc9" dependencies = [ "async-trait", - "bit-vec 0.6.3", + "bit-vec", "bytes 1.4.0", "chrono", "crc", @@ -9221,7 +9215,7 @@ dependencies = [ "azure_storage", "azure_storage_blobs", "base64 0.21.2", - "bloom", + "bloomy", "bollard", "bytes 1.4.0", "bytesize", diff --git a/Cargo.toml b/Cargo.toml index b2bf3eccbe315..e842e7a218127 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -239,7 +239,7 @@ async-compression = { version = "0.4.0", default-features = false, features = [" apache-avro = { version = "0.14.0", default-features = false, optional = true } axum = { version = "0.6.18", default-features = false } base64 = { version = "0.21.2", default-features = false, optional = true } -bloom = { version = "0.3.2", default-features = false, optional = true } +bloomy = { version = "1.2.0", default-features = false, optional = true } bollard = { version = "0.14.0", default-features = false, features = ["ssl", "chrono"], optional = true } bytes = { version = "1.4.0", default-features = false, features = ["serde"] } bytesize = { version = "1.2.0", default-features = false } @@ -593,7 +593,7 @@ transforms-reduce = [] transforms-remap = [] transforms-route = [] transforms-sample = [] -transforms-tag_cardinality_limit = ["dep:bloom", "dep:hashbrown"] +transforms-tag_cardinality_limit = ["dep:bloomy", "dep:hashbrown"] transforms-throttle = ["dep:governor"] # Sinks diff --git a/LICENSE-3rdparty.csv b/LICENSE-3rdparty.csv index dcd241ccc2c5b..e9da5a9d59349 100644 --- a/LICENSE-3rdparty.csv +++ b/LICENSE-3rdparty.csv @@ -87,7 +87,7 @@ bitvec,https://github.com/bitvecto-rs/bitvec,MIT,The bitvec Authors block-buffer,https://github.com/RustCrypto/utils,MIT OR Apache-2.0,RustCrypto Developers block-padding,https://github.com/RustCrypto/utils,MIT OR Apache-2.0,RustCrypto Developers blocking,https://github.com/smol-rs/blocking,Apache-2.0 OR MIT,Stjepan Glavina -bloom,https://github.com/nicklan/bloom-rs,GPL-2.0,Nick Lanham +bloomy,https://docs.rs/bloomy/,MIT,"Aleksandr Bezobchuk , Alexis Sellier " bollard,https://github.com/fussybeaver/bollard,Apache-2.0,Bollard contributors borsh,https://github.com/near/borsh-rs,MIT OR Apache-2.0,Near Inc borsh-derive,https://github.com/nearprotocol/borsh,Apache-2.0,Near Inc diff --git a/src/transforms/tag_cardinality_limit/config.rs b/src/transforms/tag_cardinality_limit/config.rs index e3dbe992e0fcc..4ea1833fb216d 100644 --- a/src/transforms/tag_cardinality_limit/config.rs +++ b/src/transforms/tag_cardinality_limit/config.rs @@ -18,7 +18,7 @@ use vector_core::config::LogNamespace; pub struct TagCardinalityLimitConfig { /// How many distinct values to accept for any given key. #[serde(default = "default_value_limit")] - pub value_limit: u32, + pub value_limit: usize, #[configurable(derived)] #[serde(default = "default_limit_exceeded_action")] @@ -81,7 +81,7 @@ const fn default_limit_exceeded_action() -> LimitExceededAction { LimitExceededAction::DropTag } -const fn default_value_limit() -> u32 { +const fn default_value_limit() -> usize { 500 } diff --git a/src/transforms/tag_cardinality_limit/tag_value_set.rs b/src/transforms/tag_cardinality_limit/tag_value_set.rs index 599a021df110b..e12454945e788 100644 --- a/src/transforms/tag_cardinality_limit/tag_value_set.rs +++ b/src/transforms/tag_cardinality_limit/tag_value_set.rs @@ -1,6 +1,6 @@ use crate::event::metric::TagValueSet; use crate::transforms::tag_cardinality_limit::config::Mode; -use bloom::{BloomFilter, ASMS}; +use bloomy::BloomFilter; use std::collections::HashSet; use std::fmt; @@ -8,12 +8,11 @@ use std::fmt; #[derive(Debug)] pub struct AcceptedTagValueSet { storage: TagValueSetStorage, - num_elements: usize, } enum TagValueSetStorage { Set(HashSet), - Bloom(BloomFilter), + Bloom(BloomFilter), } impl fmt::Debug for TagValueSetStorage { @@ -26,19 +25,15 @@ impl fmt::Debug for TagValueSetStorage { } impl AcceptedTagValueSet { - pub fn new(value_limit: u32, mode: &Mode) -> Self { + pub fn new(value_limit: usize, mode: &Mode) -> Self { let storage = match &mode { Mode::Exact => TagValueSetStorage::Set(HashSet::with_capacity(value_limit as usize)), Mode::Probabilistic(config) => { let num_bits = config.cache_size_per_key / 8; // Convert bytes to bits - let num_hashes = bloom::optimal_num_hashes(num_bits, value_limit); - TagValueSetStorage::Bloom(BloomFilter::with_size(num_bits, num_hashes)) + TagValueSetStorage::Bloom(BloomFilter::with_size(num_bits)) } }; - Self { - storage, - num_elements: 0, - } + Self { storage } } pub fn contains(&self, value: &TagValueSet) -> bool { @@ -48,18 +43,19 @@ impl AcceptedTagValueSet { } } - pub const fn len(&self) -> usize { - self.num_elements + pub fn len(&self) -> usize { + match &self.storage { + TagValueSetStorage::Set(set) => set.len(), + TagValueSetStorage::Bloom(bloom) => bloom.count(), + } } - pub fn insert(&mut self, value: TagValueSet) -> bool { - let inserted = match &mut self.storage { - TagValueSetStorage::Set(set) => set.insert(value), + pub fn insert(&mut self, value: TagValueSet) { + match &mut self.storage { + TagValueSetStorage::Set(set) => { + set.insert(value); + } TagValueSetStorage::Bloom(bloom) => bloom.insert(&value), }; - if inserted { - self.num_elements += 1 - } - inserted } } diff --git a/src/transforms/tag_cardinality_limit/tests.rs b/src/transforms/tag_cardinality_limit/tests.rs index 5753d0176dd3b..7e386a6403ab8 100644 --- a/src/transforms/tag_cardinality_limit/tests.rs +++ b/src/transforms/tag_cardinality_limit/tests.rs @@ -35,7 +35,7 @@ fn make_metric(tags: MetricTags) -> Event { } const fn make_transform_hashset( - value_limit: u32, + value_limit: usize, limit_exceeded_action: LimitExceededAction, ) -> TagCardinalityLimitConfig { TagCardinalityLimitConfig { @@ -46,7 +46,7 @@ const fn make_transform_hashset( } const fn make_transform_bloom( - value_limit: u32, + value_limit: usize, limit_exceeded_action: LimitExceededAction, ) -> TagCardinalityLimitConfig { TagCardinalityLimitConfig { From 99e1514f97df828dc28a6ce53cb92522d382fe1c Mon Sep 17 00:00:00 2001 From: Jesse Szwedko Date: Fri, 7 Jul 2023 13:46:40 -0700 Subject: [PATCH 2/2] clippy Signed-off-by: Jesse Szwedko --- src/transforms/tag_cardinality_limit/mod.rs | 6 +++--- src/transforms/tag_cardinality_limit/tag_value_set.rs | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/transforms/tag_cardinality_limit/mod.rs b/src/transforms/tag_cardinality_limit/mod.rs index 42caf15304cc0..14b18b457a6c0 100644 --- a/src/transforms/tag_cardinality_limit/mod.rs +++ b/src/transforms/tag_cardinality_limit/mod.rs @@ -55,11 +55,11 @@ impl TagCardinalityLimit { } // Tag value not yet part of the accepted set. - if tag_value_set.len() < self.config.value_limit as usize { + if tag_value_set.len() < self.config.value_limit { // accept the new value tag_value_set.insert(value.clone()); - if tag_value_set.len() == self.config.value_limit as usize { + if tag_value_set.len() == self.config.value_limit { emit!(TagCardinalityValueLimitReached { key }); } @@ -76,7 +76,7 @@ impl TagCardinalityLimit { self.accepted_tags .get(key) .map(|value_set| { - !value_set.contains(value) && value_set.len() >= self.config.value_limit as usize + !value_set.contains(value) && value_set.len() >= self.config.value_limit }) .unwrap_or(false) } diff --git a/src/transforms/tag_cardinality_limit/tag_value_set.rs b/src/transforms/tag_cardinality_limit/tag_value_set.rs index e12454945e788..0296fa0cf0c68 100644 --- a/src/transforms/tag_cardinality_limit/tag_value_set.rs +++ b/src/transforms/tag_cardinality_limit/tag_value_set.rs @@ -27,7 +27,7 @@ impl fmt::Debug for TagValueSetStorage { impl AcceptedTagValueSet { pub fn new(value_limit: usize, mode: &Mode) -> Self { let storage = match &mode { - Mode::Exact => TagValueSetStorage::Set(HashSet::with_capacity(value_limit as usize)), + Mode::Exact => TagValueSetStorage::Set(HashSet::with_capacity(value_limit)), Mode::Probabilistic(config) => { let num_bits = config.cache_size_per_key / 8; // Convert bytes to bits TagValueSetStorage::Bloom(BloomFilter::with_size(num_bits)) @@ -39,7 +39,7 @@ impl AcceptedTagValueSet { pub fn contains(&self, value: &TagValueSet) -> bool { match &self.storage { TagValueSetStorage::Set(set) => set.contains(value), - TagValueSetStorage::Bloom(bloom) => bloom.contains(&value), + TagValueSetStorage::Bloom(bloom) => bloom.contains(value), } }