Skip to content

Commit 35ef707

Browse files
Attribute filtering during export (#5320)
#3632
1 parent fba9e1e commit 35ef707

File tree

10 files changed

+119
-75
lines changed

10 files changed

+119
-75
lines changed

components/collator/src/provider.rs

+4
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,7 @@ fn data_ce_to_primary(data_ce: u64, c: char) -> u32 {
119119
"collator/data@1",
120120
// TODO(#3867): Use script fallback
121121
fallback_by = "language",
122+
attributes_domain = "collator",
122123
))]
123124
#[derive(Debug, PartialEq, Clone)]
124125
#[cfg_attr(feature = "datagen", derive(serde::Serialize, databake::Bake), databake(path = icu_collator::provider))]
@@ -233,6 +234,7 @@ impl<'data> CollationDataV1<'data> {
233234
CollationDiacriticsV1Marker,
234235
"collator/dia@1",
235236
fallback_by = "language",
237+
attributes_domain = "collator",
236238
))]
237239
#[derive(Debug, PartialEq, Clone)]
238240
#[cfg_attr(feature = "datagen", derive(serde::Serialize, databake::Bake), databake(path = icu_collator::provider))]
@@ -275,6 +277,7 @@ pub struct CollationJamoV1<'data> {
275277
CollationReorderingV1Marker,
276278
"collator/reord@1",
277279
fallback_by = "language",
280+
attributes_domain = "collator",
278281
))]
279282
#[derive(Debug, PartialEq, Clone)]
280283
#[cfg_attr(feature = "datagen", derive(serde::Serialize, databake::Bake), databake(path = icu_collator::provider))]
@@ -363,6 +366,7 @@ impl<'data> CollationReorderingV1<'data> {
363366
CollationMetadataV1Marker,
364367
"collator/meta@1",
365368
fallback_by = "language",
369+
attributes_domain = "collator",
366370
))]
367371
#[derive(Debug, PartialEq, Clone, Copy)]
368372
#[cfg_attr(feature = "datagen", derive(serde::Serialize, databake::Bake), databake(path = icu_collator::provider))]

components/experimental/src/dimension/provider/units.rs

+5-1
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,11 @@ use icu_plurals::PluralCategory;
1313
use icu_provider::prelude::*;
1414
use zerovec::ZeroMap;
1515

16-
#[icu_provider::data_struct(UnitsDisplayNameV1Marker = "units/displaynames@1")]
16+
#[icu_provider::data_struct(marker(
17+
UnitsDisplayNameV1Marker,
18+
"units/displaynames@1",
19+
attributes_domain = "units"
20+
))]
1721
#[derive(Clone, PartialEq, Debug)]
1822
#[cfg_attr(
1923
feature = "datagen",

components/segmenter/src/provider/lstm.rs

+5-1
Original file line numberDiff line numberDiff line change
@@ -357,7 +357,11 @@ impl databake::BakeSize for LstmDataFloat32<'_> {
357357
/// including in SemVer minor releases. While the serde representation of data structs is guaranteed
358358
/// to be stable, their Rust representation might not be. Use with caution.
359359
/// </div>
360-
#[icu_provider::data_struct(LstmForWordLineAutoV1Marker = "segmenter/lstm/wl_auto@1")]
360+
#[icu_provider::data_struct(marker(
361+
LstmForWordLineAutoV1Marker,
362+
"segmenter/lstm/wl_auto@1",
363+
attributes_domain = "segmenter"
364+
))]
361365
#[derive(Debug, PartialEq, Clone)]
362366
#[cfg_attr(
363367
feature = "datagen",

components/segmenter/src/provider/mod.rs

+10-2
Original file line numberDiff line numberDiff line change
@@ -124,8 +124,16 @@ pub struct RuleBreakDataV2<'data> {
124124
/// to be stable, their Rust representation might not be. Use with caution.
125125
/// </div>
126126
#[icu_provider::data_struct(
127-
DictionaryForWordOnlyAutoV1Marker = "segmenter/dictionary/w_auto@1",
128-
DictionaryForWordLineExtendedV1Marker = "segmenter/dictionary/wl_ext@1"
127+
marker(
128+
DictionaryForWordOnlyAutoV1Marker,
129+
"segmenter/dictionary/w_auto@1",
130+
attributes_domain = "segmenter"
131+
),
132+
marker(
133+
DictionaryForWordLineExtendedV1Marker,
134+
"segmenter/dictionary/wl_ext@1",
135+
attributes_domain = "segmenter"
136+
)
129137
)]
130138
#[derive(Debug, PartialEq, Clone)]
131139
#[cfg_attr(

provider/core/macros/src/lib.rs

+18
Original file line numberDiff line numberDiff line change
@@ -115,6 +115,7 @@ struct DataStructArg {
115115
marker_name: Path,
116116
path_lit: Option<LitStr>,
117117
fallback_by: Option<LitStr>,
118+
attributes_domain: Option<LitStr>,
118119
singleton: bool,
119120
}
120121

@@ -124,6 +125,7 @@ impl DataStructArg {
124125
marker_name,
125126
path_lit: None,
126127
fallback_by: None,
128+
attributes_domain: None,
127129
singleton: false,
128130
}
129131
}
@@ -155,6 +157,7 @@ impl Parse for DataStructArg {
155157
let mut marker_name: Option<Path> = None;
156158
let mut path_lit: Option<LitStr> = None;
157159
let mut fallback_by: Option<LitStr> = None;
160+
let mut attributes_domain: Option<LitStr> = None;
158161
let mut singleton = false;
159162
let punct = content.parse_terminated(DataStructMarkerArg::parse, Token![,])?;
160163

@@ -171,6 +174,13 @@ impl Parse for DataStructArg {
171174
"fallback_by",
172175
paren.span.join(),
173176
)?;
177+
} else if name == "attributes_domain" {
178+
at_most_one_option(
179+
&mut attributes_domain,
180+
value,
181+
"attributes_domain",
182+
paren.span.join(),
183+
)?;
174184
} else {
175185
return Err(parse::Error::new(
176186
name.span(),
@@ -199,6 +209,7 @@ impl Parse for DataStructArg {
199209
marker_name,
200210
path_lit,
201211
fallback_by,
212+
attributes_domain,
202213
singleton,
203214
})
204215
} else {
@@ -282,6 +293,7 @@ fn data_struct_impl(attr: DataStructArgs, input: DeriveInput) -> TokenStream2 {
282293
marker_name,
283294
path_lit,
284295
fallback_by,
296+
attributes_domain,
285297
singleton,
286298
} = single_attr;
287299

@@ -324,12 +336,18 @@ fn data_struct_impl(attr: DataStructArgs, input: DeriveInput) -> TokenStream2 {
324336
} else {
325337
quote! {icu_provider::_internal::LocaleFallbackPriority::const_default()}
326338
};
339+
let attributes_domain_setter = if let Some(attributes_domain_lit) = attributes_domain {
340+
quote! { info.attributes_domain = #attributes_domain_lit; }
341+
} else {
342+
quote!()
343+
};
327344
result.extend(quote!(
328345
impl icu_provider::DataMarker for #marker_name {
329346
const INFO: icu_provider::DataMarkerInfo = {
330347
let mut info = icu_provider::DataMarkerInfo::from_path(icu_provider::marker::data_marker_path!(#path_str));
331348
info.is_singleton = #singleton;
332349
info.fallback_config.priority = #fallback_by_expr;
350+
#attributes_domain_setter
333351
info
334352
};
335353
}

provider/core/src/marker.rs

+3
Original file line numberDiff line numberDiff line change
@@ -531,6 +531,8 @@ pub struct DataMarkerInfo {
531531
/// Useful for reading and writing data to a file system.
532532
pub path: DataMarkerPath,
533533
/// TODO
534+
pub attributes_domain: &'static str,
535+
/// TODO
534536
pub is_singleton: bool,
535537
/// TODO
536538
pub fallback_config: LocaleFallbackConfig,
@@ -560,6 +562,7 @@ impl DataMarkerInfo {
560562
Self {
561563
path,
562564
is_singleton: false,
565+
attributes_domain: "",
563566
fallback_config: LocaleFallbackConfig::const_default(),
564567
}
565568
}

provider/export/src/export_impl.rs

+24-46
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ use icu_provider::prelude::*;
1010
use std::collections::HashMap;
1111
use std::collections::HashSet;
1212
use std::fmt;
13+
use std::sync::Arc;
1314
use std::time::Duration;
1415
use std::time::Instant;
1516
use writeable::Writeable;
@@ -53,8 +54,7 @@ impl ExportDriver {
5354
include_full,
5455
fallbacker,
5556
deduplication_strategy,
56-
additional_collations,
57-
segmenter_models,
57+
attributes_filters,
5858
} = self;
5959

6060
let markers = markers.unwrap_or_else(|| provider.supported_markers());
@@ -177,9 +177,8 @@ impl ExportDriver {
177177
provider,
178178
marker,
179179
&requested_families,
180+
&attributes_filters,
180181
include_full,
181-
&additional_collations,
182-
&segmenter_models,
183182
&fallbacker,
184183
)?;
185184

@@ -262,13 +261,16 @@ impl ExportDriver {
262261

263262
/// Selects the maximal set of locales to export based on a [`DataMarkerInfo`] and this datagen
264263
/// provider's options bag. The locales may be later optionally deduplicated for fallback.
264+
#[allow(clippy::type_complexity)] // sigh
265265
fn select_locales_for_marker<'a>(
266266
provider: &'a dyn ExportableProvider,
267267
marker: DataMarkerInfo,
268268
requested_families: &HashMap<DataLocale, DataLocaleFamilyAnnotations>,
269+
attributes_filters: &HashMap<
270+
String,
271+
Arc<Box<dyn Fn(&DataMarkerAttributes) -> bool + Send + Sync + 'static>>,
272+
>,
269273
include_full: bool,
270-
additional_collations: &HashSet<String>,
271-
segmenter_models: &[String],
272274
fallbacker: &LocaleFallbacker,
273275
) -> Result<HashSet<DataIdentifierCow<'a>>, DataError> {
274276
// Map from all supported DataLocales to their corresponding supported DataIdentifiers.
@@ -283,41 +285,13 @@ fn select_locales_for_marker<'a>(
283285
.insert(id);
284286
}
285287

286-
if marker.path.as_str().starts_with("segmenter/dictionary/") {
287-
supported_map.retain(|_, ids| {
288-
ids.retain(|id| {
289-
segmenter_models
290-
.iter()
291-
.any(|m| **m == **id.marker_attributes)
292-
});
293-
!ids.is_empty()
294-
});
295-
// Don't perform additional locale filtering
296-
return Ok(supported_map.into_values().flatten().collect());
297-
} else if marker.path.as_str().starts_with("segmenter/lstm/") {
298-
supported_map.retain(|_, locales| {
299-
locales.retain(|id| {
300-
segmenter_models
301-
.iter()
302-
.any(|m| **m == **id.marker_attributes)
303-
});
304-
!locales.is_empty()
305-
});
306-
// Don't perform additional locale filtering
307-
return Ok(supported_map.into_values().flatten().collect());
308-
} else if marker.path.as_str().starts_with("collator/") {
309-
supported_map.retain(|_, ids| {
310-
ids.retain(|id| {
311-
id.marker_attributes.is_empty()
312-
|| additional_collations.contains(id.marker_attributes.as_str())
313-
|| if id.marker_attributes.as_str().starts_with("search") {
314-
additional_collations.contains("search*")
315-
} else {
316-
!["big5han", "gb2312"].contains(&id.marker_attributes.as_str())
317-
}
288+
if !marker.attributes_domain.is_empty() {
289+
if let Some(filter) = attributes_filters.get(marker.attributes_domain) {
290+
supported_map.retain(|_, ids| {
291+
ids.retain(|id| filter(&id.marker_attributes));
292+
!ids.is_empty()
318293
});
319-
!ids.is_empty()
320-
});
294+
}
321295
}
322296

323297
if include_full && requested_families.is_empty() {
@@ -510,6 +484,7 @@ impl fmt::Display for DisplayDuration {
510484

511485
#[test]
512486
fn test_collation_filtering() {
487+
use crate::DataLocaleFamily;
513488
use icu::locale::locale;
514489
use std::collections::BTreeSet;
515490

@@ -619,16 +594,19 @@ fn test_collation_filtering() {
619594
},
620595
];
621596
for cas in cases {
597+
let driver = ExportDriver::new(
598+
[DataLocaleFamily::single(cas.language.clone())],
599+
DeduplicationStrategy::None.into(),
600+
LocaleFallbacker::new_without_data(),
601+
)
602+
.with_additional_collations(cas.include_collations.iter().copied().map(String::from));
622603
let resolved_locales = select_locales_for_marker(
623604
&Provider,
624605
icu::collator::provider::CollationDataV1Marker::INFO,
625-
&[(cas.language.clone(), DataLocaleFamilyAnnotations::single())]
626-
.into_iter()
627-
.collect(),
606+
&driver.requested_families,
607+
&driver.attributes_filters,
628608
false,
629-
&HashSet::from_iter(cas.include_collations.iter().copied().map(String::from)),
630-
&[],
631-
&LocaleFallbacker::new_without_data(),
609+
&driver.fallbacker,
632610
)
633611
.unwrap()
634612
.into_iter()

provider/export/src/lib.rs

+43-13
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,7 @@ use icu_provider::prelude::*;
9292
use std::collections::HashMap;
9393
use std::collections::HashSet;
9494
use std::hash::Hash;
95+
use std::sync::Arc;
9596

9697
/// Configuration for a data export operation.
9798
///
@@ -115,15 +116,29 @@ use std::hash::Hash;
115116
/// )
116117
/// .unwrap();
117118
/// ```
118-
#[derive(Debug, Clone)]
119+
#[derive(Clone)]
119120
pub struct ExportDriver {
120121
markers: Option<HashSet<DataMarkerInfo>>,
121122
requested_families: HashMap<DataLocale, DataLocaleFamilyAnnotations>,
123+
#[allow(clippy::type_complexity)] // sigh
124+
attributes_filters:
125+
HashMap<String, Arc<Box<dyn Fn(&DataMarkerAttributes) -> bool + Send + Sync + 'static>>>,
122126
fallbacker: LocaleFallbacker,
123127
include_full: bool,
124128
deduplication_strategy: DeduplicationStrategy,
125-
additional_collations: HashSet<String>,
126-
segmenter_models: Vec<String>,
129+
}
130+
131+
impl core::fmt::Debug for ExportDriver {
132+
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
133+
f.debug_struct("ExportDriver")
134+
.field("markers", &self.markers)
135+
.field("requested_families", &self.requested_families)
136+
.field("attributes_filters", &self.attributes_filters.keys())
137+
.field("fallbacker", &self.fallbacker)
138+
.field("include_full", &self.include_full)
139+
.field("deduplication_strategy", &self.deduplication_strategy)
140+
.finish()
141+
}
127142
}
128143

129144
impl ExportDriver {
@@ -158,13 +173,24 @@ impl ExportDriver {
158173
))
159174
})
160175
.collect(),
176+
attributes_filters: Default::default(),
161177
include_full,
162178
fallbacker,
163179
deduplication_strategy: options.deduplication_strategy,
164-
additional_collations: Default::default(),
165-
segmenter_models: Default::default(),
166180
}
167181
.with_recommended_segmenter_models()
182+
.with_additional_collations([])
183+
}
184+
185+
/// TODO
186+
pub fn with_marker_attributes_filter(
187+
mut self,
188+
domain: &str,
189+
filter: impl Fn(&DataMarkerAttributes) -> bool + Send + Sync + 'static,
190+
) -> Self {
191+
self.attributes_filters
192+
.insert(String::from(domain), Arc::new(Box::new(filter)));
193+
self
168194
}
169195

170196
/// Sets this driver to generate the given data markers.
@@ -187,10 +213,16 @@ impl ExportDriver {
187213
self,
188214
additional_collations: impl IntoIterator<Item = String>,
189215
) -> Self {
190-
Self {
191-
additional_collations: additional_collations.into_iter().collect(),
192-
..self
193-
}
216+
let set = additional_collations.into_iter().collect::<HashSet<_>>();
217+
self.with_marker_attributes_filter("collator", move |attrs| {
218+
attrs.is_empty()
219+
|| set.contains(attrs.as_str())
220+
|| if attrs.as_str().starts_with("search") {
221+
set.contains("search*")
222+
} else {
223+
!["big5han", "gb2312"].contains(&attrs.as_str())
224+
}
225+
})
194226
}
195227

196228
/// This option is only relevant if using `icu::segmenter`.
@@ -236,10 +268,8 @@ impl ExportDriver {
236268
/// If multiple models for the same language and segmentation type (dictionary/LSTM) are
237269
/// listed, the first one will be used.
238270
pub fn with_segmenter_models(self, models: impl IntoIterator<Item = String>) -> Self {
239-
Self {
240-
segmenter_models: models.into_iter().collect(),
241-
..self
242-
}
271+
let set = models.into_iter().collect::<HashSet<_>>();
272+
self.with_marker_attributes_filter("segmenter", move |attrs| set.contains(attrs.as_str()))
243273
}
244274
}
245275

0 commit comments

Comments
 (0)