Skip to content

Commit 552c83d

Browse files
committed
Add compressible_bytes data
As a side effect, always load CollationSpecialPrimaries, since we no longer know at collator instantiation time if some of the data in the struct is going to be used. Preparation for unicode-org#6537
1 parent 5c795cf commit 552c83d

File tree

11 files changed

+154
-107
lines changed

11 files changed

+154
-107
lines changed

components/collator/src/comparison.rs

Lines changed: 48 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ use crate::provider::CollationReordering;
3636
use crate::provider::CollationReorderingV1;
3737
use crate::provider::CollationRootV1;
3838
use crate::provider::CollationSpecialPrimaries;
39-
use crate::provider::CollationSpecialPrimariesV1;
39+
use crate::provider::CollationSpecialPrimariesV2;
4040
use crate::provider::CollationTailoringV1;
4141
use core::cmp::Ordering;
4242
use core::convert::TryFrom;
@@ -379,7 +379,7 @@ impl LocaleSpecificDataHolder {
379379
/// Compares strings according to culturally-relevant ordering.
380380
#[derive(Debug)]
381381
pub struct Collator {
382-
special_primaries: Option<DataPayload<CollationSpecialPrimariesV1>>,
382+
special_primaries: DataPayload<CollationSpecialPrimariesV2>,
383383
root: DataPayload<CollationRootV1>,
384384
tailoring: Option<DataPayload<CollationTailoringV1>>,
385385
jamo: DataPayload<CollationJamoV1>,
@@ -395,7 +395,7 @@ impl Collator {
395395
/// Constructs a borrowed version of this type for more efficient querying.
396396
pub fn as_borrowed(&self) -> CollatorBorrowed {
397397
CollatorBorrowed {
398-
special_primaries: self.special_primaries.as_ref().map(|s| s.get()),
398+
special_primaries: self.special_primaries.get(),
399399
root: self.root.get(),
400400
tailoring: self.tailoring.as_ref().map(|s| s.get()),
401401
jamo: self.jamo.get(),
@@ -434,7 +434,7 @@ impl Collator {
434434
options: CollatorOptions,
435435
) -> Result<Self, DataError>
436436
where
437-
D: DataProvider<CollationSpecialPrimariesV1>
437+
D: DataProvider<CollationSpecialPrimariesV2>
438438
+ DataProvider<CollationRootV1>
439439
+ DataProvider<CollationTailoringV1>
440440
+ DataProvider<CollationDiacriticsV1>
@@ -451,7 +451,7 @@ impl Collator {
451451
provider.load(Default::default())?.payload,
452452
provider.load(Default::default())?.payload,
453453
provider.load(Default::default())?.payload,
454-
|| provider.load(Default::default()).map(|r| r.payload),
454+
provider.load(Default::default())?.payload,
455455
prefs,
456456
options,
457457
)
@@ -464,7 +464,7 @@ impl Collator {
464464
decompositions: DataPayload<NormalizerNfdDataV1>,
465465
tables: DataPayload<NormalizerNfdTablesV1>,
466466
jamo: DataPayload<CollationJamoV1>,
467-
special_primaries: impl FnOnce() -> Result<DataPayload<CollationSpecialPrimariesV1>, DataError>,
467+
special_primaries: DataPayload<CollationSpecialPrimariesV2>,
468468
prefs: CollatorPreferences,
469469
options: CollatorOptions,
470470
) -> Result<Self, DataError>
@@ -484,22 +484,14 @@ impl Collator {
484484
return Err(DataError::custom("invalid").with_marker(CollationJamoV1::INFO));
485485
}
486486

487-
let special_primaries = if locale_dependent.merged_options.alternate_handling()
488-
== AlternateHandling::Shifted
489-
|| locale_dependent.merged_options.numeric()
490-
{
491-
let special_primaries = special_primaries()?;
492-
// `variant_count` isn't stable yet:
493-
// https://github.com/rust-lang/rust/issues/73662
494-
if special_primaries.get().last_primaries.len() <= (MaxVariable::Currency as usize) {
495-
return Err(
496-
DataError::custom("invalid").with_marker(CollationSpecialPrimariesV1::INFO)
497-
);
498-
}
499-
Some(special_primaries)
500-
} else {
501-
None
502-
};
487+
// `variant_count` isn't stable yet:
488+
// https://github.com/rust-lang/rust/issues/73662
489+
if special_primaries.get().last_primaries.len() <= (MaxVariable::Currency as usize) {
490+
return Err(DataError::custom("invalid").with_marker(CollationSpecialPrimariesV2::INFO));
491+
}
492+
if special_primaries.get().compressible_bytes.len() != 32 {
493+
return Err(DataError::custom("invalid").with_marker(CollationSpecialPrimariesV2::INFO));
494+
}
503495

504496
Ok(Collator {
505497
special_primaries,
@@ -543,7 +535,7 @@ macro_rules! compare {
543535
/// borrowed version.
544536
#[derive(Debug)]
545537
pub struct CollatorBorrowed<'a> {
546-
special_primaries: Option<&'a CollationSpecialPrimaries<'a>>,
538+
special_primaries: &'a CollationSpecialPrimaries<'a>,
547539
root: &'a CollationData<'a>,
548540
tailoring: Option<&'a CollationData<'a>>,
549541
jamo: &'a CollationJamo<'a>,
@@ -579,23 +571,15 @@ impl CollatorBorrowed<'static> {
579571
return Err(DataError::custom("invalid").with_marker(CollationJamoV1::INFO));
580572
}
581573

582-
let special_primaries = if locale_dependent.merged_options.alternate_handling()
583-
== AlternateHandling::Shifted
584-
|| locale_dependent.merged_options.numeric()
585-
{
586-
let special_primaries =
587-
crate::provider::Baked::SINGLETON_COLLATION_SPECIAL_PRIMARIES_V1;
588-
// `variant_count` isn't stable yet:
589-
// https://github.com/rust-lang/rust/issues/73662
590-
if special_primaries.last_primaries.len() <= (MaxVariable::Currency as usize) {
591-
return Err(
592-
DataError::custom("invalid").with_marker(CollationSpecialPrimariesV1::INFO)
593-
);
594-
}
595-
Some(special_primaries)
596-
} else {
597-
None
598-
};
574+
let special_primaries = crate::provider::Baked::SINGLETON_COLLATION_SPECIAL_PRIMARIES_V2;
575+
// `variant_count` isn't stable yet:
576+
// https://github.com/rust-lang/rust/issues/73662
577+
if special_primaries.last_primaries.len() <= (MaxVariable::Currency as usize) {
578+
return Err(DataError::custom("invalid").with_marker(CollationSpecialPrimariesV2::INFO));
579+
}
580+
if special_primaries.compressible_bytes.len() != 32 {
581+
return Err(DataError::custom("invalid").with_marker(CollationSpecialPrimariesV2::INFO));
582+
}
599583

600584
// Attribute belongs closer to `unwrap`, but
601585
// https://github.com/rust-lang/rust/issues/15701
@@ -623,12 +607,7 @@ impl CollatorBorrowed<'static> {
623607
/// compile-time optimizations that are possible with [`CollatorBorrowed`].
624608
pub const fn static_to_owned(self) -> Collator {
625609
Collator {
626-
special_primaries: if let Some(s) = self.special_primaries {
627-
// `map` not available in const context
628-
Some(DataPayload::from_static_ref(s))
629-
} else {
630-
None
631-
},
610+
special_primaries: DataPayload::from_static_ref(self.special_primaries),
632611
root: DataPayload::from_static_ref(self.root),
633612
tailoring: if let Some(s) = self.tailoring {
634613
// `map` not available in const context
@@ -751,10 +730,6 @@ impl CollatorBorrowed<'_> {
751730
} else {
752731
// +1 so that we can use "<" and primary ignorables test out early.
753732
self.special_primaries
754-
.as_ref()
755-
// `unwrap()` is OK, because we've ensured in the constructor that value
756-
// is `Some` if we have alternate handling.
757-
.unwrap()
758733
.last_primary_for_group(self.options.max_variable())
759734
+ 1
760735
};
@@ -763,13 +738,7 @@ impl CollatorBorrowed<'_> {
763738
// https://github.com/rust-lang/rust/issues/15701
764739
#[allow(clippy::unwrap_used)]
765740
let numeric_primary = if self.options.numeric() {
766-
Some(
767-
self.special_primaries
768-
.as_ref()
769-
// `unwrap` is OK, because we've ensured `Some` in the constructor
770-
.unwrap()
771-
.numeric_primary,
772-
)
741+
Some(self.special_primaries.numeric_primary)
773742
} else {
774743
None
775744
};
@@ -1544,3 +1513,25 @@ impl CollatorBorrowed<'_> {
15441513
Ordering::Equal
15451514
}
15461515
}
1516+
1517+
/// Helper for checking if a byte is compressible
1518+
pub(crate) struct CompressibleBytes<'a> {
1519+
arr: &'a [u8; 32],
1520+
}
1521+
1522+
impl<'a> CompressibleBytes<'a> {
1523+
pub(crate) fn new(arr: &'a [u8; 32]) -> Self {
1524+
Self { arr }
1525+
}
1526+
1527+
#[allow(dead_code)]
1528+
pub(crate) fn is_compressible(&self, b: u8) -> bool {
1529+
// Indexing OK by construction and pasting this
1530+
// into Compiler Explorer shows that the panic
1531+
// is optimized away.
1532+
#[allow(clippy::indexing_slicing)]
1533+
let field = self.arr[usize::from(b >> 3)];
1534+
let mask = 1 << (b & 0b111);
1535+
(field & mask) != 0
1536+
}
1537+
}

components/collator/src/provider.rs

Lines changed: 42 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ use zerovec::ule::AsULE;
2525
use zerovec::ZeroVec;
2626
use zerovec::{zeroslice, ZeroSlice};
2727

28+
use crate::comparison::CompressibleBytes;
2829
use crate::elements::CollationElement;
2930
use crate::elements::CollationElement32;
3031
use crate::elements::Tag;
@@ -64,7 +65,7 @@ const _: () = {
6465
impl_collation_diacritics_v1!(Baked);
6566
impl_collation_jamo_v1!(Baked);
6667
impl_collation_metadata_v1!(Baked);
67-
impl_collation_special_primaries_v1!(Baked);
68+
impl_collation_special_primaries_v2!(Baked);
6869
impl_collation_reordering_v1!(Baked);
6970
};
7071

@@ -126,8 +127,8 @@ icu_provider::data_marker!(
126127
);
127128
icu_provider::data_marker!(
128129
/// Data marker for collcation special primaries data.
129-
CollationSpecialPrimariesV1,
130-
"collation/special/primaries/v1",
130+
CollationSpecialPrimariesV2,
131+
"collation/special/primaries/v2",
131132
CollationSpecialPrimaries<'static>,
132133
is_singleton = true,
133134
);
@@ -141,7 +142,7 @@ pub const MARKERS: &[DataMarkerInfo] = &[
141142
CollationJamoV1::INFO,
142143
CollationMetadataV1::INFO,
143144
CollationReorderingV1::INFO,
144-
CollationSpecialPrimariesV1::INFO,
145+
CollationSpecialPrimariesV2::INFO,
145146
];
146147

147148
const SINGLE_U32: &ZeroSlice<u32> =
@@ -521,7 +522,26 @@ impl CollationMetadata {
521522
}
522523
}
523524

524-
/// Special primaries associated with the root collation
525+
/// Root-associated additional data that doesn't change in tailorings
526+
///
527+
/// These are the fields that logically belong to the root data but
528+
/// don't belong to the tailoring data and that are on this separate
529+
/// struct, since we have the same struct for a tailoring and the
530+
/// bulk of the root.
531+
///
532+
/// As a practical matter, this struct happens to only carry
533+
/// information about what concrete numeric values for primary
534+
/// weights are special in particular ways. In principle, when the
535+
/// root data is built, the root builder is allowed to assign the
536+
/// numeric values as it sees fit, which is why these aren't
537+
/// hard-coded.
538+
///
539+
/// Note: In 2.0.0 and prior, this struct was loaded only if
540+
/// it was known at collator construction time (based on options)
541+
/// that the data here was going to be needed. With the introduction
542+
/// of collation keys and the decision not to introduce a collator
543+
/// key generator object separate from the collator, this struct
544+
/// is now always loaded.
525545
///
526546
/// <div class="stab unstable">
527547
/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
@@ -539,6 +559,15 @@ pub struct CollationSpecialPrimaries<'data> {
539559
/// variants in `MaxVariable`, currently 4.
540560
#[cfg_attr(feature = "serde", serde(borrow))]
541561
pub last_primaries: ZeroVec<'data, u16>,
562+
/// 256 bits to classify every possible byte
563+
/// into compressible or non-compressible.
564+
/// The 256 bits are distributed across 32
565+
/// `u8`s, since we don't have a native `u256`
566+
/// and have to do manual shifts and masks,
567+
/// so we might as well do them with the narrow
568+
/// type that is its own `ULE`.
569+
#[cfg_attr(feature = "serde", serde(borrow))]
570+
pub compressible_bytes: ZeroVec<'data, u8>,
542571
/// The high 8 bits of the numeric primary
543572
pub numeric_primary: u8,
544573
}
@@ -557,4 +586,12 @@ impl CollationSpecialPrimaries<'_> {
557586
// See parse.cpp in genrb and getLastPrimaryForGroup in ICU4C.
558587
(u32::from(self.last_primaries.get(max_variable as usize).unwrap()) << 16) - 1
559588
}
589+
590+
#[allow(dead_code)]
591+
pub(crate) fn get_compressible_bytes(&self) -> CompressibleBytes<'_> {
592+
// The length has already been validated when constructing the collator.
593+
debug_assert_eq!(self.compressible_bytes.len(), 32);
594+
#[allow(clippy::unwrap_used)]
595+
CompressibleBytes::new(<&[u8; 32]>::try_from(self.compressible_bytes.as_bytes()).unwrap())
596+
}
560597
}

components/collator/tests/tests.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ const _: () = {
2727
impl_collation_diacritics_v1!(TestingProvider);
2828
impl_collation_jamo_v1!(TestingProvider);
2929
impl_collation_metadata_v1!(TestingProvider);
30-
impl_collation_special_primaries_v1!(TestingProvider);
30+
impl_collation_special_primaries_v2!(TestingProvider);
3131
impl_collation_reordering_v1!(TestingProvider);
3232

3333
icu_normalizer_data::impl_normalizer_nfc_v1!(TestingProvider);

provider/data/collator/data/collation_special_primaries_v1.rs.data renamed to provider/data/collator/data/collation_special_primaries_v2.rs.data

Lines changed: 18 additions & 18 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)