|
12 | 12 | // See the License for the specific language governing permissions and
|
13 | 13 | // limitations under the License.
|
14 | 14 |
|
15 |
| -use std::ops::Not; |
| 15 | +use std::{collections::BTreeSet, ops::Not}; |
16 | 16 |
|
17 | 17 | use bloomfilter::Bloom;
|
18 | 18 | use matrix_sdk_base::deserialized_responses::SyncTimelineEvent;
|
@@ -47,24 +47,42 @@ impl Deduplicator {
|
47 | 47 | where
|
48 | 48 | I: Iterator<Item = SyncTimelineEvent> + 'a,
|
49 | 49 | {
|
50 |
| - events.filter(|event| { |
| 50 | + let mut already_seen = BTreeSet::new(); |
| 51 | + |
| 52 | + events.filter(move |event| { |
51 | 53 | let Some(event_id) = event.event_id() else {
|
52 | 54 | // The event has no `event_id`. Safe path: filter it out.
|
53 | 55 | return false;
|
54 | 56 | };
|
55 | 57 |
|
56 | 58 | if self.bloom_filter.check_and_set(&event_id) {
|
57 |
| - // Bloom filter has false positives. We are NOT sure the event |
58 |
| - // is NOT present. Even if the false positive rate is low, we |
59 |
| - // need to iterate over all events to ensure it isn't present. |
60 |
| - |
61 |
| - room_events |
| 59 | + // Bloom filter has false positives. We are NOT sure the event is NOT present. |
| 60 | + // Even if the false positive rate is low, we need to iterate over all events to |
| 61 | + // ensure it isn't present. |
| 62 | + |
| 63 | + // But first, let's ensure `event` is not a duplicate from `events`, i.e. if the |
| 64 | + // iterator itself contains duplicated events! We use a `BTreetSet`, otherwise |
| 65 | + // using a bloom filter again may generate false positives. |
| 66 | + if already_seen.contains(&event_id) { |
| 67 | + // The iterator contains a duplicated `event`. Let's filter it out. |
| 68 | + return false; |
| 69 | + } |
| 70 | + |
| 71 | + // Now we can iterate over all events to ensure `event` is not present in |
| 72 | + // `room_events`. |
| 73 | + let result = room_events |
62 | 74 | .revents()
|
63 | 75 | .any(|(_position, other_event)| {
|
64 | 76 | other_event.event_id().as_ref() == Some(&event_id)
|
65 | 77 | })
|
66 |
| - .not() |
| 78 | + .not(); |
| 79 | + |
| 80 | + already_seen.insert(event_id); |
| 81 | + |
| 82 | + result |
67 | 83 | } else {
|
| 84 | + already_seen.insert(event_id); |
| 85 | + |
68 | 86 | // Bloom filter has no false negatives. We are sure the event is NOT present: we
|
69 | 87 | // can keep it in the iterator.
|
70 | 88 | true
|
@@ -120,7 +138,39 @@ mod tests {
|
120 | 138 | }
|
121 | 139 |
|
122 | 140 | #[test]
|
123 |
| - fn test_filter_duplicates() { |
| 141 | + fn test_filter_duplicates_in_new_events() { |
| 142 | + let event_builder = EventBuilder::new(); |
| 143 | + |
| 144 | + let event_id_0 = owned_event_id!("$ev0"); |
| 145 | + let event_id_1 = owned_event_id!("$ev1"); |
| 146 | + |
| 147 | + let event_0 = sync_timeline_event(&event_builder, &event_id_0); |
| 148 | + let event_1 = sync_timeline_event(&event_builder, &event_id_1); |
| 149 | + |
| 150 | + let mut deduplicator = Deduplicator::new(); |
| 151 | + let room_events = RoomEvents::new(); |
| 152 | + |
| 153 | + let mut events = deduplicator.filter_and_learn( |
| 154 | + [ |
| 155 | + event_0.clone(), // OK |
| 156 | + event_0, // Not OK |
| 157 | + event_1, // OK |
| 158 | + ] |
| 159 | + .into_iter(), |
| 160 | + &room_events, |
| 161 | + ); |
| 162 | + |
| 163 | + assert_let!(Some(event) = events.next()); |
| 164 | + assert_eq!(event.event_id(), Some(event_id_0)); |
| 165 | + |
| 166 | + assert_let!(Some(event) = events.next()); |
| 167 | + assert_eq!(event.event_id(), Some(event_id_1)); |
| 168 | + |
| 169 | + assert!(events.next().is_none()); |
| 170 | + } |
| 171 | + |
| 172 | + #[test] |
| 173 | + fn test_filter_duplicates_with_existing_events() { |
124 | 174 | let event_builder = EventBuilder::new();
|
125 | 175 |
|
126 | 176 | let event_id_0 = owned_event_id!("$ev0");
|
|
0 commit comments