1+ /* *
2+ * \file countminsketch.hpp
3+ * \brief Template class implementing Count-Min Sketch algorithm.
4+ * Used to estimate frequency of events in a stream and effectively find top-k frequent events.
5+ * \author Damir Zainullin <[email protected] > 6+ * \date 2024
7+ */
8+ #pragma once
9+
10+ #include < array>
11+ #include < cmath>
12+ #include < functional>
13+ #include < limits>
14+ #include < queue>
15+ #include < unordered_map>
16+ #include < cstdint>
17+
18+ namespace ipxp {
19+
20+ /* *
21+ * @brief Template class implementing Count-Min Sketch algorithm.
22+ * Used to estimate frequency of events in a stream and effectively find top-k frequent events.
23+ * @tparam EventType Type of tracked event event.
24+ * @tparam HashFunctionsCount Count of passed hash functions.
25+ * @tparam TopEventsCount Count of top frequent events to store.
26+ * @tparam RelativeError Relative error of the algorithm on scale from 1 to 9999, where 1 is the highest precision.
27+ * Lower error leads to higher memory consumption.
28+ * @tparam EventsEqual Function object to compare events.
29+ */
30+ template <typename EventType, size_t HashFunctionsCount, size_t TopEventsCount = 10 ,
31+ size_t RelativeError = 100 , typename EventsEqual = std::equal_to<EventType>>
32+ class CountMinSketch {
33+ struct EventCount {
34+ EventType event;
35+ size_t frequency;
36+ };
37+
38+ constexpr const static inline size_t MOST_FREQUENT_EVENTS_COUNT = TopEventsCount * 5 ;
39+ public:
40+ /* * @brief Length of row for each hash function in table. */
41+ constexpr const static inline size_t ROW_LENGTH = std::ceil( std::exp(1.0 ) / (RelativeError / 10000.0 ));
42+
43+ /* *
44+ * @brief Constructor.
45+ * @param hash_functions Array of hash functions to use.
46+ */
47+ CountMinSketch (std::array<std::function<size_t (const EventType&)>, HashFunctionsCount>
48+ hash_functions) noexcept
49+ : m_hash_functions(std::move(hash_functions))
50+ , m_minimal_heap(
51+ [](const EventCount& a, const EventCount& b) { return a.frequency > b.frequency ; })
52+ , m_in_heap(0 , m_hash_functions[0 ], EventsEqual())
53+ {
54+ static_assert (TopEventsCount > 0 , " TopEventsCount must be greater than 0" );
55+ static_assert (
56+ RelativeError > 0 && RelativeError < 10000 ,
57+ " RelativeError must be between 0 and 1" );
58+ static_assert (HashFunctionsCount > 0 , " HashFunctionsCount must be greater than 0" );
59+
60+ for (auto & row : m_event_counts) {
61+ row.fill (0 );
62+ }
63+ }
64+
65+ /* *
66+ * @brief Insert event into the sketch.
67+ * @param event Event to insert.
68+ */
69+ void insert (const EventType& event) noexcept
70+ {
71+ size_t event_frequency = std::numeric_limits<size_t >::max ();
72+ for (size_t hash_function_index = 0 ; hash_function_index < HashFunctionsCount;
73+ hash_function_index++) {
74+ const uint16_t event_index = get_event_index (hash_function_index, event);
75+ m_event_counts[hash_function_index][event_index]++;
76+ event_frequency = std::min (
77+ event_frequency,
78+ m_event_counts[hash_function_index][event_index]);
79+ }
80+
81+ update_least_freq_event ();
82+
83+ if (m_in_heap.find (event) != m_in_heap.end ()) {
84+ m_in_heap[event] = event_frequency;
85+ return ;
86+ }
87+
88+ if (m_minimal_heap.size () < MOST_FREQUENT_EVENTS_COUNT) {
89+ m_minimal_heap.push ({event, event_frequency});
90+ m_in_heap[event] = event_frequency;
91+ return ;
92+ }
93+
94+ if (event_frequency > m_minimal_heap.top ().frequency ) {
95+ m_in_heap.erase (m_minimal_heap.top ().event );
96+ m_minimal_heap.pop ();
97+ m_minimal_heap.push ({event, event_frequency});
98+ m_in_heap[event] = event_frequency;
99+ }
100+ }
101+
102+ /* *
103+ * @brief Function to get current most frequent events.
104+ * @return Pair of array of top frequent events and its real size.
105+ */
106+ std::pair<std::array<EventCount, TopEventsCount>, uint16_t > get_top_events () const noexcept
107+ {
108+ std::array<EventCount, MOST_FREQUENT_EVENTS_COUNT> top_events{};
109+ std::transform (m_in_heap.begin (), m_in_heap.end (), top_events.begin (),
110+ [](const std::pair<EventType, size_t >& event_count) -> EventCount {
111+ return {event_count.first , event_count.second };
112+ });
113+ const uint16_t inserted = std::min (m_in_heap.size (), TopEventsCount);
114+ std::partial_sort (top_events.begin (), top_events.begin () + inserted, top_events.end (),
115+ [](const EventCount& a, const EventCount& b) {
116+ return a.frequency > b.frequency || ( a.frequency == b.frequency && a.event < b.event ); });
117+ std::array<EventCount, TopEventsCount> res{};
118+ std::copy_n (top_events.begin (), inserted, res.begin ());
119+ return {res, inserted};
120+ }
121+ private:
122+ size_t get_event_index (uint16_t hash_function_index, EventType event) const noexcept
123+ {
124+ return m_hash_functions[hash_function_index](event) % ROW_LENGTH;
125+ }
126+
127+ void update_least_freq_event () noexcept
128+ {
129+ if (m_minimal_heap.empty ()) {
130+ return ;
131+ }
132+
133+ const EventType event = m_minimal_heap.top ().event ;
134+ const size_t new_frequency = m_in_heap[m_minimal_heap.top ().event ];
135+ m_minimal_heap.pop ();
136+ m_minimal_heap.push ({event, new_frequency});
137+ }
138+
139+ std::array<std::array<size_t , ROW_LENGTH>, HashFunctionsCount> m_event_counts;
140+ std::array<std::function<size_t (const EventType&)>, HashFunctionsCount> m_hash_functions;
141+ std::priority_queue<EventCount, std::vector<EventCount>,
142+ std::function<bool (const EventCount&, const EventCount&)>> m_minimal_heap;
143+ std::unordered_map<EventType, size_t , std::function<size_t (const EventType&)>,
144+ std::function<bool (const EventType&,const EventType&)>> m_in_heap;
145+ };
146+
147+ } // namespace ipxp
0 commit comments