Skip to content

Commit 0fe4ca7

Browse files
committed
Top 10 ports - Introduce Count-Min Sketch class
1 parent 6532d55 commit 0fe4ca7

File tree

1 file changed

+147
-0
lines changed

1 file changed

+147
-0
lines changed

input/countminsketch.hpp

Lines changed: 147 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,147 @@
1+
/**
2+
* \file countminsketch.hpp
3+
* \brief Template class implementing Count-Min Sketch algorithm.
4+
* Used to estimate frequency of events in a stream and effectively find top-k frequent events.
5+
* \author Damir Zainullin <[email protected]>
6+
* \date 2024
7+
*/
8+
#pragma once
9+
10+
#include <array>
11+
#include <cmath>
12+
#include <functional>
13+
#include <limits>
14+
#include <queue>
15+
#include <unordered_map>
16+
#include <cstdint>
17+
18+
namespace ipxp {
19+
20+
/**
21+
* @brief Template class implementing Count-Min Sketch algorithm.
22+
* Used to estimate frequency of events in a stream and effectively find top-k frequent events.
23+
* @tparam EventType Type of tracked event event.
24+
* @tparam HashFunctionsCount Count of passed hash functions.
25+
* @tparam TopEventsCount Count of top frequent events to store.
26+
* @tparam RelativeError Relative error of the algorithm on scale from 1 to 9999, where 1 is the highest precision.
27+
* Lower error leads to higher memory consumption.
28+
* @tparam EventsEqual Function object to compare events.
29+
*/
30+
template<typename EventType, size_t HashFunctionsCount, size_t TopEventsCount = 10,
31+
size_t RelativeError = 100, typename EventsEqual = std::equal_to<EventType>>
32+
class CountMinSketch {
33+
struct EventCount {
34+
EventType event;
35+
size_t frequency;
36+
};
37+
38+
constexpr const static inline size_t MOST_FREQUENT_EVENTS_COUNT = TopEventsCount * 5;
39+
public:
40+
/** @brief Length of row for each hash function in table. */
41+
constexpr const static inline size_t ROW_LENGTH = std::ceil( std::exp(1.0) / (RelativeError / 10000.0));
42+
43+
/**
44+
* @brief Constructor.
45+
* @param hash_functions Array of hash functions to use.
46+
*/
47+
CountMinSketch(std::array<std::function<size_t(const EventType&)>, HashFunctionsCount>
48+
hash_functions) noexcept
49+
: m_hash_functions(std::move(hash_functions))
50+
, m_minimal_heap(
51+
[](const EventCount& a, const EventCount& b) { return a.frequency > b.frequency; })
52+
, m_in_heap(0, m_hash_functions[0], EventsEqual())
53+
{
54+
static_assert(TopEventsCount > 0, "TopEventsCount must be greater than 0");
55+
static_assert(
56+
RelativeError > 0 && RelativeError < 10000,
57+
"RelativeError must be between 0 and 1");
58+
static_assert(HashFunctionsCount > 0, "HashFunctionsCount must be greater than 0");
59+
60+
for (auto& row : m_event_counts) {
61+
row.fill(0);
62+
}
63+
}
64+
65+
/**
66+
* @brief Insert event into the sketch.
67+
* @param event Event to insert.
68+
*/
69+
void insert(const EventType& event) noexcept
70+
{
71+
size_t event_frequency = std::numeric_limits<size_t>::max();
72+
for (size_t hash_function_index = 0; hash_function_index < HashFunctionsCount;
73+
hash_function_index++) {
74+
const uint16_t event_index = get_event_index(hash_function_index, event);
75+
m_event_counts[hash_function_index][event_index]++;
76+
event_frequency = std::min(
77+
event_frequency,
78+
m_event_counts[hash_function_index][event_index]);
79+
}
80+
81+
update_least_freq_event();
82+
83+
if (m_in_heap.find(event) != m_in_heap.end()) {
84+
m_in_heap[event] = event_frequency;
85+
return;
86+
}
87+
88+
if (m_minimal_heap.size() < MOST_FREQUENT_EVENTS_COUNT) {
89+
m_minimal_heap.push({event, event_frequency});
90+
m_in_heap[event] = event_frequency;
91+
return;
92+
}
93+
94+
if (event_frequency > m_minimal_heap.top().frequency) {
95+
m_in_heap.erase(m_minimal_heap.top().event);
96+
m_minimal_heap.pop();
97+
m_minimal_heap.push({event, event_frequency});
98+
m_in_heap[event] = event_frequency;
99+
}
100+
}
101+
102+
/**
103+
* @brief Function to get current most frequent events.
104+
* @return Pair of array of top frequent events and its real size.
105+
*/
106+
std::pair<std::array<EventCount, TopEventsCount>, uint16_t> get_top_events() const noexcept
107+
{
108+
std::array<EventCount, MOST_FREQUENT_EVENTS_COUNT> top_events{};
109+
std::transform(m_in_heap.begin(), m_in_heap.end(), top_events.begin(),
110+
[](const std::pair<EventType, size_t>& event_count) -> EventCount {
111+
return {event_count.first, event_count.second};
112+
});
113+
const uint16_t inserted = std::min(m_in_heap.size(), TopEventsCount);
114+
std::partial_sort(top_events.begin(), top_events.begin() + inserted, top_events.end(),
115+
[](const EventCount& a, const EventCount& b) {
116+
return a.frequency > b.frequency || ( a.frequency == b.frequency && a.event < b.event); });
117+
std::array<EventCount, TopEventsCount> res{};
118+
std::copy_n(top_events.begin(), inserted, res.begin());
119+
return {res, inserted};
120+
}
121+
private:
122+
size_t get_event_index(uint16_t hash_function_index, EventType event) const noexcept
123+
{
124+
return m_hash_functions[hash_function_index](event) % ROW_LENGTH;
125+
}
126+
127+
void update_least_freq_event() noexcept
128+
{
129+
if (m_minimal_heap.empty()) {
130+
return;
131+
}
132+
133+
const EventType event = m_minimal_heap.top().event;
134+
const size_t new_frequency = m_in_heap[m_minimal_heap.top().event];
135+
m_minimal_heap.pop();
136+
m_minimal_heap.push({event, new_frequency});
137+
}
138+
139+
std::array<std::array<size_t, ROW_LENGTH>, HashFunctionsCount> m_event_counts;
140+
std::array<std::function<size_t(const EventType&)>, HashFunctionsCount> m_hash_functions;
141+
std::priority_queue<EventCount, std::vector<EventCount>,
142+
std::function<bool(const EventCount&, const EventCount&)>> m_minimal_heap;
143+
std::unordered_map<EventType, size_t, std::function<size_t(const EventType&)>,
144+
std::function<bool(const EventType&,const EventType&)>> m_in_heap;
145+
};
146+
147+
} // namespace ipxp

0 commit comments

Comments
 (0)