Skip to content

Commit 3894165

Browse files
committed
Merge branch 'develop' into sweeper-develop-merge
Conflicts: priv/riak_kv.schema rebar.config src/riak_kv_entropy_manager.erl src/riak_kv_index_hashtree.erl src/riak_kv_vnode.erl
2 parents aaca104 + ff0687f commit 3894165

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

58 files changed

+3222
-1422
lines changed

dialyzer.ignore-warnings

+1
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ Unknown functions:
2929
yz_kv:index/3
3030
yz_kv:index_binary/5
3131
yz_kv:should_handoff/1
32+
object:warning/2
3233
Unknown types:
3334
base64:ascii_binary/0
3435
mochijson2:json_object/0

docs/hll/hll.bib

+53
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
@ONLINE{Flatjolet:2007:Online,
2+
author = "Flajolet, P. and Fusy, E. and Gandouet, O. and Meunier, F.",
3+
title = {HyperLogLog: the analysis of a near-optimal
4+
cardinality estimation algorithm},
5+
year = {2007},
6+
url = {http://algo.inria.fr/flajolet/Publications/FlFuGaMe07.pdf}
7+
}
8+
9+
@ONLINE{Durand:2003:Online,
10+
author = "Durand, M. and Flajolet, P.",
11+
title = {Loglog Counting of Large Cardinalities},
12+
year = {2003},
13+
url = {http://algo.inria.fr/flajolet/Publications/DuFl03-LNCS.pdf}
14+
}
15+
16+
@ONLINE{Count-distinct:Online,
17+
author = "Wikipedia",
18+
title = {Count-distinct problem},
19+
url = {https://en.wikipedia.org/wiki/Count-distinct_problem}
20+
}
21+
22+
@ONLINE{Harmonic-mean:Online,
23+
author = "Wikipedia",
24+
title = {Harmonic mean},
25+
url = {https://en.wikipedia.org/wiki/Harmonic_mean}
26+
}
27+
28+
@ONLINE{Hyper:Online,
29+
author = "GameAnalytics",
30+
title = {Hyper},
31+
url = {https://github.com/GameAnalytics/hyper}
32+
}
33+
34+
@ONLINE{Heule:2013:Online,
35+
author = "Heule, S. and Nunkesser, M. and Hall, A.",
36+
title = {HyperLogLog in Practice: Algorithmic Engineering of a State of The Art
37+
Cardinality Estimation Algorithm},
38+
month = mar,
39+
year = {2013},
40+
url = {http://static.googleusercontent.com/media/research.google.com/en//pubs/archive/40671.pdf}
41+
}
42+
43+
@ONLINE{Kiip:Online,
44+
author = "Kiip",
45+
title = {Sketching \& Scaling: Everyday HyperLogLog},
46+
url = {http://blog.kiip.me/engineering/sketching-scaling-everyday-hyperloglog}
47+
}
48+
49+
@ONLINE{Neustar:Online,
50+
author = "Neustar",
51+
title = {Sketch of the Day: HyperLogLog — Cornerstone of a Big Data Infrastructure},
52+
url = {https://research.neustar.biz/2012/10/25/sketch-of-the-day-hyperloglog-cornerstone-of-a-big-data-infrastructure/}
53+
}

docs/hll/hll.pdf

325 KB
Binary file not shown.

docs/hll/hll.tex

+287
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,287 @@
1+
\documentclass[12pt]{article}
2+
3+
\usepackage{url}
4+
\usepackage[backend=bibtex,sorting=none]{biblatex}
5+
\usepackage{dirtytalk}
6+
\usepackage{amsmath,amssymb,mathtools,bm,etoolbox}
7+
\usepackage{listings}
8+
\usepackage{graphicx}
9+
\usepackage{float}
10+
11+
\graphicspath{ {images/} }
12+
13+
\lstset{frame=tb,
14+
language=Erlang,
15+
aboveskip=3mm,
16+
belowskip=3mm,
17+
showstringspaces=false,
18+
columns=flexible,
19+
basicstyle={\small\ttfamily},
20+
numbers=none,
21+
numberstyle=\tiny
22+
breaklines=true,
23+
breakatwhitespace=true,
24+
tabsize=2,
25+
literate={\ \ }{{\ }}1}
26+
27+
\newlength\tindent
28+
\setlength{\tindent}{\parindent}
29+
\setlength{\parindent}{0pt}
30+
\renewcommand{\indent}{\hspace*{\tindent}}
31+
32+
\bibliography{hll}
33+
34+
\begin{document}
35+
36+
\title{Hyperloglog Datatypes in Riak}
37+
\author{Zeeshan Lakhani}
38+
39+
\maketitle
40+
41+
\section{Getting Started}
42+
43+
Hyperloglog (HLL) was conceived of by
44+
Flajolet et.al.\cite{Flatjolet:2007:Online} in 2007 as an improvement and
45+
extension on the Loglog\cite{Durand:2003:Online} algorithm to tackle the
46+
\textbf{Count-distinct problem}\cite{Count-distinct:Online}, or finding the
47+
number of distinct elements in a large file and, later, a data stream.
48+
Or, as more properly stated in the quintessential 2007 paper:\newline
49+
50+
\say{The purpose of this note is to present and analyse an efficient algorithm
51+
for estimating the number of distinct elements, known as the cardinality, of
52+
large data ensembles, which are referred to here as multisets and are usually
53+
massive streams (read-once sequences). This problem has received a great deal of
54+
attention over the past two decades, finding an ever growing number of
55+
applications in networking and traffic monitoring, such as the detection of worm
56+
propagation, of network attacks (e.g., by Denial of Service), and of link-based
57+
spam on the web.}
58+
59+
\subsection{Why HyperLogLog?}
60+
61+
So, what's a good use case for HLLs? One example would be to determine the
62+
number of distinct search queries on \textit{google.com} over a time
63+
period\cite{Heule:2013:Online}.
64+
65+
\indent The goal of HLL is to estimate unique elements in large sets (large being beyond
66+
$10^{9}$) and streams while also keeping memory low(er). Normally, calculating
67+
the exact cardinality of a set requires an amount of memory proportional to the
68+
cardinality when counting these unique items. With HLLs, the trade off is less
69+
memory in exchange for approximated cardinality. Yo performance.\newline
70+
71+
As per \cite{Heule:2013:Online}, the key requirements for a cardinality
72+
estimation algorithm are
73+
74+
\begin{enumerate}
75+
\item \textbf{Accuracy}: For a fixed amount of memory, the algorithm should
76+
provide as accurate an estimate as possible. Especially for small
77+
cardinalities, the results should be near exact.
78+
\item \textbf{Memory efficiency}: The algorithm should use the available memory
79+
efficiently and adapt its memory usage to the cardinality. That is, the
80+
algorithm should use less than the user-specified maximum amount of memory if
81+
the cardinality to be estimated is very small.
82+
\item \textbf{Estimate large cardinalities}: Multisets with cardinalities well
83+
beyond 1 billion occur on a daily basis, and it is important that such large
84+
cardinalities can be estimated with reasonable accuracy.
85+
\item \textbf{Practicality}: The algorithm should be implementable and
86+
maintainable.
87+
\end{enumerate}
88+
89+
There are two generalized categories of cardinality observables
90+
\cite{Flatjolet:2007:Online}:
91+
92+
\begin{enumerate}
93+
\item \textbf{Bit-pattern observables}: these are based on certain patterns of
94+
bits occurring at the beginning of the (binary) $S$-values. For instance,
95+
observing in the stream $S$ at the beginning of a string a bitpattern
96+
$0^{p-1}1$ is more or less a likely indication that the cardinality $n$
97+
of $S$ is at least $2^{p}$. HLL is an example of this category.
98+
99+
\item \textbf{Order statistics observables}: these are based on order statistics,
100+
like the smallest (real) values, that appear in $S$. For instance,
101+
if $X = min(S)$, we may legitimately hope that $n$ is roughly of the order of
102+
$1/X$, since, as regards expectations, one has $\mathbb{E}(X) = 1/(n + 1)$.
103+
\end{enumerate}
104+
105+
\section{The HyperLogLog Algorithm}
106+
107+
\indent The key components of the algorithm are
108+
109+
\begin{enumerate}
110+
\item \textit{randomization achieved by a hash function}, $h$, that is applied
111+
to every element that is to be counted.
112+
\item As a hashed valued comes in, the first $p$, \texttt{precision}
113+
\texttt{(4..16)} bits are used to determine which register (substream)
114+
we'll use to store the maximum number of leading zeros in the rest of the
115+
hash. The \textit{bit-pattern observables} in the HLL approach would be this
116+
maximum, i.e. the longest run of zeros in the hash values (after the initial
117+
$p$ bits). $m = 2^p$ is the maximum number of hash values maintained.
118+
\item \textit{stochastic averaging} across the
119+
registers/substreams\textemdash divided $m$ substreams of $S_i$
120+
($S$ meaning data elements), to reduce the large variability of each single
121+
measurement.
122+
\item To reduce dramatic outliers, a \textit{harmonic mean} is used instead of
123+
a arithmetic mean across the estimates, which tends strongly tward the least
124+
elements of the list\cite{Harmonic-mean:Online}.
125+
\end{enumerate}
126+
127+
Here's a visualization of the basic idea\cite{Kiip:Online}:
128+
129+
\begin{figure}[H]
130+
\centering
131+
\includegraphics[width=8.1cm, height=4cm]{bucket-run}
132+
\caption{$p=4$; The bucket/register for the hashed value of $0100$ is $8$.}
133+
\label{figurebucketrun}
134+
\end{figure}
135+
136+
\begin{figure}[H]
137+
\centering
138+
\includegraphics[width=8.2cm, height=3cm]{register-store}
139+
\caption{Storing 4 as the max number of leading zeros.}
140+
\label{figurestoreregister}
141+
\end{figure}
142+
143+
\begin{figure}[H]
144+
\centering
145+
\includegraphics[width=8.0cm, height=3cm]{sto-avg}
146+
\caption{To reduce the large variability of single measurements, a stochastic
147+
average is calculated across the registers. \textbf{This is a simple example}.
148+
A normalized bias corrected harmonic mean of the estimations is actually used
149+
for the final estimate.}
150+
\label{figurestoaverage}
151+
\end{figure}
152+
153+
The simplified formula that actually defines the HLL distinct-value (DV)
154+
estimator\cite{Neustar:Online} is
155+
156+
$$DV_{HLL} = constant * m^{2} * \Bigg(\sum_{j=1}^{m} 2^{-R_{j}}\Bigg)^{-1}$$
157+
158+
$R_j$ is the longest run of zeroes in the $j^{th}$ bucket. The relative error
159+
is $1.04/\sqrt{m}$ ($m$ - number of counters). More information about the
160+
original algorithm and it's additional modifications can be found in
161+
\cite{Flatjolet:2007:Online}.
162+
163+
\subsection{HyperLogLog++}\label{hll++}
164+
165+
As per \cite{Heule:2013:Online}, going into production with requirements of
166+
estimating multisets of cardinalities beyond 1 billion, there needed to be
167+
some changes to the known HLL algorithm, hence \textit{HyperLogLog++}.\newline
168+
169+
The main changes we'll take away for this exercise are
170+
171+
\begin{enumerate}
172+
\item Use a 64-bit hash function instead of the
173+
original's\cite{Flatjolet:2007:Online} 32-bit hash function with special range
174+
correction. Therefore, hash collisions only become a problem if we reach a
175+
cardinality of $2^{64}$, which is fine for many real-world data sets.
176+
\item HyperLogLog++ introduces a bias-correction which corrects for bias
177+
using empirically determined data for cardinalities $< 5m$.
178+
\end{enumerate}
179+
180+
\section{HLL Datatypes in Riak}
181+
182+
\subsection{Hyper Library}
183+
184+
Our HyperLogLogs (HLLs) are driven by GameAnalytics' Erlang HyperLogLog
185+
implementation\cite{Hyper:Online} under the hood, which includes the bias
186+
correction from \cite{Heule:2013:Online} (mentioned in \ref{hll++}).
187+
188+
\indent Currently, we are using the \textit{hyper\_binary} option as a backend,
189+
which has \say{fixed memory usage $(6 bits * 2^{P})$, fastest on insert, union,
190+
cardinality and serialization. Best default choice.} 6 instead of 5 bits is
191+
mentioned due to the increased-bit hash function.
192+
193+
\indent The library gives us the ability to perform unions
194+
(\textit{looks like a merge}), be prescribed a precision, reduce precision,
195+
union varying-precisioned HLLs (based on a \textit{union} toward the reduced
196+
HLL), and compact the data structure's buffer before the registers are
197+
needed/calculated from.\newline
198+
199+
Here's an example of what an insert and card-check looks like:
200+
201+
\begin{lstlisting}
202+
H = hyper:insert(<<"foo">>, hyper:insert(<<"qu">>, hyper:new(4))).
203+
{hyper,4,
204+
{hyper_binary,{dense,<<0,0,0,0,0,0,0,0,0,0,0,2>>,[{0,1}],1,16}}}
205+
206+
hyper:card(H).
207+
2.136502281992361
208+
\end{lstlisting}
209+
210+
\subsection{Brief In-Riak Example}
211+
212+
Ok. Here's an example workflow with the Riak erlang-(pb)-client:
213+
214+
\begin{lstlisting}
215+
CMod = riakc_pb_socket,
216+
Key = <<"Holy Diver">>,
217+
Bucket = {<<"hll_bucket">>, <<"testbucket1">>},
218+
219+
S0 = riakc_hll:new(),
220+
221+
Item = <<"Jokes">>,
222+
ok = CMod:update_type(
223+
Pid, Bucket, Key, riakc_hll:to_op(
224+
riakc_hll:add_element(Item, S0))
225+
).
226+
227+
{ok, S1} = CMod:fetch_type(Pid, Bucket, Key),
228+
Items = [<<"are">>, <<"better">>, <<"explained">>],
229+
ok = CMod:update_type(
230+
Pid, Bucket, Key,
231+
riakc_hll:to_op(riakc_hll:add_elements(Items, S1))
232+
).
233+
234+
{ok, S2} = CMod:fetch_type(Pid, Bucket, Key),
235+
riakc_hll:value(S2) =:= 4.
236+
237+
%% Add a redundant element
238+
239+
ok = CMod:update_type(
240+
Pid, Bucket, Key, riakc_hll:to_op(
241+
riakc_hll:add_element(Item, S2))
242+
).
243+
244+
{ok, S3} = CMod:fetch_type(Pid, Bucket, Key),
245+
riakc_hll:value(S3) =:= 4.
246+
\end{lstlisting}
247+
248+
\subsection{Testing within an Error Bound}
249+
250+
\begin{lstlisting}
251+
%% @doc Standard Error is sigma ≈ 1.04/srqt(m), where m is the
252+
%% # of registers. Deviations are related to margin of error away
253+
%% from the actual cardinality in of percentils.
254+
%% sigma = 65%, 2σ=95%, 3σ=99%
255+
margin_of_error(P, Deviations) ->
256+
M = trunc(math:pow(2, P)),
257+
Sigma = 1.04 / math:sqrt(M),
258+
Sigma*Deviations.
259+
260+
%% @doc Check if Estimated Card from HllSet is within an acceptable
261+
%% margin of error determined by m-registers and 3 deviations of
262+
%% the standard error. Use a window of +1 to account for rounding
263+
%% and extremely small cardinalities.
264+
within_error_check(Card, HllSet, HllVal) ->
265+
case Card > 0 of
266+
true ->
267+
Precision = riak_kv_hll:precision(HllSet),
268+
MarginOfError = margin_of_error(Precision, 3),
269+
RelativeError = (abs(Card-HllVal)/Card),
270+
%% Is the relative error within the margin of error times
271+
%% the estimation *(andalso)* is the value difference less than
272+
%% the actual cardinality times the margin of error
273+
BoundCheck1 = RelativeError =< (MarginOfError * HllVal)+1,
274+
BoundCheck2 = abs(HllVal-Card) =< (Card*MarginOfError)+1,
275+
BoundCheck1 andalso BoundCheck2;
276+
_ -> trunc(HllVal) == Card
277+
end.
278+
\end{lstlisting}
279+
280+
\section{So?}
281+
282+
So, HLL's are a super useful way to count distinct elements in a set, stream,
283+
multiset while also keeping memory and data structure byte-size down. Win!
284+
285+
\printbibliography
286+
287+
\end{document}

docs/hll/images/bucket-run.png

85.1 KB
Loading

docs/hll/images/register-store.png

62.8 KB
Loading

docs/hll/images/sto-avg.png

77.9 KB
Loading

include/riak_kv_types.hrl

+17-3
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,11 @@
1313
-define(SET_TYPE, riak_dt_orswot).
1414
-define(SET_TYPE(Val), #crdt{mod=?SET_TYPE, ctype="application/riak_set", value=Val}).
1515

16+
-define(HLL_TYPE, riak_kv_hll).
17+
-define(HLL_TYPE(Val), #crdt{mod=?HLL_TYPE,
18+
ctype="application/riak_hll",
19+
value=Val}).
20+
1621
-define(MAP_TYPE, riak_dt_map).
1722
-define(MAP_TYPE(Val), #crdt{mod=?MAP_TYPE, ctype="application/riak_map", value=Val}).
1823

@@ -23,23 +28,32 @@
2328

2429
-define(V1_TOP_LEVEL_TYPES, [pncounter]).
2530
-define(V2_TOP_LEVEL_TYPES, [?COUNTER_TYPE, ?SET_TYPE, ?MAP_TYPE]).
26-
-define(TOP_LEVEL_TYPES, ?V1_TOP_LEVEL_TYPES ++ ?V2_TOP_LEVEL_TYPES).
31+
-define(V3_TOP_LEVEL_TYPES, [?HLL_TYPE]).
32+
-define(TOP_LEVEL_TYPES, ?V1_TOP_LEVEL_TYPES ++ ?V2_TOP_LEVEL_TYPES ++
33+
?V3_TOP_LEVEL_TYPES).
2734
-define(ALL_TYPES, ?TOP_LEVEL_TYPES ++ [?FLAG_TYPE, ?REG_TYPE]).
2835
-define(EMBEDDED_TYPES, [{map, ?MAP_TYPE}, {set, ?SET_TYPE},
2936
{counter, ?EMCNTR_TYPE}, {flag, ?FLAG_TYPE},
3037
{register, ?REG_TYPE}]).
3138

3239
-define(MOD_MAP, [{map, ?MAP_TYPE}, {set, ?SET_TYPE},
33-
{counter, ?COUNTER_TYPE}]).
40+
{counter, ?COUNTER_TYPE}, {hll, ?HLL_TYPE}]).
3441

3542
-define(DATATYPE_STATS_DEFAULTS, [actor_count]).
43+
-define(HLL_STATS, [bytes]).
3644

3745
%% These proplists represent the current versions of supported
3846
%% datatypes. The naming `EN_DATATYPE_VERSIONS' means `Epoch' and
3947
%% number. `N' is incremented when any new version of any datatype is
4048
%% introduced, thus bumping the data type `Epoch'.
4149
-define(E1_DATATYPE_VERSIONS, [{?COUNTER_TYPE, 2}]).
42-
-define(E2_DATATYPE_VERSIONS, [{?MAP_TYPE, 2}, {?SET_TYPE, 2}, {?COUNTER_TYPE, 2}]).
50+
-define(E2_DATATYPE_VERSIONS, [{?MAP_TYPE, 2},
51+
{?SET_TYPE, 2},
52+
{?COUNTER_TYPE, 2}]).
53+
-define(E3_DATATYPE_VERSIONS, ?E2_DATATYPE_VERSIONS ++ [{?HLL_TYPE, 1}]).
4354

4455
-type crdt() :: ?CRDT{}.
4556
-type crdt_op() :: ?CRDT_OP{}.
57+
58+
%% Redis Default Yo Lolz, but a good default.
59+
-define(HYPER_DEFAULT_PRECISION, 14).

0 commit comments

Comments
 (0)