Skip to content

Commit c41d40c

Browse files
authored
Implement frequency algorithms such as Count Sketch and Count-Min Sketch (#15)
Count Sketch and Count–Min Sketch are simple space-efficient probabilistic data structures that are used to estimate frequencies of elements in data streams and can address the Heavy hitters problem. Count Sketch was proposed by Moses Charikar, Kevin Chen, and Martin Farach-Colton in 2002. Count–Min Sketch was presented in 2003 by Graham Cormode and Shan Muthukrishnan and published in 2005. In the current implementation, we support up to 2^{32} -1 counters (due to 32-bit hash functions) each of 32 bits.
1 parent 528cfa6 commit c41d40c

File tree

11 files changed

+749
-8
lines changed

11 files changed

+749
-8
lines changed

.gitignore

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -112,12 +112,14 @@ cythonize.json
112112

113113

114114
#PDSA
115+
pdsa/cardinality/hyperloglog.cpp
116+
pdsa/cardinality/linear_counter.cpp
117+
pdsa/cardinality/probabilistic_counter.cpp
115118
pdsa/helpers/hashing/mmh.cpp
116119
pdsa/helpers/storage/bitvector.cpp
117120
pdsa/helpers/storage/bitvector_counter.cpp
118-
pdsa/cardinality/linear_counter.cpp
119-
pdsa/cardinality/probabilistic_counter.cpp
120-
pdsa/cardinality/hyperloglog.cpp
121121
pdsa/membership/bloom_filter.cpp
122122
pdsa/membership/counting_bloom_filter.cpp
123123
pdsa/rank/qdigest.cpp
124+
pdsa/frequency/count_min_sketch.cpp
125+
pdsa/frequency/count_sketch.cpp

MANIFEST.in

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,13 +11,14 @@ global-exclude __pycache__/*
1111
global-exclude *.so
1212
global-exclude .DS_Store
1313

14+
global-exclude pdsa/cardinality/hyperloglog.cpp
1415
global-exclude pdsa/cardinality/linear_counter.cpp
1516
global-exclude pdsa/cardinality/probabilistic_counter.cpp
16-
global-exclude pdsa/cardinality/hyperloglog.cpp
17-
global-exclude pdsa/membership/bloom_filter.cpp
18-
global-exclude pdsa/membership/counting_bloom_filter.cpp
17+
global-exclude pdsa/frequency/count_min_sketch.cpp
18+
global-exclude pdsa/frequency/count_sketch.cpp
1919
global-exclude pdsa/helpers/hashing/mmh.cpp
2020
global-exclude pdsa/helpers/storage/bitvector.cpp
2121
global-exclude pdsa/helpers/storage/bitvector_counter.cpp
22+
global-exclude pdsa/membership/bloom_filter.cpp
23+
global-exclude pdsa/membership/counting_bloom_filter.cpp
2224
global-exclude pdsa/rank/qdigest.cpp
23-

pdsa/frequency/__init__.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
from .count_min_sketch import CountMinSketch
2+
3+
4+
__all__ = ["CountMinSketch", ]
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
from libc.stdint cimport uint64_t, uint32_t, uint8_t
2+
3+
4+
cdef class CountMinSketch:
5+
cdef uint32_t _MAX_COUNTER_VALUE
6+
7+
cdef uint8_t num_of_counters
8+
cdef uint32_t length_of_counter
9+
10+
cdef uint64_t _length
11+
cdef uint8_t[:] _seeds
12+
cdef uint32_t[:] _counter
13+
14+
cpdef void add(self, object element) except *
15+
cpdef uint32_t frequency(self, object element) except *
16+
cpdef size_t sizeof(self)
17+
18+
cdef bint _increment_counter(self, const uint64_t index)
19+
cdef uint32_t _hash(self, object element, uint8_t seed)
Lines changed: 261 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,261 @@
1+
"""
2+
Count-Min Sketch.
3+
4+
Count–Min Sketch is a simple space-efficient probabilistic data structure
5+
that is used to estimate frequencies of elements in data streams and can
6+
address the Heavy hitters problem. It was presented in 2003 [1] by
7+
Graham Cormode and Shan Muthukrishnan and published in 2005 [2].
8+
9+
References
10+
----------
11+
[1] Cormode, G., Muthukrishnan, S.
12+
What's hot and what's not: Tracking most frequent items dynamically
13+
Proceedings of the 22th ACM SIGMOD-SIGACT-SIGART symposium on Principles
14+
of database systems, San Diego, California - June 09-11, 2003,
15+
pp. 296–306, ACM New York, NY.
16+
http://www.cs.princeton.edu/courses/archive/spr04/cos598B/bib/CormodeM-hot.pdf
17+
[2] Cormode, G., Muthukrishnan, S.
18+
An Improved Data Stream Summary: The Count–Min Sketch and its Applications
19+
Journal of Algorithms, Vol. 55 (1), pp. 58–75.
20+
http://dimacs.rutgers.edu/~graham/pubs/papers/cm-full.pdf
21+
22+
"""
23+
import cython
24+
25+
from cpython.array cimport array
26+
from libc.math cimport ceil, log, M_E
27+
from libc.stdint cimport uint64_t, uint32_t, uint8_t
28+
from libc.stdint cimport UINT32_MAX, UINT8_MAX
29+
from libc.stdlib cimport rand, RAND_MAX
30+
31+
from pdsa.helpers.hashing.mmh cimport mmh3_x86_32bit
32+
33+
34+
cdef class CountMinSketch:
35+
"""Count-Min Sketch.
36+
37+
Count-Min Sketch is simple data structure that allows for the indexing
38+
of elements from the data stream, results in updating counters,
39+
and can provide the number of times every element has been indexed.
40+
41+
Example
42+
-------
43+
44+
>>> from pdsa.frequency.count_min_sketch import CountMinSketch
45+
46+
>>> cms = CountMinSketch(5, 2000)
47+
>>> cms.add("hello")
48+
>>> cms.frequency("hello")
49+
50+
51+
Note
52+
-----
53+
This implementation uses MurmurHash3 family of hash functions
54+
which yields a 32-bit hash value. Thus, the length of the counters
55+
is expected to be smaller or equal to the (2^{32} - 1), since
56+
we cannot access elements with indexes above this value.
57+
58+
Note
59+
-----
60+
This implementation uses 32-bits counters that freeze at their
61+
maximal values (2^{32} - 1).
62+
63+
Attributes
64+
----------
65+
num_of_counters : :obj:`int`
66+
The number of counter arrays used in the sketch.
67+
length_of_counter : :obj:`int`
68+
The number of counters in each counter array.
69+
70+
"""
71+
72+
@cython.cdivision(True)
73+
def __cinit__(self, const uint8_t num_of_counters, const uint32_t length_of_counter):
74+
"""Create sketch from its dimensions.
75+
76+
Parameters
77+
----------
78+
num_of_counters : :obj:`int`
79+
The number of counter arrays used in the sketch.
80+
length_of_counter : :obj:`int`
81+
The number of counters in each counter array.
82+
83+
Raises
84+
------
85+
ValueError
86+
If `num of counters` is less than 1.
87+
ValueError
88+
If `length_of_counter` is less than 1.
89+
90+
"""
91+
if num_of_counters < 1:
92+
raise ValueError("At least one counter array is required")
93+
94+
if length_of_counter < 1:
95+
raise ValueError("The length of the counter array cannot be less then 1")
96+
97+
self.num_of_counters = num_of_counters
98+
self.length_of_counter = length_of_counter
99+
100+
self._length = self.num_of_counters * self.length_of_counter
101+
102+
self._MAX_COUNTER_VALUE = UINT32_MAX
103+
self._seeds = array('B', [
104+
<uint8_t >((rand()/RAND_MAX) * UINT8_MAX)
105+
for r in range(self.num_of_counters)
106+
])
107+
self._counter = array('I', range(self._length))
108+
109+
cdef uint64_t index
110+
for index in xrange(self._length):
111+
self._counter[index] = 0
112+
113+
@classmethod
114+
def create_from_expected_error(cls, const float deviation, const float error):
115+
"""Create sketch from the expected frequency deviation and error probability.
116+
117+
Parameters
118+
----------
119+
deviation : float
120+
The error ε in answering the paricular query.
121+
For example, if we expect 10^7 elements and allow
122+
the fixed overestimate of 10, the deviation is 10/10^7 = 10^{-6}.
123+
error : float
124+
The standard error δ (0 < error < 1).
125+
126+
Note
127+
----
128+
The Count–Min Sketch is approximate and probabilistic at the same
129+
time, therefore two parameters, the error ε in answering the paricular
130+
query and the error probability δ, affect the space and time
131+
requirements. In fact, it provides the guarantee that the estimation
132+
error for frequencies will not exceed ε x n
133+
with probability at least 1 – δ.
134+
135+
Raises
136+
------
137+
ValueError
138+
If `deviation` is smaller than 10^{-10}.
139+
ValueError
140+
If `error` is not in range (0, 1).
141+
142+
"""
143+
if deviation <= 0.0000000001:
144+
raise ValueError("Deviation is too small. Not enough counters")
145+
146+
if error <= 0 or error >= 1:
147+
raise ValueError("Error rate shell be in (0, 1)")
148+
149+
cdef uint8_t num_of_counters = <uint8_t > (ceil(-log(error)))
150+
cdef uint32_t length_of_counter = <uint32_t > (ceil(M_E / deviation))
151+
152+
return cls(max(1, num_of_counters), max(1, length_of_counter))
153+
154+
cdef uint32_t _hash(self, object key, uint8_t seed):
155+
return mmh3_x86_32bit(key, seed)
156+
157+
def __dealloc__(self):
158+
pass
159+
160+
cdef bint _increment_counter(self, const uint64_t index):
161+
"""Increment counter if the value doesn't exceed maximal allowed.
162+
163+
Parameters
164+
----------
165+
index : obj:`int`
166+
The index of the counter to be incremented.
167+
168+
Note
169+
----
170+
When counter reaches its maximal value, we simple freeze it there.
171+
172+
"""
173+
if self._counter[index] < self._MAX_COUNTER_VALUE:
174+
self._counter[index] += 1
175+
return True
176+
return False
177+
178+
@cython.boundscheck(False)
179+
@cython.wraparound(False)
180+
@cython.cdivision(True)
181+
cpdef void add(self, object element) except *:
182+
"""Index element into the sketch.
183+
184+
Parameters
185+
----------
186+
element : obj
187+
The element to be indexed into the sketch.
188+
189+
"""
190+
cdef uint8_t counter_index
191+
cdef uint32_t element_index
192+
cdef uint8_t seed
193+
cdef uint64_t index
194+
for counter_index in range(self.num_of_counters):
195+
seed = self._seeds[counter_index]
196+
element_index = self._hash(element, seed) % self.length_of_counter
197+
index = counter_index * (self.length_of_counter - 1) + element_index
198+
self._increment_counter(index)
199+
200+
@cython.boundscheck(False)
201+
@cython.wraparound(False)
202+
@cython.cdivision(True)
203+
cpdef uint32_t frequency(self, object element) except *:
204+
"""Estimate frequency of element.
205+
206+
Parameters
207+
----------
208+
element : obj
209+
The element to estimate the frequency for.
210+
211+
Returns
212+
-------
213+
uint32_t
214+
The frequency of the element.
215+
216+
"""
217+
cdef uint8_t counter_index
218+
cdef uint32_t element_index
219+
cdef uint8_t seed
220+
cdef uint64_t index
221+
cdef uint32_t frequency = self._MAX_COUNTER_VALUE
222+
for counter_index in range(self.num_of_counters):
223+
seed = self._seeds[counter_index]
224+
element_index = self._hash(element, seed) % self.length_of_counter
225+
index = counter_index * (self.length_of_counter - 1) + element_index
226+
frequency = min(frequency, self._counter[index])
227+
return frequency
228+
229+
cpdef size_t sizeof(self):
230+
"""Size of the sketch in bytes.
231+
232+
Returns
233+
-------
234+
:obj:`int`
235+
Number of bytes allocated for the sketch.
236+
237+
"""
238+
return self._length * sizeof(uint32_t)
239+
240+
def __repr__(self):
241+
return "<CountMinSketch ({} x {})>".format(
242+
self.num_of_counters,
243+
self.length_of_counter
244+
)
245+
246+
def __len__(self):
247+
"""Get length of the filter.
248+
249+
Returns
250+
-------
251+
:obj:`int`
252+
The length of the filter.
253+
254+
"""
255+
return self._length
256+
257+
258+
259+
def debug(self):
260+
"""Return sketch for debug purposes."""
261+
return self._counter

pdsa/frequency/count_sketch.pxd

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
from libc.stdint cimport uint64_t, uint32_t, uint8_t
2+
from libc.stdint cimport int32_t
3+
4+
5+
cdef class CountSketch:
6+
cdef int32_t _MAX_COUNTER_VALUE
7+
cdef int32_t _MIN_COUNTER_VALUE
8+
9+
cdef uint8_t num_of_counters
10+
cdef uint32_t length_of_counter
11+
12+
cdef uint64_t _length
13+
cdef uint8_t[:] _seeds
14+
cdef uint8_t[:] _seeds_for_switcher
15+
cdef int32_t[:] _counter
16+
17+
cpdef void add(self, object element) except *
18+
cpdef uint32_t frequency(self, object element) except *
19+
cpdef size_t sizeof(self)
20+
21+
cdef bint _update_counter(self, const uint64_t index, const bint reverse)
22+
cdef uint32_t _hash(self, object element, uint8_t seed)

0 commit comments

Comments
 (0)