|
| 1 | +""" |
| 2 | +Count-Min Sketch. |
| 3 | +
|
| 4 | +Count–Min Sketch is a simple space-efficient probabilistic data structure |
| 5 | +that is used to estimate frequencies of elements in data streams and can |
| 6 | +address the Heavy hitters problem. It was presented in 2003 [1] by |
| 7 | +Graham Cormode and Shan Muthukrishnan and published in 2005 [2]. |
| 8 | +
|
| 9 | +References |
| 10 | +---------- |
| 11 | +[1] Cormode, G., Muthukrishnan, S. |
| 12 | + What's hot and what's not: Tracking most frequent items dynamically |
| 13 | + Proceedings of the 22th ACM SIGMOD-SIGACT-SIGART symposium on Principles |
| 14 | + of database systems, San Diego, California - June 09-11, 2003, |
| 15 | + pp. 296–306, ACM New York, NY. |
| 16 | + http://www.cs.princeton.edu/courses/archive/spr04/cos598B/bib/CormodeM-hot.pdf |
| 17 | +[2] Cormode, G., Muthukrishnan, S. |
| 18 | + An Improved Data Stream Summary: The Count–Min Sketch and its Applications |
| 19 | + Journal of Algorithms, Vol. 55 (1), pp. 58–75. |
| 20 | + http://dimacs.rutgers.edu/~graham/pubs/papers/cm-full.pdf |
| 21 | +
|
| 22 | +""" |
| 23 | +import cython |
| 24 | + |
| 25 | +from cpython.array cimport array |
| 26 | +from libc.math cimport ceil, log, M_E |
| 27 | +from libc.stdint cimport uint64_t, uint32_t, uint8_t |
| 28 | +from libc.stdint cimport UINT32_MAX, UINT8_MAX |
| 29 | +from libc.stdlib cimport rand, RAND_MAX |
| 30 | + |
| 31 | +from pdsa.helpers.hashing.mmh cimport mmh3_x86_32bit |
| 32 | + |
| 33 | + |
| 34 | +cdef class CountMinSketch: |
| 35 | + """Count-Min Sketch. |
| 36 | +
|
| 37 | + Count-Min Sketch is simple data structure that allows for the indexing |
| 38 | + of elements from the data stream, results in updating counters, |
| 39 | + and can provide the number of times every element has been indexed. |
| 40 | +
|
| 41 | + Example |
| 42 | + ------- |
| 43 | +
|
| 44 | + >>> from pdsa.frequency.count_min_sketch import CountMinSketch |
| 45 | +
|
| 46 | + >>> cms = CountMinSketch(5, 2000) |
| 47 | + >>> cms.add("hello") |
| 48 | + >>> cms.frequency("hello") |
| 49 | +
|
| 50 | +
|
| 51 | + Note |
| 52 | + ----- |
| 53 | + This implementation uses MurmurHash3 family of hash functions |
| 54 | + which yields a 32-bit hash value. Thus, the length of the counters |
| 55 | + is expected to be smaller or equal to the (2^{32} - 1), since |
| 56 | + we cannot access elements with indexes above this value. |
| 57 | +
|
| 58 | + Note |
| 59 | + ----- |
| 60 | + This implementation uses 32-bits counters that freeze at their |
| 61 | + maximal values (2^{32} - 1). |
| 62 | +
|
| 63 | + Attributes |
| 64 | + ---------- |
| 65 | + num_of_counters : :obj:`int` |
| 66 | + The number of counter arrays used in the sketch. |
| 67 | + length_of_counter : :obj:`int` |
| 68 | + The number of counters in each counter array. |
| 69 | +
|
| 70 | + """ |
| 71 | + |
| 72 | + @cython.cdivision(True) |
| 73 | + def __cinit__(self, const uint8_t num_of_counters, const uint32_t length_of_counter): |
| 74 | + """Create sketch from its dimensions. |
| 75 | +
|
| 76 | + Parameters |
| 77 | + ---------- |
| 78 | + num_of_counters : :obj:`int` |
| 79 | + The number of counter arrays used in the sketch. |
| 80 | + length_of_counter : :obj:`int` |
| 81 | + The number of counters in each counter array. |
| 82 | +
|
| 83 | + Raises |
| 84 | + ------ |
| 85 | + ValueError |
| 86 | + If `num of counters` is less than 1. |
| 87 | + ValueError |
| 88 | + If `length_of_counter` is less than 1. |
| 89 | +
|
| 90 | + """ |
| 91 | + if num_of_counters < 1: |
| 92 | + raise ValueError("At least one counter array is required") |
| 93 | + |
| 94 | + if length_of_counter < 1: |
| 95 | + raise ValueError("The length of the counter array cannot be less then 1") |
| 96 | + |
| 97 | + self.num_of_counters = num_of_counters |
| 98 | + self.length_of_counter = length_of_counter |
| 99 | + |
| 100 | + self._length = self.num_of_counters * self.length_of_counter |
| 101 | + |
| 102 | + self._MAX_COUNTER_VALUE = UINT32_MAX |
| 103 | + self._seeds = array('B', [ |
| 104 | + <uint8_t >((rand()/RAND_MAX) * UINT8_MAX) |
| 105 | + for r in range(self.num_of_counters) |
| 106 | + ]) |
| 107 | + self._counter = array('I', range(self._length)) |
| 108 | + |
| 109 | + cdef uint64_t index |
| 110 | + for index in xrange(self._length): |
| 111 | + self._counter[index] = 0 |
| 112 | + |
| 113 | + @classmethod |
| 114 | + def create_from_expected_error(cls, const float deviation, const float error): |
| 115 | + """Create sketch from the expected frequency deviation and error probability. |
| 116 | +
|
| 117 | + Parameters |
| 118 | + ---------- |
| 119 | + deviation : float |
| 120 | + The error ε in answering the paricular query. |
| 121 | + For example, if we expect 10^7 elements and allow |
| 122 | + the fixed overestimate of 10, the deviation is 10/10^7 = 10^{-6}. |
| 123 | + error : float |
| 124 | + The standard error δ (0 < error < 1). |
| 125 | +
|
| 126 | + Note |
| 127 | + ---- |
| 128 | + The Count–Min Sketch is approximate and probabilistic at the same |
| 129 | + time, therefore two parameters, the error ε in answering the paricular |
| 130 | + query and the error probability δ, affect the space and time |
| 131 | + requirements. In fact, it provides the guarantee that the estimation |
| 132 | + error for frequencies will not exceed ε x n |
| 133 | + with probability at least 1 – δ. |
| 134 | +
|
| 135 | + Raises |
| 136 | + ------ |
| 137 | + ValueError |
| 138 | + If `deviation` is smaller than 10^{-10}. |
| 139 | + ValueError |
| 140 | + If `error` is not in range (0, 1). |
| 141 | +
|
| 142 | + """ |
| 143 | + if deviation <= 0.0000000001: |
| 144 | + raise ValueError("Deviation is too small. Not enough counters") |
| 145 | + |
| 146 | + if error <= 0 or error >= 1: |
| 147 | + raise ValueError("Error rate shell be in (0, 1)") |
| 148 | + |
| 149 | + cdef uint8_t num_of_counters = <uint8_t > (ceil(-log(error))) |
| 150 | + cdef uint32_t length_of_counter = <uint32_t > (ceil(M_E / deviation)) |
| 151 | + |
| 152 | + return cls(max(1, num_of_counters), max(1, length_of_counter)) |
| 153 | + |
| 154 | + cdef uint32_t _hash(self, object key, uint8_t seed): |
| 155 | + return mmh3_x86_32bit(key, seed) |
| 156 | + |
| 157 | + def __dealloc__(self): |
| 158 | + pass |
| 159 | + |
| 160 | + cdef bint _increment_counter(self, const uint64_t index): |
| 161 | + """Increment counter if the value doesn't exceed maximal allowed. |
| 162 | +
|
| 163 | + Parameters |
| 164 | + ---------- |
| 165 | + index : obj:`int` |
| 166 | + The index of the counter to be incremented. |
| 167 | +
|
| 168 | + Note |
| 169 | + ---- |
| 170 | + When counter reaches its maximal value, we simple freeze it there. |
| 171 | +
|
| 172 | + """ |
| 173 | + if self._counter[index] < self._MAX_COUNTER_VALUE: |
| 174 | + self._counter[index] += 1 |
| 175 | + return True |
| 176 | + return False |
| 177 | + |
| 178 | + @cython.boundscheck(False) |
| 179 | + @cython.wraparound(False) |
| 180 | + @cython.cdivision(True) |
| 181 | + cpdef void add(self, object element) except *: |
| 182 | + """Index element into the sketch. |
| 183 | +
|
| 184 | + Parameters |
| 185 | + ---------- |
| 186 | + element : obj |
| 187 | + The element to be indexed into the sketch. |
| 188 | +
|
| 189 | + """ |
| 190 | + cdef uint8_t counter_index |
| 191 | + cdef uint32_t element_index |
| 192 | + cdef uint8_t seed |
| 193 | + cdef uint64_t index |
| 194 | + for counter_index in range(self.num_of_counters): |
| 195 | + seed = self._seeds[counter_index] |
| 196 | + element_index = self._hash(element, seed) % self.length_of_counter |
| 197 | + index = counter_index * (self.length_of_counter - 1) + element_index |
| 198 | + self._increment_counter(index) |
| 199 | + |
| 200 | + @cython.boundscheck(False) |
| 201 | + @cython.wraparound(False) |
| 202 | + @cython.cdivision(True) |
| 203 | + cpdef uint32_t frequency(self, object element) except *: |
| 204 | + """Estimate frequency of element. |
| 205 | +
|
| 206 | + Parameters |
| 207 | + ---------- |
| 208 | + element : obj |
| 209 | + The element to estimate the frequency for. |
| 210 | +
|
| 211 | + Returns |
| 212 | + ------- |
| 213 | + uint32_t |
| 214 | + The frequency of the element. |
| 215 | +
|
| 216 | + """ |
| 217 | + cdef uint8_t counter_index |
| 218 | + cdef uint32_t element_index |
| 219 | + cdef uint8_t seed |
| 220 | + cdef uint64_t index |
| 221 | + cdef uint32_t frequency = self._MAX_COUNTER_VALUE |
| 222 | + for counter_index in range(self.num_of_counters): |
| 223 | + seed = self._seeds[counter_index] |
| 224 | + element_index = self._hash(element, seed) % self.length_of_counter |
| 225 | + index = counter_index * (self.length_of_counter - 1) + element_index |
| 226 | + frequency = min(frequency, self._counter[index]) |
| 227 | + return frequency |
| 228 | + |
| 229 | + cpdef size_t sizeof(self): |
| 230 | + """Size of the sketch in bytes. |
| 231 | +
|
| 232 | + Returns |
| 233 | + ------- |
| 234 | + :obj:`int` |
| 235 | + Number of bytes allocated for the sketch. |
| 236 | +
|
| 237 | + """ |
| 238 | + return self._length * sizeof(uint32_t) |
| 239 | + |
| 240 | + def __repr__(self): |
| 241 | + return "<CountMinSketch ({} x {})>".format( |
| 242 | + self.num_of_counters, |
| 243 | + self.length_of_counter |
| 244 | + ) |
| 245 | + |
| 246 | + def __len__(self): |
| 247 | + """Get length of the filter. |
| 248 | +
|
| 249 | + Returns |
| 250 | + ------- |
| 251 | + :obj:`int` |
| 252 | + The length of the filter. |
| 253 | +
|
| 254 | + """ |
| 255 | + return self._length |
| 256 | + |
| 257 | + |
| 258 | + |
| 259 | + def debug(self): |
| 260 | + """Return sketch for debug purposes.""" |
| 261 | + return self._counter |
0 commit comments