Line data Source code
1 : /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 : /* vim: set ts=8 sts=2 et sw=2 tw=80: */
3 : /* This Source Code Form is subject to the terms of the Mozilla Public
4 : * License, v. 2.0. If a copy of the MPL was not distributed with this
5 : * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
6 :
7 : /*
8 : * A counting Bloom filter implementation. This allows consumers to
9 : * do fast probabilistic "is item X in set Y?" testing which will
10 : * never answer "no" when the correct answer is "yes" (but might
11 : * incorrectly answer "yes" when the correct answer is "no").
12 : */
13 :
14 : #ifndef mozilla_BloomFilter_h
15 : #define mozilla_BloomFilter_h
16 :
17 : #include "mozilla/Assertions.h"
18 : #include "mozilla/Likely.h"
19 :
20 : #include <stdint.h>
21 : #include <string.h>
22 :
23 : namespace mozilla {
24 :
25 : /*
26 : * This class implements a counting Bloom filter as described at
27 : * <http://en.wikipedia.org/wiki/Bloom_filter#Counting_filters>, with
28 : * 8-bit counters. This allows quick probabilistic answers to the
29 : * question "is object X in set Y?" where the contents of Y might not
30 : * be time-invariant. The probabilistic nature of the test means that
31 : * sometimes the answer will be "yes" when it should be "no". If the
32 : * answer is "no", then X is guaranteed not to be in Y.
33 : *
34 : * The filter is parametrized on KeySize, which is the size of the key
35 : * generated by each of hash functions used by the filter, in bits,
36 : * and the type of object T being added and removed. T must implement
37 : * a |uint32_t hash() const| method which returns a uint32_t hash key
38 : * that will be used to generate the two separate hash functions for
39 : * the Bloom filter. This hash key MUST be well-distributed for good
40 : * results! KeySize is not allowed to be larger than 16.
41 : *
42 : * The filter uses exactly 2**KeySize bytes of memory. From now on we
43 : * will refer to the memory used by the filter as M.
44 : *
45 : * The expected rate of incorrect "yes" answers depends on M and on
46 : * the number N of objects in set Y. As long as N is small compared
47 : * to M, the rate of such answers is expected to be approximately
48 : * 4*(N/M)**2 for this filter. In practice, if Y has a few hundred
49 : * elements then using a KeySize of 12 gives a reasonably low
50 : * incorrect answer rate. A KeySize of 12 has the additional benefit
51 : * of using exactly one page for the filter in typical hardware
52 : * configurations.
53 : */
54 :
55 : template<unsigned KeySize, class T>
56 : class BloomFilter
57 : {
58 : /*
59 : * A counting Bloom filter with 8-bit counters. For now we assume
60 : * that having two hash functions is enough, but we may revisit that
61 : * decision later.
62 : *
63 : * The filter uses an array with 2**KeySize entries.
64 : *
65 : * Assuming a well-distributed hash function, a Bloom filter with
66 : * array size M containing N elements and
67 : * using k hash function has expected false positive rate exactly
68 : *
69 : * $ (1 - (1 - 1/M)^{kN})^k $
70 : *
71 : * because each array slot has a
72 : *
73 : * $ (1 - 1/M)^{kN} $
74 : *
75 : * chance of being 0, and the expected false positive rate is the
76 : * probability that all of the k hash functions will hit a nonzero
77 : * slot.
78 : *
79 : * For reasonable assumptions (M large, kN large, which should both
80 : * hold if we're worried about false positives) about M and kN this
81 : * becomes approximately
82 : *
83 : * $$ (1 - \exp(-kN/M))^k $$
84 : *
85 : * For our special case of k == 2, that's $(1 - \exp(-2N/M))^2$,
86 : * or in other words
87 : *
88 : * $$ N/M = -0.5 * \ln(1 - \sqrt(r)) $$
89 : *
90 : * where r is the false positive rate. This can be used to compute
91 : * the desired KeySize for a given load N and false positive rate r.
92 : *
93 : * If N/M is assumed small, then the false positive rate can
94 : * further be approximated as 4*N^2/M^2. So increasing KeySize by
95 : * 1, which doubles M, reduces the false positive rate by about a
96 : * factor of 4, and a false positive rate of 1% corresponds to
97 : * about M/N == 20.
98 : *
99 : * What this means in practice is that for a few hundred keys using a
100 : * KeySize of 12 gives false positive rates on the order of 0.25-4%.
101 : *
102 : * Similarly, using a KeySize of 10 would lead to a 4% false
103 : * positive rate for N == 100 and to quite bad false positive
104 : * rates for larger N.
105 : */
106 : public:
107 141 : BloomFilter()
108 : {
109 : static_assert(KeySize <= kKeyShift, "KeySize too big");
110 :
111 : // Should we have a custom operator new using calloc instead and
112 : // require that we're allocated via the operator?
113 141 : clear();
114 141 : }
115 :
116 : /*
117 : * Clear the filter. This should be done before reusing it, because
118 : * just removing all items doesn't clear counters that hit the upper
119 : * bound.
120 : */
121 : void clear();
122 :
123 : /*
124 : * Add an item to the filter.
125 : */
126 : void add(const T* aValue);
127 :
128 : /*
129 : * Remove an item from the filter.
130 : */
131 : void remove(const T* aValue);
132 :
133 : /*
134 : * Check whether the filter might contain an item. This can
135 : * sometimes return true even if the item is not in the filter,
136 : * but will never return false for items that are actually in the
137 : * filter.
138 : */
139 : bool mightContain(const T* aValue) const;
140 :
141 : /*
142 : * Methods for add/remove/contain when we already have a hash computed
143 : */
144 : void add(uint32_t aHash);
145 : void remove(uint32_t aHash);
146 : bool mightContain(uint32_t aHash) const;
147 :
148 : private:
149 : static const size_t kArraySize = (1 << KeySize);
150 : static const uint32_t kKeyMask = (1 << KeySize) - 1;
151 : static const uint32_t kKeyShift = 16;
152 :
153 81362 : static uint32_t hash1(uint32_t aHash)
154 : {
155 81362 : return aHash & kKeyMask;
156 : }
157 16116 : static uint32_t hash2(uint32_t aHash)
158 : {
159 16116 : return (aHash >> kKeyShift) & kKeyMask;
160 : }
161 :
162 6535 : uint8_t& firstSlot(uint32_t aHash)
163 : {
164 6535 : return mCounters[hash1(aHash)];
165 : }
166 6535 : uint8_t& secondSlot(uint32_t aHash)
167 : {
168 6535 : return mCounters[hash2(aHash)];
169 : }
170 :
171 74827 : const uint8_t& firstSlot(uint32_t aHash) const
172 : {
173 74827 : return mCounters[hash1(aHash)];
174 : }
175 9581 : const uint8_t& secondSlot(uint32_t aHash) const
176 : {
177 9581 : return mCounters[hash2(aHash)];
178 : }
179 :
180 13070 : static bool full(const uint8_t& aSlot) { return aSlot == UINT8_MAX; }
181 :
182 : uint8_t mCounters[kArraySize];
183 : };
184 :
185 : template<unsigned KeySize, class T>
186 : inline void
187 141 : BloomFilter<KeySize, T>::clear()
188 : {
189 141 : memset(mCounters, 0, kArraySize);
190 141 : }
191 :
192 : template<unsigned KeySize, class T>
193 : inline void
194 3989 : BloomFilter<KeySize, T>::add(uint32_t aHash)
195 : {
196 3989 : uint8_t& slot1 = firstSlot(aHash);
197 3989 : if (MOZ_LIKELY(!full(slot1))) {
198 3989 : ++slot1;
199 : }
200 3989 : uint8_t& slot2 = secondSlot(aHash);
201 3989 : if (MOZ_LIKELY(!full(slot2))) {
202 3989 : ++slot2;
203 : }
204 3989 : }
205 :
206 : template<unsigned KeySize, class T>
207 : MOZ_ALWAYS_INLINE void
208 0 : BloomFilter<KeySize, T>::add(const T* aValue)
209 : {
210 0 : uint32_t hash = aValue->hash();
211 0 : return add(hash);
212 : }
213 :
214 : template<unsigned KeySize, class T>
215 : inline void
216 2546 : BloomFilter<KeySize, T>::remove(uint32_t aHash)
217 : {
218 : // If the slots are full, we don't know whether we bumped them to be
219 : // there when we added or not, so just leave them full.
220 2546 : uint8_t& slot1 = firstSlot(aHash);
221 2546 : if (MOZ_LIKELY(!full(slot1))) {
222 2546 : --slot1;
223 : }
224 2546 : uint8_t& slot2 = secondSlot(aHash);
225 2546 : if (MOZ_LIKELY(!full(slot2))) {
226 2546 : --slot2;
227 : }
228 2546 : }
229 :
230 : template<unsigned KeySize, class T>
231 : MOZ_ALWAYS_INLINE void
232 : BloomFilter<KeySize, T>::remove(const T* aValue)
233 : {
234 : uint32_t hash = aValue->hash();
235 : remove(hash);
236 : }
237 :
238 : template<unsigned KeySize, class T>
239 : MOZ_ALWAYS_INLINE bool
240 74827 : BloomFilter<KeySize, T>::mightContain(uint32_t aHash) const
241 : {
242 : // Check that all the slots for this hash contain something
243 74827 : return firstSlot(aHash) && secondSlot(aHash);
244 : }
245 :
246 : template<unsigned KeySize, class T>
247 : MOZ_ALWAYS_INLINE bool
248 0 : BloomFilter<KeySize, T>::mightContain(const T* aValue) const
249 : {
250 0 : uint32_t hash = aValue->hash();
251 0 : return mightContain(hash);
252 : }
253 :
254 : } // namespace mozilla
255 :
256 : #endif /* mozilla_BloomFilter_h */
|