LCOV - code coverage report
Current view: top level - mfbt - BloomFilter.h (source / functions) Hit Total Coverage
Test: output.info Lines: 37 43 86.0 %
Date: 2017-07-14 16:53:18 Functions: 12 27 44.4 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
       2             : /* vim: set ts=8 sts=2 et sw=2 tw=80: */
       3             : /* This Source Code Form is subject to the terms of the Mozilla Public
       4             :  * License, v. 2.0. If a copy of the MPL was not distributed with this
       5             :  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
       6             : 
       7             : /*
       8             :  * A counting Bloom filter implementation.  This allows consumers to
       9             :  * do fast probabilistic "is item X in set Y?" testing which will
      10             :  * never answer "no" when the correct answer is "yes" (but might
      11             :  * incorrectly answer "yes" when the correct answer is "no").
      12             :  */
      13             : 
      14             : #ifndef mozilla_BloomFilter_h
      15             : #define mozilla_BloomFilter_h
      16             : 
      17             : #include "mozilla/Assertions.h"
      18             : #include "mozilla/Likely.h"
      19             : 
      20             : #include <stdint.h>
      21             : #include <string.h>
      22             : 
      23             : namespace mozilla {
      24             : 
      25             : /*
      26             :  * This class implements a counting Bloom filter as described at
      27             :  * <http://en.wikipedia.org/wiki/Bloom_filter#Counting_filters>, with
      28             :  * 8-bit counters.  This allows quick probabilistic answers to the
      29             :  * question "is object X in set Y?" where the contents of Y might not
      30             :  * be time-invariant.  The probabilistic nature of the test means that
      31             :  * sometimes the answer will be "yes" when it should be "no".  If the
      32             :  * answer is "no", then X is guaranteed not to be in Y.
      33             :  *
      34             :  * The filter is parametrized on KeySize, which is the size of the key
      35             :  * generated by each of hash functions used by the filter, in bits,
      36             :  * and the type of object T being added and removed.  T must implement
      37             :  * a |uint32_t hash() const| method which returns a uint32_t hash key
      38             :  * that will be used to generate the two separate hash functions for
      39             :  * the Bloom filter.  This hash key MUST be well-distributed for good
      40             :  * results!  KeySize is not allowed to be larger than 16.
      41             :  *
      42             :  * The filter uses exactly 2**KeySize bytes of memory.  From now on we
      43             :  * will refer to the memory used by the filter as M.
      44             :  *
      45             :  * The expected rate of incorrect "yes" answers depends on M and on
      46             :  * the number N of objects in set Y.  As long as N is small compared
      47             :  * to M, the rate of such answers is expected to be approximately
      48             :  * 4*(N/M)**2 for this filter.  In practice, if Y has a few hundred
      49             :  * elements then using a KeySize of 12 gives a reasonably low
      50             :  * incorrect answer rate.  A KeySize of 12 has the additional benefit
      51             :  * of using exactly one page for the filter in typical hardware
      52             :  * configurations.
      53             :  */
      54             : 
      55             : template<unsigned KeySize, class T>
      56             : class BloomFilter
      57             : {
      58             :   /*
      59             :    * A counting Bloom filter with 8-bit counters.  For now we assume
      60             :    * that having two hash functions is enough, but we may revisit that
      61             :    * decision later.
      62             :    *
      63             :    * The filter uses an array with 2**KeySize entries.
      64             :    *
      65             :    * Assuming a well-distributed hash function, a Bloom filter with
      66             :    * array size M containing N elements and
      67             :    * using k hash function has expected false positive rate exactly
      68             :    *
      69             :    * $  (1 - (1 - 1/M)^{kN})^k  $
      70             :    *
      71             :    * because each array slot has a
      72             :    *
      73             :    * $  (1 - 1/M)^{kN}  $
      74             :    *
      75             :    * chance of being 0, and the expected false positive rate is the
      76             :    * probability that all of the k hash functions will hit a nonzero
      77             :    * slot.
      78             :    *
      79             :    * For reasonable assumptions (M large, kN large, which should both
      80             :    * hold if we're worried about false positives) about M and kN this
      81             :    * becomes approximately
      82             :    *
      83             :    * $$  (1 - \exp(-kN/M))^k   $$
      84             :    *
      85             :    * For our special case of k == 2, that's $(1 - \exp(-2N/M))^2$,
      86             :    * or in other words
      87             :    *
      88             :    * $$    N/M = -0.5 * \ln(1 - \sqrt(r))   $$
      89             :    *
      90             :    * where r is the false positive rate.  This can be used to compute
      91             :    * the desired KeySize for a given load N and false positive rate r.
      92             :    *
      93             :    * If N/M is assumed small, then the false positive rate can
      94             :    * further be approximated as 4*N^2/M^2.  So increasing KeySize by
      95             :    * 1, which doubles M, reduces the false positive rate by about a
      96             :    * factor of 4, and a false positive rate of 1% corresponds to
      97             :    * about M/N == 20.
      98             :    *
      99             :    * What this means in practice is that for a few hundred keys using a
     100             :    * KeySize of 12 gives false positive rates on the order of 0.25-4%.
     101             :    *
     102             :    * Similarly, using a KeySize of 10 would lead to a 4% false
     103             :    * positive rate for N == 100 and to quite bad false positive
     104             :    * rates for larger N.
     105             :    */
     106             : public:
     107         141 :   BloomFilter()
     108             :   {
     109             :     static_assert(KeySize <= kKeyShift, "KeySize too big");
     110             : 
     111             :     // Should we have a custom operator new using calloc instead and
     112             :     // require that we're allocated via the operator?
     113         141 :     clear();
     114         141 :   }
     115             : 
     116             :   /*
     117             :    * Clear the filter.  This should be done before reusing it, because
     118             :    * just removing all items doesn't clear counters that hit the upper
     119             :    * bound.
     120             :    */
     121             :   void clear();
     122             : 
     123             :   /*
     124             :    * Add an item to the filter.
     125             :    */
     126             :   void add(const T* aValue);
     127             : 
     128             :   /*
     129             :    * Remove an item from the filter.
     130             :    */
     131             :   void remove(const T* aValue);
     132             : 
     133             :   /*
     134             :    * Check whether the filter might contain an item.  This can
     135             :    * sometimes return true even if the item is not in the filter,
     136             :    * but will never return false for items that are actually in the
     137             :    * filter.
     138             :    */
     139             :   bool mightContain(const T* aValue) const;
     140             : 
     141             :   /*
     142             :    * Methods for add/remove/contain when we already have a hash computed
     143             :    */
     144             :   void add(uint32_t aHash);
     145             :   void remove(uint32_t aHash);
     146             :   bool mightContain(uint32_t aHash) const;
     147             : 
     148             : private:
     149             :   static const size_t kArraySize = (1 << KeySize);
     150             :   static const uint32_t kKeyMask = (1 << KeySize) - 1;
     151             :   static const uint32_t kKeyShift = 16;
     152             : 
     153       81362 :   static uint32_t hash1(uint32_t aHash)
     154             :   {
     155       81362 :     return aHash & kKeyMask;
     156             :   }
     157       16116 :   static uint32_t hash2(uint32_t aHash)
     158             :   {
     159       16116 :     return (aHash >> kKeyShift) & kKeyMask;
     160             :   }
     161             : 
     162        6535 :   uint8_t& firstSlot(uint32_t aHash)
     163             :   {
     164        6535 :     return mCounters[hash1(aHash)];
     165             :   }
     166        6535 :   uint8_t& secondSlot(uint32_t aHash)
     167             :   {
     168        6535 :     return mCounters[hash2(aHash)];
     169             :   }
     170             : 
     171       74827 :   const uint8_t& firstSlot(uint32_t aHash) const
     172             :   {
     173       74827 :     return mCounters[hash1(aHash)];
     174             :   }
     175        9581 :   const uint8_t& secondSlot(uint32_t aHash) const
     176             :   {
     177        9581 :     return mCounters[hash2(aHash)];
     178             :   }
     179             : 
     180       13070 :   static bool full(const uint8_t& aSlot) { return aSlot == UINT8_MAX; }
     181             : 
     182             :   uint8_t mCounters[kArraySize];
     183             : };
     184             : 
     185             : template<unsigned KeySize, class T>
     186             : inline void
     187         141 : BloomFilter<KeySize, T>::clear()
     188             : {
     189         141 :   memset(mCounters, 0, kArraySize);
     190         141 : }
     191             : 
     192             : template<unsigned KeySize, class T>
     193             : inline void
     194        3989 : BloomFilter<KeySize, T>::add(uint32_t aHash)
     195             : {
     196        3989 :   uint8_t& slot1 = firstSlot(aHash);
     197        3989 :   if (MOZ_LIKELY(!full(slot1))) {
     198        3989 :     ++slot1;
     199             :   }
     200        3989 :   uint8_t& slot2 = secondSlot(aHash);
     201        3989 :   if (MOZ_LIKELY(!full(slot2))) {
     202        3989 :     ++slot2;
     203             :   }
     204        3989 : }
     205             : 
     206             : template<unsigned KeySize, class T>
     207             : MOZ_ALWAYS_INLINE void
     208           0 : BloomFilter<KeySize, T>::add(const T* aValue)
     209             : {
     210           0 :   uint32_t hash = aValue->hash();
     211           0 :   return add(hash);
     212             : }
     213             : 
     214             : template<unsigned KeySize, class T>
     215             : inline void
     216        2546 : BloomFilter<KeySize, T>::remove(uint32_t aHash)
     217             : {
     218             :   // If the slots are full, we don't know whether we bumped them to be
     219             :   // there when we added or not, so just leave them full.
     220        2546 :   uint8_t& slot1 = firstSlot(aHash);
     221        2546 :   if (MOZ_LIKELY(!full(slot1))) {
     222        2546 :     --slot1;
     223             :   }
     224        2546 :   uint8_t& slot2 = secondSlot(aHash);
     225        2546 :   if (MOZ_LIKELY(!full(slot2))) {
     226        2546 :     --slot2;
     227             :   }
     228        2546 : }
     229             : 
     230             : template<unsigned KeySize, class T>
     231             : MOZ_ALWAYS_INLINE void
     232             : BloomFilter<KeySize, T>::remove(const T* aValue)
     233             : {
     234             :   uint32_t hash = aValue->hash();
     235             :   remove(hash);
     236             : }
     237             : 
     238             : template<unsigned KeySize, class T>
     239             : MOZ_ALWAYS_INLINE bool
     240       74827 : BloomFilter<KeySize, T>::mightContain(uint32_t aHash) const
     241             : {
     242             :   // Check that all the slots for this hash contain something
     243       74827 :   return firstSlot(aHash) && secondSlot(aHash);
     244             : }
     245             : 
     246             : template<unsigned KeySize, class T>
     247             : MOZ_ALWAYS_INLINE bool
     248           0 : BloomFilter<KeySize, T>::mightContain(const T* aValue) const
     249             : {
     250           0 :   uint32_t hash = aValue->hash();
     251           0 :   return mightContain(hash);
     252             : }
     253             : 
     254             : } // namespace mozilla
     255             : 
     256             : #endif /* mozilla_BloomFilter_h */

Generated by: LCOV version 1.13