BloomFilter.h

Go to the documentation of this file.
00001 
00022 #ifndef HYPERTABLE_BLOOM_FILTER_H
00023 #define HYPERTABLE_BLOOM_FILTER_H
00024 
00025 #include <cmath>
00026 #include <limits.h>
00027 #include "Common/StaticBuffer.h"
00028 #include "Common/MurmurHash.h"
00029 #include "Common/Logger.h"
00030 #include "Common/StringExt.h"
00031 
00032 namespace Hypertable {
00033 
00038 template <class HasherT = MurmurHash2>
00039 class BasicBloomFilter {
00040 public:
00041   BasicBloomFilter(size_t items_estimate, float false_positive_prob) {
00042     m_items_actual = 0;
00043     m_items_estimate = items_estimate;
00044     m_false_positive_prob = false_positive_prob;
00045     double num_hashes = -std::log(m_false_positive_prob) / std::log(2);
00046     m_num_hash_functions = (size_t)num_hashes;
00047     m_num_bits = (size_t)(m_items_estimate * num_hashes / std::log(2));
00048     if (m_num_bits == 0) {
00049       HT_THROWF(Error::EMPTY_BLOOMFILTER, "Num elements=%lu false_positive_prob=%.3f",
00050                 (Lu)items_estimate, false_positive_prob);
00051     }
00052     m_num_bytes = (m_num_bits / CHAR_BIT) + (m_num_bits % CHAR_BIT ? 1 : 0);
00053     m_bloom_bits = new uint8_t[m_num_bytes];
00054 
00055     memset(m_bloom_bits, 0, m_num_bytes);
00056 
00057     HT_DEBUG_OUT <<"num funcs="<< m_num_hash_functions
00058                  <<" num bits="<< m_num_bits <<" num bytes="<< m_num_bytes
00059                  <<" bits per element="<< double(m_num_bits) / items_estimate
00060                  << HT_END;
00061   }
00062 
00063   BasicBloomFilter(size_t items_estimate, float bits_per_item, size_t num_hashes) {
00064     m_items_actual = 0;
00065     m_items_estimate = items_estimate;
00066     m_false_positive_prob = 0.0;
00067     m_num_hash_functions = num_hashes;
00068     m_num_bits = (size_t)((double)items_estimate * (double)bits_per_item);
00069     if (m_num_bits == 0) {
00070       HT_THROWF(Error::EMPTY_BLOOMFILTER, "Num elements=%lu bits_per_item=%.3f",
00071                 (Lu)items_estimate, bits_per_item);
00072     }
00073     m_num_bytes = (m_num_bits / CHAR_BIT) + (m_num_bits % CHAR_BIT ? 1 : 0);
00074     m_bloom_bits = new uint8_t[m_num_bytes];
00075 
00076     memset(m_bloom_bits, 0, m_num_bytes);
00077 
00078     HT_DEBUG_OUT <<"num funcs="<< m_num_hash_functions
00079                  <<" num bits="<< m_num_bits <<" num bytes="<< m_num_bytes
00080                  <<" bits per element="<< double(m_num_bits) / items_estimate
00081                  << HT_END;
00082   }
00083 
00084   BasicBloomFilter(size_t items_estimate, size_t items_actual,
00085                  int64_t length, size_t num_hashes) {
00086     m_items_actual = items_actual;
00087     m_items_estimate = items_estimate;
00088     m_false_positive_prob = 0.0;
00089     m_num_hash_functions = num_hashes;
00090     m_num_bits = (size_t)length;
00091     if (m_num_bits == 0) {
00092       HT_THROWF(Error::EMPTY_BLOOMFILTER, "Estimated items=%lu actual items=%lu length=%lld num hashes=%lu",
00093                 (Lu)items_estimate, (Lu)items_actual, (Lld)length, (Lu)num_hashes);
00094     }
00095     m_num_bytes = (m_num_bits / CHAR_BIT) + (m_num_bits % CHAR_BIT ? 1 : 0);
00096     m_bloom_bits = new uint8_t[m_num_bytes];
00097 
00098     memset(m_bloom_bits, 0, m_num_bytes);
00099 
00100     HT_DEBUG_OUT <<"num funcs="<< m_num_hash_functions
00101                  <<" num bits="<< m_num_bits <<" num bytes="<< m_num_bytes
00102                  <<" bits per element="<< double(m_num_bits) / items_estimate
00103                  << HT_END;
00104   }
00105 
00106   ~BasicBloomFilter() {
00107     delete[] m_bloom_bits;
00108   }
00109 
00110   /* XXX/review static functions to expose the bloom filter parameters, given
00111    1) probablility and # keys
00112    2) #keys and fix the total size (m), to calculate the optimal
00113       probability - # hash functions to use
00114   */
00115 
00116   void insert(const void *key, size_t len) {
00117     uint32_t hash = len;
00118 
00119     for (size_t i = 0; i < m_num_hash_functions; ++i) {
00120       hash = m_hasher(key, len, hash) % m_num_bits;
00121       m_bloom_bits[hash / CHAR_BIT] |= (1 << (hash % CHAR_BIT));
00122     }
00123     m_items_actual++;
00124   }
00125 
00126   void insert(const String& key) {
00127     insert(key.c_str(), key.length());
00128   }
00129 
00130   bool may_contain(const void *key, size_t len) const {
00131     uint32_t hash = len;
00132     uint8_t byte_mask;
00133     uint8_t byte;
00134 
00135     for (size_t i = 0; i < m_num_hash_functions; ++i) {
00136       hash = m_hasher(key, len, hash) % m_num_bits;
00137       byte = m_bloom_bits[hash / CHAR_BIT];
00138       byte_mask = (1 << (hash % CHAR_BIT));
00139 
00140       if ( (byte & byte_mask) == 0 ) {
00141         return false;
00142       }
00143     }
00144     return true;
00145   }
00146 
00147   bool may_contain(const String& key) const {
00148     return may_contain(key.c_str(), key.length());
00149   }
00150 
00151   void serialize(StaticBuffer& buf) {
00152     buf.set(m_bloom_bits, m_num_bytes, false);
00153   }
00154 
00155   uint8_t* ptr(void) {
00156     return m_bloom_bits;
00157   }
00158 
00159   size_t size(void) {
00160     return m_num_bytes;
00161   }
00162 
00163   size_t get_num_hashes() { return m_num_hash_functions; }
00164 
00165   size_t get_length_bits() { return m_num_bits; }
00166 
00167   size_t get_items_estimate() { return m_items_estimate; }
00168 
00169   size_t get_items_actual() { return m_items_actual; }
00170 
00171 private:
00172   HasherT    m_hasher;
00173   size_t     m_items_estimate;
00174   size_t     m_items_actual;
00175   float      m_false_positive_prob;
00176   size_t     m_num_hash_functions;
00177   size_t     m_num_bits;
00178   size_t     m_num_bytes;
00179   uint8_t   *m_bloom_bits;
00180 };
00181 
00182 typedef BasicBloomFilter<> BloomFilter;
00183 
00184 } //namespace Hypertable
00185 
00186 #endif // HYPERTABLE_BLOOM_FILTER_H