Go to the documentation of this file.00001
00022 #ifndef HYPERTABLE_BLOOM_FILTER_H
00023 #define HYPERTABLE_BLOOM_FILTER_H
00024
00025 #include <cmath>
00026 #include <limits.h>
00027 #include "Common/StaticBuffer.h"
00028 #include "Common/MurmurHash.h"
00029 #include "Common/Logger.h"
00030 #include "Common/StringExt.h"
00031
00032 namespace Hypertable {
00033
00038 template <class HasherT = MurmurHash2>
00039 class BasicBloomFilter {
00040 public:
00041 BasicBloomFilter(size_t items_estimate, float false_positive_prob) {
00042 m_items_actual = 0;
00043 m_items_estimate = items_estimate;
00044 m_false_positive_prob = false_positive_prob;
00045 double num_hashes = -std::log(m_false_positive_prob) / std::log(2);
00046 m_num_hash_functions = (size_t)num_hashes;
00047 m_num_bits = (size_t)(m_items_estimate * num_hashes / std::log(2));
00048 if (m_num_bits == 0) {
00049 HT_THROWF(Error::EMPTY_BLOOMFILTER, "Num elements=%lu false_positive_prob=%.3f",
00050 (Lu)items_estimate, false_positive_prob);
00051 }
00052 m_num_bytes = (m_num_bits / CHAR_BIT) + (m_num_bits % CHAR_BIT ? 1 : 0);
00053 m_bloom_bits = new uint8_t[m_num_bytes];
00054
00055 memset(m_bloom_bits, 0, m_num_bytes);
00056
00057 HT_DEBUG_OUT <<"num funcs="<< m_num_hash_functions
00058 <<" num bits="<< m_num_bits <<" num bytes="<< m_num_bytes
00059 <<" bits per element="<< double(m_num_bits) / items_estimate
00060 << HT_END;
00061 }
00062
00063 BasicBloomFilter(size_t items_estimate, float bits_per_item, size_t num_hashes) {
00064 m_items_actual = 0;
00065 m_items_estimate = items_estimate;
00066 m_false_positive_prob = 0.0;
00067 m_num_hash_functions = num_hashes;
00068 m_num_bits = (size_t)((double)items_estimate * (double)bits_per_item);
00069 if (m_num_bits == 0) {
00070 HT_THROWF(Error::EMPTY_BLOOMFILTER, "Num elements=%lu bits_per_item=%.3f",
00071 (Lu)items_estimate, bits_per_item);
00072 }
00073 m_num_bytes = (m_num_bits / CHAR_BIT) + (m_num_bits % CHAR_BIT ? 1 : 0);
00074 m_bloom_bits = new uint8_t[m_num_bytes];
00075
00076 memset(m_bloom_bits, 0, m_num_bytes);
00077
00078 HT_DEBUG_OUT <<"num funcs="<< m_num_hash_functions
00079 <<" num bits="<< m_num_bits <<" num bytes="<< m_num_bytes
00080 <<" bits per element="<< double(m_num_bits) / items_estimate
00081 << HT_END;
00082 }
00083
00084 BasicBloomFilter(size_t items_estimate, size_t items_actual,
00085 int64_t length, size_t num_hashes) {
00086 m_items_actual = items_actual;
00087 m_items_estimate = items_estimate;
00088 m_false_positive_prob = 0.0;
00089 m_num_hash_functions = num_hashes;
00090 m_num_bits = (size_t)length;
00091 if (m_num_bits == 0) {
00092 HT_THROWF(Error::EMPTY_BLOOMFILTER, "Estimated items=%lu actual items=%lu length=%lld num hashes=%lu",
00093 (Lu)items_estimate, (Lu)items_actual, (Lld)length, (Lu)num_hashes);
00094 }
00095 m_num_bytes = (m_num_bits / CHAR_BIT) + (m_num_bits % CHAR_BIT ? 1 : 0);
00096 m_bloom_bits = new uint8_t[m_num_bytes];
00097
00098 memset(m_bloom_bits, 0, m_num_bytes);
00099
00100 HT_DEBUG_OUT <<"num funcs="<< m_num_hash_functions
00101 <<" num bits="<< m_num_bits <<" num bytes="<< m_num_bytes
00102 <<" bits per element="<< double(m_num_bits) / items_estimate
00103 << HT_END;
00104 }
00105
00106 ~BasicBloomFilter() {
00107 delete[] m_bloom_bits;
00108 }
00109
00110
00111
00112
00113
00114
00115
00116 void insert(const void *key, size_t len) {
00117 uint32_t hash = len;
00118
00119 for (size_t i = 0; i < m_num_hash_functions; ++i) {
00120 hash = m_hasher(key, len, hash) % m_num_bits;
00121 m_bloom_bits[hash / CHAR_BIT] |= (1 << (hash % CHAR_BIT));
00122 }
00123 m_items_actual++;
00124 }
00125
00126 void insert(const String& key) {
00127 insert(key.c_str(), key.length());
00128 }
00129
00130 bool may_contain(const void *key, size_t len) const {
00131 uint32_t hash = len;
00132 uint8_t byte_mask;
00133 uint8_t byte;
00134
00135 for (size_t i = 0; i < m_num_hash_functions; ++i) {
00136 hash = m_hasher(key, len, hash) % m_num_bits;
00137 byte = m_bloom_bits[hash / CHAR_BIT];
00138 byte_mask = (1 << (hash % CHAR_BIT));
00139
00140 if ( (byte & byte_mask) == 0 ) {
00141 return false;
00142 }
00143 }
00144 return true;
00145 }
00146
00147 bool may_contain(const String& key) const {
00148 return may_contain(key.c_str(), key.length());
00149 }
00150
00151 void serialize(StaticBuffer& buf) {
00152 buf.set(m_bloom_bits, m_num_bytes, false);
00153 }
00154
00155 uint8_t* ptr(void) {
00156 return m_bloom_bits;
00157 }
00158
00159 size_t size(void) {
00160 return m_num_bytes;
00161 }
00162
00163 size_t get_num_hashes() { return m_num_hash_functions; }
00164
00165 size_t get_length_bits() { return m_num_bits; }
00166
00167 size_t get_items_estimate() { return m_items_estimate; }
00168
00169 size_t get_items_actual() { return m_items_actual; }
00170
00171 private:
00172 HasherT m_hasher;
00173 size_t m_items_estimate;
00174 size_t m_items_actual;
00175 float m_false_positive_prob;
00176 size_t m_num_hash_functions;
00177 size_t m_num_bits;
00178 size_t m_num_bytes;
00179 uint8_t *m_bloom_bits;
00180 };
00181
00182 typedef BasicBloomFilter<> BloomFilter;
00183
00184 }
00185
00186 #endif // HYPERTABLE_BLOOM_FILTER_H