bloom.h
Go to the documentation of this file.
1 /*
2  * Copyright (C) 2014 Freie Universität Berlin
3  *
4  * This file is subject to the terms and conditions of the GNU Lesser
5  * General Public License v2.1. See the file LICENSE in the top level
6  * directory for more details.
7  */
8 
9 /*
10  * bloom.c
11  *
12  * Bloom filters
13  *
14  * HISTORY
15  * {x, y, z}
16  * A Bloom filter is a probibalistic : : :
17  * data structure with several interesting /|\ /|\ /|\
18  * properties, such as low memory usage, / | X | X | \
19  * asymmetric query confidence, and a very / |/ \|/ \| \
20  * speedy O(k) membership test. / | | \ \
21  * / /| /|\ |\ \
22  * Because a Bloom filter can . . . . . . . . .
23  * accept any input that can be 00000000001000101010101010100010000000000
24  * hashed effectively (such as " " "
25  * strings), that membership test \ | /
26  * tends to draw a crowd. TNSTAAFL, but \ | /
27  * as caveats go, the Bloom filters' are \ | /
28  * more interesting than incapacitating. \|/
29  * :
30  * Most notably, it can tell you with certainty {w}
31  * that an item 'i' is *not* a member of set 's',
32  * but it can only tell you with some finite
33  * probability whether an item 'i' *is* a member
34  * of set 's'.
35  *
36  * Still, along with the intriguing possibility of using bitwise AND and OR
37  * to compute the logical union and intersection of two filters, the cheap
38  * cost of adding elements to the filter set, and the low memory requirements,
39  * the Bloom filter is a good choice for many applications.
40  *
41  * NOTES
42  *
43  * Let's look more closely at the probability values.
44  *
45  * Assume that a hash function selects each array position with equal
46  * probability. If m is the number of bits in the array, and k is the number
47  * of hash functions, then the probability that a certain bit is not set
48  * to 1 by a certain hash function during the insertion of an element is
49  *
50  * 1-(1/m).
51  *
52  * The probability that it is not set to 1 by any of the hash functions is
53  *
54  * (1-(1/m))^k.
55  *
56  * If we have inserted n elements, the probability that a certain bit is
57  * set 0 is
58  *
59  * (1-(1/m))^kn,
60  *
61  * Meaning that the probability said bit is set to 1 is therefore
62  *
63  * 1-([1-(1/m)]^kn).
64  *
65  * Now test membership of an element that is not in the set. Each of the k
66  * array positions computed by the hash functions is 1 with a probability
67  * as above. The probability of all of them being 1, which would cause the
68  * algorithm to erroneously claim that the element is in the set, is often
69  * given as
70  *
71  * (1-[1-(1/m)]^kn)^k ~~ (1 - e^(-kn/m))^k.
72  *
73  * This is not strictly correct as it assumes independence for the
74  * probabilities of each bit being set. However, assuming it is a close
75  * approximation we have that the probability of false positives descreases
76  * as m (the number of bits in the array) increases, and increases as n
77  * (the number of inserted elements) increases. For a given m and n, the
78  * value of k (the number of hash functions) that minimizes the probability
79  * is
80  *
81  * (m/n)ln(2) ~~ 0.7(m/n),
82  *
83  * which gives the false positive probability of
84  *
85  * 2^-k ~~ 0.6185^(m/n).
86  *
87  * The required number of bits m, given n and a desired false positive
88  * probability p (and assuming the optimal value of k is used) can be
89  * computed by substituting the optimal value of k in the probability
90  * expression above:
91  *
92  * p = (1 - e^(-(((m/n)ln(2))*(n/m))))^((m/n)ln(2)),
93  *
94  * which simplifies to
95  *
96  * ln(p) = -(m/n) * (ln2)^2.
97  *
98  * This results in the equation
99  *
100  * m = -((n*ln(p)) / ((ln(2))^2))
101  *
102  * The classic filter uses
103  *
104  * 1.44*log2(1/eta)
105  *
106  * bits of space per inserted key, where eta is the false positive rate of
107  * the Bloom filter.
108  *
109  */
110 
123 #ifndef BLOOM_H
124 #define BLOOM_H
125 
126 #include <stdlib.h>
127 #include <stdbool.h>
128 #include <stdint.h>
129 
130 #ifdef __cplusplus
131 extern "C" {
132 #endif
133 
137 typedef uint32_t (*hashfp_t)(const uint8_t *, int len);
138 
142 typedef struct {
144  size_t m;
146  size_t k;
148  uint8_t *a;
151 } bloom_t;
152 
166 void bloom_init(bloom_t *bloom, size_t size, uint8_t *bitfield, hashfp_t *hashes, int hashes_numof);
167 
175 void bloom_del(bloom_t *bloom);
176 
189 void bloom_add(bloom_t *bloom, const uint8_t *buf, size_t len);
190 
229 bool bloom_check(bloom_t *bloom, const uint8_t *buf, size_t len);
230 
231 #ifdef __cplusplus
232 }
233 #endif
234 
236 #endif /* BLOOM_H */
bloom_del
void bloom_del(bloom_t *bloom)
Delete a Bloom filter.
hashfp_t
uint32_t(* hashfp_t)(const uint8_t *, int len)
hash function to use in thee filter
Definition: bloom.h:137
bloom_t::a
uint8_t * a
the bloom array
Definition: bloom.h:148
bloom_t
bloom_t bloom filter object
Definition: bloom.h:142
bloom_t::m
size_t m
number of bits in the bloom array
Definition: bloom.h:144
bloom_init
void bloom_init(bloom_t *bloom, size_t size, uint8_t *bitfield, hashfp_t *hashes, int hashes_numof)
Initialize a Bloom Filter.
bloom_t::hash
hashfp_t * hash
the hash functions
Definition: bloom.h:150
bloom_check
bool bloom_check(bloom_t *bloom, const uint8_t *buf, size_t len)
Determine if a string is in the Bloom filter.
bloom_t::k
size_t k
number of hash functions
Definition: bloom.h:146
bloom_add
void bloom_add(bloom_t *bloom, const uint8_t *buf, size_t len)
Add a string to a Bloom filter.