bloom.go 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453
  1. /*
  2. Package bloom provides data structures and methods for creating Bloom filters.
  3. A Bloom filter is a representation of a set of _n_ items, where the main
  4. requirement is to make membership queries; _i.e._, whether an item is a
  5. member of a set.
  6. A Bloom filter has two parameters: _m_, a maximum size (typically a reasonably large
  7. multiple of the cardinality of the set to represent) and _k_, the number of hashing
  8. functions on elements of the set. (The actual hashing functions are important, too,
  9. but this is not a parameter for this implementation). A Bloom filter is backed by
  10. a BitSet; a key is represented in the filter by setting the bits at each value of the
  11. hashing functions (modulo _m_). Set membership is done by _testing_ whether the
  12. bits at each value of the hashing functions (again, modulo _m_) are set. If so,
  13. the item is in the set. If the item is actually in the set, a Bloom filter will
  14. never fail (the true positive rate is 1.0); but it is susceptible to false
  15. positives. The art is to choose _k_ and _m_ correctly.
  16. In this implementation, the hashing functions used is murmurhash,
  17. a non-cryptographic hashing function.
  18. This implementation accepts keys for setting as testing as []byte. Thus, to
  19. add a string item, "Love":
  20. uint n = 1000
  21. filter := bloom.New(20*n, 5) // load of 20, 5 keys
  22. filter.Add([]byte("Love"))
  23. Similarly, to test if "Love" is in bloom:
  24. if filter.Test([]byte("Love"))
  25. For numeric data, I recommend that you look into the binary/encoding library. But,
  26. for example, to add a uint32 to the filter:
  27. i := uint32(100)
  28. n1 := make([]byte,4)
  29. binary.BigEndian.PutUint32(n1,i)
  30. f.Add(n1)
  31. Finally, there is a method to estimate the false positive rate of a
  32. Bloom filter with _m_ bits and _k_ hashing functions for a set of size _n_:
  33. if bloom.EstimateFalsePositiveRate(20*n, 5, n) > 0.001 ...
  34. You can use it to validate the computed m, k parameters:
  35. m, k := bloom.EstimateParameters(n, fp)
  36. ActualfpRate := bloom.EstimateFalsePositiveRate(m, k, n)
  37. or
  38. f := bloom.NewWithEstimates(n, fp)
  39. ActualfpRate := bloom.EstimateFalsePositiveRate(f.m, f.k, n)
  40. You would expect ActualfpRate to be close to the desired fp in these cases.
  41. The EstimateFalsePositiveRate function creates a temporary Bloom filter. It is
  42. also relatively expensive and only meant for validation.
  43. */
  44. package bloom
  45. import (
  46. "bytes"
  47. "encoding/binary"
  48. "encoding/json"
  49. "fmt"
  50. "io"
  51. "math"
  52. "github.com/bits-and-blooms/bitset"
  53. )
  54. // A BloomFilter is a representation of a set of _n_ items, where the main
  55. // requirement is to make membership queries; _i.e._, whether an item is a
  56. // member of a set.
  57. type BloomFilter struct {
  58. m uint
  59. k uint
  60. b *bitset.BitSet
  61. }
  62. func max(x, y uint) uint {
  63. if x > y {
  64. return x
  65. }
  66. return y
  67. }
  68. // New creates a new Bloom filter with _m_ bits and _k_ hashing functions
  69. // We force _m_ and _k_ to be at least one to avoid panics.
  70. func New(m uint, k uint) *BloomFilter {
  71. return &BloomFilter{max(1, m), max(1, k), bitset.New(m)}
  72. }
  73. // From creates a new Bloom filter with len(_data_) * 64 bits and _k_ hashing
  74. // functions. The data slice is not going to be reset.
  75. func From(data []uint64, k uint) *BloomFilter {
  76. m := uint(len(data) * 64)
  77. return FromWithM(data, m, k)
  78. }
  79. // FromWithM creates a new Bloom filter with _m_ length, _k_ hashing functions.
  80. // The data slice is not going to be reset.
  81. func FromWithM(data []uint64, m, k uint) *BloomFilter {
  82. return &BloomFilter{m, k, bitset.From(data)}
  83. }
  84. // baseHashes returns the four hash values of data that are used to create k
  85. // hashes
  86. func baseHashes(data []byte) [4]uint64 {
  87. var d digest128 // murmur hashing
  88. hash1, hash2, hash3, hash4 := d.sum256(data)
  89. return [4]uint64{
  90. hash1, hash2, hash3, hash4,
  91. }
  92. }
  93. // location returns the ith hashed location using the four base hash values
  94. func location(h [4]uint64, i uint) uint64 {
  95. ii := uint64(i)
  96. return h[ii%2] + ii*h[2+(((ii+(ii%2))%4)/2)]
  97. }
  98. // location returns the ith hashed location using the four base hash values
  99. func (f *BloomFilter) location(h [4]uint64, i uint) uint {
  100. return uint(location(h, i) % uint64(f.m))
  101. }
  102. // EstimateParameters estimates requirements for m and k.
  103. // Based on https://bitbucket.org/ww/bloom/src/829aa19d01d9/bloom.go
  104. // used with permission.
  105. func EstimateParameters(n uint, p float64) (m uint, k uint) {
  106. m = uint(math.Ceil(-1 * float64(n) * math.Log(p) / math.Pow(math.Log(2), 2)))
  107. k = uint(math.Ceil(math.Log(2) * float64(m) / float64(n)))
  108. return
  109. }
  110. // NewWithEstimates creates a new Bloom filter for about n items with fp
  111. // false positive rate
  112. func NewWithEstimates(n uint, fp float64) *BloomFilter {
  113. m, k := EstimateParameters(n, fp)
  114. return New(m, k)
  115. }
  116. // Cap returns the capacity, _m_, of a Bloom filter
  117. func (f *BloomFilter) Cap() uint {
  118. return f.m
  119. }
  120. // K returns the number of hash functions used in the BloomFilter
  121. func (f *BloomFilter) K() uint {
  122. return f.k
  123. }
  124. // BitSet returns the underlying bitset for this filter.
  125. func (f *BloomFilter) BitSet() *bitset.BitSet {
  126. return f.b
  127. }
  128. // Add data to the Bloom Filter. Returns the filter (allows chaining)
  129. func (f *BloomFilter) Add(data []byte) *BloomFilter {
  130. h := baseHashes(data)
  131. for i := uint(0); i < f.k; i++ {
  132. f.b.Set(f.location(h, i))
  133. }
  134. return f
  135. }
  136. // Merge the data from two Bloom Filters.
  137. func (f *BloomFilter) Merge(g *BloomFilter) error {
  138. // Make sure the m's and k's are the same, otherwise merging has no real use.
  139. if f.m != g.m {
  140. return fmt.Errorf("m's don't match: %d != %d", f.m, g.m)
  141. }
  142. if f.k != g.k {
  143. return fmt.Errorf("k's don't match: %d != %d", f.m, g.m)
  144. }
  145. f.b.InPlaceUnion(g.b)
  146. return nil
  147. }
  148. // Copy creates a copy of a Bloom filter.
  149. func (f *BloomFilter) Copy() *BloomFilter {
  150. fc := New(f.m, f.k)
  151. fc.Merge(f) // #nosec
  152. return fc
  153. }
  154. // AddString to the Bloom Filter. Returns the filter (allows chaining)
  155. func (f *BloomFilter) AddString(data string) *BloomFilter {
  156. return f.Add([]byte(data))
  157. }
  158. // Test returns true if the data is in the BloomFilter, false otherwise.
  159. // If true, the result might be a false positive. If false, the data
  160. // is definitely not in the set.
  161. func (f *BloomFilter) Test(data []byte) bool {
  162. h := baseHashes(data)
  163. for i := uint(0); i < f.k; i++ {
  164. if !f.b.Test(f.location(h, i)) {
  165. return false
  166. }
  167. }
  168. return true
  169. }
  170. // TestString returns true if the string is in the BloomFilter, false otherwise.
  171. // If true, the result might be a false positive. If false, the data
  172. // is definitely not in the set.
  173. func (f *BloomFilter) TestString(data string) bool {
  174. return f.Test([]byte(data))
  175. }
  176. // TestLocations returns true if all locations are set in the BloomFilter, false
  177. // otherwise.
  178. func (f *BloomFilter) TestLocations(locs []uint64) bool {
  179. for i := 0; i < len(locs); i++ {
  180. if !f.b.Test(uint(locs[i] % uint64(f.m))) {
  181. return false
  182. }
  183. }
  184. return true
  185. }
  186. // TestAndAdd is equivalent to calling Test(data) then Add(data).
  187. // The filter is written to unconditionnally: even if the element is present,
  188. // the corresponding bits are still set. See also TestOrAdd.
  189. // Returns the result of Test.
  190. func (f *BloomFilter) TestAndAdd(data []byte) bool {
  191. present := true
  192. h := baseHashes(data)
  193. for i := uint(0); i < f.k; i++ {
  194. l := f.location(h, i)
  195. if !f.b.Test(l) {
  196. present = false
  197. }
  198. f.b.Set(l)
  199. }
  200. return present
  201. }
  202. // TestAndAddString is the equivalent to calling Test(string) then Add(string).
  203. // The filter is written to unconditionnally: even if the string is present,
  204. // the corresponding bits are still set. See also TestOrAdd.
  205. // Returns the result of Test.
  206. func (f *BloomFilter) TestAndAddString(data string) bool {
  207. return f.TestAndAdd([]byte(data))
  208. }
  209. // TestOrAdd is equivalent to calling Test(data) then if not present Add(data).
  210. // If the element is already in the filter, then the filter is unchanged.
  211. // Returns the result of Test.
  212. func (f *BloomFilter) TestOrAdd(data []byte) bool {
  213. present := true
  214. h := baseHashes(data)
  215. for i := uint(0); i < f.k; i++ {
  216. l := f.location(h, i)
  217. if !f.b.Test(l) {
  218. present = false
  219. f.b.Set(l)
  220. }
  221. }
  222. return present
  223. }
  224. // TestOrAddString is the equivalent to calling Test(string) then if not present Add(string).
  225. // If the string is already in the filter, then the filter is unchanged.
  226. // Returns the result of Test.
  227. func (f *BloomFilter) TestOrAddString(data string) bool {
  228. return f.TestOrAdd([]byte(data))
  229. }
  230. // ClearAll clears all the data in a Bloom filter, removing all keys
  231. func (f *BloomFilter) ClearAll() *BloomFilter {
  232. f.b.ClearAll()
  233. return f
  234. }
  235. // EstimateFalsePositiveRate returns, for a BloomFilter of m bits
  236. // and k hash functions, an estimation of the false positive rate when
  237. //
  238. // storing n entries. This is an empirical, relatively slow
  239. //
  240. // test using integers as keys.
  241. // This function is useful to validate the implementation.
  242. func EstimateFalsePositiveRate(m, k, n uint) (fpRate float64) {
  243. rounds := uint32(100000)
  244. // We construct a new filter.
  245. f := New(m, k)
  246. n1 := make([]byte, 4)
  247. // We populate the filter with n values.
  248. for i := uint32(0); i < uint32(n); i++ {
  249. binary.BigEndian.PutUint32(n1, i)
  250. f.Add(n1)
  251. }
  252. fp := 0
  253. // test for number of rounds
  254. for i := uint32(0); i < rounds; i++ {
  255. binary.BigEndian.PutUint32(n1, i+uint32(n)+1)
  256. if f.Test(n1) {
  257. fp++
  258. }
  259. }
  260. fpRate = float64(fp) / (float64(rounds))
  261. return
  262. }
  263. // Approximating the number of items
  264. // https://en.wikipedia.org/wiki/Bloom_filter#Approximating_the_number_of_items_in_a_Bloom_filter
  265. func (f *BloomFilter) ApproximatedSize() uint32 {
  266. x := float64(f.b.Count())
  267. m := float64(f.Cap())
  268. k := float64(f.K())
  269. size := -1 * m / k * math.Log(1-x/m) / math.Log(math.E)
  270. return uint32(math.Floor(size + 0.5)) // round
  271. }
  272. // bloomFilterJSON is an unexported type for marshaling/unmarshaling BloomFilter struct.
  273. type bloomFilterJSON struct {
  274. M uint `json:"m"`
  275. K uint `json:"k"`
  276. B *bitset.BitSet `json:"b"`
  277. }
  278. // MarshalJSON implements json.Marshaler interface.
  279. func (f BloomFilter) MarshalJSON() ([]byte, error) {
  280. return json.Marshal(bloomFilterJSON{f.m, f.k, f.b})
  281. }
  282. // UnmarshalJSON implements json.Unmarshaler interface.
  283. func (f *BloomFilter) UnmarshalJSON(data []byte) error {
  284. var j bloomFilterJSON
  285. err := json.Unmarshal(data, &j)
  286. if err != nil {
  287. return err
  288. }
  289. f.m = j.M
  290. f.k = j.K
  291. f.b = j.B
  292. return nil
  293. }
  294. // WriteTo writes a binary representation of the BloomFilter to an i/o stream.
  295. // It returns the number of bytes written.
  296. //
  297. // Performance: if this function is used to write to a disk or network
  298. // connection, it might be beneficial to wrap the stream in a bufio.Writer.
  299. // E.g.,
  300. //
  301. // f, err := os.Create("myfile")
  302. // w := bufio.NewWriter(f)
  303. func (f *BloomFilter) WriteTo(stream io.Writer) (int64, error) {
  304. err := binary.Write(stream, binary.BigEndian, uint64(f.m))
  305. if err != nil {
  306. return 0, err
  307. }
  308. err = binary.Write(stream, binary.BigEndian, uint64(f.k))
  309. if err != nil {
  310. return 0, err
  311. }
  312. numBytes, err := f.b.WriteTo(stream)
  313. return numBytes + int64(2*binary.Size(uint64(0))), err
  314. }
  315. // ReadFrom reads a binary representation of the BloomFilter (such as might
  316. // have been written by WriteTo()) from an i/o stream. It returns the number
  317. // of bytes read.
  318. //
  319. // Performance: if this function is used to read from a disk or network
  320. // connection, it might be beneficial to wrap the stream in a bufio.Reader.
  321. // E.g.,
  322. //
  323. // f, err := os.Open("myfile")
  324. // r := bufio.NewReader(f)
  325. func (f *BloomFilter) ReadFrom(stream io.Reader) (int64, error) {
  326. var m, k uint64
  327. err := binary.Read(stream, binary.BigEndian, &m)
  328. if err != nil {
  329. return 0, err
  330. }
  331. err = binary.Read(stream, binary.BigEndian, &k)
  332. if err != nil {
  333. return 0, err
  334. }
  335. b := &bitset.BitSet{}
  336. numBytes, err := b.ReadFrom(stream)
  337. if err != nil {
  338. return 0, err
  339. }
  340. f.m = uint(m)
  341. f.k = uint(k)
  342. f.b = b
  343. return numBytes + int64(2*binary.Size(uint64(0))), nil
  344. }
  345. // GobEncode implements gob.GobEncoder interface.
  346. func (f *BloomFilter) GobEncode() ([]byte, error) {
  347. var buf bytes.Buffer
  348. _, err := f.WriteTo(&buf)
  349. if err != nil {
  350. return nil, err
  351. }
  352. return buf.Bytes(), nil
  353. }
  354. // GobDecode implements gob.GobDecoder interface.
  355. func (f *BloomFilter) GobDecode(data []byte) error {
  356. buf := bytes.NewBuffer(data)
  357. _, err := f.ReadFrom(buf)
  358. return err
  359. }
  360. // MarshalBinary implements binary.BinaryMarshaler interface.
  361. func (f *BloomFilter) MarshalBinary() ([]byte, error) {
  362. var buf bytes.Buffer
  363. _, err := f.WriteTo(&buf)
  364. if err != nil {
  365. return nil, err
  366. }
  367. return buf.Bytes(), nil
  368. }
  369. // UnmarshalBinary implements binary.BinaryUnmarshaler interface.
  370. func (f *BloomFilter) UnmarshalBinary(data []byte) error {
  371. buf := bytes.NewBuffer(data)
  372. _, err := f.ReadFrom(buf)
  373. return err
  374. }
  375. // Equal tests for the equality of two Bloom filters
  376. func (f *BloomFilter) Equal(g *BloomFilter) bool {
  377. return f.m == g.m && f.k == g.k && f.b.Equal(g.b)
  378. }
  379. // Locations returns a list of hash locations representing a data item.
  380. func Locations(data []byte, k uint) []uint64 {
  381. locs := make([]uint64, k)
  382. // calculate locations
  383. h := baseHashes(data)
  384. for i := uint(0); i < k; i++ {
  385. locs[i] = location(h, i)
  386. }
  387. return locs
  388. }