| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453 |
- /*
- Package bloom provides data structures and methods for creating Bloom filters.
- A Bloom filter is a representation of a set of _n_ items, where the main
- requirement is to make membership queries; _i.e._, whether an item is a
- member of a set.
- A Bloom filter has two parameters: _m_, a maximum size (typically a reasonably large
- multiple of the cardinality of the set to represent) and _k_, the number of hashing
- functions on elements of the set. (The actual hashing functions are important, too,
- but this is not a parameter for this implementation). A Bloom filter is backed by
- a BitSet; a key is represented in the filter by setting the bits at each value of the
- hashing functions (modulo _m_). Set membership is done by _testing_ whether the
- bits at each value of the hashing functions (again, modulo _m_) are set. If so,
- the item is in the set. If the item is actually in the set, a Bloom filter will
- never fail (the true positive rate is 1.0); but it is susceptible to false
- positives. The art is to choose _k_ and _m_ correctly.
- In this implementation, the hashing functions used is murmurhash,
- a non-cryptographic hashing function.
- This implementation accepts keys for setting as testing as []byte. Thus, to
- add a string item, "Love":
- uint n = 1000
- filter := bloom.New(20*n, 5) // load of 20, 5 keys
- filter.Add([]byte("Love"))
- Similarly, to test if "Love" is in bloom:
- if filter.Test([]byte("Love"))
- For numeric data, I recommend that you look into the binary/encoding library. But,
- for example, to add a uint32 to the filter:
- i := uint32(100)
- n1 := make([]byte,4)
- binary.BigEndian.PutUint32(n1,i)
- f.Add(n1)
- Finally, there is a method to estimate the false positive rate of a
- Bloom filter with _m_ bits and _k_ hashing functions for a set of size _n_:
- if bloom.EstimateFalsePositiveRate(20*n, 5, n) > 0.001 ...
- You can use it to validate the computed m, k parameters:
- m, k := bloom.EstimateParameters(n, fp)
- ActualfpRate := bloom.EstimateFalsePositiveRate(m, k, n)
- or
- f := bloom.NewWithEstimates(n, fp)
- ActualfpRate := bloom.EstimateFalsePositiveRate(f.m, f.k, n)
- You would expect ActualfpRate to be close to the desired fp in these cases.
- The EstimateFalsePositiveRate function creates a temporary Bloom filter. It is
- also relatively expensive and only meant for validation.
- */
- package bloom
- import (
- "bytes"
- "encoding/binary"
- "encoding/json"
- "fmt"
- "io"
- "math"
- "github.com/bits-and-blooms/bitset"
- )
- // A BloomFilter is a representation of a set of _n_ items, where the main
- // requirement is to make membership queries; _i.e._, whether an item is a
- // member of a set.
- type BloomFilter struct {
- m uint
- k uint
- b *bitset.BitSet
- }
- func max(x, y uint) uint {
- if x > y {
- return x
- }
- return y
- }
- // New creates a new Bloom filter with _m_ bits and _k_ hashing functions
- // We force _m_ and _k_ to be at least one to avoid panics.
- func New(m uint, k uint) *BloomFilter {
- return &BloomFilter{max(1, m), max(1, k), bitset.New(m)}
- }
- // From creates a new Bloom filter with len(_data_) * 64 bits and _k_ hashing
- // functions. The data slice is not going to be reset.
- func From(data []uint64, k uint) *BloomFilter {
- m := uint(len(data) * 64)
- return FromWithM(data, m, k)
- }
- // FromWithM creates a new Bloom filter with _m_ length, _k_ hashing functions.
- // The data slice is not going to be reset.
- func FromWithM(data []uint64, m, k uint) *BloomFilter {
- return &BloomFilter{m, k, bitset.From(data)}
- }
- // baseHashes returns the four hash values of data that are used to create k
- // hashes
- func baseHashes(data []byte) [4]uint64 {
- var d digest128 // murmur hashing
- hash1, hash2, hash3, hash4 := d.sum256(data)
- return [4]uint64{
- hash1, hash2, hash3, hash4,
- }
- }
- // location returns the ith hashed location using the four base hash values
- func location(h [4]uint64, i uint) uint64 {
- ii := uint64(i)
- return h[ii%2] + ii*h[2+(((ii+(ii%2))%4)/2)]
- }
- // location returns the ith hashed location using the four base hash values
- func (f *BloomFilter) location(h [4]uint64, i uint) uint {
- return uint(location(h, i) % uint64(f.m))
- }
- // EstimateParameters estimates requirements for m and k.
- // Based on https://bitbucket.org/ww/bloom/src/829aa19d01d9/bloom.go
- // used with permission.
- func EstimateParameters(n uint, p float64) (m uint, k uint) {
- m = uint(math.Ceil(-1 * float64(n) * math.Log(p) / math.Pow(math.Log(2), 2)))
- k = uint(math.Ceil(math.Log(2) * float64(m) / float64(n)))
- return
- }
- // NewWithEstimates creates a new Bloom filter for about n items with fp
- // false positive rate
- func NewWithEstimates(n uint, fp float64) *BloomFilter {
- m, k := EstimateParameters(n, fp)
- return New(m, k)
- }
- // Cap returns the capacity, _m_, of a Bloom filter
- func (f *BloomFilter) Cap() uint {
- return f.m
- }
- // K returns the number of hash functions used in the BloomFilter
- func (f *BloomFilter) K() uint {
- return f.k
- }
- // BitSet returns the underlying bitset for this filter.
- func (f *BloomFilter) BitSet() *bitset.BitSet {
- return f.b
- }
- // Add data to the Bloom Filter. Returns the filter (allows chaining)
- func (f *BloomFilter) Add(data []byte) *BloomFilter {
- h := baseHashes(data)
- for i := uint(0); i < f.k; i++ {
- f.b.Set(f.location(h, i))
- }
- return f
- }
- // Merge the data from two Bloom Filters.
- func (f *BloomFilter) Merge(g *BloomFilter) error {
- // Make sure the m's and k's are the same, otherwise merging has no real use.
- if f.m != g.m {
- return fmt.Errorf("m's don't match: %d != %d", f.m, g.m)
- }
- if f.k != g.k {
- return fmt.Errorf("k's don't match: %d != %d", f.m, g.m)
- }
- f.b.InPlaceUnion(g.b)
- return nil
- }
- // Copy creates a copy of a Bloom filter.
- func (f *BloomFilter) Copy() *BloomFilter {
- fc := New(f.m, f.k)
- fc.Merge(f) // #nosec
- return fc
- }
- // AddString to the Bloom Filter. Returns the filter (allows chaining)
- func (f *BloomFilter) AddString(data string) *BloomFilter {
- return f.Add([]byte(data))
- }
- // Test returns true if the data is in the BloomFilter, false otherwise.
- // If true, the result might be a false positive. If false, the data
- // is definitely not in the set.
- func (f *BloomFilter) Test(data []byte) bool {
- h := baseHashes(data)
- for i := uint(0); i < f.k; i++ {
- if !f.b.Test(f.location(h, i)) {
- return false
- }
- }
- return true
- }
- // TestString returns true if the string is in the BloomFilter, false otherwise.
- // If true, the result might be a false positive. If false, the data
- // is definitely not in the set.
- func (f *BloomFilter) TestString(data string) bool {
- return f.Test([]byte(data))
- }
- // TestLocations returns true if all locations are set in the BloomFilter, false
- // otherwise.
- func (f *BloomFilter) TestLocations(locs []uint64) bool {
- for i := 0; i < len(locs); i++ {
- if !f.b.Test(uint(locs[i] % uint64(f.m))) {
- return false
- }
- }
- return true
- }
- // TestAndAdd is equivalent to calling Test(data) then Add(data).
- // The filter is written to unconditionnally: even if the element is present,
- // the corresponding bits are still set. See also TestOrAdd.
- // Returns the result of Test.
- func (f *BloomFilter) TestAndAdd(data []byte) bool {
- present := true
- h := baseHashes(data)
- for i := uint(0); i < f.k; i++ {
- l := f.location(h, i)
- if !f.b.Test(l) {
- present = false
- }
- f.b.Set(l)
- }
- return present
- }
- // TestAndAddString is the equivalent to calling Test(string) then Add(string).
- // The filter is written to unconditionnally: even if the string is present,
- // the corresponding bits are still set. See also TestOrAdd.
- // Returns the result of Test.
- func (f *BloomFilter) TestAndAddString(data string) bool {
- return f.TestAndAdd([]byte(data))
- }
- // TestOrAdd is equivalent to calling Test(data) then if not present Add(data).
- // If the element is already in the filter, then the filter is unchanged.
- // Returns the result of Test.
- func (f *BloomFilter) TestOrAdd(data []byte) bool {
- present := true
- h := baseHashes(data)
- for i := uint(0); i < f.k; i++ {
- l := f.location(h, i)
- if !f.b.Test(l) {
- present = false
- f.b.Set(l)
- }
- }
- return present
- }
- // TestOrAddString is the equivalent to calling Test(string) then if not present Add(string).
- // If the string is already in the filter, then the filter is unchanged.
- // Returns the result of Test.
- func (f *BloomFilter) TestOrAddString(data string) bool {
- return f.TestOrAdd([]byte(data))
- }
- // ClearAll clears all the data in a Bloom filter, removing all keys
- func (f *BloomFilter) ClearAll() *BloomFilter {
- f.b.ClearAll()
- return f
- }
- // EstimateFalsePositiveRate returns, for a BloomFilter of m bits
- // and k hash functions, an estimation of the false positive rate when
- //
- // storing n entries. This is an empirical, relatively slow
- //
- // test using integers as keys.
- // This function is useful to validate the implementation.
- func EstimateFalsePositiveRate(m, k, n uint) (fpRate float64) {
- rounds := uint32(100000)
- // We construct a new filter.
- f := New(m, k)
- n1 := make([]byte, 4)
- // We populate the filter with n values.
- for i := uint32(0); i < uint32(n); i++ {
- binary.BigEndian.PutUint32(n1, i)
- f.Add(n1)
- }
- fp := 0
- // test for number of rounds
- for i := uint32(0); i < rounds; i++ {
- binary.BigEndian.PutUint32(n1, i+uint32(n)+1)
- if f.Test(n1) {
- fp++
- }
- }
- fpRate = float64(fp) / (float64(rounds))
- return
- }
- // Approximating the number of items
- // https://en.wikipedia.org/wiki/Bloom_filter#Approximating_the_number_of_items_in_a_Bloom_filter
- func (f *BloomFilter) ApproximatedSize() uint32 {
- x := float64(f.b.Count())
- m := float64(f.Cap())
- k := float64(f.K())
- size := -1 * m / k * math.Log(1-x/m) / math.Log(math.E)
- return uint32(math.Floor(size + 0.5)) // round
- }
- // bloomFilterJSON is an unexported type for marshaling/unmarshaling BloomFilter struct.
- type bloomFilterJSON struct {
- M uint `json:"m"`
- K uint `json:"k"`
- B *bitset.BitSet `json:"b"`
- }
- // MarshalJSON implements json.Marshaler interface.
- func (f BloomFilter) MarshalJSON() ([]byte, error) {
- return json.Marshal(bloomFilterJSON{f.m, f.k, f.b})
- }
- // UnmarshalJSON implements json.Unmarshaler interface.
- func (f *BloomFilter) UnmarshalJSON(data []byte) error {
- var j bloomFilterJSON
- err := json.Unmarshal(data, &j)
- if err != nil {
- return err
- }
- f.m = j.M
- f.k = j.K
- f.b = j.B
- return nil
- }
- // WriteTo writes a binary representation of the BloomFilter to an i/o stream.
- // It returns the number of bytes written.
- //
- // Performance: if this function is used to write to a disk or network
- // connection, it might be beneficial to wrap the stream in a bufio.Writer.
- // E.g.,
- //
- // f, err := os.Create("myfile")
- // w := bufio.NewWriter(f)
- func (f *BloomFilter) WriteTo(stream io.Writer) (int64, error) {
- err := binary.Write(stream, binary.BigEndian, uint64(f.m))
- if err != nil {
- return 0, err
- }
- err = binary.Write(stream, binary.BigEndian, uint64(f.k))
- if err != nil {
- return 0, err
- }
- numBytes, err := f.b.WriteTo(stream)
- return numBytes + int64(2*binary.Size(uint64(0))), err
- }
- // ReadFrom reads a binary representation of the BloomFilter (such as might
- // have been written by WriteTo()) from an i/o stream. It returns the number
- // of bytes read.
- //
- // Performance: if this function is used to read from a disk or network
- // connection, it might be beneficial to wrap the stream in a bufio.Reader.
- // E.g.,
- //
- // f, err := os.Open("myfile")
- // r := bufio.NewReader(f)
- func (f *BloomFilter) ReadFrom(stream io.Reader) (int64, error) {
- var m, k uint64
- err := binary.Read(stream, binary.BigEndian, &m)
- if err != nil {
- return 0, err
- }
- err = binary.Read(stream, binary.BigEndian, &k)
- if err != nil {
- return 0, err
- }
- b := &bitset.BitSet{}
- numBytes, err := b.ReadFrom(stream)
- if err != nil {
- return 0, err
- }
- f.m = uint(m)
- f.k = uint(k)
- f.b = b
- return numBytes + int64(2*binary.Size(uint64(0))), nil
- }
- // GobEncode implements gob.GobEncoder interface.
- func (f *BloomFilter) GobEncode() ([]byte, error) {
- var buf bytes.Buffer
- _, err := f.WriteTo(&buf)
- if err != nil {
- return nil, err
- }
- return buf.Bytes(), nil
- }
- // GobDecode implements gob.GobDecoder interface.
- func (f *BloomFilter) GobDecode(data []byte) error {
- buf := bytes.NewBuffer(data)
- _, err := f.ReadFrom(buf)
- return err
- }
- // MarshalBinary implements binary.BinaryMarshaler interface.
- func (f *BloomFilter) MarshalBinary() ([]byte, error) {
- var buf bytes.Buffer
- _, err := f.WriteTo(&buf)
- if err != nil {
- return nil, err
- }
- return buf.Bytes(), nil
- }
- // UnmarshalBinary implements binary.BinaryUnmarshaler interface.
- func (f *BloomFilter) UnmarshalBinary(data []byte) error {
- buf := bytes.NewBuffer(data)
- _, err := f.ReadFrom(buf)
- return err
- }
- // Equal tests for the equality of two Bloom filters
- func (f *BloomFilter) Equal(g *BloomFilter) bool {
- return f.m == g.m && f.k == g.k && f.b.Equal(g.b)
- }
- // Locations returns a list of hash locations representing a data item.
- func Locations(data []byte, k uint) []uint64 {
- locs := make([]uint64, k)
- // calculate locations
- h := baseHashes(data)
- for i := uint(0); i < k; i++ {
- locs[i] = location(h, i)
- }
- return locs
- }
|