| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284 |
- // The MIT License (MIT)
- // Copyright (c) 2014 Andreas Briese, eduToolbox@Bri-C GmbH, Sarstedt
- // Permission is hereby granted, free of charge, to any person obtaining a copy of
- // this software and associated documentation files (the "Software"), to deal in
- // the Software without restriction, including without limitation the rights to
- // use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
- // the Software, and to permit persons to whom the Software is furnished to do so,
- // subject to the following conditions:
- // The above copyright notice and this permission notice shall be included in all
- // copies or substantial portions of the Software.
- // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
- // FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
- // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
- // IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- // CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- // 2019/08/25 code revision to reduce unsafe use
- // Parts are adopted from the fork at ipfs/bbloom after performance rev by
- // Steve Allen (https://github.com/Stebalien)
- // (see https://github.com/ipfs/bbloom/blob/master/bbloom.go)
- // -> func Has
- // -> func set
- // -> func add
- package bbloom
- import (
- "bytes"
- "encoding/json"
- "log"
- "math"
- "sync"
- "unsafe"
- )
- // helper
- // not needed anymore by Set
- // var mask = []uint8{1, 2, 4, 8, 16, 32, 64, 128}
- func getSize(ui64 uint64) (size uint64, exponent uint64) {
- if ui64 < uint64(512) {
- ui64 = uint64(512)
- }
- size = uint64(1)
- for size < ui64 {
- size <<= 1
- exponent++
- }
- return size, exponent
- }
- func calcSizeByWrongPositives(numEntries, wrongs float64) (uint64, uint64) {
- size := -1 * numEntries * math.Log(wrongs) / math.Pow(float64(0.69314718056), 2)
- locs := math.Ceil(float64(0.69314718056) * size / numEntries)
- return uint64(size), uint64(locs)
- }
- // New
- // returns a new bloomfilter
- func New(params ...float64) (bloomfilter Bloom) {
- var entries, locs uint64
- if len(params) == 2 {
- if params[1] < 1 {
- entries, locs = calcSizeByWrongPositives(params[0], params[1])
- } else {
- entries, locs = uint64(params[0]), uint64(params[1])
- }
- } else {
- log.Fatal("usage: New(float64(number_of_entries), float64(number_of_hashlocations)) i.e. New(float64(1000), float64(3)) or New(float64(number_of_entries), float64(number_of_hashlocations)) i.e. New(float64(1000), float64(0.03))")
- }
- size, exponent := getSize(uint64(entries))
- bloomfilter = Bloom{
- Mtx: &sync.Mutex{},
- sizeExp: exponent,
- size: size - 1,
- setLocs: locs,
- shift: 64 - exponent,
- }
- bloomfilter.Size(size)
- return bloomfilter
- }
- // NewWithBoolset
- // takes a []byte slice and number of locs per entry
- // returns the bloomfilter with a bitset populated according to the input []byte
- func NewWithBoolset(bs *[]byte, locs uint64) (bloomfilter Bloom) {
- bloomfilter = New(float64(len(*bs)<<3), float64(locs))
- for i, b := range *bs {
- *(*uint8)(unsafe.Pointer(uintptr(unsafe.Pointer(&bloomfilter.bitset[0])) + uintptr(i))) = b
- }
- return bloomfilter
- }
- // bloomJSONImExport
- // Im/Export structure used by JSONMarshal / JSONUnmarshal
- type bloomJSONImExport struct {
- FilterSet []byte
- SetLocs uint64
- }
- // JSONUnmarshal
- // takes JSON-Object (type bloomJSONImExport) as []bytes
- // returns Bloom object
- func JSONUnmarshal(dbData []byte) Bloom {
- bloomImEx := bloomJSONImExport{}
- json.Unmarshal(dbData, &bloomImEx)
- buf := bytes.NewBuffer(bloomImEx.FilterSet)
- bs := buf.Bytes()
- bf := NewWithBoolset(&bs, bloomImEx.SetLocs)
- return bf
- }
- //
- // Bloom filter
- type Bloom struct {
- Mtx *sync.Mutex
- ElemNum uint64
- bitset []uint64
- sizeExp uint64
- size uint64
- setLocs uint64
- shift uint64
- }
- // <--- http://www.cse.yorku.ca/~oz/hash.html
- // modified Berkeley DB Hash (32bit)
- // hash is casted to l, h = 16bit fragments
- // func (bl Bloom) absdbm(b *[]byte) (l, h uint64) {
- // hash := uint64(len(*b))
- // for _, c := range *b {
- // hash = uint64(c) + (hash << 6) + (hash << bl.sizeExp) - hash
- // }
- // h = hash >> bl.shift
- // l = hash << bl.shift >> bl.shift
- // return l, h
- // }
- // Update: found sipHash of Jean-Philippe Aumasson & Daniel J. Bernstein to be even faster than absdbm()
- // https://131002.net/siphash/
- // siphash was implemented for Go by Dmitry Chestnykh https://github.com/dchest/siphash
- // Add
- // set the bit(s) for entry; Adds an entry to the Bloom filter
- func (bl *Bloom) Add(entry []byte) {
- l, h := bl.sipHash(entry)
- for i := uint64(0); i < bl.setLocs; i++ {
- bl.set((h + i*l) & bl.size)
- bl.ElemNum++
- }
- }
- // AddTS
- // Thread safe: Mutex.Lock the bloomfilter for the time of processing the entry
- func (bl *Bloom) AddTS(entry []byte) {
- bl.Mtx.Lock()
- defer bl.Mtx.Unlock()
- bl.Add(entry)
- }
- // Has
- // check if bit(s) for entry is/are set
- // returns true if the entry was added to the Bloom Filter
- func (bl Bloom) Has(entry []byte) bool {
- l, h := bl.sipHash(entry)
- res := true
- for i := uint64(0); i < bl.setLocs; i++ {
- res = res && bl.isSet((h+i*l)&bl.size)
- // https://github.com/ipfs/bbloom/commit/84e8303a9bfb37b2658b85982921d15bbb0fecff
- // // Branching here (early escape) is not worth it
- // // This is my conclusion from benchmarks
- // // (prevents loop unrolling)
- // switch bl.IsSet((h + i*l) & bl.size) {
- // case false:
- // return false
- // }
- }
- return res
- }
- // HasTS
- // Thread safe: Mutex.Lock the bloomfilter for the time of processing the entry
- func (bl *Bloom) HasTS(entry []byte) bool {
- bl.Mtx.Lock()
- defer bl.Mtx.Unlock()
- return bl.Has(entry)
- }
- // AddIfNotHas
- // Only Add entry if it's not present in the bloomfilter
- // returns true if entry was added
- // returns false if entry was allready registered in the bloomfilter
- func (bl Bloom) AddIfNotHas(entry []byte) (added bool) {
- if bl.Has(entry) {
- return added
- }
- bl.Add(entry)
- return true
- }
- // AddIfNotHasTS
- // Tread safe: Only Add entry if it's not present in the bloomfilter
- // returns true if entry was added
- // returns false if entry was allready registered in the bloomfilter
- func (bl *Bloom) AddIfNotHasTS(entry []byte) (added bool) {
- bl.Mtx.Lock()
- defer bl.Mtx.Unlock()
- return bl.AddIfNotHas(entry)
- }
- // Size
- // make Bloom filter with as bitset of size sz
- func (bl *Bloom) Size(sz uint64) {
- bl.bitset = make([]uint64, sz>>6)
- }
- // Clear
- // resets the Bloom filter
- func (bl *Bloom) Clear() {
- bs := bl.bitset
- for i := range bs {
- bs[i] = 0
- }
- }
- // Set
- // set the bit[idx] of bitsit
- func (bl *Bloom) set(idx uint64) {
- // ommit unsafe
- // *(*uint8)(unsafe.Pointer(uintptr(unsafe.Pointer(&bl.bitset[idx>>6])) + uintptr((idx%64)>>3))) |= mask[idx%8]
- bl.bitset[idx>>6] |= 1 << (idx % 64)
- }
- // IsSet
- // check if bit[idx] of bitset is set
- // returns true/false
- func (bl *Bloom) isSet(idx uint64) bool {
- // ommit unsafe
- // return (((*(*uint8)(unsafe.Pointer(uintptr(unsafe.Pointer(&bl.bitset[idx>>6])) + uintptr((idx%64)>>3)))) >> (idx % 8)) & 1) == 1
- return bl.bitset[idx>>6]&(1<<(idx%64)) != 0
- }
- // JSONMarshal
- // returns JSON-object (type bloomJSONImExport) as []byte
- func (bl Bloom) JSONMarshal() []byte {
- bloomImEx := bloomJSONImExport{}
- bloomImEx.SetLocs = uint64(bl.setLocs)
- bloomImEx.FilterSet = make([]byte, len(bl.bitset)<<3)
- for i := range bloomImEx.FilterSet {
- bloomImEx.FilterSet[i] = *(*byte)(unsafe.Pointer(uintptr(unsafe.Pointer(&bl.bitset[0])) + uintptr(i)))
- }
- data, err := json.Marshal(bloomImEx)
- if err != nil {
- log.Fatal("json.Marshal failed: ", err)
- }
- return data
- }
- // // alternative hashFn
- // func (bl Bloom) fnv64a(b *[]byte) (l, h uint64) {
- // h64 := fnv.New64a()
- // h64.Write(*b)
- // hash := h64.Sum64()
- // h = hash >> 32
- // l = hash << 32 >> 32
- // return l, h
- // }
- //
- // // <-- http://partow.net/programming/hashfunctions/index.html
- // // citation: An algorithm proposed by Donald E. Knuth in The Art Of Computer Programming Volume 3,
- // // under the topic of sorting and search chapter 6.4.
- // // modified to fit with boolset-length
- // func (bl Bloom) DEKHash(b *[]byte) (l, h uint64) {
- // hash := uint64(len(*b))
- // for _, c := range *b {
- // hash = ((hash << 5) ^ (hash >> bl.shift)) ^ uint64(c)
- // }
- // h = hash >> bl.shift
- // l = hash << bl.sizeExp >> bl.sizeExp
- // return l, h
- // }
|