bbloom.go 8.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284
  1. // The MIT License (MIT)
  2. // Copyright (c) 2014 Andreas Briese, eduToolbox@Bri-C GmbH, Sarstedt
  3. // Permission is hereby granted, free of charge, to any person obtaining a copy of
  4. // this software and associated documentation files (the "Software"), to deal in
  5. // the Software without restriction, including without limitation the rights to
  6. // use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
  7. // the Software, and to permit persons to whom the Software is furnished to do so,
  8. // subject to the following conditions:
  9. // The above copyright notice and this permission notice shall be included in all
  10. // copies or substantial portions of the Software.
  11. // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  12. // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
  13. // FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
  14. // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
  15. // IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  16. // CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  17. // 2019/08/25 code revision to reduce unsafe use
  18. // Parts are adopted from the fork at ipfs/bbloom after performance rev by
  19. // Steve Allen (https://github.com/Stebalien)
  20. // (see https://github.com/ipfs/bbloom/blob/master/bbloom.go)
  21. // -> func Has
  22. // -> func set
  23. // -> func add
  24. package bbloom
  25. import (
  26. "bytes"
  27. "encoding/json"
  28. "log"
  29. "math"
  30. "sync"
  31. "unsafe"
  32. )
  33. // helper
  34. // not needed anymore by Set
  35. // var mask = []uint8{1, 2, 4, 8, 16, 32, 64, 128}
  36. func getSize(ui64 uint64) (size uint64, exponent uint64) {
  37. if ui64 < uint64(512) {
  38. ui64 = uint64(512)
  39. }
  40. size = uint64(1)
  41. for size < ui64 {
  42. size <<= 1
  43. exponent++
  44. }
  45. return size, exponent
  46. }
  47. func calcSizeByWrongPositives(numEntries, wrongs float64) (uint64, uint64) {
  48. size := -1 * numEntries * math.Log(wrongs) / math.Pow(float64(0.69314718056), 2)
  49. locs := math.Ceil(float64(0.69314718056) * size / numEntries)
  50. return uint64(size), uint64(locs)
  51. }
  52. // New
  53. // returns a new bloomfilter
  54. func New(params ...float64) (bloomfilter Bloom) {
  55. var entries, locs uint64
  56. if len(params) == 2 {
  57. if params[1] < 1 {
  58. entries, locs = calcSizeByWrongPositives(params[0], params[1])
  59. } else {
  60. entries, locs = uint64(params[0]), uint64(params[1])
  61. }
  62. } else {
  63. log.Fatal("usage: New(float64(number_of_entries), float64(number_of_hashlocations)) i.e. New(float64(1000), float64(3)) or New(float64(number_of_entries), float64(number_of_hashlocations)) i.e. New(float64(1000), float64(0.03))")
  64. }
  65. size, exponent := getSize(uint64(entries))
  66. bloomfilter = Bloom{
  67. Mtx: &sync.Mutex{},
  68. sizeExp: exponent,
  69. size: size - 1,
  70. setLocs: locs,
  71. shift: 64 - exponent,
  72. }
  73. bloomfilter.Size(size)
  74. return bloomfilter
  75. }
  76. // NewWithBoolset
  77. // takes a []byte slice and number of locs per entry
  78. // returns the bloomfilter with a bitset populated according to the input []byte
  79. func NewWithBoolset(bs *[]byte, locs uint64) (bloomfilter Bloom) {
  80. bloomfilter = New(float64(len(*bs)<<3), float64(locs))
  81. for i, b := range *bs {
  82. *(*uint8)(unsafe.Pointer(uintptr(unsafe.Pointer(&bloomfilter.bitset[0])) + uintptr(i))) = b
  83. }
  84. return bloomfilter
  85. }
  86. // bloomJSONImExport
  87. // Im/Export structure used by JSONMarshal / JSONUnmarshal
  88. type bloomJSONImExport struct {
  89. FilterSet []byte
  90. SetLocs uint64
  91. }
  92. // JSONUnmarshal
  93. // takes JSON-Object (type bloomJSONImExport) as []bytes
  94. // returns Bloom object
  95. func JSONUnmarshal(dbData []byte) Bloom {
  96. bloomImEx := bloomJSONImExport{}
  97. json.Unmarshal(dbData, &bloomImEx)
  98. buf := bytes.NewBuffer(bloomImEx.FilterSet)
  99. bs := buf.Bytes()
  100. bf := NewWithBoolset(&bs, bloomImEx.SetLocs)
  101. return bf
  102. }
  103. //
  104. // Bloom filter
  105. type Bloom struct {
  106. Mtx *sync.Mutex
  107. ElemNum uint64
  108. bitset []uint64
  109. sizeExp uint64
  110. size uint64
  111. setLocs uint64
  112. shift uint64
  113. }
  114. // <--- http://www.cse.yorku.ca/~oz/hash.html
  115. // modified Berkeley DB Hash (32bit)
  116. // hash is casted to l, h = 16bit fragments
  117. // func (bl Bloom) absdbm(b *[]byte) (l, h uint64) {
  118. // hash := uint64(len(*b))
  119. // for _, c := range *b {
  120. // hash = uint64(c) + (hash << 6) + (hash << bl.sizeExp) - hash
  121. // }
  122. // h = hash >> bl.shift
  123. // l = hash << bl.shift >> bl.shift
  124. // return l, h
  125. // }
  126. // Update: found sipHash of Jean-Philippe Aumasson & Daniel J. Bernstein to be even faster than absdbm()
  127. // https://131002.net/siphash/
  128. // siphash was implemented for Go by Dmitry Chestnykh https://github.com/dchest/siphash
  129. // Add
  130. // set the bit(s) for entry; Adds an entry to the Bloom filter
  131. func (bl *Bloom) Add(entry []byte) {
  132. l, h := bl.sipHash(entry)
  133. for i := uint64(0); i < bl.setLocs; i++ {
  134. bl.set((h + i*l) & bl.size)
  135. bl.ElemNum++
  136. }
  137. }
  138. // AddTS
  139. // Thread safe: Mutex.Lock the bloomfilter for the time of processing the entry
  140. func (bl *Bloom) AddTS(entry []byte) {
  141. bl.Mtx.Lock()
  142. defer bl.Mtx.Unlock()
  143. bl.Add(entry)
  144. }
  145. // Has
  146. // check if bit(s) for entry is/are set
  147. // returns true if the entry was added to the Bloom Filter
  148. func (bl Bloom) Has(entry []byte) bool {
  149. l, h := bl.sipHash(entry)
  150. res := true
  151. for i := uint64(0); i < bl.setLocs; i++ {
  152. res = res && bl.isSet((h+i*l)&bl.size)
  153. // https://github.com/ipfs/bbloom/commit/84e8303a9bfb37b2658b85982921d15bbb0fecff
  154. // // Branching here (early escape) is not worth it
  155. // // This is my conclusion from benchmarks
  156. // // (prevents loop unrolling)
  157. // switch bl.IsSet((h + i*l) & bl.size) {
  158. // case false:
  159. // return false
  160. // }
  161. }
  162. return res
  163. }
  164. // HasTS
  165. // Thread safe: Mutex.Lock the bloomfilter for the time of processing the entry
  166. func (bl *Bloom) HasTS(entry []byte) bool {
  167. bl.Mtx.Lock()
  168. defer bl.Mtx.Unlock()
  169. return bl.Has(entry)
  170. }
  171. // AddIfNotHas
  172. // Only Add entry if it's not present in the bloomfilter
  173. // returns true if entry was added
  174. // returns false if entry was allready registered in the bloomfilter
  175. func (bl Bloom) AddIfNotHas(entry []byte) (added bool) {
  176. if bl.Has(entry) {
  177. return added
  178. }
  179. bl.Add(entry)
  180. return true
  181. }
  182. // AddIfNotHasTS
  183. // Tread safe: Only Add entry if it's not present in the bloomfilter
  184. // returns true if entry was added
  185. // returns false if entry was allready registered in the bloomfilter
  186. func (bl *Bloom) AddIfNotHasTS(entry []byte) (added bool) {
  187. bl.Mtx.Lock()
  188. defer bl.Mtx.Unlock()
  189. return bl.AddIfNotHas(entry)
  190. }
  191. // Size
  192. // make Bloom filter with as bitset of size sz
  193. func (bl *Bloom) Size(sz uint64) {
  194. bl.bitset = make([]uint64, sz>>6)
  195. }
  196. // Clear
  197. // resets the Bloom filter
  198. func (bl *Bloom) Clear() {
  199. bs := bl.bitset
  200. for i := range bs {
  201. bs[i] = 0
  202. }
  203. }
  204. // Set
  205. // set the bit[idx] of bitsit
  206. func (bl *Bloom) set(idx uint64) {
  207. // ommit unsafe
  208. // *(*uint8)(unsafe.Pointer(uintptr(unsafe.Pointer(&bl.bitset[idx>>6])) + uintptr((idx%64)>>3))) |= mask[idx%8]
  209. bl.bitset[idx>>6] |= 1 << (idx % 64)
  210. }
  211. // IsSet
  212. // check if bit[idx] of bitset is set
  213. // returns true/false
  214. func (bl *Bloom) isSet(idx uint64) bool {
  215. // ommit unsafe
  216. // return (((*(*uint8)(unsafe.Pointer(uintptr(unsafe.Pointer(&bl.bitset[idx>>6])) + uintptr((idx%64)>>3)))) >> (idx % 8)) & 1) == 1
  217. return bl.bitset[idx>>6]&(1<<(idx%64)) != 0
  218. }
  219. // JSONMarshal
  220. // returns JSON-object (type bloomJSONImExport) as []byte
  221. func (bl Bloom) JSONMarshal() []byte {
  222. bloomImEx := bloomJSONImExport{}
  223. bloomImEx.SetLocs = uint64(bl.setLocs)
  224. bloomImEx.FilterSet = make([]byte, len(bl.bitset)<<3)
  225. for i := range bloomImEx.FilterSet {
  226. bloomImEx.FilterSet[i] = *(*byte)(unsafe.Pointer(uintptr(unsafe.Pointer(&bl.bitset[0])) + uintptr(i)))
  227. }
  228. data, err := json.Marshal(bloomImEx)
  229. if err != nil {
  230. log.Fatal("json.Marshal failed: ", err)
  231. }
  232. return data
  233. }
  234. // // alternative hashFn
  235. // func (bl Bloom) fnv64a(b *[]byte) (l, h uint64) {
  236. // h64 := fnv.New64a()
  237. // h64.Write(*b)
  238. // hash := h64.Sum64()
  239. // h = hash >> 32
  240. // l = hash << 32 >> 32
  241. // return l, h
  242. // }
  243. //
  244. // // <-- http://partow.net/programming/hashfunctions/index.html
  245. // // citation: An algorithm proposed by Donald E. Knuth in The Art Of Computer Programming Volume 3,
  246. // // under the topic of sorting and search chapter 6.4.
  247. // // modified to fit with boolset-length
  248. // func (bl Bloom) DEKHash(b *[]byte) (l, h uint64) {
  249. // hash := uint64(len(*b))
  250. // for _, c := range *b {
  251. // hash = ((hash << 5) ^ (hash >> bl.shift)) ^ uint64(c)
  252. // }
  253. // h = hash >> bl.shift
  254. // l = hash << bl.sizeExp >> bl.sizeExp
  255. // return l, h
  256. // }