hyperloglog.go 7.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354
  1. package hyperloglog
  2. import (
  3. "encoding/binary"
  4. "errors"
  5. "fmt"
  6. "math"
  7. "slices"
  8. )
  9. const (
  10. pp = uint8(25)
  11. mp = uint32(1) << pp
  12. version = 2
  13. )
  14. type Sketch struct {
  15. p uint8
  16. m uint32
  17. alpha float64
  18. tmpSet set
  19. sparseList *compressedList
  20. regs []uint8
  21. }
  22. // New returns a HyperLogLog Sketch with 2^14 registers (precision 14)
  23. func New() *Sketch { return New14() }
  24. // New14 returns a HyperLogLog Sketch with 2^14 registers (precision 14)
  25. func New14() *Sketch { return newSketchNoError(14, true) }
  26. // New16 returns a HyperLogLog Sketch with 2^16 registers (precision 16)
  27. func New16() *Sketch { return newSketchNoError(16, true) }
  28. // NewNoSparse returns a HyperLogLog Sketch with 2^14 registers (precision 14) that will not use a sparse representation
  29. func NewNoSparse() *Sketch { return newSketchNoError(14, false) }
  30. // New16NoSparse returns a HyperLogLog Sketch with 2^16 registers (precision 16) that will not use a sparse representation
  31. func New16NoSparse() *Sketch { return newSketchNoError(16, false) }
  32. func newSketchNoError(precision uint8, sparse bool) *Sketch {
  33. sk, _ := NewSketch(precision, sparse)
  34. return sk
  35. }
  36. func NewSketch(precision uint8, sparse bool) (*Sketch, error) {
  37. if precision < 4 || precision > 18 {
  38. return nil, fmt.Errorf("p has to be >= 4 and <= 18")
  39. }
  40. m := uint32(1) << precision
  41. s := &Sketch{
  42. m: m,
  43. p: precision,
  44. alpha: alpha(float64(m)),
  45. }
  46. if sparse {
  47. s.tmpSet = makeSet(0)
  48. s.sparseList = newCompressedList(0)
  49. } else {
  50. s.regs = make([]uint8, m)
  51. }
  52. return s, nil
  53. }
  54. func (sk *Sketch) sparse() bool { return sk.sparseList != nil }
  55. // Clone returns a deep copy of sk.
  56. func (sk *Sketch) Clone() *Sketch {
  57. clone := *sk
  58. clone.regs = append([]uint8(nil), sk.regs...)
  59. clone.tmpSet = sk.tmpSet.Clone()
  60. clone.sparseList = sk.sparseList.Clone()
  61. return &clone
  62. }
  63. func (sk *Sketch) maybeToNormal() {
  64. if uint32(sk.tmpSet.Len())*100 > sk.m {
  65. sk.mergeSparse()
  66. if uint32(sk.sparseList.Len()) > sk.m {
  67. sk.toNormal()
  68. }
  69. }
  70. }
  71. func (sk *Sketch) Merge(other *Sketch) error {
  72. if other == nil {
  73. return nil
  74. }
  75. if sk.p != other.p {
  76. return errors.New("precisions must be equal")
  77. }
  78. if sk.sparse() && other.sparse() {
  79. sk.mergeSparseSketch(other)
  80. } else {
  81. sk.mergeDenseSketch(other)
  82. }
  83. return nil
  84. }
  85. func (sk *Sketch) mergeSparseSketch(other *Sketch) {
  86. sk.tmpSet.Merge(other.tmpSet)
  87. for iter := other.sparseList.Iter(); iter.HasNext(); {
  88. sk.tmpSet.add(iter.Next())
  89. }
  90. sk.maybeToNormal()
  91. }
  92. func (sk *Sketch) mergeDenseSketch(other *Sketch) {
  93. if sk.sparse() {
  94. sk.toNormal()
  95. }
  96. if other.sparse() {
  97. other.tmpSet.ForEach(func(k uint32) {
  98. i, r := decodeHash(k, other.p, pp)
  99. sk.insert(i, r)
  100. })
  101. for iter := other.sparseList.Iter(); iter.HasNext(); {
  102. i, r := decodeHash(iter.Next(), other.p, pp)
  103. sk.insert(i, r)
  104. }
  105. } else {
  106. for i, v := range other.regs {
  107. if v > sk.regs[i] {
  108. sk.regs[i] = v
  109. }
  110. }
  111. }
  112. }
  113. func (sk *Sketch) toNormal() {
  114. if sk.tmpSet.Len() > 0 {
  115. sk.mergeSparse()
  116. }
  117. sk.regs = make([]uint8, sk.m)
  118. for iter := sk.sparseList.Iter(); iter.HasNext(); {
  119. i, r := decodeHash(iter.Next(), sk.p, pp)
  120. sk.insert(i, r)
  121. }
  122. sk.tmpSet = nilSet
  123. sk.sparseList = nil
  124. }
  125. func (sk *Sketch) insert(i uint32, r uint8) { sk.regs[i] = max(r, sk.regs[i]) }
  126. func (sk *Sketch) Insert(e []byte) { sk.InsertHash(hash(e)) }
  127. func (sk *Sketch) InsertHash(x uint64) {
  128. if sk.sparse() {
  129. if sk.tmpSet.add(encodeHash(x, sk.p, pp)) {
  130. sk.maybeToNormal()
  131. }
  132. return
  133. }
  134. i, r := getPosVal(x, sk.p)
  135. sk.insert(uint32(i), r)
  136. }
  137. func (sk *Sketch) Estimate() uint64 {
  138. if sk.sparse() {
  139. sk.mergeSparse()
  140. return uint64(linearCount(mp, mp-sk.sparseList.count))
  141. }
  142. sum, ez := sumAndZeros(sk.regs)
  143. m := float64(sk.m)
  144. est := sk.alpha * m * (m - ez) / (sum + beta(sk.p, ez))
  145. return uint64(est + 0.5)
  146. }
  147. func (sk *Sketch) mergeSparse() {
  148. if sk.tmpSet.Len() == 0 {
  149. return
  150. }
  151. keys := make([]uint32, 0, sk.tmpSet.Len())
  152. sk.tmpSet.ForEach(func(k uint32) {
  153. keys = append(keys, k)
  154. })
  155. slices.Sort(keys)
  156. newList := newCompressedList(4*sk.tmpSet.Len() + sk.sparseList.Len())
  157. for iter, i := sk.sparseList.Iter(), 0; iter.HasNext() || i < len(keys); {
  158. if !iter.HasNext() {
  159. newList.Append(keys[i])
  160. i++
  161. continue
  162. }
  163. if i >= len(keys) {
  164. newList.Append(iter.Next())
  165. continue
  166. }
  167. x1, adv := iter.Peek()
  168. x2 := keys[i]
  169. if x1 == x2 {
  170. newList.Append(x1)
  171. iter.Advance(x1, adv)
  172. i++
  173. } else if x1 > x2 {
  174. newList.Append(x2)
  175. i++
  176. } else {
  177. newList.Append(x1)
  178. iter.Advance(x1, adv)
  179. }
  180. }
  181. sk.sparseList = newList
  182. sk.tmpSet = makeSet(0)
  183. }
  184. // MarshalBinary implements the encoding.BinaryMarshaler interface.
  185. //
  186. // When the result will be appended to another buffer, consider using
  187. // AppendBinary to avoid additional allocations and copying.
  188. func (sk *Sketch) MarshalBinary() (data []byte, err error) {
  189. return sk.AppendBinary(nil)
  190. }
  191. // AppendBinary implements the encoding.BinaryAppender interface.
  192. func (sk *Sketch) AppendBinary(data []byte) ([]byte, error) {
  193. data = slices.Grow(data, 8+len(sk.regs))
  194. // Marshal a version marker.
  195. data = append(data, version)
  196. // Marshal p.
  197. data = append(data, sk.p)
  198. // Marshal b
  199. data = append(data, 0)
  200. if sk.sparse() {
  201. // It's using the sparse Sketch.
  202. data = append(data, byte(1))
  203. // Add the tmp_set
  204. data, err := sk.tmpSet.AppendBinary(data)
  205. if err != nil {
  206. return nil, err
  207. }
  208. // Add the sparse Sketch
  209. return sk.sparseList.AppendBinary(data)
  210. }
  211. // It's using the dense Sketch.
  212. data = append(data, byte(0))
  213. // Add the dense sketch Sketch.
  214. sz := len(sk.regs)
  215. data = append(data,
  216. byte(sz>>24),
  217. byte(sz>>16),
  218. byte(sz>>8),
  219. byte(sz),
  220. )
  221. // Marshal each element in the list.
  222. for _, v := range sk.regs {
  223. data = append(data, byte(v))
  224. }
  225. return data, nil
  226. }
  227. // ErrorTooShort is an error that UnmarshalBinary try to parse too short
  228. // binary.
  229. var ErrorTooShort = errors.New("too short binary")
  230. // UnmarshalBinary implements the encoding.BinaryUnmarshaler interface.
  231. func (sk *Sketch) UnmarshalBinary(data []byte) error {
  232. if len(data) < 8 {
  233. return ErrorTooShort
  234. }
  235. // Unmarshal version. We may need this in the future if we make
  236. // non-compatible changes.
  237. v := data[0]
  238. // Unmarshal p.
  239. p := data[1]
  240. // Unmarshal b.
  241. b := data[2]
  242. // Determine if we need a sparse Sketch
  243. sparse := data[3] == byte(1)
  244. // Make a newSketch Sketch if the precision doesn't match or if the Sketch was used
  245. if sk.p != p || sk.regs != nil || sk.tmpSet.Len() > 0 || (sk.sparseList != nil && sk.sparseList.Len() > 0) {
  246. newh, err := NewSketch(p, sparse)
  247. if err != nil {
  248. return err
  249. }
  250. *sk = *newh
  251. }
  252. // h is now initialised with the correct p. We just need to fill the
  253. // rest of the details out.
  254. if sparse {
  255. // Using the sparse Sketch.
  256. // Unmarshal the tmp_set.
  257. tssz := binary.BigEndian.Uint32(data[4:8])
  258. sk.tmpSet = makeSet(int(tssz))
  259. // We need to unmarshal tssz values in total, and each value requires us
  260. // to read 4 bytes.
  261. tsLastByte := int((tssz * 4) + 8)
  262. for i := 8; i < tsLastByte; i += 4 {
  263. k := binary.BigEndian.Uint32(data[i : i+4])
  264. sk.tmpSet.add(k)
  265. }
  266. // Unmarshal the sparse Sketch.
  267. return sk.sparseList.UnmarshalBinary(data[tsLastByte:])
  268. }
  269. // Using the dense Sketch.
  270. sk.sparseList = nil
  271. sk.tmpSet = nilSet
  272. if v == 1 {
  273. return sk.unmarshalBinaryV1(data[8:], b)
  274. }
  275. return sk.unmarshalBinaryV2(data)
  276. }
  277. func sumAndZeros(regs []uint8) (res, ez float64) {
  278. for _, v := range regs {
  279. if v == 0 {
  280. ez++
  281. }
  282. res += 1.0 / math.Pow(2.0, float64(v))
  283. }
  284. return res, ez
  285. }
  286. func (sk *Sketch) unmarshalBinaryV1(data []byte, b uint8) error {
  287. sk.regs = make([]uint8, len(data)*2)
  288. for i, v := range data {
  289. sk.regs[i*2] = uint8((v >> 4)) + b
  290. sk.regs[i*2+1] = uint8((v<<4)>>4) + b
  291. }
  292. return nil
  293. }
  294. func (sk *Sketch) unmarshalBinaryV2(data []byte) error {
  295. sk.regs = data[8:]
  296. return nil
  297. }