hash128_amd64.s 4.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288
  1. //go:build amd64 && !appengine && !gccgo
  2. // +build amd64,!appengine,!gccgo
  3. // This is a translation of the gcc output of FloodyBerry's pure-C public
  4. // domain siphash implementation at https://github.com/floodyberry/siphash
  5. // This assembly code has been modified from the 64-bit output to the experiment 128-bit output.
  6. // SI = v0
  7. // AX = v1
  8. // CX = v2
  9. // DX = v3
  10. // func Hash128(k0, k1 uint64, b []byte) (r0 uint64, r1 uint64)
  11. TEXT ·Hash128(SB),4,$0-56
  12. MOVQ k0+0(FP),CX
  13. MOVQ $0x736F6D6570736575,R9
  14. MOVQ k1+8(FP),DI
  15. MOVQ $0x6C7967656E657261,BX
  16. MOVQ $0x646F72616E646F6D,AX
  17. MOVQ b_len+24(FP),DX
  18. XORQ $0xEE,AX
  19. MOVQ DX,R11
  20. MOVQ DX,R10
  21. XORQ CX,R9
  22. XORQ CX,BX
  23. MOVQ $0x7465646279746573,CX
  24. XORQ DI,AX
  25. XORQ DI,CX
  26. SHLQ $0x38,R11
  27. XORQ DI,DI
  28. MOVQ b_base+16(FP),SI
  29. ANDQ $0xFFFFFFFFFFFFFFF8,R10
  30. JE afterLoop
  31. XCHGQ AX,AX
  32. loopBody:
  33. MOVQ 0(SI)(DI*1),R8
  34. ADDQ AX,R9
  35. RORQ $0x33,AX
  36. XORQ R9,AX
  37. RORQ $0x20,R9
  38. ADDQ $0x8,DI
  39. XORQ R8,CX
  40. ADDQ CX,BX
  41. RORQ $0x30,CX
  42. XORQ BX,CX
  43. ADDQ AX,BX
  44. RORQ $0x2F,AX
  45. ADDQ CX,R9
  46. RORQ $0x2B,CX
  47. XORQ BX,AX
  48. XORQ R9,CX
  49. RORQ $0x20,BX
  50. ADDQ AX,R9
  51. ADDQ CX,BX
  52. RORQ $0x33,AX
  53. RORQ $0x30,CX
  54. XORQ R9,AX
  55. XORQ BX,CX
  56. RORQ $0x20,R9
  57. ADDQ AX,BX
  58. ADDQ CX,R9
  59. RORQ $0x2F,AX
  60. RORQ $0x2B,CX
  61. XORQ BX,AX
  62. RORQ $0x20,BX
  63. XORQ R9,CX
  64. XORQ R8,R9
  65. CMPQ R10,DI
  66. JA loopBody
  67. afterLoop:
  68. ANDL $7, DX
  69. JZ afterSwitch
  70. // no support for jump tables
  71. CMPQ DX,$0x7
  72. JE sw7
  73. CMPQ DX,$0x6
  74. JE sw6
  75. CMPQ DX,$0x5
  76. JE sw5
  77. CMPQ DX,$0x4
  78. JE sw4
  79. CMPQ DX,$0x3
  80. JE sw3
  81. CMPQ DX,$0x2
  82. JE sw2
  83. JMP sw1
  84. sw7: MOVBQZX 6(SI)(DI*1),DX
  85. SHLQ $0x30,DX
  86. ORQ DX,R11
  87. sw6: MOVBQZX 0x5(SI)(DI*1),DX
  88. SHLQ $0x28,DX
  89. ORQ DX,R11
  90. sw5: MOVBQZX 0x4(SI)(DI*1),DX
  91. SHLQ $0x20,DX
  92. ORQ DX,R11
  93. sw4: MOVBQZX 0x3(SI)(DI*1),DX
  94. SHLQ $0x18,DX
  95. ORQ DX,R11
  96. sw3: MOVBQZX 0x2(SI)(DI*1),DX
  97. SHLQ $0x10,DX
  98. ORQ DX,R11
  99. sw2: MOVBQZX 0x1(SI)(DI*1),DX
  100. SHLQ $0x8,DX
  101. ORQ DX,R11
  102. sw1: MOVBQZX 0(SI)(DI*1),DX
  103. ORQ DX,R11
  104. afterSwitch:
  105. LEAQ (AX)(R9*1),SI
  106. XORQ R11,CX
  107. RORQ $0x33,AX
  108. ADDQ CX,BX
  109. MOVQ CX,DX
  110. XORQ SI,AX
  111. RORQ $0x30,DX
  112. RORQ $0x20,SI
  113. LEAQ 0(BX)(AX*1),CX
  114. XORQ BX,DX
  115. RORQ $0x2F,AX
  116. ADDQ DX,SI
  117. RORQ $0x2B,DX
  118. XORQ CX,AX
  119. XORQ SI,DX
  120. RORQ $0x20,CX
  121. ADDQ AX,SI
  122. RORQ $0x33,AX
  123. ADDQ DX,CX
  124. XORQ SI,AX
  125. RORQ $0x30,DX
  126. RORQ $0x20,SI
  127. XORQ CX,DX
  128. ADDQ AX,CX
  129. RORQ $0x2F,AX
  130. ADDQ DX,SI
  131. XORQ CX,AX
  132. RORQ $0x2B,DX
  133. RORQ $0x20,CX
  134. XORQ SI,DX
  135. XORQ R11,SI
  136. XORB $0xEE,CL
  137. ADDQ AX,SI
  138. RORQ $0x33,AX
  139. ADDQ DX,CX
  140. RORQ $0x30,DX
  141. XORQ SI,AX
  142. XORQ CX,DX
  143. RORQ $0x20,SI
  144. ADDQ AX,CX
  145. ADDQ DX,SI
  146. RORQ $0x2F,AX
  147. RORQ $0x2B,DX
  148. XORQ CX,AX
  149. XORQ SI,DX
  150. RORQ $0x20,CX
  151. ADDQ AX,SI
  152. ADDQ DX,CX
  153. RORQ $0x33,AX
  154. RORQ $0x30,DX
  155. XORQ SI,AX
  156. RORQ $0x20,SI
  157. XORQ CX,DX
  158. ADDQ AX,CX
  159. RORQ $0x2F,AX
  160. ADDQ DX,SI
  161. RORQ $0x2B,DX
  162. XORQ CX,AX
  163. XORQ SI,DX
  164. RORQ $0x20,CX
  165. ADDQ AX,SI
  166. ADDQ DX,CX
  167. RORQ $0x33,AX
  168. RORQ $0x30,DX
  169. XORQ CX,DX
  170. XORQ SI,AX
  171. RORQ $0x20,SI
  172. ADDQ DX,SI
  173. ADDQ AX,CX
  174. RORQ $0x2F,AX
  175. XORQ CX,AX
  176. RORQ $0x2B,DX
  177. RORQ $0x20,CX
  178. XORQ SI,DX
  179. // gcc optimized the tail end of this function differently. However,
  180. // we need to preserve out registers to carry out the second stage of
  181. // the finalization. This is a duplicate of an earlier finalization
  182. // round.
  183. ADDQ AX,SI
  184. RORQ $0x33,AX
  185. ADDQ DX,CX
  186. RORQ $0x30,DX
  187. XORQ SI,AX
  188. XORQ CX,DX
  189. RORQ $0x20,SI
  190. ADDQ AX,CX
  191. ADDQ DX,SI
  192. RORQ $0x2F,AX
  193. RORQ $0x2B,DX
  194. XORQ CX,AX
  195. XORQ SI,DX
  196. RORQ $0x20,CX
  197. // Stuff the result into BX instead of AX as gcc had done
  198. MOVQ SI,BX
  199. XORQ AX,BX
  200. XORQ DX,BX
  201. XORQ CX,BX
  202. MOVQ BX,ret+40(FP)
  203. // Start the second finalization round
  204. XORB $0xDD,AL
  205. ADDQ AX,SI
  206. RORQ $0x33,AX
  207. ADDQ DX,CX
  208. RORQ $0x30,DX
  209. XORQ SI,AX
  210. XORQ CX,DX
  211. RORQ $0x20,SI
  212. ADDQ AX,CX
  213. ADDQ DX,SI
  214. RORQ $0x2F,AX
  215. RORQ $0x2B,DX
  216. XORQ CX,AX
  217. XORQ SI,DX
  218. RORQ $0x20,CX
  219. ADDQ AX,SI
  220. ADDQ DX,CX
  221. RORQ $0x33,AX
  222. RORQ $0x30,DX
  223. XORQ SI,AX
  224. RORQ $0x20,SI
  225. XORQ CX,DX
  226. ADDQ AX,CX
  227. RORQ $0x2F,AX
  228. ADDQ DX,SI
  229. RORQ $0x2B,DX
  230. XORQ CX,AX
  231. XORQ SI,DX
  232. RORQ $0x20,CX
  233. ADDQ AX,SI
  234. ADDQ DX,CX
  235. RORQ $0x33,AX
  236. RORQ $0x30,DX
  237. XORQ CX,DX
  238. XORQ SI,AX
  239. RORQ $0x20,SI
  240. ADDQ DX,SI
  241. ADDQ AX,CX
  242. RORQ $0x2F,AX
  243. XORQ CX,AX
  244. RORQ $0x2B,DX
  245. RORQ $0x20,CX
  246. XORQ SI,DX
  247. ADDQ AX,SI
  248. RORQ $0x33,AX
  249. ADDQ DX,CX
  250. RORQ $0x30,DX
  251. XORQ SI,AX
  252. XORQ CX,DX
  253. RORQ $0x20,SI
  254. ADDQ AX,CX
  255. ADDQ DX,SI
  256. RORQ $0x2F,AX
  257. RORQ $0x2B,DX
  258. XORQ CX,AX
  259. XORQ SI,DX
  260. RORQ $0x20,CX
  261. MOVQ SI,BX
  262. XORQ AX,BX
  263. XORQ DX,BX
  264. XORQ CX,BX
  265. MOVQ BX,ret1+48(FP)
  266. RET