hash128_amd64.s 4.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292
  1. // +build amd64,!appengine,!gccgo
  2. // This is a translation of the gcc output of FloodyBerry's pure-C public
  3. // domain siphash implementation at https://github.com/floodyberry/siphash
  4. // This assembly code has been modified from the 64-bit output to the experiment 128-bit output.
  5. // SI = v0
  6. // AX = v1
  7. // CX = v2
  8. // DX = v3
  9. // func Hash128(k0, k1 uint64, b []byte) (r0 uint64, r1 uint64)
  10. TEXT ·Hash128(SB),4,$0-56
  11. MOVQ k0+0(FP),CX
  12. MOVQ $0x736F6D6570736575,R9
  13. MOVQ k1+8(FP),DI
  14. MOVQ $0x6C7967656E657261,BX
  15. MOVQ $0x646F72616E646F6D,AX
  16. MOVQ b_len+24(FP),DX
  17. XORQ $0xEE,AX
  18. MOVQ DX,R11
  19. MOVQ DX,R10
  20. XORQ CX,R9
  21. XORQ CX,BX
  22. MOVQ $0x7465646279746573,CX
  23. XORQ DI,AX
  24. XORQ DI,CX
  25. SHLQ $0x38,R11
  26. XORQ DI,DI
  27. MOVQ b_base+16(FP),SI
  28. ANDQ $0xFFFFFFFFFFFFFFF8,R10
  29. JE afterLoop
  30. XCHGQ AX,AX
  31. loopBody:
  32. MOVQ 0(SI)(DI*1),R8
  33. ADDQ AX,R9
  34. RORQ $0x33,AX
  35. XORQ R9,AX
  36. RORQ $0x20,R9
  37. ADDQ $0x8,DI
  38. XORQ R8,CX
  39. ADDQ CX,BX
  40. RORQ $0x30,CX
  41. XORQ BX,CX
  42. ADDQ AX,BX
  43. RORQ $0x2F,AX
  44. ADDQ CX,R9
  45. RORQ $0x2B,CX
  46. XORQ BX,AX
  47. XORQ R9,CX
  48. RORQ $0x20,BX
  49. ADDQ AX,R9
  50. ADDQ CX,BX
  51. RORQ $0x33,AX
  52. RORQ $0x30,CX
  53. XORQ R9,AX
  54. XORQ BX,CX
  55. RORQ $0x20,R9
  56. ADDQ AX,BX
  57. ADDQ CX,R9
  58. RORQ $0x2F,AX
  59. RORQ $0x2B,CX
  60. XORQ BX,AX
  61. RORQ $0x20,BX
  62. XORQ R9,CX
  63. XORQ R8,R9
  64. CMPQ R10,DI
  65. JA loopBody
  66. afterLoop:
  67. SUBQ R10,DX
  68. CMPQ DX,$0x7
  69. JA afterSwitch
  70. // no support for jump tables
  71. CMPQ DX,$0x7
  72. JE sw7
  73. CMPQ DX,$0x6
  74. JE sw6
  75. CMPQ DX,$0x5
  76. JE sw5
  77. CMPQ DX,$0x4
  78. JE sw4
  79. CMPQ DX,$0x3
  80. JE sw3
  81. CMPQ DX,$0x2
  82. JE sw2
  83. CMPQ DX,$0x1
  84. JE sw1
  85. JMP afterSwitch
  86. sw7: MOVBQZX 6(SI)(DI*1),DX
  87. SHLQ $0x30,DX
  88. ORQ DX,R11
  89. sw6: MOVBQZX 0x5(SI)(DI*1),DX
  90. SHLQ $0x28,DX
  91. ORQ DX,R11
  92. sw5: MOVBQZX 0x4(SI)(DI*1),DX
  93. SHLQ $0x20,DX
  94. ORQ DX,R11
  95. sw4: MOVBQZX 0x3(SI)(DI*1),DX
  96. SHLQ $0x18,DX
  97. ORQ DX,R11
  98. sw3: MOVBQZX 0x2(SI)(DI*1),DX
  99. SHLQ $0x10,DX
  100. ORQ DX,R11
  101. sw2: MOVBQZX 0x1(SI)(DI*1),DX
  102. SHLQ $0x8,DX
  103. ORQ DX,R11
  104. sw1: MOVBQZX 0(SI)(DI*1),DX
  105. ORQ DX,R11
  106. afterSwitch:
  107. LEAQ (AX)(R9*1),SI
  108. XORQ R11,CX
  109. RORQ $0x33,AX
  110. ADDQ CX,BX
  111. MOVQ CX,DX
  112. XORQ SI,AX
  113. RORQ $0x30,DX
  114. RORQ $0x20,SI
  115. LEAQ 0(BX)(AX*1),CX
  116. XORQ BX,DX
  117. RORQ $0x2F,AX
  118. ADDQ DX,SI
  119. RORQ $0x2B,DX
  120. XORQ CX,AX
  121. XORQ SI,DX
  122. RORQ $0x20,CX
  123. ADDQ AX,SI
  124. RORQ $0x33,AX
  125. ADDQ DX,CX
  126. XORQ SI,AX
  127. RORQ $0x30,DX
  128. RORQ $0x20,SI
  129. XORQ CX,DX
  130. ADDQ AX,CX
  131. RORQ $0x2F,AX
  132. ADDQ DX,SI
  133. XORQ CX,AX
  134. RORQ $0x2B,DX
  135. RORQ $0x20,CX
  136. XORQ SI,DX
  137. XORQ R11,SI
  138. XORB $0xEE,CL
  139. ADDQ AX,SI
  140. RORQ $0x33,AX
  141. ADDQ DX,CX
  142. RORQ $0x30,DX
  143. XORQ SI,AX
  144. XORQ CX,DX
  145. RORQ $0x20,SI
  146. ADDQ AX,CX
  147. ADDQ DX,SI
  148. RORQ $0x2F,AX
  149. RORQ $0x2B,DX
  150. XORQ CX,AX
  151. XORQ SI,DX
  152. RORQ $0x20,CX
  153. ADDQ AX,SI
  154. ADDQ DX,CX
  155. RORQ $0x33,AX
  156. RORQ $0x30,DX
  157. XORQ SI,AX
  158. RORQ $0x20,SI
  159. XORQ CX,DX
  160. ADDQ AX,CX
  161. RORQ $0x2F,AX
  162. ADDQ DX,SI
  163. RORQ $0x2B,DX
  164. XORQ CX,AX
  165. XORQ SI,DX
  166. RORQ $0x20,CX
  167. ADDQ AX,SI
  168. ADDQ DX,CX
  169. RORQ $0x33,AX
  170. RORQ $0x30,DX
  171. XORQ CX,DX
  172. XORQ SI,AX
  173. RORQ $0x20,SI
  174. ADDQ DX,SI
  175. ADDQ AX,CX
  176. RORQ $0x2F,AX
  177. XORQ CX,AX
  178. RORQ $0x2B,DX
  179. RORQ $0x20,CX
  180. XORQ SI,DX
  181. // gcc optimized the tail end of this function differently. However,
  182. // we need to preserve out registers to carry out the second stage of
  183. // the finalization. This is a duplicate of an earlier finalization
  184. // round.
  185. ADDQ AX,SI
  186. RORQ $0x33,AX
  187. ADDQ DX,CX
  188. RORQ $0x30,DX
  189. XORQ SI,AX
  190. XORQ CX,DX
  191. RORQ $0x20,SI
  192. ADDQ AX,CX
  193. ADDQ DX,SI
  194. RORQ $0x2F,AX
  195. RORQ $0x2B,DX
  196. XORQ CX,AX
  197. XORQ SI,DX
  198. RORQ $0x20,CX
  199. // Stuff the result into BX instead of AX as gcc had done
  200. MOVQ SI,BX
  201. XORQ AX,BX
  202. XORQ DX,BX
  203. XORQ CX,BX
  204. MOVQ BX,ret+40(FP)
  205. // Start the second finalization round
  206. XORB $0xDD,AL
  207. ADDQ AX,SI
  208. RORQ $0x33,AX
  209. ADDQ DX,CX
  210. RORQ $0x30,DX
  211. XORQ SI,AX
  212. XORQ CX,DX
  213. RORQ $0x20,SI
  214. ADDQ AX,CX
  215. ADDQ DX,SI
  216. RORQ $0x2F,AX
  217. RORQ $0x2B,DX
  218. XORQ CX,AX
  219. XORQ SI,DX
  220. RORQ $0x20,CX
  221. ADDQ AX,SI
  222. ADDQ DX,CX
  223. RORQ $0x33,AX
  224. RORQ $0x30,DX
  225. XORQ SI,AX
  226. RORQ $0x20,SI
  227. XORQ CX,DX
  228. ADDQ AX,CX
  229. RORQ $0x2F,AX
  230. ADDQ DX,SI
  231. RORQ $0x2B,DX
  232. XORQ CX,AX
  233. XORQ SI,DX
  234. RORQ $0x20,CX
  235. ADDQ AX,SI
  236. ADDQ DX,CX
  237. RORQ $0x33,AX
  238. RORQ $0x30,DX
  239. XORQ CX,DX
  240. XORQ SI,AX
  241. RORQ $0x20,SI
  242. ADDQ DX,SI
  243. ADDQ AX,CX
  244. RORQ $0x2F,AX
  245. XORQ CX,AX
  246. RORQ $0x2B,DX
  247. RORQ $0x20,CX
  248. XORQ SI,DX
  249. ADDQ AX,SI
  250. RORQ $0x33,AX
  251. ADDQ DX,CX
  252. RORQ $0x30,DX
  253. XORQ SI,AX
  254. XORQ CX,DX
  255. RORQ $0x20,SI
  256. ADDQ AX,CX
  257. ADDQ DX,SI
  258. RORQ $0x2F,AX
  259. RORQ $0x2B,DX
  260. XORQ CX,AX
  261. XORQ SI,DX
  262. RORQ $0x20,CX
  263. MOVQ SI,BX
  264. XORQ AX,BX
  265. XORQ DX,BX
  266. XORQ CX,BX
  267. MOVQ BX,ret1+48(FP)
  268. RET