| 12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295 |
- #!/usr/bin/env python3
- #
- # To the extent possible under law, Yawning Angel has waived all copyright
- # and related or neighboring rights to chacha20, using the Creative
- # Commons "CC0" public domain dedication. See LICENSE or
- # <http://creativecommons.org/publicdomain/zero/1.0/> for full details.
- #
- # cgo sucks. Plan 9 assembly sucks. Real languages have SIMD intrinsics.
- # The least terrible/retarded option is to use a Python code generator, so
- # that's what I did.
- #
- # Code based on Ted Krovetz's vec128 C implementation, with corrections
- # to use a 64 bit counter instead of 32 bit, and to allow unaligned input and
- # output pointers.
- #
- # Dependencies: https://github.com/Maratyszcza/PeachPy
- #
- # python3 -m peachpy.x86_64 -mabi=goasm -S -o chacha20_amd64.s chacha20_amd64.py
- #
- from peachpy import *
- from peachpy.x86_64 import *
- x = Argument(ptr(uint32_t))
- inp = Argument(ptr(const_uint8_t))
- outp = Argument(ptr(uint8_t))
- nrBlocks = Argument(ptr(size_t))
- #
- # SSE2 helper functions. A temporary register is explicitly passed in because
- # the main fast loop uses every single register (and even spills) so manual
- # control is needed.
- #
- # This used to also have a DQROUNDS helper that did 2 rounds of ChaCha like
- # in the C code, but the C code has the luxury of an optimizer reordering
- # everything, while this does not.
- #
- def ROTW16_sse2(tmp, d):
- MOVDQA(tmp, d)
- PSLLD(tmp, 16)
- PSRLD(d, 16)
- PXOR(d, tmp)
- def ROTW12_sse2(tmp, b):
- MOVDQA(tmp, b)
- PSLLD(tmp, 12)
- PSRLD(b, 20)
- PXOR(b, tmp)
- def ROTW8_sse2(tmp, d):
- MOVDQA(tmp, d)
- PSLLD(tmp, 8)
- PSRLD(d, 24)
- PXOR(d, tmp)
- def ROTW7_sse2(tmp, b):
- MOVDQA(tmp, b)
- PSLLD(tmp, 7)
- PSRLD(b, 25)
- PXOR(b, tmp)
- def WriteXor_sse2(tmp, inp, outp, d, v0, v1, v2, v3):
- MOVDQU(tmp, [inp+d])
- PXOR(tmp, v0)
- MOVDQU([outp+d], tmp)
- MOVDQU(tmp, [inp+d+16])
- PXOR(tmp, v1)
- MOVDQU([outp+d+16], tmp)
- MOVDQU(tmp, [inp+d+32])
- PXOR(tmp, v2)
- MOVDQU([outp+d+32], tmp)
- MOVDQU(tmp, [inp+d+48])
- PXOR(tmp, v3)
- MOVDQU([outp+d+48], tmp)
- # SSE2 ChaCha20 (aka vec128). Does not handle partial blocks, and will
- # process 4/2/1 blocks at a time.
- with Function("blocksAmd64SSE2", (x, inp, outp, nrBlocks)):
- reg_x = GeneralPurposeRegister64()
- reg_inp = GeneralPurposeRegister64()
- reg_outp = GeneralPurposeRegister64()
- reg_blocks = GeneralPurposeRegister64()
- reg_sp_save = GeneralPurposeRegister64()
- LOAD.ARGUMENT(reg_x, x)
- LOAD.ARGUMENT(reg_inp, inp)
- LOAD.ARGUMENT(reg_outp, outp)
- LOAD.ARGUMENT(reg_blocks, nrBlocks)
- # Align the stack to a 32 byte boundary.
- MOV(reg_sp_save, registers.rsp)
- AND(registers.rsp, 0xffffffffffffffe0)
- SUB(registers.rsp, 0x20)
- # Build the counter increment vector on the stack, and allocate the scratch
- # space
- xmm_v0 = XMMRegister()
- PXOR(xmm_v0, xmm_v0)
- SUB(registers.rsp, 16+16)
- MOVDQA([registers.rsp], xmm_v0)
- reg_tmp = GeneralPurposeRegister32()
- MOV(reg_tmp, 0x00000001)
- MOV([registers.rsp], reg_tmp)
- mem_one = [registers.rsp] # (Stack) Counter increment vector
- mem_tmp0 = [registers.rsp+16] # (Stack) Scratch space.
- mem_s0 = [reg_x] # (Memory) Cipher state [0..3]
- mem_s1 = [reg_x+16] # (Memory) Cipher state [4..7]
- mem_s2 = [reg_x+32] # (Memory) Cipher state [8..11]
- mem_s3 = [reg_x+48] # (Memory) Cipher state [12..15]
- # xmm_v0 allocated above...
- xmm_v1 = XMMRegister()
- xmm_v2 = XMMRegister()
- xmm_v3 = XMMRegister()
- xmm_v4 = XMMRegister()
- xmm_v5 = XMMRegister()
- xmm_v6 = XMMRegister()
- xmm_v7 = XMMRegister()
- xmm_v8 = XMMRegister()
- xmm_v9 = XMMRegister()
- xmm_v10 = XMMRegister()
- xmm_v11 = XMMRegister()
- xmm_v12 = XMMRegister()
- xmm_v13 = XMMRegister()
- xmm_v14 = XMMRegister()
- xmm_v15 = XMMRegister()
- xmm_tmp = xmm_v12
- #
- # 4 blocks at a time.
- #
- reg_rounds = GeneralPurposeRegister64()
- vector_loop4 = Loop()
- SUB(reg_blocks, 4)
- JB(vector_loop4.end)
- with vector_loop4:
- MOVDQU(xmm_v0, mem_s0)
- MOVDQU(xmm_v1, mem_s1)
- MOVDQU(xmm_v2, mem_s2)
- MOVDQU(xmm_v3, mem_s3)
- MOVDQA(xmm_v4, xmm_v0)
- MOVDQA(xmm_v5, xmm_v1)
- MOVDQA(xmm_v6, xmm_v2)
- MOVDQA(xmm_v7, xmm_v3)
- PADDQ(xmm_v7, mem_one)
- MOVDQA(xmm_v8, xmm_v0)
- MOVDQA(xmm_v9, xmm_v1)
- MOVDQA(xmm_v10, xmm_v2)
- MOVDQA(xmm_v11, xmm_v7)
- PADDQ(xmm_v11, mem_one)
- MOVDQA(xmm_v12, xmm_v0)
- MOVDQA(xmm_v13, xmm_v1)
- MOVDQA(xmm_v14, xmm_v2)
- MOVDQA(xmm_v15, xmm_v11)
- PADDQ(xmm_v15, mem_one)
- MOV(reg_rounds, 20)
- rounds_loop4 = Loop()
- with rounds_loop4:
- # a += b; d ^= a; d = ROTW16(d);
- PADDD(xmm_v0, xmm_v1)
- PADDD(xmm_v4, xmm_v5)
- PADDD(xmm_v8, xmm_v9)
- PADDD(xmm_v12, xmm_v13)
- PXOR(xmm_v3, xmm_v0)
- PXOR(xmm_v7, xmm_v4)
- PXOR(xmm_v11, xmm_v8)
- PXOR(xmm_v15, xmm_v12)
- MOVDQA(mem_tmp0, xmm_tmp) # Save
- ROTW16_sse2(xmm_tmp, xmm_v3)
- ROTW16_sse2(xmm_tmp, xmm_v7)
- ROTW16_sse2(xmm_tmp, xmm_v11)
- ROTW16_sse2(xmm_tmp, xmm_v15)
- # c += d; b ^= c; b = ROTW12(b);
- PADDD(xmm_v2, xmm_v3)
- PADDD(xmm_v6, xmm_v7)
- PADDD(xmm_v10, xmm_v11)
- PADDD(xmm_v14, xmm_v15)
- PXOR(xmm_v1, xmm_v2)
- PXOR(xmm_v5, xmm_v6)
- PXOR(xmm_v9, xmm_v10)
- PXOR(xmm_v13, xmm_v14)
- ROTW12_sse2(xmm_tmp, xmm_v1)
- ROTW12_sse2(xmm_tmp, xmm_v5)
- ROTW12_sse2(xmm_tmp, xmm_v9)
- ROTW12_sse2(xmm_tmp, xmm_v13)
- # a += b; d ^= a; d = ROTW8(d);
- MOVDQA(xmm_tmp, mem_tmp0) # Restore
- PADDD(xmm_v0, xmm_v1)
- PADDD(xmm_v4, xmm_v5)
- PADDD(xmm_v8, xmm_v9)
- PADDD(xmm_v12, xmm_v13)
- PXOR(xmm_v3, xmm_v0)
- PXOR(xmm_v7, xmm_v4)
- PXOR(xmm_v11, xmm_v8)
- PXOR(xmm_v15, xmm_v12)
- MOVDQA(mem_tmp0, xmm_tmp) # Save
- ROTW8_sse2(xmm_tmp, xmm_v3)
- ROTW8_sse2(xmm_tmp, xmm_v7)
- ROTW8_sse2(xmm_tmp, xmm_v11)
- ROTW8_sse2(xmm_tmp, xmm_v15)
- # c += d; b ^= c; b = ROTW7(b)
- PADDD(xmm_v2, xmm_v3)
- PADDD(xmm_v6, xmm_v7)
- PADDD(xmm_v10, xmm_v11)
- PADDD(xmm_v14, xmm_v15)
- PXOR(xmm_v1, xmm_v2)
- PXOR(xmm_v5, xmm_v6)
- PXOR(xmm_v9, xmm_v10)
- PXOR(xmm_v13, xmm_v14)
- ROTW7_sse2(xmm_tmp, xmm_v1)
- ROTW7_sse2(xmm_tmp, xmm_v5)
- ROTW7_sse2(xmm_tmp, xmm_v9)
- ROTW7_sse2(xmm_tmp, xmm_v13)
- # b = ROTV1(b); c = ROTV2(c); d = ROTV3(d);
- PSHUFD(xmm_v1, xmm_v1, 0x39)
- PSHUFD(xmm_v5, xmm_v5, 0x39)
- PSHUFD(xmm_v9, xmm_v9, 0x39)
- PSHUFD(xmm_v13, xmm_v13, 0x39)
- PSHUFD(xmm_v2, xmm_v2, 0x4e)
- PSHUFD(xmm_v6, xmm_v6, 0x4e)
- PSHUFD(xmm_v10, xmm_v10, 0x4e)
- PSHUFD(xmm_v14, xmm_v14, 0x4e)
- PSHUFD(xmm_v3, xmm_v3, 0x93)
- PSHUFD(xmm_v7, xmm_v7, 0x93)
- PSHUFD(xmm_v11, xmm_v11, 0x93)
- PSHUFD(xmm_v15, xmm_v15, 0x93)
- MOVDQA(xmm_tmp, mem_tmp0) # Restore
- # a += b; d ^= a; d = ROTW16(d);
- PADDD(xmm_v0, xmm_v1)
- PADDD(xmm_v4, xmm_v5)
- PADDD(xmm_v8, xmm_v9)
- PADDD(xmm_v12, xmm_v13)
- PXOR(xmm_v3, xmm_v0)
- PXOR(xmm_v7, xmm_v4)
- PXOR(xmm_v11, xmm_v8)
- PXOR(xmm_v15, xmm_v12)
- MOVDQA(mem_tmp0, xmm_tmp) # Save
- ROTW16_sse2(xmm_tmp, xmm_v3)
- ROTW16_sse2(xmm_tmp, xmm_v7)
- ROTW16_sse2(xmm_tmp, xmm_v11)
- ROTW16_sse2(xmm_tmp, xmm_v15)
- # c += d; b ^= c; b = ROTW12(b);
- PADDD(xmm_v2, xmm_v3)
- PADDD(xmm_v6, xmm_v7)
- PADDD(xmm_v10, xmm_v11)
- PADDD(xmm_v14, xmm_v15)
- PXOR(xmm_v1, xmm_v2)
- PXOR(xmm_v5, xmm_v6)
- PXOR(xmm_v9, xmm_v10)
- PXOR(xmm_v13, xmm_v14)
- ROTW12_sse2(xmm_tmp, xmm_v1)
- ROTW12_sse2(xmm_tmp, xmm_v5)
- ROTW12_sse2(xmm_tmp, xmm_v9)
- ROTW12_sse2(xmm_tmp, xmm_v13)
- # a += b; d ^= a; d = ROTW8(d);
- MOVDQA(xmm_tmp, mem_tmp0) # Restore
- PADDD(xmm_v0, xmm_v1)
- PADDD(xmm_v4, xmm_v5)
- PADDD(xmm_v8, xmm_v9)
- PADDD(xmm_v12, xmm_v13)
- PXOR(xmm_v3, xmm_v0)
- PXOR(xmm_v7, xmm_v4)
- PXOR(xmm_v11, xmm_v8)
- PXOR(xmm_v15, xmm_v12)
- MOVDQA(mem_tmp0, xmm_tmp) # Save
- ROTW8_sse2(xmm_tmp, xmm_v3)
- ROTW8_sse2(xmm_tmp, xmm_v7)
- ROTW8_sse2(xmm_tmp, xmm_v11)
- ROTW8_sse2(xmm_tmp, xmm_v15)
- # c += d; b ^= c; b = ROTW7(b)
- PADDD(xmm_v2, xmm_v3)
- PADDD(xmm_v6, xmm_v7)
- PADDD(xmm_v10, xmm_v11)
- PADDD(xmm_v14, xmm_v15)
- PXOR(xmm_v1, xmm_v2)
- PXOR(xmm_v5, xmm_v6)
- PXOR(xmm_v9, xmm_v10)
- PXOR(xmm_v13, xmm_v14)
- ROTW7_sse2(xmm_tmp, xmm_v1)
- ROTW7_sse2(xmm_tmp, xmm_v5)
- ROTW7_sse2(xmm_tmp, xmm_v9)
- ROTW7_sse2(xmm_tmp, xmm_v13)
- # b = ROTV1(b); c = ROTV2(c); d = ROTV3(d);
- PSHUFD(xmm_v1, xmm_v1, 0x93)
- PSHUFD(xmm_v5, xmm_v5, 0x93)
- PSHUFD(xmm_v9, xmm_v9, 0x93)
- PSHUFD(xmm_v13, xmm_v13, 0x93)
- PSHUFD(xmm_v2, xmm_v2, 0x4e)
- PSHUFD(xmm_v6, xmm_v6, 0x4e)
- PSHUFD(xmm_v10, xmm_v10, 0x4e)
- PSHUFD(xmm_v14, xmm_v14, 0x4e)
- PSHUFD(xmm_v3, xmm_v3, 0x39)
- PSHUFD(xmm_v7, xmm_v7, 0x39)
- PSHUFD(xmm_v11, xmm_v11, 0x39)
- PSHUFD(xmm_v15, xmm_v15, 0x39)
- MOVDQA(xmm_tmp, mem_tmp0) # Restore
- SUB(reg_rounds, 2)
- JNZ(rounds_loop4.begin)
- MOVDQA(mem_tmp0, xmm_tmp)
- PADDD(xmm_v0, mem_s0)
- PADDD(xmm_v1, mem_s1)
- PADDD(xmm_v2, mem_s2)
- PADDD(xmm_v3, mem_s3)
- WriteXor_sse2(xmm_tmp, reg_inp, reg_outp, 0, xmm_v0, xmm_v1, xmm_v2, xmm_v3)
- MOVDQU(xmm_v3, mem_s3)
- PADDQ(xmm_v3, mem_one)
- PADDD(xmm_v4, mem_s0)
- PADDD(xmm_v5, mem_s1)
- PADDD(xmm_v6, mem_s2)
- PADDD(xmm_v7, xmm_v3)
- WriteXor_sse2(xmm_tmp, reg_inp, reg_outp, 64, xmm_v4, xmm_v5, xmm_v6, xmm_v7)
- PADDQ(xmm_v3, mem_one)
- PADDD(xmm_v8, mem_s0)
- PADDD(xmm_v9, mem_s1)
- PADDD(xmm_v10, mem_s2)
- PADDD(xmm_v11, xmm_v3)
- WriteXor_sse2(xmm_tmp, reg_inp, reg_outp, 128, xmm_v8, xmm_v9, xmm_v10, xmm_v11)
- PADDQ(xmm_v3, mem_one)
- MOVDQA(xmm_tmp, mem_tmp0)
- PADDD(xmm_v12, mem_s0)
- PADDD(xmm_v13, mem_s1)
- PADDD(xmm_v14, mem_s2)
- PADDD(xmm_v15, xmm_v3)
- WriteXor_sse2(xmm_v0, reg_inp, reg_outp, 192, xmm_v12, xmm_v13, xmm_v14, xmm_v15)
- PADDQ(xmm_v3, mem_one)
- MOVDQU(mem_s3, xmm_v3)
- ADD(reg_inp, 4 * 64)
- ADD(reg_outp, 4 * 64)
- SUB(reg_blocks, 4)
- JAE(vector_loop4.begin)
- ADD(reg_blocks, 4)
- out = Label()
- JZ(out)
- # Past this point, we no longer need to use every single register to hold
- # the in progress state.
- xmm_s0 = xmm_v8
- xmm_s1 = xmm_v9
- xmm_s2 = xmm_v10
- xmm_s3 = xmm_v11
- xmm_one = xmm_v13
- MOVDQU(xmm_s0, mem_s0)
- MOVDQU(xmm_s1, mem_s1)
- MOVDQU(xmm_s2, mem_s2)
- MOVDQU(xmm_s3, mem_s3)
- MOVDQA(xmm_one, mem_one)
- #
- # 2 blocks at a time.
- #
- process_1_block = Label()
- SUB(reg_blocks, 2)
- JB(process_1_block) # < 2 blocks remaining.
- MOVDQA(xmm_v0, xmm_s0)
- MOVDQA(xmm_v1, xmm_s1)
- MOVDQA(xmm_v2, xmm_s2)
- MOVDQA(xmm_v3, xmm_s3)
- MOVDQA(xmm_v4, xmm_v0)
- MOVDQA(xmm_v5, xmm_v1)
- MOVDQA(xmm_v6, xmm_v2)
- MOVDQA(xmm_v7, xmm_v3)
- PADDQ(xmm_v7, xmm_one)
- MOV(reg_rounds, 20)
- rounds_loop2 = Loop()
- with rounds_loop2:
- # a += b; d ^= a; d = ROTW16(d);
- PADDD(xmm_v0, xmm_v1)
- PADDD(xmm_v4, xmm_v5)
- PXOR(xmm_v3, xmm_v0)
- PXOR(xmm_v7, xmm_v4)
- ROTW16_sse2(xmm_tmp, xmm_v3)
- ROTW16_sse2(xmm_tmp, xmm_v7)
- # c += d; b ^= c; b = ROTW12(b);
- PADDD(xmm_v2, xmm_v3)
- PADDD(xmm_v6, xmm_v7)
- PXOR(xmm_v1, xmm_v2)
- PXOR(xmm_v5, xmm_v6)
- ROTW12_sse2(xmm_tmp, xmm_v1)
- ROTW12_sse2(xmm_tmp, xmm_v5)
- # a += b; d ^= a; d = ROTW8(d);
- PADDD(xmm_v0, xmm_v1)
- PADDD(xmm_v4, xmm_v5)
- PXOR(xmm_v3, xmm_v0)
- PXOR(xmm_v7, xmm_v4)
- ROTW8_sse2(xmm_tmp, xmm_v3)
- ROTW8_sse2(xmm_tmp, xmm_v7)
- # c += d; b ^= c; b = ROTW7(b)
- PADDD(xmm_v2, xmm_v3)
- PADDD(xmm_v6, xmm_v7)
- PXOR(xmm_v1, xmm_v2)
- PXOR(xmm_v5, xmm_v6)
- ROTW7_sse2(xmm_tmp, xmm_v1)
- ROTW7_sse2(xmm_tmp, xmm_v5)
- # b = ROTV1(b); c = ROTV2(c); d = ROTV3(d);
- PSHUFD(xmm_v1, xmm_v1, 0x39)
- PSHUFD(xmm_v5, xmm_v5, 0x39)
- PSHUFD(xmm_v2, xmm_v2, 0x4e)
- PSHUFD(xmm_v6, xmm_v6, 0x4e)
- PSHUFD(xmm_v3, xmm_v3, 0x93)
- PSHUFD(xmm_v7, xmm_v7, 0x93)
- # a += b; d ^= a; d = ROTW16(d);
- PADDD(xmm_v0, xmm_v1)
- PADDD(xmm_v4, xmm_v5)
- PXOR(xmm_v3, xmm_v0)
- PXOR(xmm_v7, xmm_v4)
- ROTW16_sse2(xmm_tmp, xmm_v3)
- ROTW16_sse2(xmm_tmp, xmm_v7)
- # c += d; b ^= c; b = ROTW12(b);
- PADDD(xmm_v2, xmm_v3)
- PADDD(xmm_v6, xmm_v7)
- PXOR(xmm_v1, xmm_v2)
- PXOR(xmm_v5, xmm_v6)
- ROTW12_sse2(xmm_tmp, xmm_v1)
- ROTW12_sse2(xmm_tmp, xmm_v5)
- # a += b; d ^= a; d = ROTW8(d);
- PADDD(xmm_v0, xmm_v1)
- PADDD(xmm_v4, xmm_v5)
- PXOR(xmm_v3, xmm_v0)
- PXOR(xmm_v7, xmm_v4)
- ROTW8_sse2(xmm_tmp, xmm_v3)
- ROTW8_sse2(xmm_tmp, xmm_v7)
- # c += d; b ^= c; b = ROTW7(b)
- PADDD(xmm_v2, xmm_v3)
- PADDD(xmm_v6, xmm_v7)
- PXOR(xmm_v1, xmm_v2)
- PXOR(xmm_v5, xmm_v6)
- ROTW7_sse2(xmm_tmp, xmm_v1)
- ROTW7_sse2(xmm_tmp, xmm_v5)
- # b = ROTV1(b); c = ROTV2(c); d = ROTV3(d);
- PSHUFD(xmm_v1, xmm_v1, 0x93)
- PSHUFD(xmm_v5, xmm_v5, 0x93)
- PSHUFD(xmm_v2, xmm_v2, 0x4e)
- PSHUFD(xmm_v6, xmm_v6, 0x4e)
- PSHUFD(xmm_v3, xmm_v3, 0x39)
- PSHUFD(xmm_v7, xmm_v7, 0x39)
- SUB(reg_rounds, 2)
- JNZ(rounds_loop2.begin)
- PADDD(xmm_v0, xmm_s0)
- PADDD(xmm_v1, xmm_s1)
- PADDD(xmm_v2, xmm_s2)
- PADDD(xmm_v3, xmm_s3)
- WriteXor_sse2(xmm_tmp, reg_inp, reg_outp, 0, xmm_v0, xmm_v1, xmm_v2, xmm_v3)
- PADDQ(xmm_s3, xmm_one)
- PADDD(xmm_v4, xmm_s0)
- PADDD(xmm_v5, xmm_s1)
- PADDD(xmm_v6, xmm_s2)
- PADDD(xmm_v7, xmm_s3)
- WriteXor_sse2(xmm_tmp, reg_inp, reg_outp, 64, xmm_v4, xmm_v5, xmm_v6, xmm_v7)
- PADDQ(xmm_s3, xmm_one)
- ADD(reg_inp, 2 * 64)
- ADD(reg_outp, 2 * 64)
- SUB(reg_blocks, 2)
- LABEL(process_1_block)
- ADD(reg_blocks, 2)
- out_serial = Label()
- JZ(out_serial)
- #
- # 1 block at a time. Only executed once, because if there was > 1,
- # the parallel code would have processed it already.
- #
- MOVDQA(xmm_v0, xmm_s0)
- MOVDQA(xmm_v1, xmm_s1)
- MOVDQA(xmm_v2, xmm_s2)
- MOVDQA(xmm_v3, xmm_s3)
- MOV(reg_rounds, 20)
- rounds_loop1 = Loop()
- with rounds_loop1:
- # a += b; d ^= a; d = ROTW16(d);
- PADDD(xmm_v0, xmm_v1)
- PXOR(xmm_v3, xmm_v0)
- ROTW16_sse2(xmm_tmp, xmm_v3)
- # c += d; b ^= c; b = ROTW12(b);
- PADDD(xmm_v2, xmm_v3)
- PXOR(xmm_v1, xmm_v2)
- ROTW12_sse2(xmm_tmp, xmm_v1)
- # a += b; d ^= a; d = ROTW8(d);
- PADDD(xmm_v0, xmm_v1)
- PXOR(xmm_v3, xmm_v0)
- ROTW8_sse2(xmm_tmp, xmm_v3)
- # c += d; b ^= c; b = ROTW7(b)
- PADDD(xmm_v2, xmm_v3)
- PXOR(xmm_v1, xmm_v2)
- ROTW7_sse2(xmm_tmp, xmm_v1)
- # b = ROTV1(b); c = ROTV2(c); d = ROTV3(d);
- PSHUFD(xmm_v1, xmm_v1, 0x39)
- PSHUFD(xmm_v2, xmm_v2, 0x4e)
- PSHUFD(xmm_v3, xmm_v3, 0x93)
- # a += b; d ^= a; d = ROTW16(d);
- PADDD(xmm_v0, xmm_v1)
- PXOR(xmm_v3, xmm_v0)
- ROTW16_sse2(xmm_tmp, xmm_v3)
- # c += d; b ^= c; b = ROTW12(b);
- PADDD(xmm_v2, xmm_v3)
- PXOR(xmm_v1, xmm_v2)
- ROTW12_sse2(xmm_tmp, xmm_v1)
- # a += b; d ^= a; d = ROTW8(d);
- PADDD(xmm_v0, xmm_v1)
- PXOR(xmm_v3, xmm_v0)
- ROTW8_sse2(xmm_tmp, xmm_v3)
- # c += d; b ^= c; b = ROTW7(b)
- PADDD(xmm_v2, xmm_v3)
- PXOR(xmm_v1, xmm_v2)
- ROTW7_sse2(xmm_tmp, xmm_v1)
- # b = ROTV1(b); c = ROTV2(c); d = ROTV3(d);
- PSHUFD(xmm_v1, xmm_v1, 0x93)
- PSHUFD(xmm_v2, xmm_v2, 0x4e)
- PSHUFD(xmm_v3, xmm_v3, 0x39)
- SUB(reg_rounds, 2)
- JNZ(rounds_loop1.begin)
- PADDD(xmm_v0, xmm_s0)
- PADDD(xmm_v1, xmm_s1)
- PADDD(xmm_v2, xmm_s2)
- PADDD(xmm_v3, xmm_s3)
- WriteXor_sse2(xmm_tmp, reg_inp, reg_outp, 0, xmm_v0, xmm_v1, xmm_v2, xmm_v3)
- PADDQ(xmm_s3, xmm_one)
- LABEL(out_serial)
- # Write back the updated counter. Stoping at 2^70 bytes is the user's
- # problem, not mine. (Skipped if there's exactly a multiple of 4 blocks
- # because the counter is incremented in memory while looping.)
- MOVDQU(mem_s3, xmm_s3)
- LABEL(out)
- # Paranoia, cleanse the scratch space.
- PXOR(xmm_v0, xmm_v0)
- MOVDQA(mem_tmp0, xmm_v0)
- # Remove our stack allocation.
- MOV(registers.rsp, reg_sp_save)
- RETURN()
- #
- # AVX2 helpers. Like the SSE2 equivalents, the scratch register is explicit,
- # and more helpers are used to increase readability for destructive operations.
- #
- # XXX/Performance: ROTW16_avx2/ROTW8_avx2 both can use VPSHUFFB.
- #
- def ADD_avx2(dst, src):
- VPADDD(dst, dst, src)
- def XOR_avx2(dst, src):
- VPXOR(dst, dst, src)
- def ROTW16_avx2(tmp, d):
- VPSLLD(tmp, d, 16)
- VPSRLD(d, d, 16)
- XOR_avx2(d, tmp)
- def ROTW12_avx2(tmp, b):
- VPSLLD(tmp, b, 12)
- VPSRLD(b, b, 20)
- XOR_avx2(b, tmp)
- def ROTW8_avx2(tmp, d):
- VPSLLD(tmp, d, 8)
- VPSRLD(d, d, 24)
- XOR_avx2(d, tmp)
- def ROTW7_avx2(tmp, b):
- VPSLLD(tmp, b, 7)
- VPSRLD(b, b, 25)
- XOR_avx2(b, tmp)
- def WriteXor_avx2(tmp, inp, outp, d, v0, v1, v2, v3):
- # XOR_WRITE(out+ 0, in+ 0, _mm256_permute2x128_si256(v0,v1,0x20));
- VPERM2I128(tmp, v0, v1, 0x20)
- VPXOR(tmp, tmp, [inp+d])
- VMOVDQU([outp+d], tmp)
- # XOR_WRITE(out+32, in+32, _mm256_permute2x128_si256(v2,v3,0x20));
- VPERM2I128(tmp, v2, v3, 0x20)
- VPXOR(tmp, tmp, [inp+d+32])
- VMOVDQU([outp+d+32], tmp)
- # XOR_WRITE(out+64, in+64, _mm256_permute2x128_si256(v0,v1,0x31));
- VPERM2I128(tmp, v0, v1, 0x31)
- VPXOR(tmp, tmp, [inp+d+64])
- VMOVDQU([outp+d+64], tmp)
- # XOR_WRITE(out+96, in+96, _mm256_permute2x128_si256(v2,v3,0x31));
- VPERM2I128(tmp, v2, v3, 0x31)
- VPXOR(tmp, tmp, [inp+d+96])
- VMOVDQU([outp+d+96], tmp)
- # AVX2 ChaCha20 (aka avx2). Does not handle partial blocks, will process
- # 8/4/2 blocks at a time.
- with Function("blocksAmd64AVX2", (x, inp, outp, nrBlocks), target=uarch.broadwell):
- reg_x = GeneralPurposeRegister64()
- reg_inp = GeneralPurposeRegister64()
- reg_outp = GeneralPurposeRegister64()
- reg_blocks = GeneralPurposeRegister64()
- reg_sp_save = GeneralPurposeRegister64()
- LOAD.ARGUMENT(reg_x, x)
- LOAD.ARGUMENT(reg_inp, inp)
- LOAD.ARGUMENT(reg_outp, outp)
- LOAD.ARGUMENT(reg_blocks, nrBlocks)
- # Align the stack to a 32 byte boundary.
- MOV(reg_sp_save, registers.rsp)
- AND(registers.rsp, 0xffffffffffffffe0)
- SUB(registers.rsp, 0x20)
- x_s0 = [reg_x] # (Memory) Cipher state [0..3]
- x_s1 = [reg_x+16] # (Memory) Cipher state [4..7]
- x_s2 = [reg_x+32] # (Memory) Cipher state [8..11]
- x_s3 = [reg_x+48] # (Memory) Cipher state [12..15]
- ymm_v0 = YMMRegister()
- ymm_v1 = YMMRegister()
- ymm_v2 = YMMRegister()
- ymm_v3 = YMMRegister()
- ymm_v4 = YMMRegister()
- ymm_v5 = YMMRegister()
- ymm_v6 = YMMRegister()
- ymm_v7 = YMMRegister()
- ymm_v8 = YMMRegister()
- ymm_v9 = YMMRegister()
- ymm_v10 = YMMRegister()
- ymm_v11 = YMMRegister()
- ymm_v12 = YMMRegister()
- ymm_v13 = YMMRegister()
- ymm_v14 = YMMRegister()
- ymm_v15 = YMMRegister()
- ymm_tmp0 = ymm_v12
- # Allocate the neccecary stack space for the counter vector and two ymm
- # registers that we will spill.
- SUB(registers.rsp, 96)
- mem_tmp0 = [registers.rsp+64] # (Stack) Scratch space.
- mem_s3 = [registers.rsp+32] # (Stack) Working copy of s3. (8x)
- mem_inc = [registers.rsp] # (Stack) Counter increment vector.
- # Increment the counter for one side of the state vector.
- VPXOR(ymm_tmp0, ymm_tmp0, ymm_tmp0)
- VMOVDQU(mem_inc, ymm_tmp0)
- reg_tmp = GeneralPurposeRegister32()
- MOV(reg_tmp, 0x00000001)
- MOV([registers.rsp+16], reg_tmp)
- VBROADCASTI128(ymm_v3, x_s3)
- VPADDQ(ymm_v3, ymm_v3, [registers.rsp])
- VMOVDQA(mem_s3, ymm_v3)
- # As we process 2xN blocks at a time, so the counter increment for both
- # sides of the state vector is 2.
- MOV(reg_tmp, 0x00000002)
- MOV([registers.rsp], reg_tmp)
- MOV([registers.rsp+16], reg_tmp)
- out_write_even = Label()
- out_write_odd = Label()
- #
- # 8 blocks at a time. Ted Krovetz's avx2 code does not do this, but it's
- # a decent gain despite all the pain...
- #
- reg_rounds = GeneralPurposeRegister64()
- vector_loop8 = Loop()
- SUB(reg_blocks, 8)
- JB(vector_loop8.end)
- with vector_loop8:
- VBROADCASTI128(ymm_v0, x_s0)
- VBROADCASTI128(ymm_v1, x_s1)
- VBROADCASTI128(ymm_v2, x_s2)
- VMOVDQA(ymm_v3, mem_s3)
- VMOVDQA(ymm_v4, ymm_v0)
- VMOVDQA(ymm_v5, ymm_v1)
- VMOVDQA(ymm_v6, ymm_v2)
- VPADDQ(ymm_v7, ymm_v3, mem_inc)
- VMOVDQA(ymm_v8, ymm_v0)
- VMOVDQA(ymm_v9, ymm_v1)
- VMOVDQA(ymm_v10, ymm_v2)
- VPADDQ(ymm_v11, ymm_v7, mem_inc)
- VMOVDQA(ymm_v12, ymm_v0)
- VMOVDQA(ymm_v13, ymm_v1)
- VMOVDQA(ymm_v14, ymm_v2)
- VPADDQ(ymm_v15, ymm_v11, mem_inc)
- MOV(reg_rounds, 20)
- rounds_loop8 = Loop()
- with rounds_loop8:
- # a += b; d ^= a; d = ROTW16(d);
- ADD_avx2(ymm_v0, ymm_v1)
- ADD_avx2(ymm_v4, ymm_v5)
- ADD_avx2(ymm_v8, ymm_v9)
- ADD_avx2(ymm_v12, ymm_v13)
- XOR_avx2(ymm_v3, ymm_v0)
- XOR_avx2(ymm_v7, ymm_v4)
- XOR_avx2(ymm_v11, ymm_v8)
- XOR_avx2(ymm_v15, ymm_v12)
- VMOVDQA(mem_tmp0, ymm_tmp0) # Save
- ROTW16_avx2(ymm_tmp0, ymm_v3)
- ROTW16_avx2(ymm_tmp0, ymm_v7)
- ROTW16_avx2(ymm_tmp0, ymm_v11)
- ROTW16_avx2(ymm_tmp0, ymm_v15)
- # c += d; b ^= c; b = ROTW12(b);
- ADD_avx2(ymm_v2, ymm_v3)
- ADD_avx2(ymm_v6, ymm_v7)
- ADD_avx2(ymm_v10, ymm_v11)
- ADD_avx2(ymm_v14, ymm_v15)
- XOR_avx2(ymm_v1, ymm_v2)
- XOR_avx2(ymm_v5, ymm_v6)
- XOR_avx2(ymm_v9, ymm_v10)
- XOR_avx2(ymm_v13, ymm_v14)
- ROTW12_avx2(ymm_tmp0, ymm_v1)
- ROTW12_avx2(ymm_tmp0, ymm_v5)
- ROTW12_avx2(ymm_tmp0, ymm_v9)
- ROTW12_avx2(ymm_tmp0, ymm_v13)
- # a += b; d ^= a; d = ROTW8(d);
- VMOVDQA(ymm_tmp0, mem_tmp0) # Restore
- ADD_avx2(ymm_v0, ymm_v1)
- ADD_avx2(ymm_v4, ymm_v5)
- ADD_avx2(ymm_v8, ymm_v9)
- ADD_avx2(ymm_v12, ymm_v13)
- XOR_avx2(ymm_v3, ymm_v0)
- XOR_avx2(ymm_v7, ymm_v4)
- XOR_avx2(ymm_v11, ymm_v8)
- XOR_avx2(ymm_v15, ymm_v12)
- VMOVDQA(mem_tmp0, ymm_tmp0) # Save
- ROTW8_avx2(ymm_tmp0, ymm_v3)
- ROTW8_avx2(ymm_tmp0, ymm_v7)
- ROTW8_avx2(ymm_tmp0, ymm_v11)
- ROTW8_avx2(ymm_tmp0, ymm_v15)
- # c += d; b ^= c; b = ROTW7(b)
- ADD_avx2(ymm_v2, ymm_v3)
- ADD_avx2(ymm_v6, ymm_v7)
- ADD_avx2(ymm_v10, ymm_v11)
- ADD_avx2(ymm_v14, ymm_v15)
- XOR_avx2(ymm_v1, ymm_v2)
- XOR_avx2(ymm_v5, ymm_v6)
- XOR_avx2(ymm_v9, ymm_v10)
- XOR_avx2(ymm_v13, ymm_v14)
- ROTW7_avx2(ymm_tmp0, ymm_v1)
- ROTW7_avx2(ymm_tmp0, ymm_v5)
- ROTW7_avx2(ymm_tmp0, ymm_v9)
- ROTW7_avx2(ymm_tmp0, ymm_v13)
- # b = ROTV1(b); c = ROTV2(c); d = ROTV3(d);
- VPSHUFD(ymm_v1, ymm_v1, 0x39)
- VPSHUFD(ymm_v5, ymm_v5, 0x39)
- VPSHUFD(ymm_v9, ymm_v9, 0x39)
- VPSHUFD(ymm_v13, ymm_v13, 0x39)
- VPSHUFD(ymm_v2, ymm_v2, 0x4e)
- VPSHUFD(ymm_v6, ymm_v6, 0x4e)
- VPSHUFD(ymm_v10, ymm_v10, 0x4e)
- VPSHUFD(ymm_v14, ymm_v14, 0x4e)
- VPSHUFD(ymm_v3, ymm_v3, 0x93)
- VPSHUFD(ymm_v7, ymm_v7, 0x93)
- VPSHUFD(ymm_v11, ymm_v11, 0x93)
- VPSHUFD(ymm_v15, ymm_v15, 0x93)
- # a += b; d ^= a; d = ROTW16(d);
- VMOVDQA(ymm_tmp0, mem_tmp0) # Restore
- ADD_avx2(ymm_v0, ymm_v1)
- ADD_avx2(ymm_v4, ymm_v5)
- ADD_avx2(ymm_v8, ymm_v9)
- ADD_avx2(ymm_v12, ymm_v13)
- XOR_avx2(ymm_v3, ymm_v0)
- XOR_avx2(ymm_v7, ymm_v4)
- XOR_avx2(ymm_v11, ymm_v8)
- XOR_avx2(ymm_v15, ymm_v12)
- VMOVDQA(mem_tmp0, ymm_tmp0) # Save
- ROTW16_avx2(ymm_tmp0, ymm_v3)
- ROTW16_avx2(ymm_tmp0, ymm_v7)
- ROTW16_avx2(ymm_tmp0, ymm_v11)
- ROTW16_avx2(ymm_tmp0, ymm_v15)
- # c += d; b ^= c; b = ROTW12(b);
- ADD_avx2(ymm_v2, ymm_v3)
- ADD_avx2(ymm_v6, ymm_v7)
- ADD_avx2(ymm_v10, ymm_v11)
- ADD_avx2(ymm_v14, ymm_v15)
- XOR_avx2(ymm_v1, ymm_v2)
- XOR_avx2(ymm_v5, ymm_v6)
- XOR_avx2(ymm_v9, ymm_v10)
- XOR_avx2(ymm_v13, ymm_v14)
- ROTW12_avx2(ymm_tmp0, ymm_v1)
- ROTW12_avx2(ymm_tmp0, ymm_v5)
- ROTW12_avx2(ymm_tmp0, ymm_v9)
- ROTW12_avx2(ymm_tmp0, ymm_v13)
- # a += b; d ^= a; d = ROTW8(d);
- VMOVDQA(ymm_tmp0, mem_tmp0) # Restore
- ADD_avx2(ymm_v0, ymm_v1)
- ADD_avx2(ymm_v4, ymm_v5)
- ADD_avx2(ymm_v8, ymm_v9)
- ADD_avx2(ymm_v12, ymm_v13)
- XOR_avx2(ymm_v3, ymm_v0)
- XOR_avx2(ymm_v7, ymm_v4)
- XOR_avx2(ymm_v11, ymm_v8)
- XOR_avx2(ymm_v15, ymm_v12)
- VMOVDQA(mem_tmp0, ymm_tmp0) # Save
- ROTW8_avx2(ymm_tmp0, ymm_v3)
- ROTW8_avx2(ymm_tmp0, ymm_v7)
- ROTW8_avx2(ymm_tmp0, ymm_v11)
- ROTW8_avx2(ymm_tmp0, ymm_v15)
- # c += d; b ^= c; b = ROTW7(b)
- ADD_avx2(ymm_v2, ymm_v3)
- ADD_avx2(ymm_v6, ymm_v7)
- ADD_avx2(ymm_v10, ymm_v11)
- ADD_avx2(ymm_v14, ymm_v15)
- XOR_avx2(ymm_v1, ymm_v2)
- XOR_avx2(ymm_v5, ymm_v6)
- XOR_avx2(ymm_v9, ymm_v10)
- XOR_avx2(ymm_v13, ymm_v14)
- ROTW7_avx2(ymm_tmp0, ymm_v1)
- ROTW7_avx2(ymm_tmp0, ymm_v5)
- ROTW7_avx2(ymm_tmp0, ymm_v9)
- ROTW7_avx2(ymm_tmp0, ymm_v13)
- # b = ROTV1(b); c = ROTV2(c); d = ROTV3(d);
- VPSHUFD(ymm_v1, ymm_v1, 0x93)
- VPSHUFD(ymm_v5, ymm_v5, 0x93)
- VPSHUFD(ymm_v9, ymm_v9, 0x93)
- VPSHUFD(ymm_v13, ymm_v13, 0x93)
- VPSHUFD(ymm_v2, ymm_v2, 0x4e)
- VPSHUFD(ymm_v6, ymm_v6, 0x4e)
- VPSHUFD(ymm_v10, ymm_v10, 0x4e)
- VPSHUFD(ymm_v14, ymm_v14, 0x4e)
- VPSHUFD(ymm_v3, ymm_v3, 0x39)
- VPSHUFD(ymm_v7, ymm_v7, 0x39)
- VPSHUFD(ymm_v11, ymm_v11, 0x39)
- VPSHUFD(ymm_v15, ymm_v15, 0x39)
- VMOVDQA(ymm_tmp0, mem_tmp0) # Restore
- SUB(reg_rounds, 2)
- JNZ(rounds_loop8.begin)
- # ymm_v12 is in mem_tmp0 and is current....
- # XXX: I assume VBROADCASTI128 is about as fast as VMOVDQA....
- VBROADCASTI128(ymm_tmp0, x_s0)
- ADD_avx2(ymm_v0, ymm_tmp0)
- ADD_avx2(ymm_v4, ymm_tmp0)
- ADD_avx2(ymm_v8, ymm_tmp0)
- ADD_avx2(ymm_tmp0, mem_tmp0)
- VMOVDQA(mem_tmp0, ymm_tmp0)
- VBROADCASTI128(ymm_tmp0, x_s1)
- ADD_avx2(ymm_v1, ymm_tmp0)
- ADD_avx2(ymm_v5, ymm_tmp0)
- ADD_avx2(ymm_v9, ymm_tmp0)
- ADD_avx2(ymm_v13, ymm_tmp0)
- VBROADCASTI128(ymm_tmp0, x_s2)
- ADD_avx2(ymm_v2, ymm_tmp0)
- ADD_avx2(ymm_v6, ymm_tmp0)
- ADD_avx2(ymm_v10, ymm_tmp0)
- ADD_avx2(ymm_v14, ymm_tmp0)
- ADD_avx2(ymm_v3, mem_s3)
- WriteXor_avx2(ymm_tmp0, reg_inp, reg_outp, 0, ymm_v0, ymm_v1, ymm_v2, ymm_v3)
- VMOVDQA(ymm_v3, mem_s3)
- ADD_avx2(ymm_v3, mem_inc)
- ADD_avx2(ymm_v7, ymm_v3)
- WriteXor_avx2(ymm_tmp0, reg_inp, reg_outp, 128, ymm_v4, ymm_v5, ymm_v6, ymm_v7)
- ADD_avx2(ymm_v3, mem_inc)
- ADD_avx2(ymm_v11, ymm_v3)
- WriteXor_avx2(ymm_tmp0, reg_inp, reg_outp, 256, ymm_v8, ymm_v9, ymm_v10, ymm_v11)
- ADD_avx2(ymm_v3, mem_inc)
- VMOVDQA(ymm_v12, mem_tmp0)
- ADD_avx2(ymm_v15, ymm_v3)
- WriteXor_avx2(ymm_v0, reg_inp, reg_outp, 384, ymm_v12, ymm_v13, ymm_v14, ymm_v15)
- ADD_avx2(ymm_v3, mem_inc)
- VMOVDQA(mem_s3, ymm_v3)
- ADD(reg_inp, 8 * 64)
- ADD(reg_outp, 8 * 64)
- SUB(reg_blocks, 8)
- JAE(vector_loop8.begin)
- # ymm_v3 contains a current copy of mem_s3 either from when it was built,
- # or because the loop updates it. Copy this before we mess with the block
- # counter in case we need to write it back and return.
- ymm_s3 = ymm_v11
- VMOVDQA(ymm_s3, ymm_v3)
- ADD(reg_blocks, 8)
- JZ(out_write_even)
- # We now actually can do everything in registers.
- ymm_s0 = ymm_v8
- VBROADCASTI128(ymm_s0, x_s0)
- ymm_s1 = ymm_v9
- VBROADCASTI128(ymm_s1, x_s1)
- ymm_s2 = ymm_v10
- VBROADCASTI128(ymm_s2, x_s2)
- ymm_inc = ymm_v14
- VMOVDQA(ymm_inc, mem_inc)
- #
- # 4 blocks at a time.
- #
- process_2_blocks = Label()
- SUB(reg_blocks, 4)
- JB(process_2_blocks) # < 4 blocks remaining.
- VMOVDQA(ymm_v0, ymm_s0)
- VMOVDQA(ymm_v1, ymm_s1)
- VMOVDQA(ymm_v2, ymm_s2)
- VMOVDQA(ymm_v3, ymm_s3)
- VMOVDQA(ymm_v4, ymm_v0)
- VMOVDQA(ymm_v5, ymm_v1)
- VMOVDQA(ymm_v6, ymm_v2)
- VPADDQ(ymm_v7, ymm_v3, ymm_inc)
- MOV(reg_rounds, 20)
- rounds_loop4 = Loop()
- with rounds_loop4:
- # a += b; d ^= a; d = ROTW16(d);
- ADD_avx2(ymm_v0, ymm_v1)
- ADD_avx2(ymm_v4, ymm_v5)
- XOR_avx2(ymm_v3, ymm_v0)
- XOR_avx2(ymm_v7, ymm_v4)
- ROTW16_avx2(ymm_tmp0, ymm_v3)
- ROTW16_avx2(ymm_tmp0, ymm_v7)
- # c += d; b ^= c; b = ROTW12(b);
- ADD_avx2(ymm_v2, ymm_v3)
- ADD_avx2(ymm_v6, ymm_v7)
- XOR_avx2(ymm_v1, ymm_v2)
- XOR_avx2(ymm_v5, ymm_v6)
- ROTW12_avx2(ymm_tmp0, ymm_v1)
- ROTW12_avx2(ymm_tmp0, ymm_v5)
- # a += b; d ^= a; d = ROTW8(d);
- ADD_avx2(ymm_v0, ymm_v1)
- ADD_avx2(ymm_v4, ymm_v5)
- XOR_avx2(ymm_v3, ymm_v0)
- XOR_avx2(ymm_v7, ymm_v4)
- ROTW8_avx2(ymm_tmp0, ymm_v3)
- ROTW8_avx2(ymm_tmp0, ymm_v7)
- # c += d; b ^= c; b = ROTW7(b)
- ADD_avx2(ymm_v2, ymm_v3)
- ADD_avx2(ymm_v6, ymm_v7)
- XOR_avx2(ymm_v1, ymm_v2)
- XOR_avx2(ymm_v5, ymm_v6)
- ROTW7_avx2(ymm_tmp0, ymm_v1)
- ROTW7_avx2(ymm_tmp0, ymm_v5)
- # b = ROTV1(b); c = ROTV2(c); d = ROTV3(d);
- VPSHUFD(ymm_v1, ymm_v1, 0x39)
- VPSHUFD(ymm_v5, ymm_v5, 0x39)
- VPSHUFD(ymm_v2, ymm_v2, 0x4e)
- VPSHUFD(ymm_v6, ymm_v6, 0x4e)
- VPSHUFD(ymm_v3, ymm_v3, 0x93)
- VPSHUFD(ymm_v7, ymm_v7, 0x93)
- # a += b; d ^= a; d = ROTW16(d);
- ADD_avx2(ymm_v0, ymm_v1)
- ADD_avx2(ymm_v4, ymm_v5)
- XOR_avx2(ymm_v3, ymm_v0)
- XOR_avx2(ymm_v7, ymm_v4)
- ROTW16_avx2(ymm_tmp0, ymm_v3)
- ROTW16_avx2(ymm_tmp0, ymm_v7)
- # c += d; b ^= c; b = ROTW12(b);
- ADD_avx2(ymm_v2, ymm_v3)
- ADD_avx2(ymm_v6, ymm_v7)
- XOR_avx2(ymm_v1, ymm_v2)
- XOR_avx2(ymm_v5, ymm_v6)
- ROTW12_avx2(ymm_tmp0, ymm_v1)
- ROTW12_avx2(ymm_tmp0, ymm_v5)
- # a += b; d ^= a; d = ROTW8(d);
- ADD_avx2(ymm_v0, ymm_v1)
- ADD_avx2(ymm_v4, ymm_v5)
- XOR_avx2(ymm_v3, ymm_v0)
- XOR_avx2(ymm_v7, ymm_v4)
- ROTW8_avx2(ymm_tmp0, ymm_v3)
- ROTW8_avx2(ymm_tmp0, ymm_v7)
- # c += d; b ^= c; b = ROTW7(b)
- ADD_avx2(ymm_v2, ymm_v3)
- ADD_avx2(ymm_v6, ymm_v7)
- XOR_avx2(ymm_v1, ymm_v2)
- XOR_avx2(ymm_v5, ymm_v6)
- ROTW7_avx2(ymm_tmp0, ymm_v1)
- ROTW7_avx2(ymm_tmp0, ymm_v5)
- # b = ROTV1(b); c = ROTV2(c); d = ROTV3(d);
- VPSHUFD(ymm_v1, ymm_v1, 0x93)
- VPSHUFD(ymm_v5, ymm_v5, 0x93)
- VPSHUFD(ymm_v2, ymm_v2, 0x4e)
- VPSHUFD(ymm_v6, ymm_v6, 0x4e)
- VPSHUFD(ymm_v3, ymm_v3, 0x39)
- VPSHUFD(ymm_v7, ymm_v7, 0x39)
- SUB(reg_rounds, 2)
- JNZ(rounds_loop4.begin)
- ADD_avx2(ymm_v0, ymm_s0)
- ADD_avx2(ymm_v1, ymm_s1)
- ADD_avx2(ymm_v2, ymm_s2)
- ADD_avx2(ymm_v3, ymm_s3)
- WriteXor_avx2(ymm_tmp0, reg_inp, reg_outp, 0, ymm_v0, ymm_v1, ymm_v2, ymm_v3)
- ADD_avx2(ymm_s3, ymm_inc)
- ADD_avx2(ymm_v4, ymm_s0)
- ADD_avx2(ymm_v5, ymm_s1)
- ADD_avx2(ymm_v6, ymm_s2)
- ADD_avx2(ymm_v7, ymm_s3)
- WriteXor_avx2(ymm_tmp0, reg_inp, reg_outp, 128, ymm_v4, ymm_v5, ymm_v6, ymm_v7)
- ADD_avx2(ymm_s3, ymm_inc)
- ADD(reg_inp, 4 * 64)
- ADD(reg_outp, 4 * 64)
- SUB(reg_blocks, 4)
- LABEL(process_2_blocks)
- ADD(reg_blocks, 4)
- JZ(out_write_even) # 0 blocks left.
- #
- # 2/1 blocks at a time. The two codepaths are unified because
- # with AVX2 we do 2 blocks at a time anyway, and this only gets called
- # if 3/2/1 blocks are remaining, so the extra branches don't hurt that
- # much.
- #
- vector_loop2 = Loop()
- with vector_loop2:
- VMOVDQA(ymm_v0, ymm_s0)
- VMOVDQA(ymm_v1, ymm_s1)
- VMOVDQA(ymm_v2, ymm_s2)
- VMOVDQA(ymm_v3, ymm_s3)
- MOV(reg_rounds, 20)
- rounds_loop2 = Loop()
- with rounds_loop2:
- # a += b; d ^= a; d = ROTW16(d);
- ADD_avx2(ymm_v0, ymm_v1)
- XOR_avx2(ymm_v3, ymm_v0)
- ROTW16_avx2(ymm_tmp0, ymm_v3)
- # c += d; b ^= c; b = ROTW12(b);
- ADD_avx2(ymm_v2, ymm_v3)
- XOR_avx2(ymm_v1, ymm_v2)
- ROTW12_avx2(ymm_tmp0, ymm_v1)
- # a += b; d ^= a; d = ROTW8(d);
- ADD_avx2(ymm_v0, ymm_v1)
- XOR_avx2(ymm_v3, ymm_v0)
- ROTW8_avx2(ymm_tmp0, ymm_v3)
- # c += d; b ^= c; b = ROTW7(b)
- ADD_avx2(ymm_v2, ymm_v3)
- XOR_avx2(ymm_v1, ymm_v2)
- ROTW7_avx2(ymm_tmp0, ymm_v1)
- # b = ROTV1(b); c = ROTV2(c); d = ROTV3(d);
- VPSHUFD(ymm_v1, ymm_v1, 0x39)
- VPSHUFD(ymm_v2, ymm_v2, 0x4e)
- VPSHUFD(ymm_v3, ymm_v3, 0x93)
- # a += b; d ^= a; d = ROTW16(d);
- ADD_avx2(ymm_v0, ymm_v1)
- XOR_avx2(ymm_v3, ymm_v0)
- ROTW16_avx2(ymm_tmp0, ymm_v3)
- # c += d; b ^= c; b = ROTW12(b);
- ADD_avx2(ymm_v2, ymm_v3)
- XOR_avx2(ymm_v1, ymm_v2)
- ROTW12_avx2(ymm_tmp0, ymm_v1)
- # a += b; d ^= a; d = ROTW8(d);
- ADD_avx2(ymm_v0, ymm_v1)
- XOR_avx2(ymm_v3, ymm_v0)
- ROTW8_avx2(ymm_tmp0, ymm_v3)
- # c += d; b ^= c; b = ROTW7(b)
- ADD_avx2(ymm_v2, ymm_v3)
- XOR_avx2(ymm_v1, ymm_v2)
- ROTW7_avx2(ymm_tmp0, ymm_v1)
- # b = ROTV1(b); c = ROTV2(c); d = ROTV3(d);
- VPSHUFD(ymm_v1, ymm_v1, 0x93)
- VPSHUFD(ymm_v2, ymm_v2, 0x4e)
- VPSHUFD(ymm_v3, ymm_v3, 0x39)
- SUB(reg_rounds, 2)
- JNZ(rounds_loop2.begin)
- ADD_avx2(ymm_v0, ymm_s0)
- ADD_avx2(ymm_v1, ymm_s1)
- ADD_avx2(ymm_v2, ymm_s2)
- ADD_avx2(ymm_v3, ymm_s3)
- # XOR_WRITE(out+ 0, in+ 0, _mm256_permute2x128_si256(v0,v1,0x20));
- VPERM2I128(ymm_tmp0, ymm_v0, ymm_v1, 0x20)
- VPXOR(ymm_tmp0, ymm_tmp0, [reg_inp])
- VMOVDQU([reg_outp], ymm_tmp0)
- # XOR_WRITE(out+32, in+32, _mm256_permute2x128_si256(v2,v3,0x20));
- VPERM2I128(ymm_tmp0, ymm_v2, ymm_v3, 0x20)
- VPXOR(ymm_tmp0, ymm_tmp0, [reg_inp+32])
- VMOVDQU([reg_outp+32], ymm_tmp0)
- SUB(reg_blocks, 1)
- JZ(out_write_odd)
- ADD_avx2(ymm_s3, ymm_inc)
- # XOR_WRITE(out+64, in+64, _mm256_permute2x128_si256(v0,v1,0x31));
- VPERM2I128(ymm_tmp0, ymm_v0, ymm_v1, 0x31)
- VPXOR(ymm_tmp0, ymm_tmp0, [reg_inp+64])
- VMOVDQU([reg_outp+64], ymm_tmp0)
- # XOR_WRITE(out+96, in+96, _mm256_permute2x128_si256(v2,v3,0x31));
- VPERM2I128(ymm_tmp0, ymm_v2, ymm_v3, 0x31)
- VPXOR(ymm_tmp0, ymm_tmp0, [reg_inp+96])
- VMOVDQU([reg_outp+96], ymm_tmp0)
- SUB(reg_blocks, 1)
- JZ(out_write_even)
- ADD(reg_inp, 2 * 64)
- ADD(reg_outp, 2 * 64)
- JMP(vector_loop2.begin)
- LABEL(out_write_odd)
- VPERM2I128(ymm_s3, ymm_s3, ymm_s3, 0x01) # Odd number of blocks.
- LABEL(out_write_even)
- VMOVDQU(x_s3, ymm_s3.as_xmm) # Write back ymm_s3 to x_v3
- # Paranoia, cleanse the scratch space.
- VPXOR(ymm_v0, ymm_v0, ymm_v0)
- VMOVDQA(mem_tmp0, ymm_v0)
- VMOVDQA(mem_s3, ymm_v0)
- # Clear all YMM (and XMM) registers.
- VZEROALL()
- # Remove our stack allocation.
- MOV(registers.rsp, reg_sp_save)
- RETURN()
- #
- # CPUID
- #
- cpuidParams = Argument(ptr(uint32_t))
- with Function("cpuidAmd64", (cpuidParams,)):
- reg_params = registers.r15
- LOAD.ARGUMENT(reg_params, cpuidParams)
- MOV(registers.eax, [reg_params])
- MOV(registers.ecx, [reg_params+8])
- CPUID()
- MOV([reg_params], registers.eax)
- MOV([reg_params+4], registers.ebx)
- MOV([reg_params+8], registers.ecx)
- MOV([reg_params+12], registers.edx)
- RETURN()
- #
- # XGETBV (ECX = 0)
- #
- xcrVec = Argument(ptr(uint32_t))
- with Function("xgetbv0Amd64", (xcrVec,)):
- reg_vec = GeneralPurposeRegister64()
- LOAD.ARGUMENT(reg_vec, xcrVec)
- XOR(registers.ecx, registers.ecx)
- XGETBV()
- MOV([reg_vec], registers.eax)
- MOV([reg_vec+4], registers.edx)
- RETURN()
|