# salsa20_word_p4 version 20050425 # D. J. Bernstein # Public domain. # stack32 out_stack # stack32 in_stack # int32 eax # int32 ebx # int32 esi # int32 edi # int32 ebp # stack32 eax_stack # stack32 ebx_stack # stack32 esi_stack # stack32 edi_stack # stack32 ebp_stack # int32 in # int32 out # int32 in0 # int32 in1 # int32 in2 # int32 in3 # int32 in4 # int32 in5 # int32 in6 # int32 in7 # int32 in8 # int32 in9 # int32 in10 # int32 in11 # int32 in12 # int32 in13 # int32 in14 # int32 in15 # int32 out0 # int32 out1 # int32 out2 # int32 out3 # int32 out4 # int32 out5 # int32 out6 # int32 out7 # int32 out8 # int32 out9 # int32 out10 # int32 out11 # int32 out12 # int32 out13 # int32 out14 # int32 out15 # int32 i # stack128 s0 # stack128 s1 # stack128 s2 # stack128 s3 # int6464 diag0 # int6464 diag1 # int6464 diag2 # int6464 diag3 # int6464 a0 # int6464 a1 # int6464 a2 # int6464 a3 # int6464 a4 # int6464 a5 # int6464 a6 # int6464 a7 # int6464 b0 # int6464 b1 # int6464 b2 # int6464 b3 # int6464 b4 # int6464 b5 # int6464 b6 # int6464 b7 # enter salsa20_word_p4 .text .p2align 5 .globl _salsa20_word_p4 .globl salsa20_word_p4 _salsa20_word_p4: salsa20_word_p4: mov %esp,%eax and $31,%eax add $96,%eax sub %eax,%esp # input out_stack # input in_stack # caller eax # caller ebx # caller esi # caller edi # caller ebp # in = in_stack # movl in=int32#2 # movl in=%ecx movl 8(%esp,%eax),%ecx # out = out_stack # movl out=int32#3 # movl out=%edx movl 4(%esp,%eax),%edx # eax_stack = eax # movl eax_stack=stack32#1 # movl eax_stack=0(%esp) movl %eax,0(%esp) # ebx_stack = ebx # movl ebx_stack=stack32#2 # movl ebx_stack=4(%esp) movl %ebx,4(%esp) # esi_stack = esi # movl esi_stack=stack32#3 # movl esi_stack=8(%esp) movl %esi,8(%esp) # edi_stack = edi # movl edi_stack=stack32#4 # movl edi_stack=12(%esp) movl %edi,12(%esp) # ebp_stack = ebp # movl ebp_stack=stack32#5 # movl ebp_stack=16(%esp) movl %ebp,16(%esp) # in15 = *(uint32 *) (in + 60) # movl 60(in15=int32#1 # movl 60(in15=%eax movl 60(%ecx),%eax # diag0 = in15 # movd diag0=int6464#1 # movd diag0=%xmm0 movd %eax,%xmm0 # in11 = *(uint32 *) (in + 44) # movl 44(in11=int32#1 # movl 44(in11=%eax movl 44(%ecx),%eax # diag1 = in11 # movd diag1=int6464#2 # movd diag1=%xmm1 movd %eax,%xmm1 # in7 = *(uint32 *) (in + 28) # movl 28(in7=int32#1 # movl 28(in7=%eax movl 28(%ecx),%eax # diag2 = in7 # movd diag2=int6464#3 # movd diag2=%xmm2 movd %eax,%xmm2 # in3 = *(uint32 *) (in + 12) # movl 12(in3=int32#1 # movl 12(in3=%eax movl 12(%ecx),%eax # diag3 = in3 # movd diag3=int6464#4 # movd diag3=%xmm3 movd %eax,%xmm3 # in10 = *(uint32 *) (in + 40) # movl 40(in10=int32#5 # movl 40(in10=%esi movl 40(%ecx),%esi # in6 = *(uint32 *) (in + 24) # movl 24(in6=int32#4 # movl 24(in6=%ebx movl 24(%ecx),%ebx # in2 = *(uint32 *) (in + 8) # movl 8(in2=int32#1 # movl 8(in2=%eax movl 8(%ecx),%eax # in14 = *(uint32 *) (in + 56) # movl 56(in14=int32#6 # movl 56(in14=%edi movl 56(%ecx),%edi # diag0 <<<= 32 # pshufd $0x93,diag0=int6464#1 # pshufd $0x93,diag0=%xmm0 pshufd $0x93,%xmm0,%xmm0 # diag1 <<<= 32 # pshufd $0x93,diag1=int6464#2 # pshufd $0x93,diag1=%xmm1 pshufd $0x93,%xmm1,%xmm1 # diag2 <<<= 32 # pshufd $0x93,diag2=int6464#3 # pshufd $0x93,diag2=%xmm2 pshufd $0x93,%xmm2,%xmm2 # diag3 <<<= 32 # pshufd $0x93,diag3=int6464#4 # pshufd $0x93,diag3=%xmm3 pshufd $0x93,%xmm3,%xmm3 # a0 = in10 # movd a0=int6464#5 # movd a0=%xmm4 movd %esi,%xmm4 # a1 = in6 # movd a1=int6464#6 # movd a1=%xmm5 movd %ebx,%xmm5 # a2 = in2 # movd a2=int6464#7 # movd a2=%xmm6 movd %eax,%xmm6 # a3 = in14 # movd a3=int6464#8 # movd a3=%xmm7 movd %edi,%xmm7 # uint32323232 diag0 += a0 # paddd in5=int32#4 # movl 20(in5=%ebx movl 20(%ecx),%ebx # in1 = *(uint32 *) (in + 4) # movl 4(in1=int32#1 # movl 4(in1=%eax movl 4(%ecx),%eax # in13 = *(uint32 *) (in + 52) # movl 52(in13=int32#6 # movl 52(in13=%edi movl 52(%ecx),%edi # in9 = *(uint32 *) (in + 36) # movl 36(in9=int32#5 # movl 36(in9=%esi movl 36(%ecx),%esi # diag0 <<<= 32 # pshufd $0x93,diag0=int6464#1 # pshufd $0x93,diag0=%xmm0 pshufd $0x93,%xmm0,%xmm0 # diag1 <<<= 32 # pshufd $0x93,diag1=int6464#2 # pshufd $0x93,diag1=%xmm1 pshufd $0x93,%xmm1,%xmm1 # diag2 <<<= 32 # pshufd $0x93,diag2=int6464#3 # pshufd $0x93,diag2=%xmm2 pshufd $0x93,%xmm2,%xmm2 # diag3 <<<= 32 # pshufd $0x93,diag3=int6464#4 # pshufd $0x93,diag3=%xmm3 pshufd $0x93,%xmm3,%xmm3 # a0 = in5 # movd a0=int6464#5 # movd a0=%xmm4 movd %ebx,%xmm4 # a1 = in1 # movd a1=int6464#6 # movd a1=%xmm5 movd %eax,%xmm5 # a2 = in13 # movd a2=int6464#7 # movd a2=%xmm6 movd %edi,%xmm6 # a3 = in9 # movd a3=int6464#8 # movd a3=%xmm7 movd %esi,%xmm7 # uint32323232 diag0 += a0 # paddd in0=int32#1 # movl 0(in0=%eax movl 0(%ecx),%eax # in12 = *(uint32 *) (in + 48) # movl 48(in12=int32#5 # movl 48(in12=%esi movl 48(%ecx),%esi # in8 = *(uint32 *) (in + 32) # movl 32(in8=int32#4 # movl 32(in8=%ebx movl 32(%ecx),%ebx # in4 = *(uint32 *) (in + 16) # movl 16(in4=int32#2 # movl 16(in4=%ecx movl 16(%ecx),%ecx # diag0 <<<= 32 # pshufd $0x93,diag0=int6464#1 # pshufd $0x93,diag0=%xmm0 pshufd $0x93,%xmm0,%xmm0 # diag1 <<<= 32 # pshufd $0x93,diag1=int6464#2 # pshufd $0x93,diag1=%xmm1 pshufd $0x93,%xmm1,%xmm1 # diag2 <<<= 32 # pshufd $0x93,diag2=int6464#3 # pshufd $0x93,diag2=%xmm2 pshufd $0x93,%xmm2,%xmm2 # diag3 <<<= 32 # pshufd $0x93,diag3=int6464#4 # pshufd $0x93,diag3=%xmm3 pshufd $0x93,%xmm3,%xmm3 # a0 = in0 # movd a0=int6464#5 # movd a0=%xmm4 movd %eax,%xmm4 # a1 = in12 # movd a1=int6464#6 # movd a1=%xmm5 movd %esi,%xmm5 # a2 = in8 # movd a2=int6464#7 # movd a2=%xmm6 movd %ebx,%xmm6 # a3 = in4 # movd a3=int6464#8 # movd a3=%xmm7 movd %ecx,%xmm7 # uint32323232 diag0 += a0 # paddd s0=stack128#1 # movdqa s0=32(%esp) movdqa %xmm0,32(%esp) # uint32323232 diag1 += a1 # paddd s1=stack128#2 # movdqa s1=48(%esp) movdqa %xmm1,48(%esp) # uint32323232 diag2 += a2 # paddd s2=stack128#3 # movdqa s2=64(%esp) movdqa %xmm2,64(%esp) # uint32323232 diag3 += a3 # paddd s3=stack128#4 # movdqa s3=80(%esp) movdqa %xmm3,80(%esp) # a0 = diag1 <<< 0 # pshufd $0xe4,a0=int6464#5 # pshufd $0xe4,a0=%xmm4 pshufd $0xe4,%xmm1,%xmm4 # b0 = 0 # pxor >b0=int6464#7,>b0=int6464#7 # pxor >b0=%xmm6,>b0=%xmm6 pxor %xmm6,%xmm6 # i = 20 # mov $20,>i=int32#1 # mov $20,>i=%eax mov $20,%eax # mainloop# ._mainloop: # uint32323232 a0 += diag0 # paddd a1=int6464#6 # pshufd $0xe4,a1=%xmm5 pshufd $0xe4,%xmm0,%xmm5 # uint32323232 b0 += a0 # paddd b1=int6464#8,>b1=int6464#8 # pxor >b1=%xmm7,>b1=%xmm7 pxor %xmm7,%xmm7 # uint32323232 b0 >>= 25 # psrld $25,a2=int6464#5 # pshufd $0xe4,a2=%xmm4 pshufd $0xe4,%xmm3,%xmm4 # uint32323232 b1 += a1 # paddd b2=int6464#7,>b2=int6464#7 # pxor >b2=%xmm6,>b2=%xmm6 pxor %xmm6,%xmm6 # uint32323232 b1 >>= 23 # psrld $23,diag3=int6464#4 # pshufd $0x93,diag3=%xmm3 pshufd $0x93,%xmm3,%xmm3 # diag2 ^= b1 # pxor a3=int6464#6 # pshufd $0xe4,a3=%xmm5 pshufd $0xe4,%xmm2,%xmm5 # uint32323232 b2 += a2 # paddd b3=int6464#8,>b3=int6464#8 # pxor >b3=%xmm7,>b3=%xmm7 pxor %xmm7,%xmm7 # uint32323232 b2 >>= 19 # psrld $19,diag2=int6464#3 # pshufd $0x4e,diag2=%xmm2 pshufd $0x4e,%xmm2,%xmm2 # diag1 ^= b2 # pxor a4=int6464#5 # pshufd $0xe4,a4=%xmm4 pshufd $0xe4,%xmm3,%xmm4 # uint32323232 b3 += a3 # paddd b4=int6464#7,>b4=int6464#7 # pxor >b4=%xmm6,>b4=%xmm6 pxor %xmm6,%xmm6 # uint32323232 b3 >>= 14 # psrld $14,diag1=int6464#2 # pshufd $0x39,diag1=%xmm1 pshufd $0x39,%xmm1,%xmm1 # diag0 ^= b3 # pxor a5=int6464#6 # pshufd $0xe4,a5=%xmm5 pshufd $0xe4,%xmm0,%xmm5 # uint32323232 b4 += a4 # paddd b5=int6464#8,>b5=int6464#8 # pxor >b5=%xmm7,>b5=%xmm7 pxor %xmm7,%xmm7 # uint32323232 b4 >>= 25 # psrld $25,a6=int6464#5 # pshufd $0xe4,a6=%xmm4 pshufd $0xe4,%xmm1,%xmm4 # uint32323232 b5 += a5 # paddd b6=int6464#7,>b6=int6464#7 # pxor >b6=%xmm6,>b6=%xmm6 pxor %xmm6,%xmm6 # uint32323232 b5 >>= 23 # psrld $23,diag1=int6464#2 # pshufd $0x93,diag1=%xmm1 pshufd $0x93,%xmm1,%xmm1 # diag2 ^= b5 # pxor a7=int6464#6 # pshufd $0xe4,a7=%xmm5 pshufd $0xe4,%xmm2,%xmm5 # uint32323232 b6 += a6 # paddd b7=int6464#8,>b7=int6464#8 # pxor >b7=%xmm7,>b7=%xmm7 pxor %xmm7,%xmm7 # uint32323232 b6 >>= 19 # psrld $19,diag2=int6464#3 # pshufd $0x4e,diag2=%xmm2 pshufd $0x4e,%xmm2,%xmm2 # diag3 ^= b6 # pxor a0=int6464#5 # pshufd $0xe4,a0=%xmm4 pshufd $0xe4,%xmm1,%xmm4 # uint32323232 b7 += a7 # paddd b0=int6464#7,>b0=int6464#7 # pxor >b0=%xmm6,>b0=%xmm6 pxor %xmm6,%xmm6 # uint32323232 b7 >>= 14 # psrld $14,diag3=int6464#4 # pshufd $0x39,diag3=%xmm3 pshufd $0x39,%xmm3,%xmm3 # diag0 ^= b7 # pxor a1=int6464#6 # pshufd $0xe4,a1=%xmm5 pshufd $0xe4,%xmm0,%xmm5 # uint32323232 b0 += a0 # paddd b1=int6464#8,>b1=int6464#8 # pxor >b1=%xmm7,>b1=%xmm7 pxor %xmm7,%xmm7 # uint32323232 b0 >>= 25 # psrld $25,a2=int6464#5 # pshufd $0xe4,a2=%xmm4 pshufd $0xe4,%xmm3,%xmm4 # uint32323232 b1 += a1 # paddd b2=int6464#7,>b2=int6464#7 # pxor >b2=%xmm6,>b2=%xmm6 pxor %xmm6,%xmm6 # uint32323232 b1 >>= 23 # psrld $23,diag3=int6464#4 # pshufd $0x93,diag3=%xmm3 pshufd $0x93,%xmm3,%xmm3 # diag2 ^= b1 # pxor a3=int6464#6 # pshufd $0xe4,a3=%xmm5 pshufd $0xe4,%xmm2,%xmm5 # uint32323232 b2 += a2 # paddd b3=int6464#8,>b3=int6464#8 # pxor >b3=%xmm7,>b3=%xmm7 pxor %xmm7,%xmm7 # uint32323232 b2 >>= 19 # psrld $19,diag2=int6464#3 # pshufd $0x4e,diag2=%xmm2 pshufd $0x4e,%xmm2,%xmm2 # diag1 ^= b2 # pxor a4=int6464#5 # pshufd $0xe4,a4=%xmm4 pshufd $0xe4,%xmm3,%xmm4 # uint32323232 b3 += a3 # paddd b4=int6464#7,>b4=int6464#7 # pxor >b4=%xmm6,>b4=%xmm6 pxor %xmm6,%xmm6 # uint32323232 b3 >>= 14 # psrld $14,diag1=int6464#2 # pshufd $0x39,diag1=%xmm1 pshufd $0x39,%xmm1,%xmm1 # diag0 ^= b3 # pxor a5=int6464#6 # pshufd $0xe4,a5=%xmm5 pshufd $0xe4,%xmm0,%xmm5 # uint32323232 b4 += a4 # paddd b5=int6464#8,>b5=int6464#8 # pxor >b5=%xmm7,>b5=%xmm7 pxor %xmm7,%xmm7 # uint32323232 b4 >>= 25 # psrld $25,a6=int6464#5 # pshufd $0xe4,a6=%xmm4 pshufd $0xe4,%xmm1,%xmm4 # uint32323232 b5 += a5 # paddd b6=int6464#7,>b6=int6464#7 # pxor >b6=%xmm6,>b6=%xmm6 pxor %xmm6,%xmm6 # uint32323232 b5 >>= 23 # psrld $23,diag1=int6464#2 # pshufd $0x93,diag1=%xmm1 pshufd $0x93,%xmm1,%xmm1 # diag2 ^= b5 # pxor a7=int6464#6 # pshufd $0xe4,a7=%xmm5 pshufd $0xe4,%xmm2,%xmm5 # uint32323232 b6 += a6 # paddd b7=int6464#8,>b7=int6464#8 # pxor >b7=%xmm7,>b7=%xmm7 pxor %xmm7,%xmm7 # uint32323232 b6 >>= 19 # psrld $19,diag2=int6464#3 # pshufd $0x4e,diag2=%xmm2 pshufd $0x4e,%xmm2,%xmm2 # diag3 ^= b6 # pxor a0=int6464#5 # pshufd $0xe4,a0=%xmm4 pshufd $0xe4,%xmm1,%xmm4 # uint32323232 b7 += a7 # paddd b0=int6464#7,>b0=int6464#7 # pxor >b0=%xmm6,>b0=%xmm6 pxor %xmm6,%xmm6 # uint32323232 b7 >>= 14 # psrld $14,diag3=int6464#4 # pshufd $0x39,diag3=%xmm3 pshufd $0x39,%xmm3,%xmm3 # diag0 ^= b7 # pxor ja ._mainloop # uint32323232 diag0 += s0 # paddd out0=int32#1 # movd out0=%eax movd %xmm0,%eax # out12 = diag1 # movd out12=int32#5 # movd out12=%esi movd %xmm1,%esi # out8 = diag2 # movd out8=int32#4 # movd out8=%ebx movd %xmm2,%ebx # out4 = diag3 # movd out4=int32#2 # movd out4=%ecx movd %xmm3,%ecx # diag0 <<<= 96 # pshufd $0x39,diag0=int6464#1 # pshufd $0x39,diag0=%xmm0 pshufd $0x39,%xmm0,%xmm0 # diag1 <<<= 96 # pshufd $0x39,diag1=int6464#2 # pshufd $0x39,diag1=%xmm1 pshufd $0x39,%xmm1,%xmm1 # diag2 <<<= 96 # pshufd $0x39,diag2=int6464#3 # pshufd $0x39,diag2=%xmm2 pshufd $0x39,%xmm2,%xmm2 # diag3 <<<= 96 # pshufd $0x39,diag3=int6464#4 # pshufd $0x39,diag3=%xmm3 pshufd $0x39,%xmm3,%xmm3 # *(uint32 *) (out + 0) = out0 # movl out5=int32#2 # movd out5=%ecx movd %xmm0,%ecx # out1 = diag1 # movd out1=int32#1 # movd out1=%eax movd %xmm1,%eax # out13 = diag2 # movd out13=int32#5 # movd out13=%esi movd %xmm2,%esi # out9 = diag3 # movd out9=int32#4 # movd out9=%ebx movd %xmm3,%ebx # diag0 <<<= 96 # pshufd $0x39,diag0=int6464#1 # pshufd $0x39,diag0=%xmm0 pshufd $0x39,%xmm0,%xmm0 # diag1 <<<= 96 # pshufd $0x39,diag1=int6464#2 # pshufd $0x39,diag1=%xmm1 pshufd $0x39,%xmm1,%xmm1 # diag2 <<<= 96 # pshufd $0x39,diag2=int6464#3 # pshufd $0x39,diag2=%xmm2 pshufd $0x39,%xmm2,%xmm2 # diag3 <<<= 96 # pshufd $0x39,diag3=int6464#4 # pshufd $0x39,diag3=%xmm3 pshufd $0x39,%xmm3,%xmm3 # *(uint32 *) (out + 20) = out5 # movl out10=int32#4 # movd out10=%ebx movd %xmm0,%ebx # out6 = diag1 # movd out6=int32#2 # movd out6=%ecx movd %xmm1,%ecx # out2 = diag2 # movd out2=int32#1 # movd out2=%eax movd %xmm2,%eax # out14 = diag3 # movd out14=int32#5 # movd out14=%esi movd %xmm3,%esi # diag0 <<<= 96 # pshufd $0x39,diag0=int6464#1 # pshufd $0x39,diag0=%xmm0 pshufd $0x39,%xmm0,%xmm0 # diag1 <<<= 96 # pshufd $0x39,diag1=int6464#2 # pshufd $0x39,diag1=%xmm1 pshufd $0x39,%xmm1,%xmm1 # diag2 <<<= 96 # pshufd $0x39,diag2=int6464#3 # pshufd $0x39,diag2=%xmm2 pshufd $0x39,%xmm2,%xmm2 # diag3 <<<= 96 # pshufd $0x39,diag3=int6464#4 # pshufd $0x39,diag3=%xmm3 pshufd $0x39,%xmm3,%xmm3 # *(uint32 *) (out + 40) = out10 # movl out15=int32#5 # movd out15=%esi movd %xmm0,%esi # out11 = diag1 # movd out11=int32#4 # movd out11=%ebx movd %xmm1,%ebx # out7 = diag2 # movd out7=int32#2 # movd out7=%ecx movd %xmm2,%ecx # out3 = diag3 # movd out3=int32#1 # movd out3=%eax movd %xmm3,%eax # *(uint32 *) (out + 60) = out15 # movl eax=int32#1 # movl eax=%eax movl 0(%esp),%eax # ebx = ebx_stack # movl ebx=int32#4 # movl ebx=%ebx movl 4(%esp),%ebx # esi = esi_stack # movl esi=int32#5 # movl esi=%esi movl 8(%esp),%esi # edi = edi_stack # movl edi=int32#6 # movl edi=%edi movl 12(%esp),%edi # ebp = ebp_stack # movl ebp=int32#7 # movl ebp=%ebp movl 16(%esp),%ebp # leave add %eax,%esp ret