# salsa20_word_p4 version 20050425 # D. J. Bernstein # Public domain. stack32 out_stack stack32 in_stack int32 eax int32 ebx int32 esi int32 edi int32 ebp stack32 eax_stack stack32 ebx_stack stack32 esi_stack stack32 edi_stack stack32 ebp_stack int32 in int32 out int32 in0 int32 in1 int32 in2 int32 in3 int32 in4 int32 in5 int32 in6 int32 in7 int32 in8 int32 in9 int32 in10 int32 in11 int32 in12 int32 in13 int32 in14 int32 in15 int32 out0 int32 out1 int32 out2 int32 out3 int32 out4 int32 out5 int32 out6 int32 out7 int32 out8 int32 out9 int32 out10 int32 out11 int32 out12 int32 out13 int32 out14 int32 out15 int32 i stack128 s0 stack128 s1 stack128 s2 stack128 s3 int6464 diag0 int6464 diag1 int6464 diag2 int6464 diag3 # situation at beginning of first round: # diag0: x0 x5 x10 x15 # diag1: x12 x1 x6 x11 # diag2: x8 x13 x2 x7 # diag3: x4 x9 x14 x3 # situation at beginning of second round: # diag0: x0 x5 x10 x15 # diag1: x1 x6 x11 x12 # diag2: x2 x7 x8 x13 # diag3: x3 x4 x9 x14 int6464 a0 int6464 a1 int6464 a2 int6464 a3 int6464 a4 int6464 a5 int6464 a6 int6464 a7 int6464 b0 int6464 b1 int6464 b2 int6464 b3 int6464 b4 int6464 b5 int6464 b6 int6464 b7 enter salsa20_word_p4 input out_stack input in_stack caller eax caller ebx caller esi caller edi caller ebp in = in_stack out = out_stack eax_stack = eax ebx_stack = ebx esi_stack = esi edi_stack = edi ebp_stack = ebp in15 = *(uint32 *) (in + 60) diag0 = in15 in11 = *(uint32 *) (in + 44) diag1 = in11 in7 = *(uint32 *) (in + 28) diag2 = in7 in3 = *(uint32 *) (in + 12) diag3 = in3 in10 = *(uint32 *) (in + 40) in6 = *(uint32 *) (in + 24) in2 = *(uint32 *) (in + 8) in14 = *(uint32 *) (in + 56) diag0 <<<= 32 diag1 <<<= 32 diag2 <<<= 32 diag3 <<<= 32 a0 = in10 a1 = in6 a2 = in2 a3 = in14 uint32323232 diag0 += a0 uint32323232 diag1 += a1 uint32323232 diag2 += a2 uint32323232 diag3 += a3 in5 = *(uint32 *) (in + 20) in1 = *(uint32 *) (in + 4) in13 = *(uint32 *) (in + 52) in9 = *(uint32 *) (in + 36) diag0 <<<= 32 diag1 <<<= 32 diag2 <<<= 32 diag3 <<<= 32 a0 = in5 a1 = in1 a2 = in13 a3 = in9 uint32323232 diag0 += a0 uint32323232 diag1 += a1 uint32323232 diag2 += a2 uint32323232 diag3 += a3 in0 = *(uint32 *) (in + 0) in12 = *(uint32 *) (in + 48) in8 = *(uint32 *) (in + 32) in4 = *(uint32 *) (in + 16) diag0 <<<= 32 diag1 <<<= 32 diag2 <<<= 32 diag3 <<<= 32 a0 = in0 a1 = in12 a2 = in8 a3 = in4 uint32323232 diag0 += a0 s0 = diag0 uint32323232 diag1 += a1 s1 = diag1 uint32323232 diag2 += a2 s2 = diag2 uint32323232 diag3 += a3 s3 = diag3 a0 = diag1 <<< 0 b0 = 0 i = 20 mainloop: uint32323232 a0 += diag0 a1 = diag0 <<< 0 uint32323232 b0 += a0 uint32323232 a0 <<= 7 b1 = 0 uint32323232 b0 >>= 25 diag3 ^= a0 diag3 ^= b0 uint32323232 a1 += diag3 a2 = diag3 <<< 0 uint32323232 b1 += a1 uint32323232 a1 <<= 9 b2 = 0 uint32323232 b1 >>= 23 diag2 ^= a1 diag3 <<<= 32 diag2 ^= b1 uint32323232 a2 += diag2 a3 = diag2 <<< 0 uint32323232 b2 += a2 uint32323232 a2 <<= 13 b3 = 0 uint32323232 b2 >>= 19 diag1 ^= a2 diag2 <<<= 64 diag1 ^= b2 uint32323232 a3 += diag1 a4 = diag3 <<< 0 uint32323232 b3 += a3 uint32323232 a3 <<= 18 b4 = 0 uint32323232 b3 >>= 14 diag0 ^= a3 diag1 <<<= 96 diag0 ^= b3 uint32323232 a4 += diag0 a5 = diag0 <<< 0 uint32323232 b4 += a4 uint32323232 a4 <<= 7 b5 = 0 uint32323232 b4 >>= 25 diag1 ^= a4 diag1 ^= b4 uint32323232 a5 += diag1 a6 = diag1 <<< 0 uint32323232 b5 += a5 uint32323232 a5 <<= 9 b6 = 0 uint32323232 b5 >>= 23 diag2 ^= a5 diag1 <<<= 32 diag2 ^= b5 uint32323232 a6 += diag2 a7 = diag2 <<< 0 uint32323232 b6 += a6 uint32323232 a6 <<= 13 b7 = 0 uint32323232 b6 >>= 19 diag3 ^= a6 diag2 <<<= 64 diag3 ^= b6 uint32323232 a7 += diag3 a0 = diag1 <<< 0 uint32323232 b7 += a7 uint32323232 a7 <<= 18 b0 = 0 uint32323232 b7 >>= 14 diag0 ^= a7 diag3 <<<= 96 diag0 ^= b7 uint32323232 a0 += diag0 a1 = diag0 <<< 0 uint32323232 b0 += a0 uint32323232 a0 <<= 7 b1 = 0 uint32323232 b0 >>= 25 diag3 ^= a0 diag3 ^= b0 uint32323232 a1 += diag3 a2 = diag3 <<< 0 uint32323232 b1 += a1 uint32323232 a1 <<= 9 b2 = 0 uint32323232 b1 >>= 23 diag2 ^= a1 diag3 <<<= 32 diag2 ^= b1 uint32323232 a2 += diag2 a3 = diag2 <<< 0 uint32323232 b2 += a2 uint32323232 a2 <<= 13 b3 = 0 uint32323232 b2 >>= 19 diag1 ^= a2 diag2 <<<= 64 diag1 ^= b2 uint32323232 a3 += diag1 a4 = diag3 <<< 0 uint32323232 b3 += a3 uint32323232 a3 <<= 18 b4 = 0 uint32323232 b3 >>= 14 diag0 ^= a3 diag1 <<<= 96 diag0 ^= b3 uint32323232 a4 += diag0 a5 = diag0 <<< 0 uint32323232 b4 += a4 uint32323232 a4 <<= 7 b5 = 0 uint32323232 b4 >>= 25 diag1 ^= a4 diag1 ^= b4 uint32323232 a5 += diag1 a6 = diag1 <<< 0 uint32323232 b5 += a5 uint32323232 a5 <<= 9 b6 = 0 uint32323232 b5 >>= 23 diag2 ^= a5 diag1 <<<= 32 diag2 ^= b5 uint32323232 a6 += diag2 a7 = diag2 <<< 0 uint32323232 b6 += a6 uint32323232 a6 <<= 13 b7 = 0 uint32323232 b6 >>= 19 diag3 ^= a6 diag2 <<<= 64 diag3 ^= b6 i -= 4 uint32323232 a7 += diag3 a0 = diag1 <<< 0 uint32323232 b7 += a7 uint32323232 a7 <<= 18 b0 = 0 uint32323232 b7 >>= 14 diag0 ^= a7 diag3 <<<= 96 diag0 ^= b7 goto mainloop if unsigned> uint32323232 diag0 += s0 uint32323232 diag1 += s1 uint32323232 diag2 += s2 uint32323232 diag3 += s3 out0 = diag0 out12 = diag1 out8 = diag2 out4 = diag3 diag0 <<<= 96 diag1 <<<= 96 diag2 <<<= 96 diag3 <<<= 96 *(uint32 *) (out + 0) = out0 *(uint32 *) (out + 48) = out12 *(uint32 *) (out + 32) = out8 *(uint32 *) (out + 16) = out4 out5 = diag0 out1 = diag1 out13 = diag2 out9 = diag3 diag0 <<<= 96 diag1 <<<= 96 diag2 <<<= 96 diag3 <<<= 96 *(uint32 *) (out + 20) = out5 *(uint32 *) (out + 4) = out1 *(uint32 *) (out + 52) = out13 *(uint32 *) (out + 36) = out9 out10 = diag0 out6 = diag1 out2 = diag2 out14 = diag3 diag0 <<<= 96 diag1 <<<= 96 diag2 <<<= 96 diag3 <<<= 96 *(uint32 *) (out + 40) = out10 *(uint32 *) (out + 24) = out6 *(uint32 *) (out + 8) = out2 *(uint32 *) (out + 56) = out14 out15 = diag0 out11 = diag1 out7 = diag2 out3 = diag3 *(uint32 *) (out + 60) = out15 *(uint32 *) (out + 44) = out11 *(uint32 *) (out + 28) = out7 *(uint32 *) (out + 12) = out3 eax = eax_stack ebx = ebx_stack esi = esi_stack edi = edi_stack ebp = ebp_stack leave