Changeset 19087
- Timestamp:
- 04/23/12 04:19:49 (14 months ago)
- Location:
- src/router/openssl/crypto/sha
- Files:
-
- 2 added
- 4 edited
-
Makefile (modified) (1 diff)
-
asm/sha1-armv4-large.pl (modified) (5 diffs)
-
asm/sha1-mips.pl (added)
-
asm/sha256-armv4.pl (modified) (7 diffs)
-
asm/sha512-armv4.pl (modified) (16 diffs)
-
asm/sha512-mips.pl (added)
Legend:
- Unmodified
- Added
- Removed
-
src/router/openssl/crypto/sha/Makefile
r14063 r19087 65 65 sha1-armv4-large.s: asm/sha1-armv4-large.pl 66 66 $(PERL) $< $@ 67 68 sha256-mips.s: asm/sha512-mips.pl 69 $(PERL) $< > $@ 70 71 sha512-mips.s: asm/sha512-mips.pl 72 $(PERL) $< > $@ 73 74 sha1-mips.s: asm/sha1-mips.pl 75 $(PERL) $< > $@ 76 67 77 68 78 sha1-ppc.s: asm/sha1-ppc.pl; $(PERL) asm/sha1-ppc.pl linux32 $@ -
src/router/openssl/crypto/sha/asm/sha1-armv4-large.pl
r14159 r19087 38 38 # the same job in Thumb, therefore the code is never twice as 39 39 # small and always slower. 40 # [***] which is also ~35% better than compiler generated code. 41 42 $output=shift; 40 # [***] which is also ~35% better than compiler generated code. Dual- 41 # issue Cortex A8 core was measured to process input block in 42 # ~990 cycles. 43 44 # August 2010. 45 # 46 # Rescheduling for dual-issue pipeline resulted in 13% improvement on 47 # Cortex A8 core and in absolute terms ~870 cycles per input block 48 # [or 13.6 cycles per byte]. 49 50 # February 2011. 51 # 52 # Profiler-assisted and platform-specific optimization resulted in 10% 53 # improvement on Cortex A8 core and 12.2 cycles per byte. 54 55 while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} 43 56 open STDOUT,">$output"; 44 57 … … 59 72 @V=($a,$b,$c,$d,$e); 60 73 61 # One can optimize this for aligned access on big-endian architecture,62 # but code's endian neutrality makes it too pretty:-)63 sub Xload {64 my ($a,$b,$c,$d,$e)=@_;65 $code.=<<___;66 ldrb $t0,[$inp],#467 ldrb $t1,[$inp,#-3]68 ldrb $t2,[$inp,#-2]69 ldrb $t3,[$inp,#-1]70 add $e,$K,$e,ror#2 @ E+=K_00_1971 orr $t0,$t1,$t0,lsl#872 add $e,$e,$a,ror#27 @ E+=ROR(A,27)73 orr $t0,$t2,$t0,lsl#874 eor $t1,$c,$d @ F_xx_xx75 orr $t0,$t3,$t0,lsl#876 add $e,$e,$t0 @ E+=X[i]77 str $t0,[$Xi,#-4]!78 ___79 }80 74 sub Xupdate { 81 my ($a,$b,$c,$d,$e,$ flag)=@_;75 my ($a,$b,$c,$d,$e,$opt1,$opt2)=@_; 82 76 $code.=<<___; 83 77 ldr $t0,[$Xi,#15*4] 84 78 ldr $t1,[$Xi,#13*4] 85 79 ldr $t2,[$Xi,#7*4] 80 add $e,$K,$e,ror#2 @ E+=K_xx_xx 86 81 ldr $t3,[$Xi,#2*4] 87 add $e,$K,$e,ror#2 @ E+=K_xx_xx88 82 eor $t0,$t0,$t1 89 eor $t0,$t0,$t2 90 eor $t0,$t0,$t3 83 eor $t2,$t2,$t3 @ 1 cycle stall 84 eor $t1,$c,$d @ F_xx_xx 85 mov $t0,$t0,ror#31 91 86 add $e,$e,$a,ror#27 @ E+=ROR(A,27) 92 ___ 93 $code.=<<___ if (!defined($flag)); 94 eor $t1,$c,$d @ F_xx_xx, but not in 40_59 95 ___ 96 $code.=<<___; 97 mov $t0,$t0,ror#31 87 eor $t0,$t0,$t2,ror#31 88 str $t0,[$Xi,#-4]! 89 $opt1 @ F_xx_xx 90 $opt2 @ F_xx_xx 98 91 add $e,$e,$t0 @ E+=X[i] 92 ___ 93 } 94 95 sub BODY_00_15 { 96 my ($a,$b,$c,$d,$e)=@_; 97 $code.=<<___; 98 #if __ARM_ARCH__<7 99 ldrb $t1,[$inp,#2] 100 ldrb $t0,[$inp,#3] 101 ldrb $t2,[$inp,#1] 102 add $e,$K,$e,ror#2 @ E+=K_00_19 103 ldrb $t3,[$inp],#4 104 orr $t0,$t0,$t1,lsl#8 105 eor $t1,$c,$d @ F_xx_xx 106 orr $t0,$t0,$t2,lsl#16 107 add $e,$e,$a,ror#27 @ E+=ROR(A,27) 108 orr $t0,$t0,$t3,lsl#24 109 #else 110 ldr $t0,[$inp],#4 @ handles unaligned 111 add $e,$K,$e,ror#2 @ E+=K_00_19 112 eor $t1,$c,$d @ F_xx_xx 113 add $e,$e,$a,ror#27 @ E+=ROR(A,27) 114 #ifdef __ARMEL__ 115 rev $t0,$t0 @ byte swap 116 #endif 117 #endif 118 and $t1,$b,$t1,ror#2 119 add $e,$e,$t0 @ E+=X[i] 120 eor $t1,$t1,$d,ror#2 @ F_00_19(B,C,D) 99 121 str $t0,[$Xi,#-4]! 100 ___ 101 } 102 103 sub BODY_00_15 { 104 my ($a,$b,$c,$d,$e)=@_; 105 &Xload(@_);106 $code.=<<___;107 and $t1,$b,$t1,ror#2 122 add $e,$e,$t1 @ E+=F_00_19(B,C,D) 123 ___ 124 } 125 126 sub BODY_16_19 { 127 my ($a,$b,$c,$d,$e)=@_; 128 &Xupdate(@_,"and $t1,$b,$t1,ror#2"); 129 $code.=<<___; 108 130 eor $t1,$t1,$d,ror#2 @ F_00_19(B,C,D) 109 131 add $e,$e,$t1 @ E+=F_00_19(B,C,D) … … 111 133 } 112 134 113 sub BODY_16_19 {114 my ($a,$b,$c,$d,$e)=@_;115 &Xupdate(@_);116 $code.=<<___;117 and $t1,$b,$t1,ror#2118 eor $t1,$t1,$d,ror#2 @ F_00_19(B,C,D)119 add $e,$e,$t1 @ E+=F_00_19(B,C,D)120 ___121 }122 123 135 sub BODY_20_39 { 124 136 my ($a,$b,$c,$d,$e)=@_; 125 &Xupdate(@_); 126 $code.=<<___; 127 eor $t1,$b,$t1,ror#2 @ F_20_39(B,C,D) 137 &Xupdate(@_,"eor $t1,$b,$t1,ror#2"); 138 $code.=<<___; 128 139 add $e,$e,$t1 @ E+=F_20_39(B,C,D) 129 140 ___ … … 132 143 sub BODY_40_59 { 133 144 my ($a,$b,$c,$d,$e)=@_; 134 &Xupdate(@_,1); 135 $code.=<<___; 136 and $t1,$b,$c,ror#2 137 orr $t2,$b,$c,ror#2 138 and $t2,$t2,$d,ror#2 139 orr $t1,$t1,$t2 @ F_40_59(B,C,D) 145 &Xupdate(@_,"and $t1,$b,$t1,ror#2","and $t2,$c,$d"); 146 $code.=<<___; 140 147 add $e,$e,$t1 @ E+=F_40_59(B,C,D) 148 add $e,$e,$t2,ror#2 141 149 ___ 142 150 } 143 151 144 152 $code=<<___; 153 #include "arm_arch.h" 154 145 155 .text 146 156 … … 216 226 bne .Lloop @ [+18], total 1307 217 227 228 #if __ARM_ARCH__>=5 229 ldmia sp!,{r4-r12,pc} 230 #else 218 231 ldmia sp!,{r4-r12,lr} 219 232 tst lr,#1 220 233 moveq pc,lr @ be binary compatible with V4, yet 221 234 bx lr @ interoperable with Thumb ISA:-) 235 #endif 222 236 .align 2 223 237 .LK_00_19: .word 0x5a827999 -
src/router/openssl/crypto/sha/asm/sha256-armv4.pl
r14159 r19087 12 12 # Performance is ~2x better than gcc 3.4 generated code and in "abso- 13 13 # lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per 14 # byte. 15 16 $output=shift; 14 # byte [on single-issue Xscale PXA250 core]. 15 16 # July 2010. 17 # 18 # Rescheduling for dual-issue pipeline resulted in 22% improvement on 19 # Cortex A8 core and ~20 cycles per processed byte. 20 21 # February 2011. 22 # 23 # Profiler-assisted and platform-specific optimization resulted in 16% 24 # improvement on Cortex A8 core and ~17 cycles per processed byte. 25 26 while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} 17 27 open STDOUT,">$output"; 18 28 19 29 $ctx="r0"; $t0="r0"; 20 $inp="r1"; 30 $inp="r1"; $t3="r1"; 21 31 $len="r2"; $t1="r2"; 22 32 $T1="r3"; … … 42 52 43 53 $code.=<<___ if ($i<16); 54 #if __ARM_ARCH__>=7 55 ldr $T1,[$inp],#4 56 #else 44 57 ldrb $T1,[$inp,#3] @ $i 45 58 ldrb $t2,[$inp,#2] … … 49 62 orr $T1,$T1,$t1,lsl#16 50 63 orr $T1,$T1,$t0,lsl#24 51 `"str $inp,[sp,#17*4]" if ($i==15)` 64 #endif 52 65 ___ 53 66 $code.=<<___; 67 mov $t0,$e,ror#$Sigma1[0] 54 68 ldr $t2,[$Ktbl],#4 @ *K256++ 69 eor $t0,$t0,$e,ror#$Sigma1[1] 70 eor $t1,$f,$g 71 #if $i>=16 72 add $T1,$T1,$t3 @ from BODY_16_xx 73 #elif __ARM_ARCH__>=7 && defined(__ARMEL__) 74 rev $T1,$T1 75 #endif 76 #if $i==15 77 str $inp,[sp,#17*4] @ leave room for $t3 78 #endif 79 eor $t0,$t0,$e,ror#$Sigma1[2] @ Sigma1(e) 80 and $t1,$t1,$e 55 81 str $T1,[sp,#`$i%16`*4] 56 mov $t0,$e,ror#$Sigma1[0]57 eor $t0,$t0,$e,ror#$Sigma1[1]58 eor $t0,$t0,$e,ror#$Sigma1[2] @ Sigma1(e)59 82 add $T1,$T1,$t0 60 eor $t1,$f,$g61 and $t1,$t1,$e62 83 eor $t1,$t1,$g @ Ch(e,f,g) 84 add $T1,$T1,$h 85 mov $h,$a,ror#$Sigma0[0] 63 86 add $T1,$T1,$t1 64 add $T1,$T1,$h87 eor $h,$h,$a,ror#$Sigma0[1] 65 88 add $T1,$T1,$t2 66 mov $h,$a,ror#$Sigma0[0]67 eor $h,$h,$a,ror#$Sigma0[1]68 89 eor $h,$h,$a,ror#$Sigma0[2] @ Sigma0(a) 90 #if $i>=15 91 ldr $t3,[sp,#`($i+2)%16`*4] @ from BODY_16_xx 92 #endif 69 93 orr $t0,$a,$b 94 and $t1,$a,$b 70 95 and $t0,$t0,$c 71 a nd $t1,$a,$b96 add $h,$h,$T1 72 97 orr $t0,$t0,$t1 @ Maj(a,b,c) 98 add $d,$d,$T1 73 99 add $h,$h,$t0 74 add $d,$d,$T175 add $h,$h,$T176 100 ___ 77 101 } … … 81 105 82 106 $code.=<<___; 83 ldr $t1,[sp,#`($i+1)%16`*4]@ $i107 @ ldr $t3,[sp,#`($i+1)%16`*4] @ $i 84 108 ldr $t2,[sp,#`($i+14)%16`*4] 109 mov $t0,$t3,ror#$sigma0[0] 85 110 ldr $T1,[sp,#`($i+0)%16`*4] 86 ldr $inp,[sp,#`($i+9)%16`*4] 87 mov $t0,$t1,ror#$sigma0[0] 88 eor $t0,$t0,$t1,ror#$sigma0[1] 89 eor $t0,$t0,$t1,lsr#$sigma0[2] @ sigma0(X[i+1]) 90 mov $t1,$t2,ror#$sigma1[0] 91 eor $t1,$t1,$t2,ror#$sigma1[1] 92 eor $t1,$t1,$t2,lsr#$sigma1[2] @ sigma1(X[i+14]) 111 eor $t0,$t0,$t3,ror#$sigma0[1] 112 ldr $t1,[sp,#`($i+9)%16`*4] 113 eor $t0,$t0,$t3,lsr#$sigma0[2] @ sigma0(X[i+1]) 114 mov $t3,$t2,ror#$sigma1[0] 93 115 add $T1,$T1,$t0 116 eor $t3,$t3,$t2,ror#$sigma1[1] 94 117 add $T1,$T1,$t1 95 add $T1,$T1,$inp 118 eor $t3,$t3,$t2,lsr#$sigma1[2] @ sigma1(X[i+14]) 119 @ add $T1,$T1,$t3 96 120 ___ 97 121 &BODY_00_15(@_); … … 99 123 100 124 $code=<<___; 125 #include "arm_arch.h" 126 101 127 .text 102 128 .code 32 … … 128 154 sub r3,pc,#8 @ sha256_block_data_order 129 155 add $len,$inp,$len,lsl#6 @ len to point at the end of inp 130 stmdb sp!,{$ctx,$inp,$len,r4-r1 2,lr}156 stmdb sp!,{$ctx,$inp,$len,r4-r11,lr} 131 157 ldmia $ctx,{$A,$B,$C,$D,$E,$F,$G,$H} 132 158 sub $Ktbl,r3,#256 @ K256 … … 167 193 168 194 add sp,sp,#`16+3`*4 @ destroy frame 169 ldmia sp!,{r4-r12,lr} 195 #if __ARM_ARCH__>=5 196 ldmia sp!,{r4-r11,pc} 197 #else 198 ldmia sp!,{r4-r11,lr} 170 199 tst lr,#1 171 200 moveq pc,lr @ be binary compatible with V4, yet 172 201 bx lr @ interoperable with Thumb ISA:-) 202 #endif 173 203 .size sha256_block_data_order,.-sha256_block_data_order 174 204 .asciz "SHA256 block transform for ARMv4, CRYPTOGAMS by <appro\@openssl.org>" -
src/router/openssl/crypto/sha/asm/sha512-armv4.pl
r14159 r19087 11 11 12 12 # This code is ~4.5 (four and a half) times faster than code generated 13 # by gcc 3.4 and it spends ~72 clock cycles per byte. 13 # by gcc 3.4 and it spends ~72 clock cycles per byte [on single-issue 14 # Xscale PXA250 core]. 15 # 16 # July 2010. 17 # 18 # Rescheduling for dual-issue pipeline resulted in 6% improvement on 19 # Cortex A8 core and ~40 cycles per processed byte. 20 21 # February 2011. 22 # 23 # Profiler-assisted and platform-specific optimization resulted in 7% 24 # improvement on Coxtex A8 core and ~38 cycles per byte. 25 26 # March 2011. 27 # 28 # Add NEON implementation. On Cortex A8 it was measured to process 29 # one byte in 25.5 cycles or 47% faster than integer-only code. 14 30 15 31 # Byte order [in]dependence. ========================================= 16 32 # 17 # Caller is expected to maintain specific *dword* order in h[0-7],18 # namely with most significant dword at *lower* address, which is19 # reflected in below two parameters. *Byte* order within these dwords20 # in turn is whatever *native* byte order on current platform.21 $hi= 0;22 $lo= 4;33 # Originally caller was expected to maintain specific *dword* order in 34 # h[0-7], namely with most significant dword at *lower* address, which 35 # was reflected in below two parameters as 0 and 4. Now caller is 36 # expected to maintain native byte order for whole 64-bit values. 37 $hi="HI"; 38 $lo="LO"; 23 39 # ==================================================================== 24 40 25 $output=shift; 41 while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} 26 42 open STDOUT,">$output"; 27 43 28 $ctx="r0"; 44 $ctx="r0"; # parameter block 29 45 $inp="r1"; 30 46 $len="r2"; 47 31 48 $Tlo="r3"; 32 49 $Thi="r4"; … … 56 73 my $magic = shift; 57 74 $code.=<<___; 58 ldr $t2,[sp,#$Hoff+0] @ h.lo59 ldr $t3,[sp,#$Hoff+4] @ h.hi60 75 @ Sigma1(x) (ROTR((x),14) ^ ROTR((x),18) ^ ROTR((x),41)) 61 76 @ LO lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23 62 77 @ HI hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23 63 78 mov $t0,$Elo,lsr#14 79 str $Tlo,[sp,#$Xoff+0] 64 80 mov $t1,$Ehi,lsr#14 81 str $Thi,[sp,#$Xoff+4] 65 82 eor $t0,$t0,$Ehi,lsl#18 83 ldr $t2,[sp,#$Hoff+0] @ h.lo 66 84 eor $t1,$t1,$Elo,lsl#18 85 ldr $t3,[sp,#$Hoff+4] @ h.hi 67 86 eor $t0,$t0,$Elo,lsr#18 68 87 eor $t1,$t1,$Ehi,lsr#18 … … 74 93 eor $t1,$t1,$Ehi,lsl#23 @ Sigma1(e) 75 94 adds $Tlo,$Tlo,$t0 95 ldr $t0,[sp,#$Foff+0] @ f.lo 76 96 adc $Thi,$Thi,$t1 @ T += Sigma1(e) 97 ldr $t1,[sp,#$Foff+4] @ f.hi 77 98 adds $Tlo,$Tlo,$t2 99 ldr $t2,[sp,#$Goff+0] @ g.lo 78 100 adc $Thi,$Thi,$t3 @ T += h 79 80 ldr $t0,[sp,#$Foff+0] @ f.lo81 ldr $t1,[sp,#$Foff+4] @ f.hi82 ldr $t2,[sp,#$Goff+0] @ g.lo83 101 ldr $t3,[sp,#$Goff+4] @ g.hi 102 103 eor $t0,$t0,$t2 84 104 str $Elo,[sp,#$Eoff+0] 105 eor $t1,$t1,$t3 85 106 str $Ehi,[sp,#$Eoff+4] 107 and $t0,$t0,$Elo 86 108 str $Alo,[sp,#$Aoff+0] 109 and $t1,$t1,$Ehi 87 110 str $Ahi,[sp,#$Aoff+4] 88 89 111 eor $t0,$t0,$t2 90 eor $t1,$t1,$t3 91 and $t0,$t0,$Elo 92 and $t1,$t1,$Ehi 93 eor $t0,$t0,$t2 112 ldr $t2,[$Ktbl,#$lo] @ K[i].lo 94 113 eor $t1,$t1,$t3 @ Ch(e,f,g) 95 96 ldr $t2,[$Ktbl,#4] @ K[i].lo 97 ldr $t3,[$Ktbl,#0] @ K[i].hi114 ldr $t3,[$Ktbl,#$hi] @ K[i].hi 115 116 adds $Tlo,$Tlo,$t0 98 117 ldr $Elo,[sp,#$Doff+0] @ d.lo 118 adc $Thi,$Thi,$t1 @ T += Ch(e,f,g) 99 119 ldr $Ehi,[sp,#$Doff+4] @ d.hi 100 101 adds $Tlo,$Tlo,$t0102 adc $Thi,$Thi,$t1 @ T += Ch(e,f,g)103 120 adds $Tlo,$Tlo,$t2 121 and $t0,$t2,#0xff 104 122 adc $Thi,$Thi,$t3 @ T += K[i] 105 123 adds $Elo,$Elo,$Tlo 124 ldr $t2,[sp,#$Boff+0] @ b.lo 106 125 adc $Ehi,$Ehi,$Thi @ d += T 107 108 and $t0,$t2,#0xff109 126 teq $t0,#$magic 127 128 ldr $t3,[sp,#$Coff+0] @ c.lo 110 129 orreq $Ktbl,$Ktbl,#1 111 112 ldr $t2,[sp,#$Boff+0] @ b.lo113 ldr $t3,[sp,#$Coff+0] @ c.lo114 130 @ Sigma0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39)) 115 131 @ LO lo>>28^hi<<4 ^ hi>>2^lo<<30 ^ hi>>7^lo<<25 … … 128 144 eor $t1,$t1,$Ahi,lsl#25 @ Sigma0(a) 129 145 adds $Tlo,$Tlo,$t0 146 and $t0,$Alo,$t2 130 147 adc $Thi,$Thi,$t1 @ T += Sigma0(a) 131 148 132 and $t0,$Alo,$t2149 ldr $t1,[sp,#$Boff+4] @ b.hi 133 150 orr $Alo,$Alo,$t2 134 ldr $t1,[sp,#$Boff+4] @ b.hi135 151 ldr $t2,[sp,#$Coff+4] @ c.hi 136 152 and $Alo,$Alo,$t3 137 orr $Alo,$Alo,$t0 @ Maj(a,b,c).lo138 153 and $t3,$Ahi,$t1 139 154 orr $Ahi,$Ahi,$t1 155 orr $Alo,$Alo,$t0 @ Maj(a,b,c).lo 140 156 and $Ahi,$Ahi,$t2 157 adds $Alo,$Alo,$Tlo 141 158 orr $Ahi,$Ahi,$t3 @ Maj(a,b,c).hi 142 adds $Alo,$Alo,$Tlo159 sub sp,sp,#8 143 160 adc $Ahi,$Ahi,$Thi @ h += T 144 145 sub sp,sp,#8 161 tst $Ktbl,#1 146 162 add $Ktbl,$Ktbl,#8 147 163 ___ 148 164 } 149 165 $code=<<___; 166 #include "arm_arch.h" 167 #ifdef __ARMEL__ 168 # define LO 0 169 # define HI 4 170 # define WORD64(hi0,lo0,hi1,lo1) .word lo0,hi0, lo1,hi1 171 #else 172 # define HI 0 173 # define LO 4 174 # define WORD64(hi0,lo0,hi1,lo1) .word hi0,lo0, hi1,lo1 175 #endif 176 150 177 .text 151 178 .code 32 … … 153 180 .align 5 154 181 K512: 155 .word 0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd 156 .word 0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc 157 .word 0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019 158 .word 0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118 159 .word 0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe 160 .word 0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2 161 .word 0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1 162 .word 0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694 163 .word 0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3 164 .word 0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65 165 .word 0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483 166 .word 0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5 167 .word 0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210 168 .word 0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4 169 .word 0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725 170 .word 0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70 171 .word 0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926 172 .word 0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df 173 .word 0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8 174 .word 0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b 175 .word 0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001 176 .word 0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30 177 .word 0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910 178 .word 0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8 179 .word 0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53 180 .word 0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8 181 .word 0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb 182 .word 0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3 183 .word 0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60 184 .word 0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec 185 .word 0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9 186 .word 0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b 187 .word 0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207 188 .word 0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178 189 .word 0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6 190 .word 0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b 191 .word 0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493 192 .word 0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c 193 .word 0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a 194 .word 0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817 182 WORD64(0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd) 183 WORD64(0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc) 184 WORD64(0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019) 185 WORD64(0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118) 186 WORD64(0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe) 187 WORD64(0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2) 188 WORD64(0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1) 189 WORD64(0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694) 190 WORD64(0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3) 191 WORD64(0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65) 192 WORD64(0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483) 193 WORD64(0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5) 194 WORD64(0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210) 195 WORD64(0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4) 196 WORD64(0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725) 197 WORD64(0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70) 198 WORD64(0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926) 199 WORD64(0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df) 200 WORD64(0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8) 201 WORD64(0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b) 202 WORD64(0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001) 203 WORD64(0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30) 204 WORD64(0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910) 205 WORD64(0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8) 206 WORD64(0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53) 207 WORD64(0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8) 208 WORD64(0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb) 209 WORD64(0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3) 210 WORD64(0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60) 211 WORD64(0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec) 212 WORD64(0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9) 213 WORD64(0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b) 214 WORD64(0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207) 215 WORD64(0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178) 216 WORD64(0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6) 217 WORD64(0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b) 218 WORD64(0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493) 219 WORD64(0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c) 220 WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a) 221 WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817) 195 222 .size K512,.-K512 223 .LOPENSSL_armcap: 224 .word OPENSSL_armcap_P-sha512_block_data_order 225 .skip 32-4 196 226 197 227 .global sha512_block_data_order … … 200 230 sub r3,pc,#8 @ sha512_block_data_order 201 231 add $len,$inp,$len,lsl#7 @ len to point at the end of inp 232 #if __ARM_ARCH__>=7 233 ldr r12,.LOPENSSL_armcap 234 ldr r12,[r3,r12] @ OPENSSL_armcap_P 235 tst r12,#1 236 bne .LNEON 237 #endif 202 238 stmdb sp!,{r4-r12,lr} 203 sub $Ktbl,r3,#6 40@ K512239 sub $Ktbl,r3,#672 @ K512 204 240 sub sp,sp,#9*8 205 241 … … 235 271 236 272 .L00_15: 273 #if __ARM_ARCH__<7 237 274 ldrb $Tlo,[$inp,#7] 238 275 ldrb $t0, [$inp,#6] … … 249 286 orr $Thi,$Thi,$t0,lsl#16 250 287 orr $Thi,$Thi,$t1,lsl#24 251 str $Tlo,[sp,#$Xoff+0] 252 str $Thi,[sp,#$Xoff+4] 288 #else 289 ldr $Tlo,[$inp,#4] 290 ldr $Thi,[$inp],#8 291 #ifdef __ARMEL__ 292 rev $Tlo,$Tlo 293 rev $Thi,$Thi 294 #endif 295 #endif 253 296 ___ 254 297 &BODY_00_15(0x94); … … 256 299 tst $Ktbl,#1 257 300 beq .L00_15 258 bic $Ktbl,$Ktbl,#1259 260 .L16_79:261 301 ldr $t0,[sp,#`$Xoff+8*(16-1)`+0] 262 302 ldr $t1,[sp,#`$Xoff+8*(16-1)`+4] 263 ldr $t2,[sp,#`$Xoff+8*(16-14)`+0] 264 ldr $t3,[sp,#`$Xoff+8*(16-14)`+4] 265 303 bic $Ktbl,$Ktbl,#1 304 .L16_79: 266 305 @ sigma0(x) (ROTR((x),1) ^ ROTR((x),8) ^ ((x)>>7)) 267 306 @ LO lo>>1^hi<<31 ^ lo>>8^hi<<24 ^ lo>>7^hi<<25 268 307 @ HI hi>>1^lo<<31 ^ hi>>8^lo<<24 ^ hi>>7 269 308 mov $Tlo,$t0,lsr#1 309 ldr $t2,[sp,#`$Xoff+8*(16-14)`+0] 270 310 mov $Thi,$t1,lsr#1 311 ldr $t3,[sp,#`$Xoff+8*(16-14)`+4] 271 312 eor $Tlo,$Tlo,$t1,lsl#31 272 313 eor $Thi,$Thi,$t0,lsl#31 … … 292 333 eor $t0,$t0,$t2,lsr#6 293 334 eor $t1,$t1,$t3,lsr#6 335 ldr $t2,[sp,#`$Xoff+8*(16-9)`+0] 294 336 eor $t0,$t0,$t3,lsl#26 295 337 296 ldr $t2,[sp,#`$Xoff+8*(16-9)`+0]297 338 ldr $t3,[sp,#`$Xoff+8*(16-9)`+4] 298 339 adds $Tlo,$Tlo,$t0 340 ldr $t0,[sp,#`$Xoff+8*16`+0] 299 341 adc $Thi,$Thi,$t1 300 342 301 ldr $t0,[sp,#`$Xoff+8*16`+0]302 343 ldr $t1,[sp,#`$Xoff+8*16`+4] 303 344 adds $Tlo,$Tlo,$t2 … … 305 346 adds $Tlo,$Tlo,$t0 306 347 adc $Thi,$Thi,$t1 307 str $Tlo,[sp,#$Xoff+0]308 str $Thi,[sp,#$Xoff+4]309 348 ___ 310 349 &BODY_00_15(0x17); 311 350 $code.=<<___; 312 tst $Ktbl,#1 351 ldreq $t0,[sp,#`$Xoff+8*(16-1)`+0] 352 ldreq $t1,[sp,#`$Xoff+8*(16-1)`+4] 313 353 beq .L16_79 314 354 bic $Ktbl,$Ktbl,#1 … … 321 361 ldr $t3, [$ctx,#$Boff+$hi] 322 362 adds $t0,$Alo,$t0 363 str $t0, [$ctx,#$Aoff+$lo] 323 364 adc $t1,$Ahi,$t1 365 str $t1, [$ctx,#$Aoff+$hi] 324 366 adds $t2,$Tlo,$t2 367 str $t2, [$ctx,#$Boff+$lo] 325 368 adc $t3,$Thi,$t3 326 str $t0, [$ctx,#$Aoff+$lo]327 str $t1, [$ctx,#$Aoff+$hi]328 str $t2, [$ctx,#$Boff+$lo]329 369 str $t3, [$ctx,#$Boff+$hi] 330 370 … … 338 378 ldr $t3, [$ctx,#$Doff+$hi] 339 379 adds $t0,$Alo,$t0 380 str $t0, [$ctx,#$Coff+$lo] 340 381 adc $t1,$Ahi,$t1 382 str $t1, [$ctx,#$Coff+$hi] 341 383 adds $t2,$Tlo,$t2 384 str $t2, [$ctx,#$Doff+$lo] 342 385 adc $t3,$Thi,$t3 343 str $t0, [$ctx,#$Coff+$lo]344 str $t1, [$ctx,#$Coff+$hi]345 str $t2, [$ctx,#$Doff+$lo]346 386 str $t3, [$ctx,#$Doff+$hi] 347 387 … … 353 393 ldr $t3, [$ctx,#$Foff+$hi] 354 394 adds $Elo,$Elo,$t0 395 str $Elo,[$ctx,#$Eoff+$lo] 355 396 adc $Ehi,$Ehi,$t1 397 str $Ehi,[$ctx,#$Eoff+$hi] 356 398 adds $t2,$Tlo,$t2 399 str $t2, [$ctx,#$Foff+$lo] 357 400 adc $t3,$Thi,$t3 358 str $Elo,[$ctx,#$Eoff+$lo]359 str $Ehi,[$ctx,#$Eoff+$hi]360 str $t2, [$ctx,#$Foff+$lo]361 401 str $t3, [$ctx,#$Foff+$hi] 362 402 … … 370 410 ldr $t3, [$ctx,#$Hoff+$hi] 371 411 adds $t0,$Alo,$t0 412 str $t0, [$ctx,#$Goff+$lo] 372 413 adc $t1,$Ahi,$t1 414 str $t1, [$ctx,#$Goff+$hi] 373 415 adds $t2,$Tlo,$t2 416 str $t2, [$ctx,#$Hoff+$lo] 374 417 adc $t3,$Thi,$t3 375 str $t0, [$ctx,#$Goff+$lo]376 str $t1, [$ctx,#$Goff+$hi]377 str $t2, [$ctx,#$Hoff+$lo]378 418 str $t3, [$ctx,#$Hoff+$hi] 379 419 … … 385 425 386 426 add sp,sp,#8*9 @ destroy frame 427 #if __ARM_ARCH__>=5 428 ldmia sp!,{r4-r12,pc} 429 #else 387 430 ldmia sp!,{r4-r12,lr} 388 431 tst lr,#1 389 432 moveq pc,lr @ be binary compatible with V4, yet 390 433 bx lr @ interoperable with Thumb ISA:-) 391 .size sha512_block_data_order,.-sha512_block_data_order 392 .asciz "SHA512 block transform for ARMv4, CRYPTOGAMS by <appro\@openssl.org>" 434 #endif 435 ___ 436 437 { 438 my @Sigma0=(28,34,39); 439 my @Sigma1=(14,18,41); 440 my @sigma0=(1, 8, 7); 441 my @sigma1=(19,61,6); 442 443 my $Ktbl="r3"; 444 my $cnt="r12"; # volatile register known as ip, intra-procedure-call scratch 445 446 my @X=map("d$_",(0..15)); 447 my @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("d$_",(16..23)); 448 449 sub NEON_00_15() { 450 my $i=shift; 451 my ($a,$b,$c,$d,$e,$f,$g,$h)=@_; 452 my ($t0,$t1,$t2,$T1,$K,$Ch,$Maj)=map("d$_",(24..31)); # temps 453 454 $code.=<<___ if ($i<16 || $i&1); 455 vshr.u64 $t0,$e,#@Sigma1[0] @ $i 456 #if $i<16 457 vld1.64 {@X[$i%16]},[$inp]! @ handles unaligned 458 #endif 459 vshr.u64 $t1,$e,#@Sigma1[1] 460 vshr.u64 $t2,$e,#@Sigma1[2] 461 ___ 462 $code.=<<___; 463 vld1.64 {$K},[$Ktbl,:64]! @ K[i++] 464 vsli.64 $t0,$e,#`64-@Sigma1[0]` 465 vsli.64 $t1,$e,#`64-@Sigma1[1]` 466 vsli.64 $t2,$e,#`64-@Sigma1[2]` 467 #if $i<16 && defined(__ARMEL__) 468 vrev64.8 @X[$i],@X[$i] 469 #endif 470 vadd.i64 $T1,$K,$h 471 veor $Ch,$f,$g 472 veor $t0,$t1 473 vand $Ch,$e 474 veor $t0,$t2 @ Sigma1(e) 475 veor $Ch,$g @ Ch(e,f,g) 476 vadd.i64 $T1,$t0 477 vshr.u64 $t0,$a,#@Sigma0[0] 478 vadd.i64 $T1,$Ch 479 vshr.u64 $t1,$a,#@Sigma0[1] 480 vshr.u64 $t2,$a,#@Sigma0[2] 481 vsli.64 $t0,$a,#`64-@Sigma0[0]` 482 vsli.64 $t1,$a,#`64-@Sigma0[1]` 483 vsli.64 $t2,$a,#`64-@Sigma0[2]` 484 vadd.i64 $T1,@X[$i%16] 485 vorr $Maj,$a,$c 486 vand $Ch,$a,$c 487 veor $h,$t0,$t1 488 vand $Maj,$b 489 veor $h,$t2 @ Sigma0(a) 490 vorr $Maj,$Ch @ Maj(a,b,c) 491 vadd.i64 $h,$T1 492 vadd.i64 $d,$T1 493 vadd.i64 $h,$Maj 494 ___ 495 } 496 497 sub NEON_16_79() { 498 my $i=shift; 499 500 if ($i&1) { &NEON_00_15($i,@_); return; } 501 502 # 2x-vectorized, therefore runs every 2nd round 503 my @X=map("q$_",(0..7)); # view @X as 128-bit vector 504 my ($t0,$t1,$s0,$s1) = map("q$_",(12..15)); # temps 505 my ($d0,$d1,$d2) = map("d$_",(24..26)); # temps from NEON_00_15 506 my $e=@_[4]; # $e from NEON_00_15 507 $i /= 2; 508 $code.=<<___; 509 vshr.u64 $t0,@X[($i+7)%8],#@sigma1[0] 510 vshr.u64 $t1,@X[($i+7)%8],#@sigma1[1] 511 vshr.u64 $s1,@X[($i+7)%8],#@sigma1[2] 512 vsli.64 $t0,@X[($i+7)%8],#`64-@sigma1[0]` 513 vext.8 $s0,@X[$i%8],@X[($i+1)%8],#8 @ X[i+1] 514 vsli.64 $t1,@X[($i+7)%8],#`64-@sigma1[1]` 515 veor $s1,$t0 516 vshr.u64 $t0,$s0,#@sigma0[0] 517 veor $s1,$t1 @ sigma1(X[i+14]) 518 vshr.u64 $t1,$s0,#@sigma0[1] 519 vadd.i64 @X[$i%8],$s1 520 vshr.u64 $s1,$s0,#@sigma0[2] 521 vsli.64 $t0,$s0,#`64-@sigma0[0]` 522 vsli.64 $t1,$s0,#`64-@sigma0[1]` 523 vext.8 $s0,@X[($i+4)%8],@X[($i+5)%8],#8 @ X[i+9] 524 veor $s1,$t0 525 vshr.u64 $d0,$e,#@Sigma1[0] @ from NEON_00_15 526 vadd.i64 @X[$i%8],$s0 527 vshr.u64 $d1,$e,#@Sigma1[1] @ from NEON_00_15 528 veor $s1,$t1 @ sigma0(X[i+1]) 529 vshr.u64 $d2,$e,#@Sigma1[2] @ from NEON_00_15 530 vadd.i64 @X[$i%8],$s1 531 ___ 532 &NEON_00_15(2*$i,@_); 533 } 534 535 $code.=<<___; 536 #if __ARM_ARCH__>=7 537 .fpu neon 538 539 .align 4 540 .LNEON: 541 dmb @ errata #451034 on early Cortex A8 542 vstmdb sp!,{d8-d15} @ ABI specification says so 543 sub $Ktbl,r3,#672 @ K512 544 vldmia $ctx,{$A-$H} @ load context 545 .Loop_neon: 546 ___ 547 for($i=0;$i<16;$i++) { &NEON_00_15($i,@V); unshift(@V,pop(@V)); } 548 $code.=<<___; 549 mov $cnt,#4 550 .L16_79_neon: 551 subs $cnt,#1 552 ___ 553 for(;$i<32;$i++) { &NEON_16_79($i,@V); unshift(@V,pop(@V)); } 554 $code.=<<___; 555 bne .L16_79_neon 556 557 vldmia $ctx,{d24-d31} @ load context to temp 558 vadd.i64 q8,q12 @ vectorized accumulate 559 vadd.i64 q9,q13 560 vadd.i64 q10,q14 561 vadd.i64 q11,q15 562 vstmia $ctx,{$A-$H} @ save context 563 teq $inp,$len 564 sub $Ktbl,#640 @ rewind K512 565 bne .Loop_neon 566 567 vldmia sp!,{d8-d15} @ epilogue 568 bx lr 569 #endif 570 ___ 571 } 572 $code.=<<___; 573 .size sha512_block_data_order,.-sha512_block_data_order 574 .asciz "SHA512 block transform for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>" 393 575 .align 2 576 .comm OPENSSL_armcap_P,4,4 394 577 ___ 395 578
Note: See TracChangeset
for help on using the changeset viewer.
