blob: fb4f2a62faef65f567245099e507c364140e9d61 [file] [log] [blame]
From 3405db97e5448c784729b56837f3f8c776a01067 Mon Sep 17 00:00:00 2001
From: Andy Polyakov <appro@openssl.org>
Date: Fri, 15 Feb 2019 09:44:39 +0100
Subject: [PATCH] ARM assembly pack: make it Windows-friendly.
"Windows friendliness" means a) flipping .thumb and .text directives,
b) always generate Thumb-2 code when asked(*); c) Windows-specific
references to external OPENSSL_armcap_P.
(*) so far *some* modules were compiled as .code 32 even if Thumb-2
was targeted. It works at hardware level because processor can alternate
between the modes with no overhead. But clang --target=arm-windows's
builtin assembler just refuses to compile .code 32...
Reviewed-by: Paul Dale <paul.dale@oracle.com>
Reviewed-by: Richard Levitte <levitte@openssl.org>
(Merged from https://github.com/openssl/openssl/pull/8252)
---
crypto/aes/asm/aes-armv4.pl | 3 +-
crypto/aes/asm/aesv8-armx.pl | 24 ++++++++++----
crypto/aes/asm/bsaes-armv7.pl | 7 ++--
crypto/armv4cpuid.pl | 3 +-
crypto/bn/asm/armv4-gf2m.pl | 13 ++++++--
crypto/bn/asm/armv4-mont.pl | 17 +++++++---
crypto/chacha/asm/chacha-armv4.pl | 11 ++++--
crypto/ec/asm/ecp_nistz256-armv4.pl | 4 ++-
crypto/modes/asm/ghash-armv4.pl | 3 +-
crypto/modes/asm/ghashv8-armx.pl | 26 +++++++++++----
crypto/perlasm/arm-xlate.pl | 7 ++++
crypto/poly1305/asm/poly1305-armv4.pl | 48 +++++++++++++--------------
crypto/sha/asm/keccak1600-armv4.pl | 34 +++++++++++++++++--
crypto/sha/asm/sha1-armv4-large.pl | 20 +++++++----
crypto/sha/asm/sha256-armv4.pl | 18 +++++++---
crypto/sha/asm/sha512-armv4.pl | 20 ++++++++---
16 files changed, 185 insertions(+), 73 deletions(-)
diff --git a/crypto/aes/asm/aes-armv4.pl b/crypto/aes/asm/aes-armv4.pl
index abb2cc79a3b6..456a46967917 100644
--- a/crypto/aes/asm/aes-armv4.pl
+++ b/crypto/aes/asm/aes-armv4.pl
@@ -76,7 +76,6 @@
# define __ARM_ARCH__ __LINUX_ARM_ARCH__
#endif
-.text
#if defined(__thumb2__) && !defined(__APPLE__)
.syntax unified
.thumb
@@ -85,6 +84,8 @@
#undef __thumb2__
#endif
+.text
+
.type AES_Te,%object
.align 5
AES_Te:
diff --git a/crypto/aes/asm/aesv8-armx.pl b/crypto/aes/asm/aesv8-armx.pl
index 9ab2158c7db7..81bc1cbf1c22 100755
--- a/crypto/aes/asm/aesv8-armx.pl
+++ b/crypto/aes/asm/aesv8-armx.pl
@@ -53,18 +53,27 @@
$prefix="aes_v8";
+$_byte = ($flavour =~ /win/ ? "DCB" : ".byte");
+
$code=<<___;
#include "arm_arch.h"
#if __ARM_MAX_ARCH__>=7
-.text
___
-$code.=".arch armv8-a+crypto\n" if ($flavour =~ /64/);
+$code.=".arch armv8-a+crypto\n.text\n" if ($flavour =~ /64/);
$code.=<<___ if ($flavour !~ /64/);
.arch armv7-a // don't confuse not-so-latest binutils with argv8 :-)
.fpu neon
+#ifdef __thumb2__
+.syntax unified
+.thumb
+# define INST(a,b,c,d) $_byte c,d|0xc,a,b
+#else
.code 32
-#undef __thumb2__
+# define INST(a,b,c,d) $_byte a,b,c,d
+#endif
+
+.text
___
# Assembler mnemonics are an eclectic mix of 32- and 64-bit syntax,
@@ -955,7 +964,7 @@ ()
# since ARMv7 instructions are always encoded little-endian.
# correct solution is to use .inst directive, but older
# assemblers don't implement it:-(
- sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s",
+ sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s",
$word&0xff,($word>>8)&0xff,
($word>>16)&0xff,($word>>24)&0xff,
$mnemonic,$arg;
@@ -996,14 +1005,17 @@ ()
s/\],#[0-9]+/]!/o;
s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or
- s/cclr\s+([^,]+),\s*([a-z]+)/mov$2 $1,#0/o or
+ s/cclr\s+([^,]+),\s*([a-z]+)/mov.$2 $1,#0/o or
s/vtbl\.8\s+(.*)/unvtbl($1)/geo or
s/vdup\.32\s+(.*)/unvdup32($1)/geo or
s/vmov\.32\s+(.*)/unvmov32($1)/geo or
s/^(\s+)b\./$1b/o or
- s/^(\s+)mov\./$1mov/o or
s/^(\s+)ret/$1bx\tlr/o;
+ if (s/^(\s+)mov\.([a-z]+)/$1mov$2/) {
+ print " it $2\n";
+ }
+
print $_,"\n";
}
}
diff --git a/crypto/aes/asm/bsaes-armv7.pl b/crypto/aes/asm/bsaes-armv7.pl
index 5df195b9efb6..7f5219bc7520 100644
--- a/crypto/aes/asm/bsaes-armv7.pl
+++ b/crypto/aes/asm/bsaes-armv7.pl
@@ -728,7 +728,6 @@ sub bitslice {
.arch armv7-a
.fpu neon
-.text
.syntax unified @ ARMv7-capable assembler is expected to handle this
#if defined(__thumb2__) && !defined(__APPLE__)
.thumb
@@ -737,6 +736,8 @@ sub bitslice {
# undef __thumb2__
#endif
+.text
+
.type _bsaes_decrypt8,%function
.align 4
_bsaes_decrypt8:
@@ -1125,9 +1126,9 @@ sub bitslice_key {
#ifndef __thumb__
blo AES_cbc_encrypt
#else
- bhs 1f
+ bhs .Lcbc_do_bsaes
b AES_cbc_encrypt
-1:
+.Lcbc_do_bsaes:
#endif
#endif
diff --git a/crypto/armv4cpuid.pl b/crypto/armv4cpuid.pl
index 3a7be6e54136..f8aeec64f0e2 100644
--- a/crypto/armv4cpuid.pl
+++ b/crypto/armv4cpuid.pl
@@ -21,7 +21,6 @@
$code.=<<___;
#include "arm_arch.h"
-.text
#if defined(__thumb2__) && !defined(__APPLE__)
.syntax unified
.thumb
@@ -30,6 +29,8 @@
#undef __thumb2__
#endif
+.text
+
.align 5
.global OPENSSL_atomic_add
.type OPENSSL_atomic_add,%function
diff --git a/crypto/bn/asm/armv4-gf2m.pl b/crypto/bn/asm/armv4-gf2m.pl
index 442ae4695399..0bf6f63ec577 100644
--- a/crypto/bn/asm/armv4-gf2m.pl
+++ b/crypto/bn/asm/armv4-gf2m.pl
@@ -57,13 +57,14 @@
$code=<<___;
#include "arm_arch.h"
-.text
#if defined(__thumb2__)
.syntax unified
.thumb
#else
.code 32
#endif
+
+.text
___
################
# private interface to mul_1x1_ialu
@@ -176,11 +177,13 @@
#if __ARM_MAX_ARCH__>=7
stmdb sp!,{r10,lr}
ldr r12,.LOPENSSL_armcap
+# if !defined(_WIN32)
adr r10,.LOPENSSL_armcap
ldr r12,[r12,r10]
-#ifdef __APPLE__
+# endif
+# if defined(__APPLE__) || defined(_WIN32)
ldr r12,[r12]
-#endif
+# endif
tst r12,#ARMV7_NEON
itt ne
ldrne r10,[sp],#8
@@ -310,7 +313,11 @@
#if __ARM_MAX_ARCH__>=7
.align 5
.LOPENSSL_armcap:
+# ifdef _WIN32
+.word OPENSSL_armcap_P
+# else
.word OPENSSL_armcap_P-.
+# endif
#endif
.asciz "GF(2^m) Multiplication for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
.align 5
diff --git a/crypto/bn/asm/armv4-mont.pl b/crypto/bn/asm/armv4-mont.pl
index b4f6c06633b4..7e0a4d814597 100644
--- a/crypto/bn/asm/armv4-mont.pl
+++ b/crypto/bn/asm/armv4-mont.pl
@@ -97,7 +97,6 @@
$code=<<___;
#include "arm_arch.h"
-.text
#if defined(__thumb2__)
.syntax unified
.thumb
@@ -105,10 +104,16 @@
.code 32
#endif
+.text
+
#if __ARM_MAX_ARCH__>=7
.align 5
.LOPENSSL_armcap:
+# ifdef _WIN32
+.word OPENSSL_armcap_P
+# else
.word OPENSSL_armcap_P-.Lbn_mul_mont
+# endif
#endif
.global bn_mul_mont
@@ -122,12 +127,14 @@
#if __ARM_MAX_ARCH__>=7
tst ip,#7
bne .Lialu
- adr r0,.Lbn_mul_mont
- ldr r2,.LOPENSSL_armcap
+ ldr r0,.LOPENSSL_armcap
+#if !defined(_WIN32)
+ adr r2,.Lbn_mul_mont
ldr r0,[r0,r2]
-#ifdef __APPLE__
+# endif
+# if defined(__APPLE__) || defined(_WIN32)
ldr r0,[r0]
-#endif
+# endif
tst r0,#ARMV7_NEON @ NEON available?
ldmia sp, {r0,r2}
beq .Lialu
diff --git a/crypto/chacha/asm/chacha-armv4.pl b/crypto/chacha/asm/chacha-armv4.pl
index 9bbfc6b376cd..c4402961d47c 100755
--- a/crypto/chacha/asm/chacha-armv4.pl
+++ b/crypto/chacha/asm/chacha-armv4.pl
@@ -171,7 +171,6 @@ sub ROUND {
$code.=<<___;
#include "arm_arch.h"
-.text
#if defined(__thumb2__) || defined(__clang__)
.syntax unified
#endif
@@ -185,6 +184,8 @@ sub ROUND {
#define ldrhsb ldrbhs
#endif
+.text
+
.align 5
.Lsigma:
.long 0x61707865,0x3320646e,0x79622d32,0x6b206574 @ endian-neutral
@@ -192,7 +193,11 @@ sub ROUND {
.long 1,0,0,0
#if __ARM_MAX_ARCH__>=7
.LOPENSSL_armcap:
+# ifdef _WIN32
+.word OPENSSL_armcap_P
+# else
.word OPENSSL_armcap_P-.LChaCha20_ctr32
+# endif
#else
.word -1
#endif
@@ -219,8 +224,10 @@ sub ROUND {
cmp r2,#192 @ test len
bls .Lshort
ldr r4,[r14,#-32]
+# if !defined(_WIN32)
ldr r4,[r14,r4]
-# ifdef __APPLE__
+# endif
+# if defined(__APPLE__) || defined(_WIN32)
ldr r4,[r4]
# endif
tst r4,#ARMV7_NEON
diff --git a/crypto/ec/asm/ecp_nistz256-armv4.pl b/crypto/ec/asm/ecp_nistz256-armv4.pl
index 4005a6fbdcf8..43a675b4b2d0 100755
--- a/crypto/ec/asm/ecp_nistz256-armv4.pl
+++ b/crypto/ec/asm/ecp_nistz256-armv4.pl
@@ -51,7 +51,6 @@
$code.=<<___;
#include "arm_arch.h"
-.text
#if defined(__thumb2__)
.syntax unified
.thumb
@@ -80,6 +79,7 @@
die "insane number of elements" if ($#arr != 64*16*37-1);
$code.=<<___;
+.rodata
.globl ecp_nistz256_precomputed
.type ecp_nistz256_precomputed,%object
.align 12
@@ -104,6 +104,8 @@
}
$code.=<<___;
.size ecp_nistz256_precomputed,.-ecp_nistz256_precomputed
+
+.text
.align 5
.LRR: @ 2^512 mod P precomputed for NIST P256 polynomial
.long 0x00000003, 0x00000000, 0xffffffff, 0xfffffffb
diff --git a/crypto/modes/asm/ghash-armv4.pl b/crypto/modes/asm/ghash-armv4.pl
index 759d29c49fdb..1391b1b6e014 100644
--- a/crypto/modes/asm/ghash-armv4.pl
+++ b/crypto/modes/asm/ghash-armv4.pl
@@ -142,7 +142,6 @@ ()
$code=<<___;
#include "arm_arch.h"
-.text
#if defined(__thumb2__) || defined(__clang__)
.syntax unified
#define ldrplb ldrbpl
@@ -154,6 +153,8 @@ ()
.code 32
#endif
+.text
+
.type rem_4bit,%object
.align 5
rem_4bit:
diff --git a/crypto/modes/asm/ghashv8-armx.pl b/crypto/modes/asm/ghashv8-armx.pl
index ea05950309d1..e89158331209 100644
--- a/crypto/modes/asm/ghashv8-armx.pl
+++ b/crypto/modes/asm/ghashv8-armx.pl
@@ -66,18 +66,26 @@
{
my ($Xl,$Xm,$Xh,$IN)=map("q$_",(0..3));
my ($t0,$t1,$t2,$xC2,$H,$Hhl,$H2)=map("q$_",(8..14));
+my $_byte = ($flavour =~ /win/ ? "DCB" : ".byte");
$code=<<___;
#include "arm_arch.h"
#if __ARM_MAX_ARCH__>=7
-.text
___
-$code.=".arch armv8-a+crypto\n" if ($flavour =~ /64/);
-$code.=<<___ if ($flavour !~ /64/);
+$code.=".arch armv8-a+crypto\n.text\n" if ($flavour =~ /64/);
+$code.=<<___ if ($flavour !~ /64/);
.fpu neon
-.code 32
-#undef __thumb2__
+#ifdef __thumb2__
+.syntax unified
+.thumb
+# define INST(a,b,c,d) $_byte c,0xef,a,b
+#else
+.code 32
+# define INST(a,b,c,d) $_byte a,b,c,0xf2
+#endif
+
+.text
___
################################################################################
@@ -752,7 +760,7 @@
# since ARMv7 instructions are always encoded little-endian.
# correct solution is to use .inst directive, but older
# assemblers don't implement it:-(
- sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s",
+ sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s",
$word&0xff,($word>>8)&0xff,
($word>>16)&0xff,($word>>24)&0xff,
$mnemonic,$arg;
@@ -767,13 +775,17 @@
# fix up remaining new-style suffixes
s/\],#[0-9]+/]!/o;
- s/cclr\s+([^,]+),\s*([a-z]+)/mov$2 $1,#0/o or
+ s/cclr\s+([^,]+),\s*([a-z]+)/mov.$2 $1,#0/o or
s/vdup\.32\s+(.*)/unvdup32($1)/geo or
s/v?(pmull2?)\.p64\s+(.*)/unvpmullp64($1,$2)/geo or
s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or
s/^(\s+)b\./$1b/o or
s/^(\s+)ret/$1bx\tlr/o;
+ if (s/^(\s+)mov\.([a-z]+)/$1mov$2/) {
+ print " it $2\n";
+ }
+
print $_,"\n";
}
}
diff --git a/crypto/perlasm/arm-xlate.pl b/crypto/perlasm/arm-xlate.pl
index d78a8baac128..b953f1fc19ef 100755
--- a/crypto/perlasm/arm-xlate.pl
+++ b/crypto/perlasm/arm-xlate.pl
@@ -28,6 +28,13 @@
if ($flavour =~ /linux/) { ".fpu\t".join(',',@_); }
else { ""; }
};
+my $rodata = sub {
+ SWITCH: for ($flavour) {
+ /linux/ && return ".section\t.rodata";
+ /ios/ && return ".section\t__TEXT,__const";
+ last;
+ }
+};
my $hidden = sub {
if ($flavour =~ /ios/) { ".private_extern\t".join(',',@_); }
else { ".hidden\t".join(',',@_); }
diff --git a/crypto/poly1305/asm/poly1305-armv4.pl b/crypto/poly1305/asm/poly1305-armv4.pl
index 7003cc7770af..38622af1ab0e 100755
--- a/crypto/poly1305/asm/poly1305-armv4.pl
+++ b/crypto/poly1305/asm/poly1305-armv4.pl
@@ -48,7 +48,6 @@
$code.=<<___;
#include "arm_arch.h"
-.text
#if defined(__thumb2__)
.syntax unified
.thumb
@@ -56,6 +55,8 @@
.code 32
#endif
+.text
+
.globl poly1305_emit
.globl poly1305_blocks
.globl poly1305_init
@@ -100,8 +101,10 @@
and r4,r4,r10
#if __ARM_MAX_ARCH__>=7
+# if !defined(_WIN32)
ldr r12,[r11,r12] @ OPENSSL_armcap_P
-# ifdef __APPLE__
+# endif
+# if defined(__APPLE__) || defined(_WIN32)
ldr r12,[r12]
# endif
#endif
@@ -116,31 +119,21 @@
#if __ARM_MAX_ARCH__>=7
tst r12,#ARMV7_NEON @ check for NEON
-# ifdef __APPLE__
- adr r9,poly1305_blocks_neon
- adr r11,poly1305_blocks
-# ifdef __thumb2__
- it ne
-# endif
+# ifdef __thumb2__
+ adr r9,.Lpoly1305_blocks_neon
+ adr r11,.Lpoly1305_blocks
+ adr r12,.Lpoly1305_emit
+ adr r10,.Lpoly1305_emit_neon
+ itt ne
movne r11,r9
- adr r12,poly1305_emit
- adr r10,poly1305_emit_neon
-# ifdef __thumb2__
- it ne
-# endif
movne r12,r10
+ orr r11,r11,#1 @ thumb-ify address
+ orr r12,r12,#1
# else
-# ifdef __thumb2__
- itete eq
-# endif
- addeq r12,r11,#(poly1305_emit-.Lpoly1305_init)
- addne r12,r11,#(poly1305_emit_neon-.Lpoly1305_init)
- addeq r11,r11,#(poly1305_blocks-.Lpoly1305_init)
- addne r11,r11,#(poly1305_blocks_neon-.Lpoly1305_init)
-# endif
-# ifdef __thumb2__
- orr r12,r12,#1 @ thumb-ify address
- orr r11,r11,#1
+ addeq r12,r11,#(.Lpoly1305_emit-.Lpoly1305_init)
+ addne r12,r11,#(.Lpoly1305_emit_neon-.Lpoly1305_init)
+ addeq r11,r11,#(.Lpoly1305_blocks-.Lpoly1305_init)
+ addne r11,r11,#(.Lpoly1305_blocks_neon-.Lpoly1305_init)
# endif
#endif
ldrb r9,[$inp,#11]
@@ -352,6 +345,7 @@
.type poly1305_emit,%function
.align 5
poly1305_emit:
+.Lpoly1305_emit:
stmdb sp!,{r4-r11}
.Lpoly1305_emit_enter:
@@ -671,6 +665,7 @@
.type poly1305_blocks_neon,%function
.align 5
poly1305_blocks_neon:
+.Lpoly1305_blocks_neon:
ldr ip,[$ctx,#36] @ is_base2_26
ands $len,$len,#-16
beq .Lno_data_neon
@@ -1157,6 +1152,7 @@
.type poly1305_emit_neon,%function
.align 5
poly1305_emit_neon:
+.Lpoly1305_emit_neon:
ldr ip,[$ctx,#36] @ is_base2_26
stmdb sp!,{r4-r11}
@@ -1229,7 +1225,11 @@
.Lzeros:
.long 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
.LOPENSSL_armcap:
+# ifdef _WIN32
+.word OPENSSL_armcap_P
+# else
.word OPENSSL_armcap_P-.Lpoly1305_init
+# endif
#endif
___
} }
diff --git a/crypto/sha/asm/keccak1600-armv4.pl b/crypto/sha/asm/keccak1600-armv4.pl
index 44504fa8acdd..be411ed74b00 100755
--- a/crypto/sha/asm/keccak1600-armv4.pl
+++ b/crypto/sha/asm/keccak1600-armv4.pl
@@ -113,8 +113,6 @@
$code.=<<___;
#include "arm_arch.h"
-.text
-
#if defined(__thumb2__)
.syntax unified
.thumb
@@ -122,6 +120,8 @@
.code 32
#endif
+.text
+
.type iotas32, %object
.align 5
iotas32:
@@ -691,7 +691,14 @@ sub Round {
$code.=<<___;
blo .Lround2x
+#if __ARM_ARCH__>=5
ldr pc,[sp,#440]
+#else
+ ldr lr,[sp,#440]
+ tst lr,#1
+ moveq pc,lr @ be binary compatible with V4, yet
+ bx lr @ interoperable with Thumb ISA:-)
+#endif
.size KeccakF1600_int,.-KeccakF1600_int
.type KeccakF1600, %function
@@ -730,7 +737,14 @@ sub Round {
stmia @E[1], {@C[0]-@C[9]}
add sp,sp,#440+20
+#if __ARM_ARCH__>=5
ldmia sp!,{r4-r11,pc}
+#else
+ ldmia sp!,{r4-r11,lr}
+ tst lr,#1
+ moveq pc,lr @ be binary compatible with V4, yet
+ bx lr @ interoperable with Thumb ISA:-)
+#endif
.size KeccakF1600,.-KeccakF1600
___
{ my ($A_flat,$inp,$len,$bsz) = map("r$_",(10..12,14));
@@ -905,7 +919,14 @@ sub Round {
.Labsorb_abort:
add sp,sp,#456+32
mov r0,$len @ return value
+#if __ARM_ARCH__>=5
ldmia sp!,{r4-r12,pc}
+#else
+ ldmia sp!,{r4-r12,lr}
+ tst lr,#1
+ moveq pc,lr @ be binary compatible with V4, yet
+ bx lr @ interoperable with Thumb ISA:-)
+#endif
.size SHA3_absorb,.-SHA3_absorb
___
}
@@ -1055,7 +1076,14 @@ sub Round {
.align 4
.Lsqueeze_done:
add sp,sp,#24
+#if __ARM_ARCH__>=5
ldmia sp!,{r4-r10,pc}
+#else
+ ldmia sp!,{r4-r10,lr}
+ tst lr,#1
+ moveq pc,lr @ be binary compatible with V4, yet
+ bx lr @ interoperable with Thumb ISA:-)
+#endif
.size SHA3_squeeze,.-SHA3_squeeze
___
}
@@ -1265,7 +1293,7 @@ sub Round {
subs r3, r3, #1
bne .Loop_neon
- bx lr
+ ret
.size KeccakF1600_neon,.-KeccakF1600_neon
.global SHA3_absorb_neon
diff --git a/crypto/sha/asm/sha1-armv4-large.pl b/crypto/sha/asm/sha1-armv4-large.pl
index cdacbd4633cd..cd0b95ade8d5 100644
--- a/crypto/sha/asm/sha1-armv4-large.pl
+++ b/crypto/sha/asm/sha1-armv4-large.pl
@@ -187,7 +187,6 @@ sub BODY_40_59 {
$code=<<___;
#include "arm_arch.h"
-.text
#if defined(__thumb2__)
.syntax unified
.thumb
@@ -195,6 +194,8 @@ sub BODY_40_59 {
.code 32
#endif
+.text
+
.global sha1_block_data_order
.type sha1_block_data_order,%function
@@ -202,12 +203,14 @@ sub BODY_40_59 {
sha1_block_data_order:
#if __ARM_MAX_ARCH__>=7
.Lsha1_block:
- adr r3,.Lsha1_block
ldr r12,.LOPENSSL_armcap
+# if !defined(_WIN32)
+ adr r3,.Lsha1_block
ldr r12,[r3,r12] @ OPENSSL_armcap_P
-#ifdef __APPLE__
+# endif
+# if defined(__APPLE__) || defined(_WIN32)
ldr r12,[r12]
-#endif
+# endif
tst r12,#ARMV8_SHA1
bne .LARMv8
tst r12,#ARMV7_NEON
@@ -311,7 +314,11 @@ sub BODY_40_59 {
.LK_60_79: .word 0xca62c1d6
#if __ARM_MAX_ARCH__>=7
.LOPENSSL_armcap:
+# ifdef _WIN32
+.word OPENSSL_armcap_P
+# else
.word OPENSSL_armcap_P-.Lsha1_block
+# endif
#endif
.asciz "SHA1 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
.align 5
@@ -613,14 +620,15 @@ ()
my @MSG=map("q$_",(4..7));
my @Kxx=map("q$_",(8..11));
my ($W0,$W1,$ABCD_SAVE)=map("q$_",(12..14));
+my $_byte = ($flavour =~ /win/ ? "DCB" : ".byte");
$code.=<<___;
#if __ARM_MAX_ARCH__>=7
# if defined(__thumb2__)
-# define INST(a,b,c,d) .byte c,d|0xf,a,b
+# define INST(a,b,c,d) $_byte c,d|0xf,a,b
# else
-# define INST(a,b,c,d) .byte a,b,c,d|0x10
+# define INST(a,b,c,d) $_byte a,b,c,d|0x10
# endif
.type sha1_block_data_order_armv8,%function
diff --git a/crypto/sha/asm/sha256-armv4.pl b/crypto/sha/asm/sha256-armv4.pl
index e0512b4d411f..cd01fcb57bb9 100644
--- a/crypto/sha/asm/sha256-armv4.pl
+++ b/crypto/sha/asm/sha256-armv4.pl
@@ -181,7 +181,6 @@ sub BODY_16_XX {
# define __ARM_MAX_ARCH__ 7
#endif
-.text
#if defined(__thumb2__)
.syntax unified
.thumb
@@ -189,6 +188,8 @@ sub BODY_16_XX {
.code 32
#endif
+.text
+
.type K256,%object
.align 5
K256:
@@ -212,7 +213,11 @@ sub BODY_16_XX {
.word 0 @ terminator
#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
.LOPENSSL_armcap:
+# ifdef _WIN32
+.word OPENSSL_armcap_P
+# else
.word OPENSSL_armcap_P-.Lsha256_block_data_order
+# endif
#endif
.align 5
@@ -227,10 +232,12 @@ sub BODY_16_XX {
#endif
#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
ldr r12,.LOPENSSL_armcap
+# if !defined(_WIN32)
ldr r12,[r3,r12] @ OPENSSL_armcap_P
-#ifdef __APPLE__
+# endif
+# if defined(__APPLE__) || defined(_WIN32)
ldr r12,[r12]
-#endif
+# endif
tst r12,#ARMV8_SHA256
bne .LARMv8
tst r12,#ARMV7_NEON
@@ -598,14 +605,15 @@ ()
my @MSG=map("q$_",(8..11));
my ($W0,$W1,$ABCD_SAVE,$EFGH_SAVE)=map("q$_",(12..15));
my $Ktbl="r3";
+my $_byte = ($flavour =~ /win/ ? "DCB" : ".byte");
$code.=<<___;
#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
# if defined(__thumb2__)
-# define INST(a,b,c,d) .byte c,d|0xc,a,b
+# define INST(a,b,c,d) $_byte c,d|0xc,a,b
# else
-# define INST(a,b,c,d) .byte a,b,c,d
+# define INST(a,b,c,d) $_byte a,b,c,d
# endif
.type sha256_block_data_order_armv8,%function
diff --git a/crypto/sha/asm/sha512-armv4.pl b/crypto/sha/asm/sha512-armv4.pl
index 795c4cbb459e..39c943b4499e 100644
--- a/crypto/sha/asm/sha512-armv4.pl
+++ b/crypto/sha/asm/sha512-armv4.pl
@@ -196,6 +196,9 @@ ()
add $Ktbl,$Ktbl,#8
___
}
+
+my $_word = ($flavour =~ /win/ ? "DCDU" : ".word");
+
$code=<<___;
#ifndef __KERNEL__
# include "arm_arch.h"
@@ -211,14 +214,13 @@ ()
#ifdef __ARMEL__
# define LO 0
# define HI 4
-# define WORD64(hi0,lo0,hi1,lo1) .word lo0,hi0, lo1,hi1
+# define WORD64(hi0,lo0,hi1,lo1) $_word lo0,hi0, lo1,hi1
#else
# define HI 0
# define LO 4
-# define WORD64(hi0,lo0,hi1,lo1) .word hi0,lo0, hi1,lo1
+# define WORD64(hi0,lo0,hi1,lo1) $_word hi0,lo0, hi1,lo1
#endif
-.text
#if defined(__thumb2__)
.syntax unified
.thumb
@@ -227,6 +229,8 @@ ()
.code 32
#endif
+.text
+
.type K512,%object
.align 5
K512:
@@ -273,7 +277,11 @@ ()
.size K512,.-K512
#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
.LOPENSSL_armcap:
+# ifdef _WIN32
+.word OPENSSL_armcap_P
+# else
.word OPENSSL_armcap_P-.Lsha512_block_data_order
+# endif
.skip 32-4
#else
.skip 32
@@ -290,10 +298,12 @@ ()
#endif
#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
ldr r12,.LOPENSSL_armcap
+# if !defined(_WIN32)
ldr r12,[r3,r12] @ OPENSSL_armcap_P
-#ifdef __APPLE__
+# endif
+# if defined(__APPLE__) || defined(_WIN32)
ldr r12,[r12]
-#endif
+# endif
tst r12,#ARMV7_NEON
bne .LNEON
#endif