| /* SPDX-License-Identifier: GPL-2.0-or-later */ | 
 | /* | 
 |  * Fast AES implementation for SPE instruction set (PPC) | 
 |  * | 
 |  * This code makes use of the SPE SIMD instruction set as defined in | 
 |  * http://cache.freescale.com/files/32bit/doc/ref_manual/SPEPIM.pdf | 
 |  * Implementation is based on optimization guide notes from | 
 |  * http://cache.freescale.com/files/32bit/doc/app_note/AN2665.pdf | 
 |  * | 
 |  * Copyright (c) 2015 Markus Stockhausen <stockhausen@collogia.de> | 
 |  */ | 
 |  | 
 | #include <asm/ppc_asm.h> | 
 | #include "aes-spe-regs.h" | 
 |  | 
 | #define	EAD(in, bpos) \ | 
 | 	rlwimi		rT0,in,28-((bpos+3)%4)*8,20,27; | 
 |  | 
 | #define DAD(in, bpos) \ | 
 | 	rlwimi		rT1,in,24-((bpos+3)%4)*8,24,31; | 
 |  | 
 | #define LWH(out, off) \ | 
 | 	evlwwsplat	out,off(rT0);	/* load word high		*/ | 
 |  | 
 | #define LWL(out, off) \ | 
 | 	lwz		out,off(rT0);	/* load word low		*/ | 
 |  | 
 | #define LBZ(out, tab, off) \ | 
 | 	lbz		out,off(tab);	/* load byte			*/ | 
 |  | 
 | #define LAH(out, in, bpos, off) \ | 
 | 	EAD(in, bpos)			/* calc addr + load word high	*/ \ | 
 | 	LWH(out, off) | 
 |  | 
 | #define LAL(out, in, bpos, off) \ | 
 | 	EAD(in, bpos)			/* calc addr + load word low	*/ \ | 
 | 	LWL(out, off) | 
 |  | 
 | #define LAE(out, in, bpos) \ | 
 | 	EAD(in, bpos)			/* calc addr + load enc byte	*/ \ | 
 | 	LBZ(out, rT0, 8) | 
 |  | 
 | #define LBE(out) \ | 
 | 	LBZ(out, rT0, 8)		/* load enc byte		*/ | 
 |  | 
 | #define LAD(out, in, bpos) \ | 
 | 	DAD(in, bpos)			/* calc addr + load dec byte	*/ \ | 
 | 	LBZ(out, rT1, 0) | 
 |  | 
 | #define LBD(out) \ | 
 | 	LBZ(out, rT1, 0) | 
 |  | 
 | /* | 
 |  * ppc_encrypt_block: The central encryption function for a single 16 bytes | 
 |  * block. It does no stack handling or register saving to support fast calls | 
 |  * via bl/blr. It expects that caller has pre-xored input data with first | 
 |  * 4 words of encryption key into rD0-rD3. Pointer/counter registers must | 
 |  * have also been set up before (rT0, rKP, CTR). Output is stored in rD0-rD3 | 
 |  * and rW0-rW3 and caller must execute a final xor on the output registers. | 
 |  * All working registers rD0-rD3 & rW0-rW7 are overwritten during processing. | 
 |  * | 
 |  */ | 
 | _GLOBAL(ppc_encrypt_block) | 
 | 	LAH(rW4, rD1, 2, 4) | 
 | 	LAH(rW6, rD0, 3, 0) | 
 | 	LAH(rW3, rD0, 1, 8) | 
 | ppc_encrypt_block_loop: | 
 | 	LAH(rW0, rD3, 0, 12) | 
 | 	LAL(rW0, rD0, 0, 12) | 
 | 	LAH(rW1, rD1, 0, 12) | 
 | 	LAH(rW2, rD2, 1, 8) | 
 | 	LAL(rW2, rD3, 1, 8) | 
 | 	LAL(rW3, rD1, 1, 8) | 
 | 	LAL(rW4, rD2, 2, 4) | 
 | 	LAL(rW6, rD1, 3, 0) | 
 | 	LAH(rW5, rD3, 2, 4) | 
 | 	LAL(rW5, rD0, 2, 4) | 
 | 	LAH(rW7, rD2, 3, 0) | 
 | 	evldw		rD1,16(rKP) | 
 | 	EAD(rD3, 3) | 
 | 	evxor		rW2,rW2,rW4 | 
 | 	LWL(rW7, 0) | 
 | 	evxor		rW2,rW2,rW6 | 
 | 	EAD(rD2, 0) | 
 | 	evxor		rD1,rD1,rW2 | 
 | 	LWL(rW1, 12) | 
 | 	evxor		rD1,rD1,rW0 | 
 | 	evldw		rD3,24(rKP) | 
 | 	evmergehi	rD0,rD0,rD1 | 
 | 	EAD(rD1, 2) | 
 | 	evxor		rW3,rW3,rW5 | 
 | 	LWH(rW4, 4) | 
 | 	evxor		rW3,rW3,rW7 | 
 | 	EAD(rD0, 3) | 
 | 	evxor		rD3,rD3,rW3 | 
 | 	LWH(rW6, 0) | 
 | 	evxor		rD3,rD3,rW1 | 
 | 	EAD(rD0, 1) | 
 | 	evmergehi	rD2,rD2,rD3 | 
 | 	LWH(rW3, 8) | 
 | 	LAH(rW0, rD3, 0, 12) | 
 | 	LAL(rW0, rD0, 0, 12) | 
 | 	LAH(rW1, rD1, 0, 12) | 
 | 	LAH(rW2, rD2, 1, 8) | 
 | 	LAL(rW2, rD3, 1, 8) | 
 | 	LAL(rW3, rD1, 1, 8) | 
 | 	LAL(rW4, rD2, 2, 4) | 
 | 	LAL(rW6, rD1, 3, 0) | 
 | 	LAH(rW5, rD3, 2, 4) | 
 | 	LAL(rW5, rD0, 2, 4) | 
 | 	LAH(rW7, rD2, 3, 0) | 
 | 	evldw		rD1,32(rKP) | 
 | 	EAD(rD3, 3) | 
 | 	evxor		rW2,rW2,rW4 | 
 | 	LWL(rW7, 0) | 
 | 	evxor		rW2,rW2,rW6 | 
 | 	EAD(rD2, 0) | 
 | 	evxor		rD1,rD1,rW2 | 
 | 	LWL(rW1, 12) | 
 | 	evxor		rD1,rD1,rW0 | 
 | 	evldw		rD3,40(rKP) | 
 | 	evmergehi	rD0,rD0,rD1 | 
 | 	EAD(rD1, 2) | 
 | 	evxor		rW3,rW3,rW5 | 
 | 	LWH(rW4, 4) | 
 | 	evxor		rW3,rW3,rW7 | 
 | 	EAD(rD0, 3) | 
 | 	evxor		rD3,rD3,rW3 | 
 | 	LWH(rW6, 0) | 
 | 	evxor		rD3,rD3,rW1 | 
 | 	EAD(rD0, 1) | 
 | 	evmergehi	rD2,rD2,rD3 | 
 | 	LWH(rW3, 8) | 
 | 	addi		rKP,rKP,32 | 
 | 	bdnz		ppc_encrypt_block_loop | 
 | 	LAH(rW0, rD3, 0, 12) | 
 | 	LAL(rW0, rD0, 0, 12) | 
 | 	LAH(rW1, rD1, 0, 12) | 
 | 	LAH(rW2, rD2, 1, 8) | 
 | 	LAL(rW2, rD3, 1, 8) | 
 | 	LAL(rW3, rD1, 1, 8) | 
 | 	LAL(rW4, rD2, 2, 4) | 
 | 	LAH(rW5, rD3, 2, 4) | 
 | 	LAL(rW6, rD1, 3, 0) | 
 | 	LAL(rW5, rD0, 2, 4) | 
 | 	LAH(rW7, rD2, 3, 0) | 
 | 	evldw		rD1,16(rKP) | 
 | 	EAD(rD3, 3) | 
 | 	evxor		rW2,rW2,rW4 | 
 | 	LWL(rW7, 0) | 
 | 	evxor		rW2,rW2,rW6 | 
 | 	EAD(rD2, 0) | 
 | 	evxor		rD1,rD1,rW2 | 
 | 	LWL(rW1, 12) | 
 | 	evxor		rD1,rD1,rW0 | 
 | 	evldw		rD3,24(rKP) | 
 | 	evmergehi	rD0,rD0,rD1 | 
 | 	EAD(rD1, 0) | 
 | 	evxor		rW3,rW3,rW5 | 
 | 	LBE(rW2) | 
 | 	evxor		rW3,rW3,rW7 | 
 | 	EAD(rD0, 1) | 
 | 	evxor		rD3,rD3,rW3 | 
 | 	LBE(rW6) | 
 | 	evxor		rD3,rD3,rW1 | 
 | 	EAD(rD0, 0) | 
 | 	evmergehi	rD2,rD2,rD3 | 
 | 	LBE(rW1) | 
 | 	LAE(rW0, rD3, 0) | 
 | 	LAE(rW1, rD0, 0) | 
 | 	LAE(rW4, rD2, 1) | 
 | 	LAE(rW5, rD3, 1) | 
 | 	LAE(rW3, rD2, 0) | 
 | 	LAE(rW7, rD1, 1) | 
 | 	rlwimi		rW0,rW4,8,16,23 | 
 | 	rlwimi		rW1,rW5,8,16,23 | 
 | 	LAE(rW4, rD1, 2) | 
 | 	LAE(rW5, rD2, 2) | 
 | 	rlwimi		rW2,rW6,8,16,23 | 
 | 	rlwimi		rW3,rW7,8,16,23 | 
 | 	LAE(rW6, rD3, 2) | 
 | 	LAE(rW7, rD0, 2) | 
 | 	rlwimi		rW0,rW4,16,8,15 | 
 | 	rlwimi		rW1,rW5,16,8,15 | 
 | 	LAE(rW4, rD0, 3) | 
 | 	LAE(rW5, rD1, 3) | 
 | 	rlwimi		rW2,rW6,16,8,15 | 
 | 	lwz		rD0,32(rKP) | 
 | 	rlwimi		rW3,rW7,16,8,15 | 
 | 	lwz		rD1,36(rKP) | 
 | 	LAE(rW6, rD2, 3) | 
 | 	LAE(rW7, rD3, 3) | 
 | 	rlwimi		rW0,rW4,24,0,7 | 
 | 	lwz		rD2,40(rKP) | 
 | 	rlwimi		rW1,rW5,24,0,7 | 
 | 	lwz		rD3,44(rKP) | 
 | 	rlwimi		rW2,rW6,24,0,7 | 
 | 	rlwimi		rW3,rW7,24,0,7 | 
 | 	blr | 
 |  | 
 | /* | 
 |  * ppc_decrypt_block: The central decryption function for a single 16 bytes | 
 |  * block. It does no stack handling or register saving to support fast calls | 
 |  * via bl/blr. It expects that caller has pre-xored input data with first | 
 |  * 4 words of encryption key into rD0-rD3. Pointer/counter registers must | 
 |  * have also been set up before (rT0, rKP, CTR). Output is stored in rD0-rD3 | 
 |  * and rW0-rW3 and caller must execute a final xor on the output registers. | 
 |  * All working registers rD0-rD3 & rW0-rW7 are overwritten during processing. | 
 |  * | 
 |  */ | 
 | _GLOBAL(ppc_decrypt_block) | 
 | 	LAH(rW0, rD1, 0, 12) | 
 | 	LAH(rW6, rD0, 3, 0) | 
 | 	LAH(rW3, rD0, 1, 8) | 
 | ppc_decrypt_block_loop: | 
 | 	LAH(rW1, rD3, 0, 12) | 
 | 	LAL(rW0, rD2, 0, 12) | 
 | 	LAH(rW2, rD2, 1, 8) | 
 | 	LAL(rW2, rD3, 1, 8) | 
 | 	LAH(rW4, rD3, 2, 4) | 
 | 	LAL(rW4, rD0, 2, 4) | 
 | 	LAL(rW6, rD1, 3, 0) | 
 | 	LAH(rW5, rD1, 2, 4) | 
 | 	LAH(rW7, rD2, 3, 0) | 
 | 	LAL(rW7, rD3, 3, 0) | 
 | 	LAL(rW3, rD1, 1, 8) | 
 | 	evldw		rD1,16(rKP) | 
 | 	EAD(rD0, 0) | 
 | 	evxor		rW4,rW4,rW6 | 
 | 	LWL(rW1, 12) | 
 | 	evxor		rW0,rW0,rW4 | 
 | 	EAD(rD2, 2) | 
 | 	evxor		rW0,rW0,rW2 | 
 | 	LWL(rW5, 4) | 
 | 	evxor		rD1,rD1,rW0 | 
 | 	evldw		rD3,24(rKP) | 
 | 	evmergehi	rD0,rD0,rD1 | 
 | 	EAD(rD1, 0) | 
 | 	evxor		rW3,rW3,rW7 | 
 | 	LWH(rW0, 12) | 
 | 	evxor		rW3,rW3,rW1 | 
 | 	EAD(rD0, 3) | 
 | 	evxor		rD3,rD3,rW3 | 
 | 	LWH(rW6, 0) | 
 | 	evxor		rD3,rD3,rW5 | 
 | 	EAD(rD0, 1) | 
 | 	evmergehi	rD2,rD2,rD3 | 
 | 	LWH(rW3, 8) | 
 | 	LAH(rW1, rD3, 0, 12) | 
 | 	LAL(rW0, rD2, 0, 12) | 
 | 	LAH(rW2, rD2, 1, 8) | 
 | 	LAL(rW2, rD3, 1, 8) | 
 | 	LAH(rW4, rD3, 2, 4) | 
 | 	LAL(rW4, rD0, 2, 4) | 
 | 	LAL(rW6, rD1, 3, 0) | 
 | 	LAH(rW5, rD1, 2, 4) | 
 | 	LAH(rW7, rD2, 3, 0) | 
 | 	LAL(rW7, rD3, 3, 0) | 
 | 	LAL(rW3, rD1, 1, 8) | 
 | 	evldw		 rD1,32(rKP) | 
 | 	EAD(rD0, 0) | 
 | 	evxor		rW4,rW4,rW6 | 
 | 	LWL(rW1, 12) | 
 | 	evxor		rW0,rW0,rW4 | 
 | 	EAD(rD2, 2) | 
 | 	evxor		rW0,rW0,rW2 | 
 | 	LWL(rW5, 4) | 
 | 	evxor		rD1,rD1,rW0 | 
 | 	evldw		rD3,40(rKP) | 
 | 	evmergehi	rD0,rD0,rD1 | 
 | 	EAD(rD1, 0) | 
 | 	evxor		rW3,rW3,rW7 | 
 | 	LWH(rW0, 12) | 
 | 	evxor		rW3,rW3,rW1 | 
 | 	EAD(rD0, 3) | 
 | 	evxor		rD3,rD3,rW3 | 
 | 	LWH(rW6, 0) | 
 | 	evxor		rD3,rD3,rW5 | 
 | 	EAD(rD0, 1) | 
 | 	evmergehi	rD2,rD2,rD3 | 
 | 	LWH(rW3, 8) | 
 | 	addi		rKP,rKP,32 | 
 | 	bdnz		ppc_decrypt_block_loop | 
 | 	LAH(rW1, rD3, 0, 12) | 
 | 	LAL(rW0, rD2, 0, 12) | 
 | 	LAH(rW2, rD2, 1, 8) | 
 | 	LAL(rW2, rD3, 1, 8) | 
 | 	LAH(rW4, rD3, 2, 4) | 
 | 	LAL(rW4, rD0, 2, 4) | 
 | 	LAL(rW6, rD1, 3, 0) | 
 | 	LAH(rW5, rD1, 2, 4) | 
 | 	LAH(rW7, rD2, 3, 0) | 
 | 	LAL(rW7, rD3, 3, 0) | 
 | 	LAL(rW3, rD1, 1, 8) | 
 | 	evldw		 rD1,16(rKP) | 
 | 	EAD(rD0, 0) | 
 | 	evxor		rW4,rW4,rW6 | 
 | 	LWL(rW1, 12) | 
 | 	evxor		rW0,rW0,rW4 | 
 | 	EAD(rD2, 2) | 
 | 	evxor		rW0,rW0,rW2 | 
 | 	LWL(rW5, 4) | 
 | 	evxor		rD1,rD1,rW0 | 
 | 	evldw		rD3,24(rKP) | 
 | 	evmergehi	rD0,rD0,rD1 | 
 | 	DAD(rD1, 0) | 
 | 	evxor		rW3,rW3,rW7 | 
 | 	LBD(rW0) | 
 | 	evxor		rW3,rW3,rW1 | 
 | 	DAD(rD0, 1) | 
 | 	evxor		rD3,rD3,rW3 | 
 | 	LBD(rW6) | 
 | 	evxor		rD3,rD3,rW5 | 
 | 	DAD(rD0, 0) | 
 | 	evmergehi	rD2,rD2,rD3 | 
 | 	LBD(rW3) | 
 | 	LAD(rW2, rD3, 0) | 
 | 	LAD(rW1, rD2, 0) | 
 | 	LAD(rW4, rD2, 1) | 
 | 	LAD(rW5, rD3, 1) | 
 | 	LAD(rW7, rD1, 1) | 
 | 	rlwimi		rW0,rW4,8,16,23 | 
 | 	rlwimi		rW1,rW5,8,16,23 | 
 | 	LAD(rW4, rD3, 2) | 
 | 	LAD(rW5, rD0, 2) | 
 | 	rlwimi		rW2,rW6,8,16,23 | 
 | 	rlwimi		rW3,rW7,8,16,23 | 
 | 	LAD(rW6, rD1, 2) | 
 | 	LAD(rW7, rD2, 2) | 
 | 	rlwimi		rW0,rW4,16,8,15 | 
 | 	rlwimi		rW1,rW5,16,8,15 | 
 | 	LAD(rW4, rD0, 3) | 
 | 	LAD(rW5, rD1, 3) | 
 | 	rlwimi		rW2,rW6,16,8,15 | 
 | 	lwz		rD0,32(rKP) | 
 | 	rlwimi		rW3,rW7,16,8,15 | 
 | 	lwz		rD1,36(rKP) | 
 | 	LAD(rW6, rD2, 3) | 
 | 	LAD(rW7, rD3, 3) | 
 | 	rlwimi		rW0,rW4,24,0,7 | 
 | 	lwz		rD2,40(rKP) | 
 | 	rlwimi		rW1,rW5,24,0,7 | 
 | 	lwz		rD3,44(rKP) | 
 | 	rlwimi		rW2,rW6,24,0,7 | 
 | 	rlwimi		rW3,rW7,24,0,7 | 
 | 	blr |