| #!/usr/bin/env perl | 
 | # SPDX-License-Identifier: GPL-1.0+ OR BSD-3-Clause | 
 | # | 
 | # ==================================================================== | 
 | # Written by Andy Polyakov, @dot-asm, originally for the OpenSSL | 
 | # project. | 
 | # ==================================================================== | 
 |  | 
 | # Poly1305 hash for MIPS. | 
 | # | 
 | # May 2016 | 
 | # | 
 | # Numbers are cycles per processed byte with poly1305_blocks alone. | 
 | # | 
 | #		IALU/gcc | 
 | # R1x000	~5.5/+130%	(big-endian) | 
 | # Octeon II	2.50/+70%	(little-endian) | 
 | # | 
 | # March 2019 | 
 | # | 
 | # Add 32-bit code path. | 
 | # | 
 | # October 2019 | 
 | # | 
 | # Modulo-scheduling reduction allows to omit dependency chain at the | 
 | # end of inner loop and improve performance. Also optimize MIPS32R2 | 
 | # code path for MIPS 1004K core. Per René von Dorst's suggestions. | 
 | # | 
 | #		IALU/gcc | 
 | # R1x000	~9.8/?		(big-endian) | 
 | # Octeon II	3.65/+140%	(little-endian) | 
 | # MT7621/1004K	4.75/?		(little-endian) | 
 | # | 
 | ###################################################################### | 
 | # There is a number of MIPS ABI in use, O32 and N32/64 are most | 
 | # widely used. Then there is a new contender: NUBI. It appears that if | 
 | # one picks the latter, it's possible to arrange code in ABI neutral | 
 | # manner. Therefore let's stick to NUBI register layout: | 
 | # | 
 | ($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25)); | 
 | ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11)); | 
 | ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23)); | 
 | ($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31)); | 
 | # | 
 | # The return value is placed in $a0. Following coding rules facilitate | 
 | # interoperability: | 
 | # | 
 | # - never ever touch $tp, "thread pointer", former $gp [o32 can be | 
 | #   excluded from the rule, because it's specified volatile]; | 
 | # - copy return value to $t0, former $v0 [or to $a0 if you're adapting | 
 | #   old code]; | 
 | # - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary; | 
 | # | 
 | # For reference here is register layout for N32/64 MIPS ABIs: | 
 | # | 
 | # ($zero,$at,$v0,$v1)=map("\$$_",(0..3)); | 
 | # ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11)); | 
 | # ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25)); | 
 | # ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23)); | 
 | # ($gp,$sp,$fp,$ra)=map("\$$_",(28..31)); | 
 | # | 
 | # <appro@openssl.org> | 
 | # | 
 | ###################################################################### | 
 |  | 
 | $flavour = shift || "64"; # supported flavours are o32,n32,64,nubi32,nubi64 | 
 |  | 
 | $v0 = ($flavour =~ /nubi/i) ? $a0 : $t0; | 
 |  | 
 | if ($flavour =~ /64|n32/i) {{{ | 
 | ###################################################################### | 
 | # 64-bit code path | 
 | # | 
 |  | 
 | my ($ctx,$inp,$len,$padbit) = ($a0,$a1,$a2,$a3); | 
 | my ($in0,$in1,$tmp0,$tmp1,$tmp2,$tmp3,$tmp4) = ($a4,$a5,$a6,$a7,$at,$t0,$t1); | 
 |  | 
 | $code.=<<___; | 
 | #if (defined(_MIPS_ARCH_MIPS64R3) || defined(_MIPS_ARCH_MIPS64R5) || \\ | 
 |      defined(_MIPS_ARCH_MIPS64R6)) \\ | 
 |      && !defined(_MIPS_ARCH_MIPS64R2) | 
 | # define _MIPS_ARCH_MIPS64R2 | 
 | #endif | 
 |  | 
 | #if defined(_MIPS_ARCH_MIPS64R6) | 
 | # define dmultu(rs,rt) | 
 | # define mflo(rd,rs,rt)	dmulu	rd,rs,rt | 
 | # define mfhi(rd,rs,rt)	dmuhu	rd,rs,rt | 
 | #else | 
 | # define dmultu(rs,rt)		dmultu	rs,rt | 
 | # define mflo(rd,rs,rt)	mflo	rd | 
 | # define mfhi(rd,rs,rt)	mfhi	rd | 
 | #endif | 
 |  | 
 | #ifdef	__KERNEL__ | 
 | # define poly1305_init   poly1305_init_mips | 
 | # define poly1305_blocks poly1305_blocks_mips | 
 | # define poly1305_emit   poly1305_emit_mips | 
 | #endif | 
 |  | 
 | #if defined(__MIPSEB__) && !defined(MIPSEB) | 
 | # define MIPSEB | 
 | #endif | 
 |  | 
 | #ifdef MIPSEB | 
 | # define MSB 0 | 
 | # define LSB 7 | 
 | #else | 
 | # define MSB 7 | 
 | # define LSB 0 | 
 | #endif | 
 |  | 
 | .text | 
 | .set	noat | 
 | .set	noreorder | 
 |  | 
 | .align	5 | 
 | .globl	poly1305_init | 
 | .ent	poly1305_init | 
 | poly1305_init: | 
 | 	.frame	$sp,0,$ra | 
 | 	.set	reorder | 
 |  | 
 | 	sd	$zero,0($ctx) | 
 | 	sd	$zero,8($ctx) | 
 | 	sd	$zero,16($ctx) | 
 |  | 
 | 	beqz	$inp,.Lno_key | 
 |  | 
 | #if defined(_MIPS_ARCH_MIPS64R6) | 
 | 	andi	$tmp0,$inp,7		# $inp % 8 | 
 | 	dsubu	$inp,$inp,$tmp0		# align $inp | 
 | 	sll	$tmp0,$tmp0,3		# byte to bit offset | 
 | 	ld	$in0,0($inp) | 
 | 	ld	$in1,8($inp) | 
 | 	beqz	$tmp0,.Laligned_key | 
 | 	ld	$tmp2,16($inp) | 
 |  | 
 | 	subu	$tmp1,$zero,$tmp0 | 
 | # ifdef	MIPSEB | 
 | 	dsllv	$in0,$in0,$tmp0 | 
 | 	dsrlv	$tmp3,$in1,$tmp1 | 
 | 	dsllv	$in1,$in1,$tmp0 | 
 | 	dsrlv	$tmp2,$tmp2,$tmp1 | 
 | # else | 
 | 	dsrlv	$in0,$in0,$tmp0 | 
 | 	dsllv	$tmp3,$in1,$tmp1 | 
 | 	dsrlv	$in1,$in1,$tmp0 | 
 | 	dsllv	$tmp2,$tmp2,$tmp1 | 
 | # endif | 
 | 	or	$in0,$in0,$tmp3 | 
 | 	or	$in1,$in1,$tmp2 | 
 | .Laligned_key: | 
 | #else | 
 | 	ldl	$in0,0+MSB($inp) | 
 | 	ldl	$in1,8+MSB($inp) | 
 | 	ldr	$in0,0+LSB($inp) | 
 | 	ldr	$in1,8+LSB($inp) | 
 | #endif | 
 | #ifdef	MIPSEB | 
 | # if defined(_MIPS_ARCH_MIPS64R2) | 
 | 	dsbh	$in0,$in0		# byte swap | 
 | 	 dsbh	$in1,$in1 | 
 | 	dshd	$in0,$in0 | 
 | 	 dshd	$in1,$in1 | 
 | # else | 
 | 	ori	$tmp0,$zero,0xFF | 
 | 	dsll	$tmp2,$tmp0,32 | 
 | 	or	$tmp0,$tmp2		# 0x000000FF000000FF | 
 |  | 
 | 	and	$tmp1,$in0,$tmp0	# byte swap | 
 | 	 and	$tmp3,$in1,$tmp0 | 
 | 	dsrl	$tmp2,$in0,24 | 
 | 	 dsrl	$tmp4,$in1,24 | 
 | 	dsll	$tmp1,24 | 
 | 	 dsll	$tmp3,24 | 
 | 	and	$tmp2,$tmp0 | 
 | 	 and	$tmp4,$tmp0 | 
 | 	dsll	$tmp0,8			# 0x0000FF000000FF00 | 
 | 	or	$tmp1,$tmp2 | 
 | 	 or	$tmp3,$tmp4 | 
 | 	and	$tmp2,$in0,$tmp0 | 
 | 	 and	$tmp4,$in1,$tmp0 | 
 | 	dsrl	$in0,8 | 
 | 	 dsrl	$in1,8 | 
 | 	dsll	$tmp2,8 | 
 | 	 dsll	$tmp4,8 | 
 | 	and	$in0,$tmp0 | 
 | 	 and	$in1,$tmp0 | 
 | 	or	$tmp1,$tmp2 | 
 | 	 or	$tmp3,$tmp4 | 
 | 	or	$in0,$tmp1 | 
 | 	 or	$in1,$tmp3 | 
 | 	dsrl	$tmp1,$in0,32 | 
 | 	 dsrl	$tmp3,$in1,32 | 
 | 	dsll	$in0,32 | 
 | 	 dsll	$in1,32 | 
 | 	or	$in0,$tmp1 | 
 | 	 or	$in1,$tmp3 | 
 | # endif | 
 | #endif | 
 | 	li	$tmp0,1 | 
 | 	dsll	$tmp0,32		# 0x0000000100000000 | 
 | 	daddiu	$tmp0,-63		# 0x00000000ffffffc1 | 
 | 	dsll	$tmp0,28		# 0x0ffffffc10000000 | 
 | 	daddiu	$tmp0,-1		# 0x0ffffffc0fffffff | 
 |  | 
 | 	and	$in0,$tmp0 | 
 | 	daddiu	$tmp0,-3		# 0x0ffffffc0ffffffc | 
 | 	and	$in1,$tmp0 | 
 |  | 
 | 	sd	$in0,24($ctx) | 
 | 	dsrl	$tmp0,$in1,2 | 
 | 	sd	$in1,32($ctx) | 
 | 	daddu	$tmp0,$in1		# s1 = r1 + (r1 >> 2) | 
 | 	sd	$tmp0,40($ctx) | 
 |  | 
 | .Lno_key: | 
 | 	li	$v0,0			# return 0 | 
 | 	jr	$ra | 
 | .end	poly1305_init | 
 | ___ | 
 | { | 
 | my $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? "0x0003f000" : "0x00030000"; | 
 |  | 
 | my ($h0,$h1,$h2,$r0,$r1,$rs1,$d0,$d1,$d2) = | 
 |    ($s0,$s1,$s2,$s3,$s4,$s5,$in0,$in1,$t2); | 
 | my ($shr,$shl) = ($s6,$s7);		# used on R6 | 
 |  | 
 | $code.=<<___; | 
 | .align	5 | 
 | .globl	poly1305_blocks | 
 | .ent	poly1305_blocks | 
 | poly1305_blocks: | 
 | 	.set	noreorder | 
 | 	dsrl	$len,4			# number of complete blocks | 
 | 	bnez	$len,poly1305_blocks_internal | 
 | 	nop | 
 | 	jr	$ra | 
 | 	nop | 
 | .end	poly1305_blocks | 
 |  | 
 | .align	5 | 
 | .ent	poly1305_blocks_internal | 
 | poly1305_blocks_internal: | 
 | 	.set	noreorder | 
 | #if defined(_MIPS_ARCH_MIPS64R6) | 
 | 	.frame	$sp,8*8,$ra | 
 | 	.mask	$SAVED_REGS_MASK|0x000c0000,-8 | 
 | 	dsubu	$sp,8*8 | 
 | 	sd	$s7,56($sp) | 
 | 	sd	$s6,48($sp) | 
 | #else | 
 | 	.frame	$sp,6*8,$ra | 
 | 	.mask	$SAVED_REGS_MASK,-8 | 
 | 	dsubu	$sp,6*8 | 
 | #endif | 
 | 	sd	$s5,40($sp) | 
 | 	sd	$s4,32($sp) | 
 | ___ | 
 | $code.=<<___ if ($flavour =~ /nubi/i);	# optimize non-nubi prologue | 
 | 	sd	$s3,24($sp) | 
 | 	sd	$s2,16($sp) | 
 | 	sd	$s1,8($sp) | 
 | 	sd	$s0,0($sp) | 
 | ___ | 
 | $code.=<<___; | 
 | 	.set	reorder | 
 |  | 
 | #if defined(_MIPS_ARCH_MIPS64R6) | 
 | 	andi	$shr,$inp,7 | 
 | 	dsubu	$inp,$inp,$shr		# align $inp | 
 | 	sll	$shr,$shr,3		# byte to bit offset | 
 | 	subu	$shl,$zero,$shr | 
 | #endif | 
 |  | 
 | 	ld	$h0,0($ctx)		# load hash value | 
 | 	ld	$h1,8($ctx) | 
 | 	ld	$h2,16($ctx) | 
 |  | 
 | 	ld	$r0,24($ctx)		# load key | 
 | 	ld	$r1,32($ctx) | 
 | 	ld	$rs1,40($ctx) | 
 |  | 
 | 	dsll	$len,4 | 
 | 	daddu	$len,$inp		# end of buffer | 
 | 	b	.Loop | 
 |  | 
 | .align	4 | 
 | .Loop: | 
 | #if defined(_MIPS_ARCH_MIPS64R6) | 
 | 	ld	$in0,0($inp)		# load input | 
 | 	ld	$in1,8($inp) | 
 | 	beqz	$shr,.Laligned_inp | 
 |  | 
 | 	ld	$tmp2,16($inp) | 
 | # ifdef	MIPSEB | 
 | 	dsllv	$in0,$in0,$shr | 
 | 	dsrlv	$tmp3,$in1,$shl | 
 | 	dsllv	$in1,$in1,$shr | 
 | 	dsrlv	$tmp2,$tmp2,$shl | 
 | # else | 
 | 	dsrlv	$in0,$in0,$shr | 
 | 	dsllv	$tmp3,$in1,$shl | 
 | 	dsrlv	$in1,$in1,$shr | 
 | 	dsllv	$tmp2,$tmp2,$shl | 
 | # endif | 
 | 	or	$in0,$in0,$tmp3 | 
 | 	or	$in1,$in1,$tmp2 | 
 | .Laligned_inp: | 
 | #else | 
 | 	ldl	$in0,0+MSB($inp)	# load input | 
 | 	ldl	$in1,8+MSB($inp) | 
 | 	ldr	$in0,0+LSB($inp) | 
 | 	ldr	$in1,8+LSB($inp) | 
 | #endif | 
 | 	daddiu	$inp,16 | 
 | #ifdef	MIPSEB | 
 | # if defined(_MIPS_ARCH_MIPS64R2) | 
 | 	dsbh	$in0,$in0		# byte swap | 
 | 	 dsbh	$in1,$in1 | 
 | 	dshd	$in0,$in0 | 
 | 	 dshd	$in1,$in1 | 
 | # else | 
 | 	ori	$tmp0,$zero,0xFF | 
 | 	dsll	$tmp2,$tmp0,32 | 
 | 	or	$tmp0,$tmp2		# 0x000000FF000000FF | 
 |  | 
 | 	and	$tmp1,$in0,$tmp0	# byte swap | 
 | 	 and	$tmp3,$in1,$tmp0 | 
 | 	dsrl	$tmp2,$in0,24 | 
 | 	 dsrl	$tmp4,$in1,24 | 
 | 	dsll	$tmp1,24 | 
 | 	 dsll	$tmp3,24 | 
 | 	and	$tmp2,$tmp0 | 
 | 	 and	$tmp4,$tmp0 | 
 | 	dsll	$tmp0,8			# 0x0000FF000000FF00 | 
 | 	or	$tmp1,$tmp2 | 
 | 	 or	$tmp3,$tmp4 | 
 | 	and	$tmp2,$in0,$tmp0 | 
 | 	 and	$tmp4,$in1,$tmp0 | 
 | 	dsrl	$in0,8 | 
 | 	 dsrl	$in1,8 | 
 | 	dsll	$tmp2,8 | 
 | 	 dsll	$tmp4,8 | 
 | 	and	$in0,$tmp0 | 
 | 	 and	$in1,$tmp0 | 
 | 	or	$tmp1,$tmp2 | 
 | 	 or	$tmp3,$tmp4 | 
 | 	or	$in0,$tmp1 | 
 | 	 or	$in1,$tmp3 | 
 | 	dsrl	$tmp1,$in0,32 | 
 | 	 dsrl	$tmp3,$in1,32 | 
 | 	dsll	$in0,32 | 
 | 	 dsll	$in1,32 | 
 | 	or	$in0,$tmp1 | 
 | 	 or	$in1,$tmp3 | 
 | # endif | 
 | #endif | 
 | 	dsrl	$tmp1,$h2,2		# modulo-scheduled reduction | 
 | 	andi	$h2,$h2,3 | 
 | 	dsll	$tmp0,$tmp1,2 | 
 |  | 
 | 	daddu	$d0,$h0,$in0		# accumulate input | 
 | 	 daddu	$tmp1,$tmp0 | 
 | 	sltu	$tmp0,$d0,$h0 | 
 | 	daddu	$d0,$d0,$tmp1		# ... and residue | 
 | 	sltu	$tmp1,$d0,$tmp1 | 
 | 	daddu	$d1,$h1,$in1 | 
 | 	daddu	$tmp0,$tmp1 | 
 | 	sltu	$tmp1,$d1,$h1 | 
 | 	daddu	$d1,$tmp0 | 
 |  | 
 | 	dmultu	($r0,$d0)		# h0*r0 | 
 | 	 daddu	$d2,$h2,$padbit | 
 | 	 sltu	$tmp0,$d1,$tmp0 | 
 | 	mflo	($h0,$r0,$d0) | 
 | 	mfhi	($h1,$r0,$d0) | 
 |  | 
 | 	dmultu	($rs1,$d1)		# h1*5*r1 | 
 | 	 daddu	$d2,$tmp1 | 
 | 	 daddu	$d2,$tmp0 | 
 | 	mflo	($tmp0,$rs1,$d1) | 
 | 	mfhi	($tmp1,$rs1,$d1) | 
 |  | 
 | 	dmultu	($r1,$d0)		# h0*r1 | 
 | 	mflo	($tmp2,$r1,$d0) | 
 | 	mfhi	($h2,$r1,$d0) | 
 | 	 daddu	$h0,$tmp0 | 
 | 	 daddu	$h1,$tmp1 | 
 | 	 sltu	$tmp0,$h0,$tmp0 | 
 |  | 
 | 	dmultu	($r0,$d1)		# h1*r0 | 
 | 	 daddu	$h1,$tmp0 | 
 | 	 daddu	$h1,$tmp2 | 
 | 	mflo	($tmp0,$r0,$d1) | 
 | 	mfhi	($tmp1,$r0,$d1) | 
 |  | 
 | 	dmultu	($rs1,$d2)		# h2*5*r1 | 
 | 	 sltu	$tmp2,$h1,$tmp2 | 
 | 	 daddu	$h2,$tmp2 | 
 | 	mflo	($tmp2,$rs1,$d2) | 
 |  | 
 | 	dmultu	($r0,$d2)		# h2*r0 | 
 | 	 daddu	$h1,$tmp0 | 
 | 	 daddu	$h2,$tmp1 | 
 | 	mflo	($tmp3,$r0,$d2) | 
 | 	 sltu	$tmp0,$h1,$tmp0 | 
 | 	 daddu	$h2,$tmp0 | 
 |  | 
 | 	daddu	$h1,$tmp2 | 
 | 	sltu	$tmp2,$h1,$tmp2 | 
 | 	daddu	$h2,$tmp2 | 
 | 	daddu	$h2,$tmp3 | 
 |  | 
 | 	bne	$inp,$len,.Loop | 
 |  | 
 | 	sd	$h0,0($ctx)		# store hash value | 
 | 	sd	$h1,8($ctx) | 
 | 	sd	$h2,16($ctx) | 
 |  | 
 | 	.set	noreorder | 
 | #if defined(_MIPS_ARCH_MIPS64R6) | 
 | 	ld	$s7,56($sp) | 
 | 	ld	$s6,48($sp) | 
 | #endif | 
 | 	ld	$s5,40($sp)		# epilogue | 
 | 	ld	$s4,32($sp) | 
 | ___ | 
 | $code.=<<___ if ($flavour =~ /nubi/i);	# optimize non-nubi epilogue | 
 | 	ld	$s3,24($sp) | 
 | 	ld	$s2,16($sp) | 
 | 	ld	$s1,8($sp) | 
 | 	ld	$s0,0($sp) | 
 | ___ | 
 | $code.=<<___; | 
 | 	jr	$ra | 
 | #if defined(_MIPS_ARCH_MIPS64R6) | 
 | 	daddu	$sp,8*8 | 
 | #else | 
 | 	daddu	$sp,6*8 | 
 | #endif | 
 | .end	poly1305_blocks_internal | 
 | ___ | 
 | } | 
 | { | 
 | my ($ctx,$mac,$nonce) = ($a0,$a1,$a2); | 
 |  | 
 | $code.=<<___; | 
 | .align	5 | 
 | .globl	poly1305_emit | 
 | .ent	poly1305_emit | 
 | poly1305_emit: | 
 | 	.frame	$sp,0,$ra | 
 | 	.set	reorder | 
 |  | 
 | 	ld	$tmp2,16($ctx) | 
 | 	ld	$tmp0,0($ctx) | 
 | 	ld	$tmp1,8($ctx) | 
 |  | 
 | 	li	$in0,-4			# final reduction | 
 | 	dsrl	$in1,$tmp2,2 | 
 | 	and	$in0,$tmp2 | 
 | 	andi	$tmp2,$tmp2,3 | 
 | 	daddu	$in0,$in1 | 
 |  | 
 | 	daddu	$tmp0,$tmp0,$in0 | 
 | 	sltu	$in1,$tmp0,$in0 | 
 | 	 daddiu	$in0,$tmp0,5		# compare to modulus | 
 | 	daddu	$tmp1,$tmp1,$in1 | 
 | 	 sltiu	$tmp3,$in0,5 | 
 | 	sltu	$tmp4,$tmp1,$in1 | 
 | 	 daddu	$in1,$tmp1,$tmp3 | 
 | 	daddu	$tmp2,$tmp2,$tmp4 | 
 | 	 sltu	$tmp3,$in1,$tmp3 | 
 | 	 daddu	$tmp2,$tmp2,$tmp3 | 
 |  | 
 | 	dsrl	$tmp2,2			# see if it carried/borrowed | 
 | 	dsubu	$tmp2,$zero,$tmp2 | 
 |  | 
 | 	xor	$in0,$tmp0 | 
 | 	xor	$in1,$tmp1 | 
 | 	and	$in0,$tmp2 | 
 | 	and	$in1,$tmp2 | 
 | 	xor	$in0,$tmp0 | 
 | 	xor	$in1,$tmp1 | 
 |  | 
 | 	lwu	$tmp0,0($nonce)		# load nonce | 
 | 	lwu	$tmp1,4($nonce) | 
 | 	lwu	$tmp2,8($nonce) | 
 | 	lwu	$tmp3,12($nonce) | 
 | 	dsll	$tmp1,32 | 
 | 	dsll	$tmp3,32 | 
 | 	or	$tmp0,$tmp1 | 
 | 	or	$tmp2,$tmp3 | 
 |  | 
 | 	daddu	$in0,$tmp0		# accumulate nonce | 
 | 	daddu	$in1,$tmp2 | 
 | 	sltu	$tmp0,$in0,$tmp0 | 
 | 	daddu	$in1,$tmp0 | 
 |  | 
 | 	dsrl	$tmp0,$in0,8		# write mac value | 
 | 	dsrl	$tmp1,$in0,16 | 
 | 	dsrl	$tmp2,$in0,24 | 
 | 	sb	$in0,0($mac) | 
 | 	dsrl	$tmp3,$in0,32 | 
 | 	sb	$tmp0,1($mac) | 
 | 	dsrl	$tmp0,$in0,40 | 
 | 	sb	$tmp1,2($mac) | 
 | 	dsrl	$tmp1,$in0,48 | 
 | 	sb	$tmp2,3($mac) | 
 | 	dsrl	$tmp2,$in0,56 | 
 | 	sb	$tmp3,4($mac) | 
 | 	dsrl	$tmp3,$in1,8 | 
 | 	sb	$tmp0,5($mac) | 
 | 	dsrl	$tmp0,$in1,16 | 
 | 	sb	$tmp1,6($mac) | 
 | 	dsrl	$tmp1,$in1,24 | 
 | 	sb	$tmp2,7($mac) | 
 |  | 
 | 	sb	$in1,8($mac) | 
 | 	dsrl	$tmp2,$in1,32 | 
 | 	sb	$tmp3,9($mac) | 
 | 	dsrl	$tmp3,$in1,40 | 
 | 	sb	$tmp0,10($mac) | 
 | 	dsrl	$tmp0,$in1,48 | 
 | 	sb	$tmp1,11($mac) | 
 | 	dsrl	$tmp1,$in1,56 | 
 | 	sb	$tmp2,12($mac) | 
 | 	sb	$tmp3,13($mac) | 
 | 	sb	$tmp0,14($mac) | 
 | 	sb	$tmp1,15($mac) | 
 |  | 
 | 	jr	$ra | 
 | .end	poly1305_emit | 
 | .rdata | 
 | .asciiz	"Poly1305 for MIPS64, CRYPTOGAMS by \@dot-asm" | 
 | .align	2 | 
 | ___ | 
 | } | 
 | }}} else {{{ | 
 | ###################################################################### | 
 | # 32-bit code path | 
 | # | 
 |  | 
 | my ($ctx,$inp,$len,$padbit) = ($a0,$a1,$a2,$a3); | 
 | my ($in0,$in1,$in2,$in3,$tmp0,$tmp1,$tmp2,$tmp3) = | 
 |    ($a4,$a5,$a6,$a7,$at,$t0,$t1,$t2); | 
 |  | 
 | $code.=<<___; | 
 | #if (defined(_MIPS_ARCH_MIPS32R3) || defined(_MIPS_ARCH_MIPS32R5) || \\ | 
 |      defined(_MIPS_ARCH_MIPS32R6)) \\ | 
 |      && !defined(_MIPS_ARCH_MIPS32R2) | 
 | # define _MIPS_ARCH_MIPS32R2 | 
 | #endif | 
 |  | 
 | #if defined(_MIPS_ARCH_MIPS32R6) | 
 | # define multu(rs,rt) | 
 | # define mflo(rd,rs,rt)	mulu	rd,rs,rt | 
 | # define mfhi(rd,rs,rt)	muhu	rd,rs,rt | 
 | #else | 
 | # define multu(rs,rt)	multu	rs,rt | 
 | # define mflo(rd,rs,rt)	mflo	rd | 
 | # define mfhi(rd,rs,rt)	mfhi	rd | 
 | #endif | 
 |  | 
 | #ifdef	__KERNEL__ | 
 | # define poly1305_init   poly1305_init_mips | 
 | # define poly1305_blocks poly1305_blocks_mips | 
 | # define poly1305_emit   poly1305_emit_mips | 
 | #endif | 
 |  | 
 | #if defined(__MIPSEB__) && !defined(MIPSEB) | 
 | # define MIPSEB | 
 | #endif | 
 |  | 
 | #ifdef MIPSEB | 
 | # define MSB 0 | 
 | # define LSB 3 | 
 | #else | 
 | # define MSB 3 | 
 | # define LSB 0 | 
 | #endif | 
 |  | 
 | .text | 
 | .set	noat | 
 | .set	noreorder | 
 |  | 
 | .align	5 | 
 | .globl	poly1305_init | 
 | .ent	poly1305_init | 
 | poly1305_init: | 
 | 	.frame	$sp,0,$ra | 
 | 	.set	reorder | 
 |  | 
 | 	sw	$zero,0($ctx) | 
 | 	sw	$zero,4($ctx) | 
 | 	sw	$zero,8($ctx) | 
 | 	sw	$zero,12($ctx) | 
 | 	sw	$zero,16($ctx) | 
 |  | 
 | 	beqz	$inp,.Lno_key | 
 |  | 
 | #if defined(_MIPS_ARCH_MIPS32R6) | 
 | 	andi	$tmp0,$inp,3		# $inp % 4 | 
 | 	subu	$inp,$inp,$tmp0		# align $inp | 
 | 	sll	$tmp0,$tmp0,3		# byte to bit offset | 
 | 	lw	$in0,0($inp) | 
 | 	lw	$in1,4($inp) | 
 | 	lw	$in2,8($inp) | 
 | 	lw	$in3,12($inp) | 
 | 	beqz	$tmp0,.Laligned_key | 
 |  | 
 | 	lw	$tmp2,16($inp) | 
 | 	subu	$tmp1,$zero,$tmp0 | 
 | # ifdef	MIPSEB | 
 | 	sllv	$in0,$in0,$tmp0 | 
 | 	srlv	$tmp3,$in1,$tmp1 | 
 | 	sllv	$in1,$in1,$tmp0 | 
 | 	or	$in0,$in0,$tmp3 | 
 | 	srlv	$tmp3,$in2,$tmp1 | 
 | 	sllv	$in2,$in2,$tmp0 | 
 | 	or	$in1,$in1,$tmp3 | 
 | 	srlv	$tmp3,$in3,$tmp1 | 
 | 	sllv	$in3,$in3,$tmp0 | 
 | 	or	$in2,$in2,$tmp3 | 
 | 	srlv	$tmp2,$tmp2,$tmp1 | 
 | 	or	$in3,$in3,$tmp2 | 
 | # else | 
 | 	srlv	$in0,$in0,$tmp0 | 
 | 	sllv	$tmp3,$in1,$tmp1 | 
 | 	srlv	$in1,$in1,$tmp0 | 
 | 	or	$in0,$in0,$tmp3 | 
 | 	sllv	$tmp3,$in2,$tmp1 | 
 | 	srlv	$in2,$in2,$tmp0 | 
 | 	or	$in1,$in1,$tmp3 | 
 | 	sllv	$tmp3,$in3,$tmp1 | 
 | 	srlv	$in3,$in3,$tmp0 | 
 | 	or	$in2,$in2,$tmp3 | 
 | 	sllv	$tmp2,$tmp2,$tmp1 | 
 | 	or	$in3,$in3,$tmp2 | 
 | # endif | 
 | .Laligned_key: | 
 | #else | 
 | 	lwl	$in0,0+MSB($inp) | 
 | 	lwl	$in1,4+MSB($inp) | 
 | 	lwl	$in2,8+MSB($inp) | 
 | 	lwl	$in3,12+MSB($inp) | 
 | 	lwr	$in0,0+LSB($inp) | 
 | 	lwr	$in1,4+LSB($inp) | 
 | 	lwr	$in2,8+LSB($inp) | 
 | 	lwr	$in3,12+LSB($inp) | 
 | #endif | 
 | #ifdef	MIPSEB | 
 | # if defined(_MIPS_ARCH_MIPS32R2) | 
 | 	wsbh	$in0,$in0		# byte swap | 
 | 	wsbh	$in1,$in1 | 
 | 	wsbh	$in2,$in2 | 
 | 	wsbh	$in3,$in3 | 
 | 	rotr	$in0,$in0,16 | 
 | 	rotr	$in1,$in1,16 | 
 | 	rotr	$in2,$in2,16 | 
 | 	rotr	$in3,$in3,16 | 
 | # else | 
 | 	srl	$tmp0,$in0,24		# byte swap | 
 | 	srl	$tmp1,$in0,8 | 
 | 	andi	$tmp2,$in0,0xFF00 | 
 | 	sll	$in0,$in0,24 | 
 | 	andi	$tmp1,0xFF00 | 
 | 	sll	$tmp2,$tmp2,8 | 
 | 	or	$in0,$tmp0 | 
 | 	 srl	$tmp0,$in1,24 | 
 | 	or	$tmp1,$tmp2 | 
 | 	 srl	$tmp2,$in1,8 | 
 | 	or	$in0,$tmp1 | 
 | 	 andi	$tmp1,$in1,0xFF00 | 
 | 	 sll	$in1,$in1,24 | 
 | 	 andi	$tmp2,0xFF00 | 
 | 	 sll	$tmp1,$tmp1,8 | 
 | 	 or	$in1,$tmp0 | 
 | 	srl	$tmp0,$in2,24 | 
 | 	 or	$tmp2,$tmp1 | 
 | 	srl	$tmp1,$in2,8 | 
 | 	 or	$in1,$tmp2 | 
 | 	andi	$tmp2,$in2,0xFF00 | 
 | 	sll	$in2,$in2,24 | 
 | 	andi	$tmp1,0xFF00 | 
 | 	sll	$tmp2,$tmp2,8 | 
 | 	or	$in2,$tmp0 | 
 | 	 srl	$tmp0,$in3,24 | 
 | 	or	$tmp1,$tmp2 | 
 | 	 srl	$tmp2,$in3,8 | 
 | 	or	$in2,$tmp1 | 
 | 	 andi	$tmp1,$in3,0xFF00 | 
 | 	 sll	$in3,$in3,24 | 
 | 	 andi	$tmp2,0xFF00 | 
 | 	 sll	$tmp1,$tmp1,8 | 
 | 	 or	$in3,$tmp0 | 
 | 	 or	$tmp2,$tmp1 | 
 | 	 or	$in3,$tmp2 | 
 | # endif | 
 | #endif | 
 | 	lui	$tmp0,0x0fff | 
 | 	ori	$tmp0,0xffff		# 0x0fffffff | 
 | 	and	$in0,$in0,$tmp0 | 
 | 	subu	$tmp0,3			# 0x0ffffffc | 
 | 	and	$in1,$in1,$tmp0 | 
 | 	and	$in2,$in2,$tmp0 | 
 | 	and	$in3,$in3,$tmp0 | 
 |  | 
 | 	sw	$in0,20($ctx) | 
 | 	sw	$in1,24($ctx) | 
 | 	sw	$in2,28($ctx) | 
 | 	sw	$in3,32($ctx) | 
 |  | 
 | 	srl	$tmp1,$in1,2 | 
 | 	srl	$tmp2,$in2,2 | 
 | 	srl	$tmp3,$in3,2 | 
 | 	addu	$in1,$in1,$tmp1		# s1 = r1 + (r1 >> 2) | 
 | 	addu	$in2,$in2,$tmp2 | 
 | 	addu	$in3,$in3,$tmp3 | 
 | 	sw	$in1,36($ctx) | 
 | 	sw	$in2,40($ctx) | 
 | 	sw	$in3,44($ctx) | 
 | .Lno_key: | 
 | 	li	$v0,0 | 
 | 	jr	$ra | 
 | .end	poly1305_init | 
 | ___ | 
 | { | 
 | my $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? "0x00fff000" : "0x00ff0000"; | 
 |  | 
 | my ($h0,$h1,$h2,$h3,$h4, $r0,$r1,$r2,$r3, $rs1,$rs2,$rs3) = | 
 |    ($s0,$s1,$s2,$s3,$s4, $s5,$s6,$s7,$s8, $s9,$s10,$s11); | 
 | my ($d0,$d1,$d2,$d3) = | 
 |    ($a4,$a5,$a6,$a7); | 
 | my $shr = $t2;		# used on R6 | 
 | my $one = $t2;		# used on R2 | 
 |  | 
 | $code.=<<___; | 
 | .globl	poly1305_blocks | 
 | .align	5 | 
 | .ent	poly1305_blocks | 
 | poly1305_blocks: | 
 | 	.frame	$sp,16*4,$ra | 
 | 	.mask	$SAVED_REGS_MASK,-4 | 
 | 	.set	noreorder | 
 | 	subu	$sp, $sp,4*12 | 
 | 	sw	$s11,4*11($sp) | 
 | 	sw	$s10,4*10($sp) | 
 | 	sw	$s9, 4*9($sp) | 
 | 	sw	$s8, 4*8($sp) | 
 | 	sw	$s7, 4*7($sp) | 
 | 	sw	$s6, 4*6($sp) | 
 | 	sw	$s5, 4*5($sp) | 
 | 	sw	$s4, 4*4($sp) | 
 | ___ | 
 | $code.=<<___ if ($flavour =~ /nubi/i);	# optimize non-nubi prologue | 
 | 	sw	$s3, 4*3($sp) | 
 | 	sw	$s2, 4*2($sp) | 
 | 	sw	$s1, 4*1($sp) | 
 | 	sw	$s0, 4*0($sp) | 
 | ___ | 
 | $code.=<<___; | 
 | 	.set	reorder | 
 |  | 
 | 	srl	$len,4			# number of complete blocks | 
 | 	li	$one,1 | 
 | 	beqz	$len,.Labort | 
 |  | 
 | #if defined(_MIPS_ARCH_MIPS32R6) | 
 | 	andi	$shr,$inp,3 | 
 | 	subu	$inp,$inp,$shr		# align $inp | 
 | 	sll	$shr,$shr,3		# byte to bit offset | 
 | #endif | 
 |  | 
 | 	lw	$h0,0($ctx)		# load hash value | 
 | 	lw	$h1,4($ctx) | 
 | 	lw	$h2,8($ctx) | 
 | 	lw	$h3,12($ctx) | 
 | 	lw	$h4,16($ctx) | 
 |  | 
 | 	lw	$r0,20($ctx)		# load key | 
 | 	lw	$r1,24($ctx) | 
 | 	lw	$r2,28($ctx) | 
 | 	lw	$r3,32($ctx) | 
 | 	lw	$rs1,36($ctx) | 
 | 	lw	$rs2,40($ctx) | 
 | 	lw	$rs3,44($ctx) | 
 |  | 
 | 	sll	$len,4 | 
 | 	addu	$len,$len,$inp		# end of buffer | 
 | 	b	.Loop | 
 |  | 
 | .align	4 | 
 | .Loop: | 
 | #if defined(_MIPS_ARCH_MIPS32R6) | 
 | 	lw	$d0,0($inp)		# load input | 
 | 	lw	$d1,4($inp) | 
 | 	lw	$d2,8($inp) | 
 | 	lw	$d3,12($inp) | 
 | 	beqz	$shr,.Laligned_inp | 
 |  | 
 | 	lw	$t0,16($inp) | 
 | 	subu	$t1,$zero,$shr | 
 | # ifdef	MIPSEB | 
 | 	sllv	$d0,$d0,$shr | 
 | 	srlv	$at,$d1,$t1 | 
 | 	sllv	$d1,$d1,$shr | 
 | 	or	$d0,$d0,$at | 
 | 	srlv	$at,$d2,$t1 | 
 | 	sllv	$d2,$d2,$shr | 
 | 	or	$d1,$d1,$at | 
 | 	srlv	$at,$d3,$t1 | 
 | 	sllv	$d3,$d3,$shr | 
 | 	or	$d2,$d2,$at | 
 | 	srlv	$t0,$t0,$t1 | 
 | 	or	$d3,$d3,$t0 | 
 | # else | 
 | 	srlv	$d0,$d0,$shr | 
 | 	sllv	$at,$d1,$t1 | 
 | 	srlv	$d1,$d1,$shr | 
 | 	or	$d0,$d0,$at | 
 | 	sllv	$at,$d2,$t1 | 
 | 	srlv	$d2,$d2,$shr | 
 | 	or	$d1,$d1,$at | 
 | 	sllv	$at,$d3,$t1 | 
 | 	srlv	$d3,$d3,$shr | 
 | 	or	$d2,$d2,$at | 
 | 	sllv	$t0,$t0,$t1 | 
 | 	or	$d3,$d3,$t0 | 
 | # endif | 
 | .Laligned_inp: | 
 | #else | 
 | 	lwl	$d0,0+MSB($inp)		# load input | 
 | 	lwl	$d1,4+MSB($inp) | 
 | 	lwl	$d2,8+MSB($inp) | 
 | 	lwl	$d3,12+MSB($inp) | 
 | 	lwr	$d0,0+LSB($inp) | 
 | 	lwr	$d1,4+LSB($inp) | 
 | 	lwr	$d2,8+LSB($inp) | 
 | 	lwr	$d3,12+LSB($inp) | 
 | #endif | 
 | #ifdef	MIPSEB | 
 | # if defined(_MIPS_ARCH_MIPS32R2) | 
 | 	wsbh	$d0,$d0			# byte swap | 
 | 	wsbh	$d1,$d1 | 
 | 	wsbh	$d2,$d2 | 
 | 	wsbh	$d3,$d3 | 
 | 	rotr	$d0,$d0,16 | 
 | 	rotr	$d1,$d1,16 | 
 | 	rotr	$d2,$d2,16 | 
 | 	rotr	$d3,$d3,16 | 
 | # else | 
 | 	srl	$at,$d0,24		# byte swap | 
 | 	srl	$t0,$d0,8 | 
 | 	andi	$t1,$d0,0xFF00 | 
 | 	sll	$d0,$d0,24 | 
 | 	andi	$t0,0xFF00 | 
 | 	sll	$t1,$t1,8 | 
 | 	or	$d0,$at | 
 | 	 srl	$at,$d1,24 | 
 | 	or	$t0,$t1 | 
 | 	 srl	$t1,$d1,8 | 
 | 	or	$d0,$t0 | 
 | 	 andi	$t0,$d1,0xFF00 | 
 | 	 sll	$d1,$d1,24 | 
 | 	 andi	$t1,0xFF00 | 
 | 	 sll	$t0,$t0,8 | 
 | 	 or	$d1,$at | 
 | 	srl	$at,$d2,24 | 
 | 	 or	$t1,$t0 | 
 | 	srl	$t0,$d2,8 | 
 | 	 or	$d1,$t1 | 
 | 	andi	$t1,$d2,0xFF00 | 
 | 	sll	$d2,$d2,24 | 
 | 	andi	$t0,0xFF00 | 
 | 	sll	$t1,$t1,8 | 
 | 	or	$d2,$at | 
 | 	 srl	$at,$d3,24 | 
 | 	or	$t0,$t1 | 
 | 	 srl	$t1,$d3,8 | 
 | 	or	$d2,$t0 | 
 | 	 andi	$t0,$d3,0xFF00 | 
 | 	 sll	$d3,$d3,24 | 
 | 	 andi	$t1,0xFF00 | 
 | 	 sll	$t0,$t0,8 | 
 | 	 or	$d3,$at | 
 | 	 or	$t1,$t0 | 
 | 	 or	$d3,$t1 | 
 | # endif | 
 | #endif | 
 | 	srl	$t0,$h4,2		# modulo-scheduled reduction | 
 | 	andi	$h4,$h4,3 | 
 | 	sll	$at,$t0,2 | 
 |  | 
 | 	addu	$d0,$d0,$h0		# accumulate input | 
 | 	 addu	$t0,$t0,$at | 
 | 	sltu	$h0,$d0,$h0 | 
 | 	addu	$d0,$d0,$t0		# ... and residue | 
 | 	sltu	$at,$d0,$t0 | 
 |  | 
 | 	addu	$d1,$d1,$h1 | 
 | 	 addu	$h0,$h0,$at		# carry | 
 | 	sltu	$h1,$d1,$h1 | 
 | 	addu	$d1,$d1,$h0 | 
 | 	sltu	$h0,$d1,$h0 | 
 |  | 
 | 	addu	$d2,$d2,$h2 | 
 | 	 addu	$h1,$h1,$h0		# carry | 
 | 	sltu	$h2,$d2,$h2 | 
 | 	addu	$d2,$d2,$h1 | 
 | 	sltu	$h1,$d2,$h1 | 
 |  | 
 | 	addu	$d3,$d3,$h3 | 
 | 	 addu	$h2,$h2,$h1		# carry | 
 | 	sltu	$h3,$d3,$h3 | 
 | 	addu	$d3,$d3,$h2 | 
 |  | 
 | #if defined(_MIPS_ARCH_MIPS32R2) && !defined(_MIPS_ARCH_MIPS32R6) | 
 | 	multu	$r0,$d0			# d0*r0 | 
 | 	 sltu	$h2,$d3,$h2 | 
 | 	maddu	$rs3,$d1		# d1*s3 | 
 | 	 addu	$h3,$h3,$h2		# carry | 
 | 	maddu	$rs2,$d2		# d2*s2 | 
 | 	 addu	$h4,$h4,$padbit | 
 | 	maddu	$rs1,$d3		# d3*s1 | 
 | 	 addu	$h4,$h4,$h3 | 
 | 	mfhi	$at | 
 | 	mflo	$h0 | 
 |  | 
 | 	multu	$r1,$d0			# d0*r1 | 
 | 	maddu	$r0,$d1			# d1*r0 | 
 | 	maddu	$rs3,$d2		# d2*s3 | 
 | 	maddu	$rs2,$d3		# d3*s2 | 
 | 	maddu	$rs1,$h4		# h4*s1 | 
 | 	maddu	$at,$one		# hi*1 | 
 | 	mfhi	$at | 
 | 	mflo	$h1 | 
 |  | 
 | 	multu	$r2,$d0			# d0*r2 | 
 | 	maddu	$r1,$d1			# d1*r1 | 
 | 	maddu	$r0,$d2			# d2*r0 | 
 | 	maddu	$rs3,$d3		# d3*s3 | 
 | 	maddu	$rs2,$h4		# h4*s2 | 
 | 	maddu	$at,$one		# hi*1 | 
 | 	mfhi	$at | 
 | 	mflo	$h2 | 
 |  | 
 | 	mul	$t0,$r0,$h4		# h4*r0 | 
 |  | 
 | 	multu	$r3,$d0			# d0*r3 | 
 | 	maddu	$r2,$d1			# d1*r2 | 
 | 	maddu	$r1,$d2			# d2*r1 | 
 | 	maddu	$r0,$d3			# d3*r0 | 
 | 	maddu	$rs3,$h4		# h4*s3 | 
 | 	maddu	$at,$one		# hi*1 | 
 | 	mfhi	$at | 
 | 	mflo	$h3 | 
 |  | 
 | 	 addiu	$inp,$inp,16 | 
 |  | 
 | 	addu	$h4,$t0,$at | 
 | #else | 
 | 	multu	($r0,$d0)		# d0*r0 | 
 | 	mflo	($h0,$r0,$d0) | 
 | 	mfhi	($h1,$r0,$d0) | 
 |  | 
 | 	 sltu	$h2,$d3,$h2 | 
 | 	 addu	$h3,$h3,$h2		# carry | 
 |  | 
 | 	multu	($rs3,$d1)		# d1*s3 | 
 | 	mflo	($at,$rs3,$d1) | 
 | 	mfhi	($t0,$rs3,$d1) | 
 |  | 
 | 	 addu	$h4,$h4,$padbit | 
 | 	 addiu	$inp,$inp,16 | 
 | 	 addu	$h4,$h4,$h3 | 
 |  | 
 | 	multu	($rs2,$d2)		# d2*s2 | 
 | 	mflo	($a3,$rs2,$d2) | 
 | 	mfhi	($t1,$rs2,$d2) | 
 | 	 addu	$h0,$h0,$at | 
 | 	 addu	$h1,$h1,$t0 | 
 | 	multu	($rs1,$d3)		# d3*s1 | 
 | 	 sltu	$at,$h0,$at | 
 | 	 addu	$h1,$h1,$at | 
 |  | 
 | 	mflo	($at,$rs1,$d3) | 
 | 	mfhi	($t0,$rs1,$d3) | 
 | 	 addu	$h0,$h0,$a3 | 
 | 	 addu	$h1,$h1,$t1 | 
 | 	multu	($r1,$d0)		# d0*r1 | 
 | 	 sltu	$a3,$h0,$a3 | 
 | 	 addu	$h1,$h1,$a3 | 
 |  | 
 |  | 
 | 	mflo	($a3,$r1,$d0) | 
 | 	mfhi	($h2,$r1,$d0) | 
 | 	 addu	$h0,$h0,$at | 
 | 	 addu	$h1,$h1,$t0 | 
 | 	multu	($r0,$d1)		# d1*r0 | 
 | 	 sltu	$at,$h0,$at | 
 | 	 addu	$h1,$h1,$at | 
 |  | 
 | 	mflo	($at,$r0,$d1) | 
 | 	mfhi	($t0,$r0,$d1) | 
 | 	 addu	$h1,$h1,$a3 | 
 | 	 sltu	$a3,$h1,$a3 | 
 | 	multu	($rs3,$d2)		# d2*s3 | 
 | 	 addu	$h2,$h2,$a3 | 
 |  | 
 | 	mflo	($a3,$rs3,$d2) | 
 | 	mfhi	($t1,$rs3,$d2) | 
 | 	 addu	$h1,$h1,$at | 
 | 	 addu	$h2,$h2,$t0 | 
 | 	multu	($rs2,$d3)		# d3*s2 | 
 | 	 sltu	$at,$h1,$at | 
 | 	 addu	$h2,$h2,$at | 
 |  | 
 | 	mflo	($at,$rs2,$d3) | 
 | 	mfhi	($t0,$rs2,$d3) | 
 | 	 addu	$h1,$h1,$a3 | 
 | 	 addu	$h2,$h2,$t1 | 
 | 	multu	($rs1,$h4)		# h4*s1 | 
 | 	 sltu	$a3,$h1,$a3 | 
 | 	 addu	$h2,$h2,$a3 | 
 |  | 
 | 	mflo	($a3,$rs1,$h4) | 
 | 	 addu	$h1,$h1,$at | 
 | 	 addu	$h2,$h2,$t0 | 
 | 	multu	($r2,$d0)		# d0*r2 | 
 | 	 sltu	$at,$h1,$at | 
 | 	 addu	$h2,$h2,$at | 
 |  | 
 |  | 
 | 	mflo	($at,$r2,$d0) | 
 | 	mfhi	($h3,$r2,$d0) | 
 | 	 addu	$h1,$h1,$a3 | 
 | 	 sltu	$a3,$h1,$a3 | 
 | 	multu	($r1,$d1)		# d1*r1 | 
 | 	 addu	$h2,$h2,$a3 | 
 |  | 
 | 	mflo	($a3,$r1,$d1) | 
 | 	mfhi	($t1,$r1,$d1) | 
 | 	 addu	$h2,$h2,$at | 
 | 	 sltu	$at,$h2,$at | 
 | 	multu	($r0,$d2)		# d2*r0 | 
 | 	 addu	$h3,$h3,$at | 
 |  | 
 | 	mflo	($at,$r0,$d2) | 
 | 	mfhi	($t0,$r0,$d2) | 
 | 	 addu	$h2,$h2,$a3 | 
 | 	 addu	$h3,$h3,$t1 | 
 | 	multu	($rs3,$d3)		# d3*s3 | 
 | 	 sltu	$a3,$h2,$a3 | 
 | 	 addu	$h3,$h3,$a3 | 
 |  | 
 | 	mflo	($a3,$rs3,$d3) | 
 | 	mfhi	($t1,$rs3,$d3) | 
 | 	 addu	$h2,$h2,$at | 
 | 	 addu	$h3,$h3,$t0 | 
 | 	multu	($rs2,$h4)		# h4*s2 | 
 | 	 sltu	$at,$h2,$at | 
 | 	 addu	$h3,$h3,$at | 
 |  | 
 | 	mflo	($at,$rs2,$h4) | 
 | 	 addu	$h2,$h2,$a3 | 
 | 	 addu	$h3,$h3,$t1 | 
 | 	multu	($r3,$d0)		# d0*r3 | 
 | 	 sltu	$a3,$h2,$a3 | 
 | 	 addu	$h3,$h3,$a3 | 
 |  | 
 |  | 
 | 	mflo	($a3,$r3,$d0) | 
 | 	mfhi	($t1,$r3,$d0) | 
 | 	 addu	$h2,$h2,$at | 
 | 	 sltu	$at,$h2,$at | 
 | 	multu	($r2,$d1)		# d1*r2 | 
 | 	 addu	$h3,$h3,$at | 
 |  | 
 | 	mflo	($at,$r2,$d1) | 
 | 	mfhi	($t0,$r2,$d1) | 
 | 	 addu	$h3,$h3,$a3 | 
 | 	 sltu	$a3,$h3,$a3 | 
 | 	multu	($r0,$d3)		# d3*r0 | 
 | 	 addu	$t1,$t1,$a3 | 
 |  | 
 | 	mflo	($a3,$r0,$d3) | 
 | 	mfhi	($d3,$r0,$d3) | 
 | 	 addu	$h3,$h3,$at | 
 | 	 addu	$t1,$t1,$t0 | 
 | 	multu	($r1,$d2)		# d2*r1 | 
 | 	 sltu	$at,$h3,$at | 
 | 	 addu	$t1,$t1,$at | 
 |  | 
 | 	mflo	($at,$r1,$d2) | 
 | 	mfhi	($t0,$r1,$d2) | 
 | 	 addu	$h3,$h3,$a3 | 
 | 	 addu	$t1,$t1,$d3 | 
 | 	multu	($rs3,$h4)		# h4*s3 | 
 | 	 sltu	$a3,$h3,$a3 | 
 | 	 addu	$t1,$t1,$a3 | 
 |  | 
 | 	mflo	($a3,$rs3,$h4) | 
 | 	 addu	$h3,$h3,$at | 
 | 	 addu	$t1,$t1,$t0 | 
 | 	multu	($r0,$h4)		# h4*r0 | 
 | 	 sltu	$at,$h3,$at | 
 | 	 addu	$t1,$t1,$at | 
 |  | 
 |  | 
 | 	mflo	($h4,$r0,$h4) | 
 | 	 addu	$h3,$h3,$a3 | 
 | 	 sltu	$a3,$h3,$a3 | 
 | 	 addu	$t1,$t1,$a3 | 
 | 	addu	$h4,$h4,$t1 | 
 |  | 
 | 	li	$padbit,1		# if we loop, padbit is 1 | 
 | #endif | 
 | 	bne	$inp,$len,.Loop | 
 |  | 
 | 	sw	$h0,0($ctx)		# store hash value | 
 | 	sw	$h1,4($ctx) | 
 | 	sw	$h2,8($ctx) | 
 | 	sw	$h3,12($ctx) | 
 | 	sw	$h4,16($ctx) | 
 |  | 
 | 	.set	noreorder | 
 | .Labort: | 
 | 	lw	$s11,4*11($sp) | 
 | 	lw	$s10,4*10($sp) | 
 | 	lw	$s9, 4*9($sp) | 
 | 	lw	$s8, 4*8($sp) | 
 | 	lw	$s7, 4*7($sp) | 
 | 	lw	$s6, 4*6($sp) | 
 | 	lw	$s5, 4*5($sp) | 
 | 	lw	$s4, 4*4($sp) | 
 | ___ | 
 | $code.=<<___ if ($flavour =~ /nubi/i);	# optimize non-nubi prologue | 
 | 	lw	$s3, 4*3($sp) | 
 | 	lw	$s2, 4*2($sp) | 
 | 	lw	$s1, 4*1($sp) | 
 | 	lw	$s0, 4*0($sp) | 
 | ___ | 
 | $code.=<<___; | 
 | 	jr	$ra | 
 | 	addu	$sp,$sp,4*12 | 
 | .end	poly1305_blocks | 
 | ___ | 
 | } | 
 | { | 
 | my ($ctx,$mac,$nonce,$tmp4) = ($a0,$a1,$a2,$a3); | 
 |  | 
 | $code.=<<___; | 
 | .align	5 | 
 | .globl	poly1305_emit | 
 | .ent	poly1305_emit | 
 | poly1305_emit: | 
 | 	.frame	$sp,0,$ra | 
 | 	.set	reorder | 
 |  | 
 | 	lw	$tmp4,16($ctx) | 
 | 	lw	$tmp0,0($ctx) | 
 | 	lw	$tmp1,4($ctx) | 
 | 	lw	$tmp2,8($ctx) | 
 | 	lw	$tmp3,12($ctx) | 
 |  | 
 | 	li	$in0,-4			# final reduction | 
 | 	srl	$ctx,$tmp4,2 | 
 | 	and	$in0,$in0,$tmp4 | 
 | 	andi	$tmp4,$tmp4,3 | 
 | 	addu	$ctx,$ctx,$in0 | 
 |  | 
 | 	addu	$tmp0,$tmp0,$ctx | 
 | 	sltu	$ctx,$tmp0,$ctx | 
 | 	 addiu	$in0,$tmp0,5		# compare to modulus | 
 | 	addu	$tmp1,$tmp1,$ctx | 
 | 	 sltiu	$in1,$in0,5 | 
 | 	sltu	$ctx,$tmp1,$ctx | 
 | 	 addu	$in1,$in1,$tmp1 | 
 | 	addu	$tmp2,$tmp2,$ctx | 
 | 	 sltu	$in2,$in1,$tmp1 | 
 | 	sltu	$ctx,$tmp2,$ctx | 
 | 	 addu	$in2,$in2,$tmp2 | 
 | 	addu	$tmp3,$tmp3,$ctx | 
 | 	 sltu	$in3,$in2,$tmp2 | 
 | 	sltu	$ctx,$tmp3,$ctx | 
 | 	 addu	$in3,$in3,$tmp3 | 
 | 	addu	$tmp4,$tmp4,$ctx | 
 | 	 sltu	$ctx,$in3,$tmp3 | 
 | 	 addu	$ctx,$tmp4 | 
 |  | 
 | 	srl	$ctx,2			# see if it carried/borrowed | 
 | 	subu	$ctx,$zero,$ctx | 
 |  | 
 | 	xor	$in0,$tmp0 | 
 | 	xor	$in1,$tmp1 | 
 | 	xor	$in2,$tmp2 | 
 | 	xor	$in3,$tmp3 | 
 | 	and	$in0,$ctx | 
 | 	and	$in1,$ctx | 
 | 	and	$in2,$ctx | 
 | 	and	$in3,$ctx | 
 | 	xor	$in0,$tmp0 | 
 | 	xor	$in1,$tmp1 | 
 | 	xor	$in2,$tmp2 | 
 | 	xor	$in3,$tmp3 | 
 |  | 
 | 	lw	$tmp0,0($nonce)		# load nonce | 
 | 	lw	$tmp1,4($nonce) | 
 | 	lw	$tmp2,8($nonce) | 
 | 	lw	$tmp3,12($nonce) | 
 |  | 
 | 	addu	$in0,$tmp0		# accumulate nonce | 
 | 	sltu	$ctx,$in0,$tmp0 | 
 |  | 
 | 	addu	$in1,$tmp1 | 
 | 	sltu	$tmp1,$in1,$tmp1 | 
 | 	addu	$in1,$ctx | 
 | 	sltu	$ctx,$in1,$ctx | 
 | 	addu	$ctx,$tmp1 | 
 |  | 
 | 	addu	$in2,$tmp2 | 
 | 	sltu	$tmp2,$in2,$tmp2 | 
 | 	addu	$in2,$ctx | 
 | 	sltu	$ctx,$in2,$ctx | 
 | 	addu	$ctx,$tmp2 | 
 |  | 
 | 	addu	$in3,$tmp3 | 
 | 	addu	$in3,$ctx | 
 |  | 
 | 	srl	$tmp0,$in0,8		# write mac value | 
 | 	srl	$tmp1,$in0,16 | 
 | 	srl	$tmp2,$in0,24 | 
 | 	sb	$in0, 0($mac) | 
 | 	sb	$tmp0,1($mac) | 
 | 	srl	$tmp0,$in1,8 | 
 | 	sb	$tmp1,2($mac) | 
 | 	srl	$tmp1,$in1,16 | 
 | 	sb	$tmp2,3($mac) | 
 | 	srl	$tmp2,$in1,24 | 
 | 	sb	$in1, 4($mac) | 
 | 	sb	$tmp0,5($mac) | 
 | 	srl	$tmp0,$in2,8 | 
 | 	sb	$tmp1,6($mac) | 
 | 	srl	$tmp1,$in2,16 | 
 | 	sb	$tmp2,7($mac) | 
 | 	srl	$tmp2,$in2,24 | 
 | 	sb	$in2, 8($mac) | 
 | 	sb	$tmp0,9($mac) | 
 | 	srl	$tmp0,$in3,8 | 
 | 	sb	$tmp1,10($mac) | 
 | 	srl	$tmp1,$in3,16 | 
 | 	sb	$tmp2,11($mac) | 
 | 	srl	$tmp2,$in3,24 | 
 | 	sb	$in3, 12($mac) | 
 | 	sb	$tmp0,13($mac) | 
 | 	sb	$tmp1,14($mac) | 
 | 	sb	$tmp2,15($mac) | 
 |  | 
 | 	jr	$ra | 
 | .end	poly1305_emit | 
 | .rdata | 
 | .asciiz	"Poly1305 for MIPS32, CRYPTOGAMS by \@dot-asm" | 
 | .align	2 | 
 | ___ | 
 | } | 
 | }}} | 
 |  | 
 | $output=pop and open STDOUT,">$output"; | 
 | print $code; | 
 | close STDOUT; |