# 2015 Feb 20 version
# Baillie-PSW primality test
# Thanks to some long calculations, we know this works for all 64-bit
# numbers. This code stops seeking quadratic non-residues when the
# number of composites is low.

	.file	"isPrime8.s"
	.text

	.set	NUM,%rdi	# the parameter to isPrime8 comes here

##########
# macros #
##########
	.macro	addmod fromreg,toreg
	add	\fromreg,\toreg
	jnc	1f
	sub	NUM,\toreg
	jmp	2f
1:
	sub	NUM,\toreg
	jnc	2f
	add	NUM,\toreg
2:
	.endm

	.macro	submod fromreg,toreg
	sub	\fromreg,\toreg
	jnc	1f
	add	NUM,\toreg
1:
	.endm

###############
# subroutines #
###############

.Lcoprime:	# set Z flag iff GCD(RAX,NUM) == 1
		# uses RAX,RCX,RDX
	movq	NUM,%rdx
10:
	subq	%rax,%rdx
	je	12f
	jnc	11f
	negq	%rdx
	subq	%rdx,%rax
11:
	bsfq	%rdx,%rcx
	shrq	%cl,%rdx
	jmp	10b
12:
	cmpq	$1,%rax
	ret


###########################################################
# isPrime8: determines whether an 8-byte integer is prime,
#    returns a bool.
#	bool isPrime8(uint8 num);
###########################################################

.globl isPrime8
	.type	isPrime8,@function
isPrime8:

# First step: test numbers below 64 with a bitmask.

	cmpq	$64,NUM
	jae	100f
	xorl	%edx,%edx	# %rdx := 1 << NUM
	incl	%edx
	movb	%dil,%cl
	shlq	%cl,%rdx
	.set	.Lbm1,(1<<2)|(1<<3)|(1<<5)|(1<<7)|(1<<11)|(1<<13)
	.set	.Lbm2,(1<<17)|(1<<19)|(1<<23)|(1<<29)|(1<<31)|(1<<37)
	.set	.Lbm3,(1<<41)|(1<<43)|(1<<47)|(1<<53)|(1<<59)|(1<<61)
	movabs	$.Lbm1|.Lbm2|.Lbm3,%rax
	testq	%rax,%rdx
	jnz	.LreturnTrueA
.LreturnFalseA:
	xorl	%eax,%eax
	ret
.LreturnTrueA:
	xorl	%eax,%eax
	incl	%eax
	ret
100:


# Second step: trial division up to 5.
# There are eight mod-30 classes which might be prime.

	movq	NUM,%rax	# compute N mod 30
	xorl	%edx,%edx
	movl	$30,%ecx
	divq	%rcx
	xorl	%eax,%eax	# compute 1 << (N mod 30)
	incl	%eax
	movl	%edx,%ecx
	shll	%cl,%eax
	.set	.Lbm1,(1<<1)|(1<<11)|(1<<19)|(1<<29)
	.set	.Lbm2,(1<<7)|(1<<13)|(1<<17)|(1<<23)
	testl	$.Lbm1|.Lbm2,%eax
	jz	.LreturnFalseA
		# In the fifth step, we'll want to know whether
		# NUM is one of (7,13,17,23) mod 30.
	testl	$.Lbm2,%eax
	setnz	%r9b


# Third step: trial division up to 101.
# It's faster to do GCDs with products of primes.

	.set	.Ln1,7*11*13*17*19*23*29*31*37*41*43*47*53
	.set	.Ln2,59*61*67*71*73*79*83*89*97*101
	movabs	$.Ln1,%rax
	call	.Lcoprime
	jnz	.LreturnFalseA
	    # Because NUM is >= 64, it can't be a
	    # prime factor of the N1 product.

	    # Now, the number has no factor below 59.
	    # It is either prime, or is >= 59*59.
	cmpq	$59*59,NUM
	jc	.LreturnTrueA	# in case it's an N2 factor
	movabs	$.Ln2,%rax
	call	.Lcoprime
	jnz	.LreturnFalseA


# Fourth step: strong primality test with base 2.
# This exposes almost all composites.

	.set	NUMM1,%r8	# NUM-1
	leaq	-1(NUM),NUMM1

	# factor NUM-1 into an odd part and a power of two
	.set	ODD,%rsi
	movq	NUMM1,ODD
	bsfq	ODD,%rax	# exponent of twopower
	# shift the odd part to the left end
	bsrq	ODD,%rcx
	xorb	$63,%cl		# compute shift amount
	shl	%cl,ODD
	.set	EXPO,%cl	# put twopower here
	movb	%al,EXPO

	# compute Base^ODD modulo NUM
	movl	$2,%edx
	.set	POWER,%rdx	# use remainder register

	addq	ODD,ODD		# already did first bit
	jz	401f
400:
	movq	POWER,%rax	# square POWER
	mulq	POWER
	divq	NUM		# remainder goes to %rdx=POWER
	addq	ODD,ODD		# check next bit
	jnc	400b
	addmod	POWER,POWER	# multiply by Base=2
	testq	ODD,ODD		# loop
	jnz	400b
401:
	cmpq	$1,POWER
	je	403f		# POWER==1: maybe prime
	cmpq	NUMM1,POWER
	je	403f		# POWER==NUM-1: maybe prime

	# square EXPO-1 times
	decb	EXPO
	jz	.LreturnFalseA
402:
	movq	POWER,%rax
	mulq	POWER
	divq	NUM
	cmpq	POWER,NUMM1	# POWER==NUM-1: maybe prime
	je	403f
	cmpq	$1,POWER	# POWER==1: composite
	loopnz	402b
	jmp	.LreturnFalseA	# loop finished: composite
403:


# There are only 31_894_014 composite numbers (below 2^64) which pass
# the strong primality test. Of those, some have a small factor:
# 31_712_357 remain, from 42799 to 18446744066047760377. Of course, the
# insidious Wieferich-prime squares (1093^2 and 3511^2) are there.

# Fifth step: Lucas test with the right discriminant.
# We need a 4n+1 discriminant which is a quadratic non-residue modulo
# NUM. (Thus the Wieferich squares are insidious: for those NUMs, there
# is no such discriminant.)
# Exhaustive testing of the 32M show that one such test with the least
# good discriminant discriminates all composites except for those two.

# To find such a discriminant, use the Jacobi symbol, precomputed in the
# .LnonQRvectors, below.
# This works even when NUM is composite because we seek non-residues.

# One needn't seek a discriminant until the bitter end.
# Try the primes from 5 to 89.
# If none of those work, only thirteen composites remain to be checked
# individually, in the sixth step.

	# Long ago, we tested whether NUM == (7,13,17,23) mod 30.
	# If so, 5 is the right discriminant.
	.set	NqrPtr,%r8
	leaq	.LnonQRvectors(%rip),NqrPtr
	testb	%r9b,%r9b
	jnz	560f

	# Five is a wrong discriminant: find a good one.
551:
	addq	$.LNqrSize,NqrPtr
	movsxb	(NqrPtr),%rcx
	testb	%cl,%cl		# zero means we ran out
	jz	.LsixthStep
	jns	552f
	negq	%rcx		# absolute value
552:
	mov	NUM,%rax	# check bit-vector
	xorl	%edx,%edx
	divq	%rcx		# %rdx gets remainder, bit-index
	btw	%dx,1(NqrPtr)
	jnc	551b

560:	# make Lucas sequence coefficients: P=1, Q=(discr-P)/4
	.set	LUCQ,%rsi
	movsxb	(NqrPtr),LUCQ	# Q = (discr-P)/4
	decq	LUCQ
	sarq	$2,LUCQ
	jns	561f
	addq	NUM,LUCQ	# make Q positive mod NUM
561:
	# Compute U[NUM],U[NUM+1] mod NUM
	# (We need only U[NUM+1], but must compute both to get either.)
	.set	NUMBIT,%r8
	bsrq	NUM,%rcx	# put highest set bit into NUMBIT
	xorq	NUMBIT,NUMBIT
	incq	NUMBIT
	shlq	%cl,NUMBIT
	shrq	NUMBIT		# first bit is always 1: don't test it

	.set	U2Np1,%r9
	.set	LUCUlo,%r10
	.set	LUCUhi,%r11

	# make sequence values U[1] U[2]
	# (not U[0],U[1]: we ignore the first bit)
	xorq	LUCUlo,LUCUlo		# U[1]=1
	incq	LUCUlo
	movq	LUCUlo,LUCUhi		# U[2]=1

570:	# for each remaining bit in NUM

	# double index: U[x],U[x+1] -> U[2x],U[2x+1]
	movq	LUCUlo,%rax	# U[2n+1] = U[n+1]^2 + Q*U[n]^2
	mulq	LUCUlo
	divq	NUM
	movq	%rdx,%rax
	mulq	LUCQ
	divq	NUM
	movq	%rdx,U2Np1

	movq	LUCUhi,%rax
	mulq	LUCUhi
	divq	NUM
	addmod	%rdx,U2Np1

	movq	LUCUhi,%rax	# U[2n] = U[n]*(2*U[n+1]-U[n])
	addmod	%rax,%rax
	submod	LUCUlo,%rax
	mulq	LUCUlo
	divq	NUM
	movq	%rdx,LUCUlo
	movq	U2Np1,LUCUhi

	testq	NUMBIT,NUM	# check next bit
	jz	580f
571:
	# increment index: U[x],U[x+1] -> U[x+1],U[x+2]
	movq	LUCUlo,%rax	# U[n+2] = U[n+1] + Q*U[n]
	mulq	LUCQ
	addq	LUCUhi,%rax
	adcq	$0,%rdx
	divq	NUM
	movq	LUCUhi,LUCUlo
	movq	%rdx,LUCUhi

580:	# end loop
	shrq	NUMBIT
	jnz	570b

	test	LUCUhi,LUCUhi	# composite if non-zero
	jnz	.LreturnFalseB

.LreturnTrueB:
	xorl	%eax,%eax
	incl	%eax
	ret
.LreturnFalseB:
	xorl	%eax,%eax
	ret


# Sixth step: check for the 13 composites which aren't exposed in the
# other steps.
##	    #
##	1194649				1093*1093
##	    12327121			3511*3511
##    172683484287699841		239919073*719757217
##	    #
##	395009109077493751		314248751*1256995001
##	    693958851755989121		480956981*1442870941
## 1329453744705764701			208534901*6375209801
##	    2735096497083370501		1045963001*2614907501
##	6092870717272192309		281908189*21612961081
##	    6414442613241233671		895435831*7163486641
##    9921760091402570401		671557217*14774258753
##	    11708069165918597341	57267541*204445117801
##	15263286937772200441		1251185569*12199059289
##	    15398785001660949001	1801*12277801*696389401

.LsixthStep:
	leaq	.LsixthTable(%rip),%rax
	cmpq	8*5(%rax),NUM
	ja	602f
	je	.LreturnFalseB
	cmpq	8*2(%rax),NUM
	ja	601f
	je	.LreturnFalseB
	cmpq	8*0(%rax),NUM
	jb	.LreturnTrueB
	je	.LreturnFalseB
	cmpq	8*1(%rax),NUM
	jne	.LreturnTrueB
	jmp	.LreturnFalseB
601:
	cmpq	8*3(%rax),NUM
	jb	.LreturnTrueB
	je	.LreturnFalseB
	cmpq	8*4(%rax),NUM
	jne	.LreturnTrueB
	jmp	.LreturnFalseB
602:
	cmpq	8*9(%rax),NUM
	ja	604f
	je	.LreturnFalseB
	cmpq	8*7(%rax),NUM
	ja	603f
	je	.LreturnFalseB
	cmpq	8*6(%rax),NUM
	jne	.LreturnTrueB
	jmp	.LreturnFalseB
603:
	cmpq	8*8(%rax),NUM
	jne	.LreturnTrueB
	jmp	.LreturnFalseB
604:
	cmpq	8*11(%rax),NUM
	ja	605f
	je	.LreturnFalseB
	cmpq	8*10(%rax),NUM
	jne	.LreturnTrueB
	jmp	.LreturnFalseB
605:
	cmpq	8*12(%rax),NUM
	jne	.LreturnTrueB

	xorl	%eax,%eax	# return false
	ret

	.align	64,0
	# So that data and code aren't in the same cache line.
	# Do whatever works for your processor.

.LsixthTable:
	.quad	1194649
	.quad	12327121
	.quad	172683484287699841
	.quad	395009109077493751
	.quad	693958851755989121
	.quad	1329453744705764701
	.quad	2735096497083370501
	.quad	6092870717272192309
	.quad	6414442613241233671
	.quad	9921760091402570401
	.quad	11708069165918597341
	.quad	15263286937772200441
	.quad	15398785001660949001

# A row of this table has P (a 3-mod-4 prime) and a bit-vector of P
# values. The K'th bit-value indicates whether an _odd_ number congruent
# to K mod P is a quadratic non-residue mod P.
.LnonQRvectors:
	.byte	  5,0x0c,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
.Lnqr1: .set	.LNqrSize,.Lnqr1-.LnonQRvectors
	.byte	 -7,0x68,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
	.byte	-11,0xc4,0x05,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
	.byte	 13,0xe4,0x09,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
	.byte	 17,0xe8,0x5c,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
	.byte	-19,0x0c,0xf5,0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
	.byte	-23,0xa0,0xcc,0x7a,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
	.byte	 29,0x0c,0xdd,0x2e,0x0c,0x00,0x00,0x00,0x00,0x00,0x00,0x00
	.byte	-31,0x48,0xb8,0xe2,0x6d,0x00,0x00,0x00,0x00,0x00,0x00,0x00
	.byte	 37,0x64,0xe1,0xde,0xa1,0x09,0x00,0x00,0x00,0x00,0x00,0x00
	.byte	 41,0xc8,0xf8,0x4a,0x7d,0x4c,0x00,0x00,0x00,0x00,0x00,0x00
	.byte	-43,0xac,0x11,0x5c,0x7c,0xa7,0x04,0x00,0x00,0x00,0x00,0x00
	.byte	-47,0x20,0xac,0xd8,0xe4,0xca,0x7b,0x00,0x00,0x00,0x00,0x00
	.byte	 53,0x2c,0x51,0xfc,0xcc,0x8f,0x22,0x0d,0x00,0x00,0x00,0x00
	.byte	-59,0x44,0x6d,0x84,0xc1,0xe7,0x9d,0xd4,0x05,0x00,0x00,0x00
	.byte	 61,0xc4,0x0d,0xa6,0xf5,0x6b,0x19,0xec,0x08,0x00,0x00,0x00
	.byte	-67,0xac,0x39,0x14,0xd8,0x45,0x7e,0x3d,0xa6,0x04,0x00,0x00
	.byte	-71,0x80,0x68,0xe2,0x94,0x8e,0xd6,0xb8,0xe9,0x7e,0x00,0x00
	.byte	 73,0xa0,0xec,0x72,0xf4,0x86,0xbd,0x38,0xdd,0x14,0x00,0x00
	.byte	-79,0xc8,0xd0,0x02,0x79,0xae,0x8a,0x61,0xbf,0xf4,0x6c,0x00
	.byte	-83,0x64,0xe1,0x5c,0x01,0x8d,0xec,0xf4,0x57,0x8c,0x97,0x05
	.byte	 89,0xc8,0xf0,0x88,0xfd,0x6a,0x4a,0x59,0xfd,0x46,0x3c,0x4c
	.byte	0 # end