1 files changed, 234 insertions, 279 deletions
diff --git a/arch/arm64/crypto/crct10dif-ce-core.S b/arch/arm64/crypto/crct10dif-ce-core.S
index 9e82e8e8ed05..e545b42e6a46 100644
--- a/arch/arm64/crypto/crct10dif-ce-core.S
+++ b/arch/arm64/crypto/crct10dif-ce-core.S
@@ -2,12 +2,14 @@
 // Accelerated CRC-T10DIF using arm64 NEON and Crypto Extensions instructions
 //
 // Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
+// Copyright (C) 2019 Google LLC <ebiggers@google.com>
 //
 // This program is free software; you can redistribute it and/or modify
 // it under the terms of the GNU General Public License version 2 as
 // published by the Free Software Foundation.
 //
 
+// Derived from the x86 version:
 //
 // Implement fast CRC-T10DIF computation with SSE and PCLMULQDQ instructions
 //
@@ -54,19 +56,11 @@
 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
-//       Function API:
-//       UINT16 crc_t10dif_pcl(
-//               UINT16 init_crc, //initial CRC value, 16 bits
-//               const unsigned char *buf, //buffer pointer to calculate CRC on
-//               UINT64 len //buffer length in bytes (64-bit data)
-//       );
-//
 //       Reference paper titled "Fast CRC Computation for Generic
 //	Polynomials Using PCLMULQDQ Instruction"
 //       URL: http://www.intel.com/content/dam/www/public/us/en/documents
 //  /white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
 //
-//
 
 #include <linux/linkage.h>
 #include <asm/assembler.h>
@@ -74,14 +68,14 @@
 	.text
 	.cpu		generic+crypto
 
-	arg1_low32	.req	w19
-	arg2		.req	x20
-	arg3		.req	x21
+	init_crc	.req	w19
+	buf		.req	x20
+	len		.req	x21
+	fold_consts_ptr	.req	x22
 
-	vzr		.req	v13
+	fold_consts	.req	v10
 
 	ad		.req	v14
-	bd		.req	v10
 
 	k00_16		.req	v15
 	k32_48		.req	v16
@@ -143,11 +137,11 @@ __pmull_p8_core:
 	ext		t5.8b, ad.8b, ad.8b, #2			// A2
 	ext		t6.8b, ad.8b, ad.8b, #3			// A3
 
-	pmull		t4.8h, t4.8b, bd.8b			// F = A1*B
+	pmull		t4.8h, t4.8b, fold_consts.8b		// F = A1*B
 	pmull		t8.8h, ad.8b, bd1.8b			// E = A*B1
-	pmull		t5.8h, t5.8b, bd.8b			// H = A2*B
+	pmull		t5.8h, t5.8b, fold_consts.8b		// H = A2*B
 	pmull		t7.8h, ad.8b, bd2.8b			// G = A*B2
-	pmull		t6.8h, t6.8b, bd.8b			// J = A3*B
+	pmull		t6.8h, t6.8b, fold_consts.8b		// J = A3*B
 	pmull		t9.8h, ad.8b, bd3.8b			// I = A*B3
 	pmull		t3.8h, ad.8b, bd4.8b			// K = A*B4
 	b		0f
@@ -157,11 +151,11 @@ __pmull_p8_core:
 	tbl		t5.16b, {ad.16b}, perm2.16b		// A2
 	tbl		t6.16b, {ad.16b}, perm3.16b		// A3
 
-	pmull2		t4.8h, t4.16b, bd.16b			// F = A1*B
+	pmull2		t4.8h, t4.16b, fold_consts.16b		// F = A1*B
 	pmull2		t8.8h, ad.16b, bd1.16b			// E = A*B1
-	pmull2		t5.8h, t5.16b, bd.16b			// H = A2*B
+	pmull2		t5.8h, t5.16b, fold_consts.16b		// H = A2*B
 	pmull2		t7.8h, ad.16b, bd2.16b			// G = A*B2
-	pmull2		t6.8h, t6.16b, bd.16b			// J = A3*B
+	pmull2		t6.8h, t6.16b, fold_consts.16b		// J = A3*B
 	pmull2		t9.8h, ad.16b, bd3.16b			// I = A*B3
 	pmull2		t3.8h, ad.16b, bd4.16b			// K = A*B4
 
@@ -203,14 +197,14 @@ __pmull_p8_core:
 ENDPROC(__pmull_p8_core)
 
 	.macro		__pmull_p8, rq, ad, bd, i
-	.ifnc		\bd, v10
+	.ifnc		\bd, fold_consts
 	.err
 	.endif
 	mov		ad.16b, \ad\().16b
 	.ifb		\i
-	pmull		\rq\().8h, \ad\().8b, bd.8b		// D = A*B
+	pmull		\rq\().8h, \ad\().8b, \bd\().8b		// D = A*B
 	.else
-	pmull2		\rq\().8h, \ad\().16b, bd.16b		// D = A*B
+	pmull2		\rq\().8h, \ad\().16b, \bd\().16b	// D = A*B
 	.endif
 
 	bl		.L__pmull_p8_core\i
@@ -219,17 +213,19 @@ ENDPROC(__pmull_p8_core)
 	eor		\rq\().16b, \rq\().16b, t6.16b
 	.endm
 
-	.macro		fold64, p, reg1, reg2
-	ldp		q11, q12, [arg2], #0x20
+	// Fold reg1, reg2 into the next 32 data bytes, storing the result back
+	// into reg1, reg2.
+	.macro		fold_32_bytes, p, reg1, reg2
+	ldp		q11, q12, [buf], #0x20
 
-	__pmull_\p	v8, \reg1, v10, 2
-	__pmull_\p	\reg1, \reg1, v10
+	__pmull_\p	v8, \reg1, fold_consts, 2
+	__pmull_\p	\reg1, \reg1, fold_consts
 
 CPU_LE(	rev64		v11.16b, v11.16b		)
 CPU_LE(	rev64		v12.16b, v12.16b		)
 
-	__pmull_\p	v9, \reg2, v10, 2
-	__pmull_\p	\reg2, \reg2, v10
+	__pmull_\p	v9, \reg2, fold_consts, 2
+	__pmull_\p	\reg2, \reg2, fold_consts
 
 CPU_LE(	ext		v11.16b, v11.16b, v11.16b, #8	)
 CPU_LE(	ext		v12.16b, v12.16b, v12.16b, #8	)
@@ -240,15 +236,16 @@ CPU_LE(	ext		v12.16b, v12.16b, v12.16b, #8	)
 	eor		\reg2\().16b, \reg2\().16b, v12.16b
 	.endm
 
-	.macro		fold16, p, reg, rk
-	__pmull_\p	v8, \reg, v10
-	__pmull_\p	\reg, \reg, v10, 2
-	.ifnb		\rk
-	ldr_l		q10, \rk, x8
-	__pmull_pre_\p	v10
+	// Fold src_reg into dst_reg, optionally loading the next fold constants
+	.macro		fold_16_bytes, p, src_reg, dst_reg, load_next_consts
+	__pmull_\p	v8, \src_reg, fold_consts
+	__pmull_\p	\src_reg, \src_reg, fold_consts, 2
+	.ifnb		\load_next_consts
+	ld1		{fold_consts.2d}, [fold_consts_ptr], #16
+	__pmull_pre_\p	fold_consts
 	.endif
-	eor		v7.16b, v7.16b, v8.16b
-	eor		v7.16b, v7.16b, \reg\().16b
+	eor		\dst_reg\().16b, \dst_reg\().16b, v8.16b
+	eor		\dst_reg\().16b, \dst_reg\().16b, \src_reg\().16b
 	.endm
 
 	.macro		__pmull_p64, rd, rn, rm, n
@@ -260,40 +257,27 @@ CPU_LE(	ext		v12.16b, v12.16b, v12.16b, #8	)
 	.endm
 
 	.macro		crc_t10dif_pmull, p
-	frame_push	3, 128
+	frame_push	4, 128
 
-	mov		arg1_low32, w0
-	mov		arg2, x1
-	mov		arg3, x2
-
-	movi		vzr.16b, #0		// init zero register
+	mov		init_crc, w0
+	mov		buf, x1
+	mov		len, x2
 
 	__pmull_init_\p
 
-	// adjust the 16-bit initial_crc value, scale it to 32 bits
-	lsl		arg1_low32, arg1_low32, #16
-
-	// check if smaller than 256
-	cmp		arg3, #256
-
-	// for sizes less than 128, we can't fold 64B at a time...
-	b.lt		.L_less_than_128_\@
+	// For sizes less than 256 bytes, we can't fold 128 bytes at a time.
+	cmp		len, #256
+	b.lt		.Lless_than_256_bytes_\@
 
-	// load the initial crc value
-	// crc value does not need to be byte-reflected, but it needs
-	// to be moved to the high part of the register.
-	// because data will be byte-reflected and will align with
-	// initial crc at correct place.
-	movi		v10.16b, #0
-	mov		v10.s[3], arg1_low32		// initial crc
-
-	// receive the initial 64B data, xor the initial crc value
-	ldp		q0, q1, [arg2]
-	ldp		q2, q3, [arg2, #0x20]
-	ldp		q4, q5, [arg2, #0x40]
-	ldp		q6, q7, [arg2, #0x60]
-	add		arg2, arg2, #0x80
+	adr_l		fold_consts_ptr, .Lfold_across_128_bytes_consts
 
+	// Load the first 128 data bytes.  Byte swapping is necessary to make
+	// the bit order match the polynomial coefficient order.
+	ldp		q0, q1, [buf]
+	ldp		q2, q3, [buf, #0x20]
+	ldp		q4, q5, [buf, #0x40]
+	ldp		q6, q7, [buf, #0x60]
+	add		buf, buf, #0x80
 CPU_LE(	rev64		v0.16b, v0.16b			)
 CPU_LE(	rev64		v1.16b, v1.16b			)
 CPU_LE(	rev64		v2.16b, v2.16b			)
@@ -302,7 +286,6 @@ CPU_LE(	rev64		v4.16b, v4.16b			)
 CPU_LE(	rev64		v5.16b, v5.16b			)
 CPU_LE(	rev64		v6.16b, v6.16b			)
 CPU_LE(	rev64		v7.16b, v7.16b			)
-
 CPU_LE(	ext		v0.16b, v0.16b, v0.16b, #8	)
 CPU_LE(	ext		v1.16b, v1.16b, v1.16b, #8	)
 CPU_LE(	ext		v2.16b, v2.16b, v2.16b, #8	)
@@ -312,36 +295,29 @@ CPU_LE(	ext		v5.16b, v5.16b, v5.16b, #8	)
 CPU_LE(	ext		v6.16b, v6.16b, v6.16b, #8	)
 CPU_LE(	ext		v7.16b, v7.16b, v7.16b, #8	)
 
-	// XOR the initial_crc value
-	eor		v0.16b, v0.16b, v10.16b
-
-	ldr_l		q10, rk3, x8	// xmm10 has rk3 and rk4
-					// type of pmull instruction
-					// will determine which constant to use
-	__pmull_pre_\p	v10
+	// XOR the first 16 data *bits* with the initial CRC value.
+	movi		v8.16b, #0
+	mov		v8.h[7], init_crc
+	eor		v0.16b, v0.16b, v8.16b
 
-	//
-	// we subtract 256 instead of 128 to save one instruction from the loop
-	//
-	sub		arg3, arg3, #256
+	// Load the constants for folding across 128 bytes.
+	ld1		{fold_consts.2d}, [fold_consts_ptr]
+	__pmull_pre_\p	fold_consts
 
-	// at this section of the code, there is 64*x+y (0<=y<64) bytes of
-	// buffer. The _fold_64_B_loop will fold 64B at a time
-	// until we have 64+y Bytes of buffer
+	// Subtract 128 for the 128 data bytes just consumed.  Subtract another
+	// 128 to simplify the termination condition of the following loop.
+	sub		len, len, #256
 
-	// fold 64B at a time. This section of the code folds 4 vector
-	// registers in parallel
-.L_fold_64_B_loop_\@:
+	// While >= 128 data bytes remain (not counting v0-v7), fold the 128
+	// bytes v0-v7 into them, storing the result back into v0-v7.
+.Lfold_128_bytes_loop_\@:
+	fold_32_bytes	\p, v0, v1
+	fold_32_bytes	\p, v2, v3
+	fold_32_bytes	\p, v4, v5
+	fold_32_bytes	\p, v6, v7
 
-	fold64		\p, v0, v1
-	fold64		\p, v2, v3
-	fold64		\p, v4, v5
-	fold64		\p, v6, v7
-
-	subs		arg3, arg3, #128
-
-	// check if there is another 64B in the buffer to be able to fold
-	b.lt		.L_fold_64_B_end_\@
+	subs		len, len, #128
+	b.lt		.Lfold_128_bytes_loop_done_\@
 
 	if_will_cond_yield_neon
 	stp		q0, q1, [sp, #.Lframe_local_offset]
@@ -353,228 +329,207 @@ CPU_LE(	ext		v7.16b, v7.16b, v7.16b, #8	)
 	ldp		q2, q3, [sp, #.Lframe_local_offset + 32]
 	ldp		q4, q5, [sp, #.Lframe_local_offset + 64]
 	ldp		q6, q7, [sp, #.Lframe_local_offset + 96]
-	ldr_l		q10, rk3, x8
-	movi		vzr.16b, #0		// init zero register
+	ld1		{fold_consts.2d}, [fold_consts_ptr]
 	__pmull_init_\p
-	__pmull_pre_\p	v10
+	__pmull_pre_\p	fold_consts
 	endif_yield_neon
 
-	b		.L_fold_64_B_loop_\@
-
-.L_fold_64_B_end_\@:
-	// at this point, the buffer pointer is pointing at the last y Bytes
-	// of the buffer the 64B of folded data is in 4 of the vector
-	// registers: v0, v1, v2, v3
-
-	// fold the 8 vector registers to 1 vector register with different
-	// constants
-
-	ldr_l		q10, rk9, x8
-	__pmull_pre_\p	v10
-
-	fold16		\p, v0, rk11
-	fold16		\p, v1, rk13
-	fold16		\p, v2, rk15
-	fold16		\p, v3, rk17
-	fold16		\p, v4, rk19
-	fold16		\p, v5, rk1
-	fold16		\p, v6
-
-	// instead of 64, we add 48 to the loop counter to save 1 instruction
-	// from the loop instead of a cmp instruction, we use the negative
-	// flag with the jl instruction
-	adds		arg3, arg3, #(128-16)
-	b.lt		.L_final_reduction_for_128_\@
-
-	// now we have 16+y bytes left to reduce. 16 Bytes is in register v7
-	// and the rest is in memory. We can fold 16 bytes at a time if y>=16
-	// continue folding 16B at a time
-
-.L_16B_reduction_loop_\@:
-	__pmull_\p	v8, v7, v10
-	__pmull_\p	v7, v7, v10, 2
+	b		.Lfold_128_bytes_loop_\@
+
+.Lfold_128_bytes_loop_done_\@:
+
+	// Now fold the 112 bytes in v0-v6 into the 16 bytes in v7.
+
+	// Fold across 64 bytes.
+	add		fold_consts_ptr, fold_consts_ptr, #16
+	ld1		{fold_consts.2d}, [fold_consts_ptr], #16
+	__pmull_pre_\p	fold_consts
+	fold_16_bytes	\p, v0, v4
+	fold_16_bytes	\p, v1, v5
+	fold_16_bytes	\p, v2, v6
+	fold_16_bytes	\p, v3, v7, 1
+	// Fold across 32 bytes.
+	fold_16_bytes	\p, v4, v6
+	fold_16_bytes	\p, v5, v7, 1
+	// Fold across 16 bytes.
+	fold_16_bytes	\p, v6, v7
+
+	// Add 128 to get the correct number of data bytes remaining in 0...127
+	// (not counting v7), following the previous extra subtraction by 128.
+	// Then subtract 16 to simplify the termination condition of the
+	// following loop.
+	adds		len, len, #(128-16)
+
+	// While >= 16 data bytes remain (not counting v7), fold the 16 bytes v7
+	// into them, storing the result back into v7.
+	b.lt		.Lfold_16_bytes_loop_done_\@
+.Lfold_16_bytes_loop_\@:
+	__pmull_\p	v8, v7, fold_consts
+	__pmull_\p	v7, v7, fold_consts, 2
 	eor		v7.16b, v7.16b, v8.16b
-
-	ldr		q0, [arg2], #16
+	ldr		q0, [buf], #16
 CPU_LE(	rev64		v0.16b, v0.16b			)
 CPU_LE(	ext		v0.16b, v0.16b, v0.16b, #8	)
 	eor		v7.16b, v7.16b, v0.16b
-	subs		arg3, arg3, #16
-
-	// instead of a cmp instruction, we utilize the flags with the
-	// jge instruction equivalent of: cmp arg3, 16-16
-	// check if there is any more 16B in the buffer to be able to fold
-	b.ge		.L_16B_reduction_loop_\@
-
-	// now we have 16+z bytes left to reduce, where 0<= z < 16.
-	// first, we reduce the data in the xmm7 register
-
-.L_final_reduction_for_128_\@:
-	// check if any more data to fold. If not, compute the CRC of
-	// the final 128 bits
-	adds		arg3, arg3, #16
-	b.eq		.L_128_done_\@
-
-	// here we are getting data that is less than 16 bytes.
-	// since we know that there was data before the pointer, we can
-	// offset the input pointer before the actual point, to receive
-	// exactly 16 bytes. after that the registers need to be adjusted.
-.L_get_last_two_regs_\@:
-	add		arg2, arg2, arg3
-	ldr		q1, [arg2, #-16]
-CPU_LE(	rev64		v1.16b, v1.16b			)
-CPU_LE(	ext		v1.16b, v1.16b, v1.16b, #8	)
-
-	// get rid of the extra data that was loaded before
-	// load the shift constant
-	adr_l		x4, tbl_shf_table + 16
-	sub		x4, x4, arg3
-	ld1		{v0.16b}, [x4]
-
-	// shift v2 to the left by arg3 bytes
-	tbl		v2.16b, {v7.16b}, v0.16b
-
-	// shift v7 to the right by 16-arg3 bytes
-	movi		v9.16b, #0x80
-	eor		v0.16b, v0.16b, v9.16b
-	tbl		v7.16b, {v7.16b}, v0.16b
-
-	// blend
-	sshr		v0.16b, v0.16b, #7	// convert to 8-bit mask
-	bsl		v0.16b, v2.16b, v1.16b
-
-	// fold 16 Bytes
-	__pmull_\p	v8, v7, v10
-	__pmull_\p	v7, v7, v10, 2
-	eor		v7.16b, v7.16b, v8.16b
-	eor		v7.16b, v7.16b, v0.16b
+	subs		len, len, #16
+	b.ge		.Lfold_16_bytes_loop_\@
+
+.Lfold_16_bytes_loop_done_\@:
+	// Add 16 to get the correct number of data bytes remaining in 0...15
+	// (not counting v7), following the previous extra subtraction by 16.
+	adds		len, len, #16
+	b.eq		.Lreduce_final_16_bytes_\@
+
+.Lhandle_partial_segment_\@:
+	// Reduce the last '16 + len' bytes where 1 <= len <= 15 and the first
+	// 16 bytes are in v7 and the rest are the remaining data in 'buf'.  To
+	// do this without needing a fold constant for each possible 'len',
+	// redivide the bytes into a first chunk of 'len' bytes and a second
+	// chunk of 16 bytes, then fold the first chunk into the second.
+
+	// v0 = last 16 original data bytes
+	add		buf, buf, len
+	ldr		q0, [buf, #-16]
+CPU_LE(	rev64		v0.16b, v0.16b			)
+CPU_LE(	ext		v0.16b, v0.16b, v0.16b, #8	)
 
-.L_128_done_\@:
-	// compute crc of a 128-bit value
-	ldr_l		q10, rk5, x8		// rk5 and rk6 in xmm10
-	__pmull_pre_\p	v10
+	// v1 = high order part of second chunk: v7 left-shifted by 'len' bytes.
+	adr_l		x4, .Lbyteshift_table + 16
+	sub		x4, x4, len
+	ld1		{v2.16b}, [x4]
+	tbl		v1.16b, {v7.16b}, v2.16b
 
-	// 64b fold
-	ext		v0.16b, vzr.16b, v7.16b, #8
-	mov		v7.d[0], v7.d[1]
-	__pmull_\p	v7, v7, v10
-	eor		v7.16b, v7.16b, v0.16b
+	// v3 = first chunk: v7 right-shifted by '16-len' bytes.
+	movi		v3.16b, #0x80
+	eor		v2.16b, v2.16b, v3.16b
+	tbl		v3.16b, {v7.16b}, v2.16b
 
-	// 32b fold
-	ext		v0.16b, v7.16b, vzr.16b, #4
-	mov		v7.s[3], vzr.s[0]
-	__pmull_\p	v0, v0, v10, 2
-	eor		v7.16b, v7.16b, v0.16b
+	// Convert to 8-bit masks: 'len' 0x00 bytes, then '16-len' 0xff bytes.
+	sshr		v2.16b, v2.16b, #7
 
-	// barrett reduction
-	ldr_l		q10, rk7, x8
-	__pmull_pre_\p	v10
-	mov		v0.d[0], v7.d[1]
+	// v2 = second chunk: 'len' bytes from v0 (low-order bytes),
+	// then '16-len' bytes from v1 (high-order bytes).
+	bsl		v2.16b, v1.16b, v0.16b
 
-	__pmull_\p	v0, v0, v10
-	ext		v0.16b, vzr.16b, v0.16b, #12
-	__pmull_\p	v0, v0, v10, 2
-	ext		v0.16b, vzr.16b, v0.16b, #12
+	// Fold the first chunk into the second chunk, storing the result in v7.
+	__pmull_\p	v0, v3, fold_consts
+	__pmull_\p	v7, v3, fold_consts, 2
 	eor		v7.16b, v7.16b, v0.16b
-	mov		w0, v7.s[1]
-
-.L_cleanup_\@:
-	// scale the result back to 16 bits
-	lsr		x0, x0, #16
+	eor		v7.16b, v7.16b, v2.16b
+
+.Lreduce_final_16_bytes_\@:
+	// Reduce the 128-bit value M(x), stored in v7, to the final 16-bit CRC.
+
+	movi		v2.16b, #0		// init zero register
+
+	// Load 'x^48 * (x^48 mod G(x))' and 'x^48 * (x^80 mod G(x))'.
+	ld1		{fold_consts.2d}, [fold_consts_ptr], #16
+	__pmull_pre_\p	fold_consts
+
+	// Fold the high 64 bits into the low 64 bits, while also multiplying by
+	// x^64.  This produces a 128-bit value congruent to x^64 * M(x) and
+	// whose low 48 bits are 0.
+	ext		v0.16b, v2.16b, v7.16b, #8
+	__pmull_\p	v7, v7, fold_consts, 2	// high bits * x^48 * (x^80 mod G(x))
+	eor		v0.16b, v0.16b, v7.16b	// + low bits * x^64
+
+	// Fold the high 32 bits into the low 96 bits.  This produces a 96-bit
+	// value congruent to x^64 * M(x) and whose low 48 bits are 0.
+	ext		v1.16b, v0.16b, v2.16b, #12	// extract high 32 bits
+	mov		v0.s[3], v2.s[0]	// zero high 32 bits
+	__pmull_\p	v1, v1, fold_consts	// high 32 bits * x^48 * (x^48 mod G(x))
+	eor		v0.16b, v0.16b, v1.16b	// + low bits
+
+	// Load G(x) and floor(x^48 / G(x)).
+	ld1		{fold_consts.2d}, [fold_consts_ptr]
+	__pmull_pre_\p	fold_consts
+
+	// Use Barrett reduction to compute the final CRC value.
+	__pmull_\p	v1, v0, fold_consts, 2	// high 32 bits * floor(x^48 / G(x))
+	ushr		v1.2d, v1.2d, #32	// /= x^32
+	__pmull_\p	v1, v1, fold_consts	// *= G(x)
+	ushr		v0.2d, v0.2d, #48
+	eor		v0.16b, v0.16b, v1.16b	// + low 16 nonzero bits
+	// Final CRC value (x^16 * M(x)) mod G(x) is in low 16 bits of v0.
+
+	umov		w0, v0.h[0]
 	frame_pop
 	ret
 
-.L_less_than_128_\@:
-	cbz		arg3, .L_cleanup_\@
+.Lless_than_256_bytes_\@:
+	// Checksumming a buffer of length 16...255 bytes
 
-	movi		v0.16b, #0
-	mov		v0.s[3], arg1_low32	// get the initial crc value
+	adr_l		fold_consts_ptr, .Lfold_across_16_bytes_consts
 
-	ldr		q7, [arg2], #0x10
+	// Load the first 16 data bytes.
+	ldr		q7, [buf], #0x10
 CPU_LE(	rev64		v7.16b, v7.16b			)
 CPU_LE(	ext		v7.16b, v7.16b, v7.16b, #8	)
-	eor		v7.16b, v7.16b, v0.16b	// xor the initial crc value
-
-	cmp		arg3, #16
-	b.eq		.L_128_done_\@		// exactly 16 left
-	b.lt		.L_less_than_16_left_\@
-
-	ldr_l		q10, rk1, x8		// rk1 and rk2 in xmm10
-	__pmull_pre_\p	v10
-
-	// update the counter. subtract 32 instead of 16 to save one
-	// instruction from the loop
-	subs		arg3, arg3, #32
-	b.ge		.L_16B_reduction_loop_\@
-
-	add		arg3, arg3, #16
-	b		.L_get_last_two_regs_\@
-
-.L_less_than_16_left_\@:
-	// shl r9, 4
-	adr_l		x0, tbl_shf_table + 16
-	sub		x0, x0, arg3
-	ld1		{v0.16b}, [x0]
-	movi		v9.16b, #0x80
-	eor		v0.16b, v0.16b, v9.16b
-	tbl		v7.16b, {v7.16b}, v0.16b
-	b		.L_128_done_\@
+
+	// XOR the first 16 data *bits* with the initial CRC value.
+	movi		v0.16b, #0
+	mov		v0.h[7], init_crc
+	eor		v7.16b, v7.16b, v0.16b
+
+	// Load the fold-across-16-bytes constants.
+	ld1		{fold_consts.2d}, [fold_consts_ptr], #16
+	__pmull_pre_\p	fold_consts
+
+	cmp		len, #16
+	b.eq		.Lreduce_final_16_bytes_\@	// len == 16
+	subs		len, len, #32
+	b.ge		.Lfold_16_bytes_loop_\@		// 32 <= len <= 255
+	add		len, len, #16
+	b		.Lhandle_partial_segment_\@	// 17 <= len <= 31
 	.endm
 
+//
+// u16 crc_t10dif_pmull_p8(u16 init_crc, const u8 *buf, size_t len);
+//
+// Assumes len >= 16.
+//
 ENTRY(crc_t10dif_pmull_p8)
 	crc_t10dif_pmull	p8
 ENDPROC(crc_t10dif_pmull_p8)
 
 	.align		5
+//
+// u16 crc_t10dif_pmull_p64(u16 init_crc, const u8 *buf, size_t len);
+//
+// Assumes len >= 16.
+//
 ENTRY(crc_t10dif_pmull_p64)
 	crc_t10dif_pmull	p64
 ENDPROC(crc_t10dif_pmull_p64)
 
-// precomputed constants
-// these constants are precomputed from the poly:
-// 0x8bb70000 (0x8bb7 scaled to 32 bits)
 	.section	".rodata", "a"
 	.align		4
-// Q = 0x18BB70000
-// rk1 = 2^(32*3) mod Q << 32
-// rk2 = 2^(32*5) mod Q << 32
-// rk3 = 2^(32*15) mod Q << 32
-// rk4 = 2^(32*17) mod Q << 32
-// rk5 = 2^(32*3) mod Q << 32
-// rk6 = 2^(32*2) mod Q << 32
-// rk7 = floor(2^64/Q)
-// rk8 = Q
-
-rk1:	.octa		0x06df0000000000002d56000000000000
-rk3:	.octa		0x7cf50000000000009d9d000000000000
-rk5:	.octa		0x13680000000000002d56000000000000
-rk7:	.octa		0x000000018bb7000000000001f65a57f8
-rk9:	.octa		0xbfd6000000000000ceae000000000000
-rk11:	.octa		0x713c0000000000001e16000000000000
-rk13:	.octa		0x80a6000000000000f7f9000000000000
-rk15:	.octa		0xe658000000000000044c000000000000
-rk17:	.octa		0xa497000000000000ad18000000000000
-rk19:	.octa		0xe7b50000000000006ee3000000000000
-
-tbl_shf_table:
-// use these values for shift constants for the tbl/tbx instruction
-// different alignments result in values as shown:
-//	DDQ 0x008f8e8d8c8b8a898887868584838281 # shl 15 (16-1) / shr1
-//	DDQ 0x01008f8e8d8c8b8a8988878685848382 # shl 14 (16-3) / shr2
-//	DDQ 0x0201008f8e8d8c8b8a89888786858483 # shl 13 (16-4) / shr3
-//	DDQ 0x030201008f8e8d8c8b8a898887868584 # shl 12 (16-4) / shr4
-//	DDQ 0x04030201008f8e8d8c8b8a8988878685 # shl 11 (16-5) / shr5
-//	DDQ 0x0504030201008f8e8d8c8b8a89888786 # shl 10 (16-6) / shr6
-//	DDQ 0x060504030201008f8e8d8c8b8a898887 # shl 9  (16-7) / shr7
-//	DDQ 0x07060504030201008f8e8d8c8b8a8988 # shl 8  (16-8) / shr8
-//	DDQ 0x0807060504030201008f8e8d8c8b8a89 # shl 7  (16-9) / shr9
-//	DDQ 0x090807060504030201008f8e8d8c8b8a # shl 6  (16-10) / shr10
-//	DDQ 0x0a090807060504030201008f8e8d8c8b # shl 5  (16-11) / shr11
-//	DDQ 0x0b0a090807060504030201008f8e8d8c # shl 4  (16-12) / shr12
-//	DDQ 0x0c0b0a090807060504030201008f8e8d # shl 3  (16-13) / shr13
-//	DDQ 0x0d0c0b0a090807060504030201008f8e # shl 2  (16-14) / shr14
-//	DDQ 0x0e0d0c0b0a090807060504030201008f # shl 1  (16-15) / shr15
 
+// Fold constants precomputed from the polynomial 0x18bb7
+// G(x) = x^16 + x^15 + x^11 + x^9 + x^8 + x^7 + x^5 + x^4 + x^2 + x^1 + x^0
+.Lfold_across_128_bytes_consts:
+	.quad		0x0000000000006123	// x^(8*128)	mod G(x)
+	.quad		0x0000000000002295	// x^(8*128+64)	mod G(x)
+// .Lfold_across_64_bytes_consts:
+	.quad		0x0000000000001069	// x^(4*128)	mod G(x)
+	.quad		0x000000000000dd31	// x^(4*128+64)	mod G(x)
+// .Lfold_across_32_bytes_consts:
+	.quad		0x000000000000857d	// x^(2*128)	mod G(x)
+	.quad		0x0000000000007acc	// x^(2*128+64)	mod G(x)
+.Lfold_across_16_bytes_consts:
+	.quad		0x000000000000a010	// x^(1*128)	mod G(x)
+	.quad		0x0000000000001faa	// x^(1*128+64)	mod G(x)
+// .Lfinal_fold_consts:
+	.quad		0x1368000000000000	// x^48 * (x^48 mod G(x))
+	.quad		0x2d56000000000000	// x^48 * (x^80 mod G(x))
+// .Lbarrett_reduction_consts:
+	.quad		0x0000000000018bb7	// G(x)
+	.quad		0x00000001f65a57f8	// floor(x^48 / G(x))
+
+// For 1 <= len <= 15, the 16-byte vector beginning at &byteshift_table[16 -
+// len] is the index vector to shift left by 'len' bytes, and is also {0x80,
+// ..., 0x80} XOR the index vector to shift right by '16 - len' bytes.
+.Lbyteshift_table:
 	.byte		 0x0, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87
 	.byte		0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f
 	.byte		 0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7