aboutsummaryrefslogtreecommitdiff
path: root/arch/arm64/crypto/sm4-ce-core.S
diff options
context:
space:
mode:
authorGravatar Tianjia Zhang <tianjia.zhang@linux.alibaba.com> 2022-10-27 14:55:01 +0800
committerGravatar Herbert Xu <herbert@gondor.apana.org.au> 2022-11-04 17:34:42 +0800
commitb1863fd0742f8da21f6f994e14e820db5831bd74 (patch)
tree57d45c39d9b0b382b13b9e3fad36a7c9db4ef856 /arch/arm64/crypto/sm4-ce-core.S
parentcrypto: arm64/sm4 - export reusable CE acceleration functions (diff)
downloadlinux-b1863fd0742f8da21f6f994e14e820db5831bd74.tar.gz
linux-b1863fd0742f8da21f6f994e14e820db5831bd74.tar.bz2
linux-b1863fd0742f8da21f6f994e14e820db5831bd74.zip
crypto: arm64/sm4 - add CE implementation for CTS-CBC mode
This patch is a CE-optimized assembly implementation for CTS-CBC mode. Benchmark on T-Head Yitian-710 2.75 GHz, the data comes from the 218 mode of tcrypt, and compared the performance before and after this patch (the driver used before this patch is cts(cbc-sm4-ce)). The abscissas are blocks of different lengths. The data is tabulated and the unit is Mb/s: Before: cts(cbc-sm4-ce) | 16 64 128 256 1024 1420 4096 ----------------+-------------------------------------------------------------- CTS-CBC enc | 286.09 297.17 457.97 627.75 868.58 900.80 957.69 CTS-CBC dec | 286.67 285.63 538.35 947.08 2241.03 2577.32 3391.14 After: cts-cbc-sm4-ce | 16 64 128 256 1024 1420 4096 ----------------+-------------------------------------------------------------- CTS-CBC enc | 288.19 428.80 593.57 741.04 911.73 931.80 950.00 CTS-CBC dec | 292.22 468.99 838.23 1380.76 2741.17 3036.42 3409.62 Signed-off-by: Tianjia Zhang <tianjia.zhang@linux.alibaba.com> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Diffstat (limited to 'arch/arm64/crypto/sm4-ce-core.S')
-rw-r--r--arch/arm64/crypto/sm4-ce-core.S102
1 files changed, 102 insertions, 0 deletions
diff --git a/arch/arm64/crypto/sm4-ce-core.S b/arch/arm64/crypto/sm4-ce-core.S
index 9e4b4f01cdf3..414d29f8110b 100644
--- a/arch/arm64/crypto/sm4-ce-core.S
+++ b/arch/arm64/crypto/sm4-ce-core.S
@@ -307,6 +307,100 @@ SYM_FUNC_START(sm4_ce_cbc_dec)
SYM_FUNC_END(sm4_ce_cbc_dec)
.align 3
+SYM_FUNC_START(sm4_ce_cbc_cts_enc)
+ /* input:
+ * x0: round key array, CTX
+ * x1: dst
+ * x2: src
+ * x3: iv (big endian, 128 bit)
+ * w4: nbytes
+ */
+ SM4_PREPARE(x0)
+
+ sub w5, w4, #16
+ uxtw x5, w5
+
+ ld1 {RIV.16b}, [x3]
+
+ ld1 {v0.16b}, [x2]
+ eor RIV.16b, RIV.16b, v0.16b
+ SM4_CRYPT_BLK(RIV)
+
+ /* load permute table */
+ adr_l x6, .Lcts_permute_table
+ add x7, x6, #32
+ add x6, x6, x5
+ sub x7, x7, x5
+ ld1 {v3.16b}, [x6]
+ ld1 {v4.16b}, [x7]
+
+ /* overlapping loads */
+ add x2, x2, x5
+ ld1 {v1.16b}, [x2]
+
+ /* create Cn from En-1 */
+ tbl v0.16b, {RIV.16b}, v3.16b
+ /* padding Pn with zeros */
+ tbl v1.16b, {v1.16b}, v4.16b
+
+ eor v1.16b, v1.16b, RIV.16b
+ SM4_CRYPT_BLK(v1)
+
+ /* overlapping stores */
+ add x5, x1, x5
+ st1 {v0.16b}, [x5]
+ st1 {v1.16b}, [x1]
+
+ ret
+SYM_FUNC_END(sm4_ce_cbc_cts_enc)
+
+.align 3
+SYM_FUNC_START(sm4_ce_cbc_cts_dec)
+ /* input:
+ * x0: round key array, CTX
+ * x1: dst
+ * x2: src
+ * x3: iv (big endian, 128 bit)
+ * w4: nbytes
+ */
+ SM4_PREPARE(x0)
+
+ sub w5, w4, #16
+ uxtw x5, w5
+
+ ld1 {RIV.16b}, [x3]
+
+ /* load permute table */
+ adr_l x6, .Lcts_permute_table
+ add x7, x6, #32
+ add x6, x6, x5
+ sub x7, x7, x5
+ ld1 {v3.16b}, [x6]
+ ld1 {v4.16b}, [x7]
+
+ /* overlapping loads */
+ ld1 {v0.16b}, [x2], x5
+ ld1 {v1.16b}, [x2]
+
+ SM4_CRYPT_BLK(v0)
+ /* select the first Ln bytes of Xn to create Pn */
+ tbl v2.16b, {v0.16b}, v3.16b
+ eor v2.16b, v2.16b, v1.16b
+
+ /* overwrite the first Ln bytes with Cn to create En-1 */
+ tbx v0.16b, {v1.16b}, v4.16b
+ SM4_CRYPT_BLK(v0)
+ eor v0.16b, v0.16b, RIV.16b
+
+ /* overlapping stores */
+ add x5, x1, x5
+ st1 {v2.16b}, [x5]
+ st1 {v0.16b}, [x1]
+
+ ret
+SYM_FUNC_END(sm4_ce_cbc_cts_dec)
+
+.align 3
SYM_FUNC_START(sm4_ce_cfb_enc)
/* input:
* x0: round key array, CTX
@@ -576,3 +670,11 @@ SYM_FUNC_END(sm4_ce_ctr_enc)
.Lbswap128_mask:
.byte 0x0c, 0x0d, 0x0e, 0x0f, 0x08, 0x09, 0x0a, 0x0b
.byte 0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03
+
+.Lcts_permute_table:
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+ .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7
+ .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff