Skip to content

Commit 1c08a10

Browse files
zx2c4herbertx
authored andcommitted
crypto: poly1305 - add new 32 and 64-bit generic versions
These two C implementations from Zinc -- a 32x32 one and a 64x64 one, depending on the platform -- come from Andrew Moon's public domain poly1305-donna portable code, modified for usage in the kernel. The precomputation in the 32-bit version and the use of 64x64 multiplies in the 64-bit version make these perform better than the code it replaces. Moon's code is also very widespread and has received many eyeballs of scrutiny. There's a bit of interference between the x86 implementation, which relies on internal details of the old scalar implementation. In the next commit, the x86 implementation will be replaced with a faster one that doesn't rely on this, so none of this matters much. But for now, to keep this passing the tests, we inline the bits of the old implementation that the x86 implementation relied on. Also, since we now support a slightly larger key space, via the union, some offsets had to be fixed up. Nonce calculation was folded in with the emit function, to take advantage of 64x64 arithmetic. However, Adiantum appeared to rely on no nonce handling in emit, so this path was conditionalized. We also introduced a new struct, poly1305_core_key, to represent the precise amount of space that particular implementation uses. Testing with kbench9000, depending on the CPU, the update function for the 32x32 version has been improved by 4%-7%, and for the 64x64 by 19%-30%. The 32x32 gains are small, but I think there's great value in having a parallel implementation to the 64x64 one so that the two can be compared side-by-side as nice stand-alone units. Signed-off-by: Jason A. Donenfeld <[email protected]> Signed-off-by: Herbert Xu <[email protected]>
1 parent e341942 commit 1c08a10

File tree

12 files changed

+675
-228
lines changed

12 files changed

+675
-228
lines changed

arch/x86/crypto/poly1305-avx2-x86_64.S

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -34,16 +34,16 @@ ORMASK: .octa 0x00000000010000000000000001000000
3434
#define u2 0x08(%r8)
3535
#define u3 0x0c(%r8)
3636
#define u4 0x10(%r8)
37-
#define w0 0x14(%r8)
38-
#define w1 0x18(%r8)
39-
#define w2 0x1c(%r8)
40-
#define w3 0x20(%r8)
41-
#define w4 0x24(%r8)
42-
#define y0 0x28(%r8)
43-
#define y1 0x2c(%r8)
44-
#define y2 0x30(%r8)
45-
#define y3 0x34(%r8)
46-
#define y4 0x38(%r8)
37+
#define w0 0x18(%r8)
38+
#define w1 0x1c(%r8)
39+
#define w2 0x20(%r8)
40+
#define w3 0x24(%r8)
41+
#define w4 0x28(%r8)
42+
#define y0 0x30(%r8)
43+
#define y1 0x34(%r8)
44+
#define y2 0x38(%r8)
45+
#define y3 0x3c(%r8)
46+
#define y4 0x40(%r8)
4747
#define m %rsi
4848
#define hc0 %ymm0
4949
#define hc1 %ymm1

arch/x86/crypto/poly1305_glue.c

Lines changed: 204 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,21 @@ asmlinkage void poly1305_4block_avx2(u32 *h, const u8 *src, const u32 *r,
2525
static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_simd);
2626
static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx2);
2727

28+
static inline u64 mlt(u64 a, u64 b)
29+
{
30+
return a * b;
31+
}
32+
33+
static inline u32 sr(u64 v, u_char n)
34+
{
35+
return v >> n;
36+
}
37+
38+
static inline u32 and(u32 v, u32 mask)
39+
{
40+
return v & mask;
41+
}
42+
2843
static void poly1305_simd_mult(u32 *a, const u32 *b)
2944
{
3045
u8 m[POLY1305_BLOCK_SIZE];
@@ -36,6 +51,168 @@ static void poly1305_simd_mult(u32 *a, const u32 *b)
3651
poly1305_block_sse2(a, m, b, 1);
3752
}
3853

54+
static void poly1305_integer_setkey(struct poly1305_key *key, const u8 *raw_key)
55+
{
56+
/* r &= 0xffffffc0ffffffc0ffffffc0fffffff */
57+
key->r[0] = (get_unaligned_le32(raw_key + 0) >> 0) & 0x3ffffff;
58+
key->r[1] = (get_unaligned_le32(raw_key + 3) >> 2) & 0x3ffff03;
59+
key->r[2] = (get_unaligned_le32(raw_key + 6) >> 4) & 0x3ffc0ff;
60+
key->r[3] = (get_unaligned_le32(raw_key + 9) >> 6) & 0x3f03fff;
61+
key->r[4] = (get_unaligned_le32(raw_key + 12) >> 8) & 0x00fffff;
62+
}
63+
64+
static void poly1305_integer_blocks(struct poly1305_state *state,
65+
const struct poly1305_key *key,
66+
const void *src,
67+
unsigned int nblocks, u32 hibit)
68+
{
69+
u32 r0, r1, r2, r3, r4;
70+
u32 s1, s2, s3, s4;
71+
u32 h0, h1, h2, h3, h4;
72+
u64 d0, d1, d2, d3, d4;
73+
74+
if (!nblocks)
75+
return;
76+
77+
r0 = key->r[0];
78+
r1 = key->r[1];
79+
r2 = key->r[2];
80+
r3 = key->r[3];
81+
r4 = key->r[4];
82+
83+
s1 = r1 * 5;
84+
s2 = r2 * 5;
85+
s3 = r3 * 5;
86+
s4 = r4 * 5;
87+
88+
h0 = state->h[0];
89+
h1 = state->h[1];
90+
h2 = state->h[2];
91+
h3 = state->h[3];
92+
h4 = state->h[4];
93+
94+
do {
95+
/* h += m[i] */
96+
h0 += (get_unaligned_le32(src + 0) >> 0) & 0x3ffffff;
97+
h1 += (get_unaligned_le32(src + 3) >> 2) & 0x3ffffff;
98+
h2 += (get_unaligned_le32(src + 6) >> 4) & 0x3ffffff;
99+
h3 += (get_unaligned_le32(src + 9) >> 6) & 0x3ffffff;
100+
h4 += (get_unaligned_le32(src + 12) >> 8) | (hibit << 24);
101+
102+
/* h *= r */
103+
d0 = mlt(h0, r0) + mlt(h1, s4) + mlt(h2, s3) +
104+
mlt(h3, s2) + mlt(h4, s1);
105+
d1 = mlt(h0, r1) + mlt(h1, r0) + mlt(h2, s4) +
106+
mlt(h3, s3) + mlt(h4, s2);
107+
d2 = mlt(h0, r2) + mlt(h1, r1) + mlt(h2, r0) +
108+
mlt(h3, s4) + mlt(h4, s3);
109+
d3 = mlt(h0, r3) + mlt(h1, r2) + mlt(h2, r1) +
110+
mlt(h3, r0) + mlt(h4, s4);
111+
d4 = mlt(h0, r4) + mlt(h1, r3) + mlt(h2, r2) +
112+
mlt(h3, r1) + mlt(h4, r0);
113+
114+
/* (partial) h %= p */
115+
d1 += sr(d0, 26); h0 = and(d0, 0x3ffffff);
116+
d2 += sr(d1, 26); h1 = and(d1, 0x3ffffff);
117+
d3 += sr(d2, 26); h2 = and(d2, 0x3ffffff);
118+
d4 += sr(d3, 26); h3 = and(d3, 0x3ffffff);
119+
h0 += sr(d4, 26) * 5; h4 = and(d4, 0x3ffffff);
120+
h1 += h0 >> 26; h0 = h0 & 0x3ffffff;
121+
122+
src += POLY1305_BLOCK_SIZE;
123+
} while (--nblocks);
124+
125+
state->h[0] = h0;
126+
state->h[1] = h1;
127+
state->h[2] = h2;
128+
state->h[3] = h3;
129+
state->h[4] = h4;
130+
}
131+
132+
static void poly1305_integer_emit(const struct poly1305_state *state, void *dst)
133+
{
134+
u32 h0, h1, h2, h3, h4;
135+
u32 g0, g1, g2, g3, g4;
136+
u32 mask;
137+
138+
/* fully carry h */
139+
h0 = state->h[0];
140+
h1 = state->h[1];
141+
h2 = state->h[2];
142+
h3 = state->h[3];
143+
h4 = state->h[4];
144+
145+
h2 += (h1 >> 26); h1 = h1 & 0x3ffffff;
146+
h3 += (h2 >> 26); h2 = h2 & 0x3ffffff;
147+
h4 += (h3 >> 26); h3 = h3 & 0x3ffffff;
148+
h0 += (h4 >> 26) * 5; h4 = h4 & 0x3ffffff;
149+
h1 += (h0 >> 26); h0 = h0 & 0x3ffffff;
150+
151+
/* compute h + -p */
152+
g0 = h0 + 5;
153+
g1 = h1 + (g0 >> 26); g0 &= 0x3ffffff;
154+
g2 = h2 + (g1 >> 26); g1 &= 0x3ffffff;
155+
g3 = h3 + (g2 >> 26); g2 &= 0x3ffffff;
156+
g4 = h4 + (g3 >> 26) - (1 << 26); g3 &= 0x3ffffff;
157+
158+
/* select h if h < p, or h + -p if h >= p */
159+
mask = (g4 >> ((sizeof(u32) * 8) - 1)) - 1;
160+
g0 &= mask;
161+
g1 &= mask;
162+
g2 &= mask;
163+
g3 &= mask;
164+
g4 &= mask;
165+
mask = ~mask;
166+
h0 = (h0 & mask) | g0;
167+
h1 = (h1 & mask) | g1;
168+
h2 = (h2 & mask) | g2;
169+
h3 = (h3 & mask) | g3;
170+
h4 = (h4 & mask) | g4;
171+
172+
/* h = h % (2^128) */
173+
put_unaligned_le32((h0 >> 0) | (h1 << 26), dst + 0);
174+
put_unaligned_le32((h1 >> 6) | (h2 << 20), dst + 4);
175+
put_unaligned_le32((h2 >> 12) | (h3 << 14), dst + 8);
176+
put_unaligned_le32((h3 >> 18) | (h4 << 8), dst + 12);
177+
}
178+
179+
void poly1305_init_arch(struct poly1305_desc_ctx *desc, const u8 *key)
180+
{
181+
poly1305_integer_setkey(desc->opaque_r, key);
182+
desc->s[0] = get_unaligned_le32(key + 16);
183+
desc->s[1] = get_unaligned_le32(key + 20);
184+
desc->s[2] = get_unaligned_le32(key + 24);
185+
desc->s[3] = get_unaligned_le32(key + 28);
186+
poly1305_core_init(&desc->h);
187+
desc->buflen = 0;
188+
desc->sset = true;
189+
desc->rset = 1;
190+
}
191+
EXPORT_SYMBOL_GPL(poly1305_init_arch);
192+
193+
static unsigned int crypto_poly1305_setdesckey(struct poly1305_desc_ctx *dctx,
194+
const u8 *src, unsigned int srclen)
195+
{
196+
if (!dctx->sset) {
197+
if (!dctx->rset && srclen >= POLY1305_BLOCK_SIZE) {
198+
poly1305_integer_setkey(dctx->r, src);
199+
src += POLY1305_BLOCK_SIZE;
200+
srclen -= POLY1305_BLOCK_SIZE;
201+
dctx->rset = 1;
202+
}
203+
if (srclen >= POLY1305_BLOCK_SIZE) {
204+
dctx->s[0] = get_unaligned_le32(src + 0);
205+
dctx->s[1] = get_unaligned_le32(src + 4);
206+
dctx->s[2] = get_unaligned_le32(src + 8);
207+
dctx->s[3] = get_unaligned_le32(src + 12);
208+
src += POLY1305_BLOCK_SIZE;
209+
srclen -= POLY1305_BLOCK_SIZE;
210+
dctx->sset = true;
211+
}
212+
}
213+
return srclen;
214+
}
215+
39216
static unsigned int poly1305_scalar_blocks(struct poly1305_desc_ctx *dctx,
40217
const u8 *src, unsigned int srclen)
41218
{
@@ -47,8 +224,8 @@ static unsigned int poly1305_scalar_blocks(struct poly1305_desc_ctx *dctx,
47224
srclen = datalen;
48225
}
49226
if (srclen >= POLY1305_BLOCK_SIZE) {
50-
poly1305_core_blocks(&dctx->h, dctx->r, src,
51-
srclen / POLY1305_BLOCK_SIZE, 1);
227+
poly1305_integer_blocks(&dctx->h, dctx->opaque_r, src,
228+
srclen / POLY1305_BLOCK_SIZE, 1);
52229
srclen %= POLY1305_BLOCK_SIZE;
53230
}
54231
return srclen;
@@ -105,12 +282,6 @@ static unsigned int poly1305_simd_blocks(struct poly1305_desc_ctx *dctx,
105282
return srclen;
106283
}
107284

108-
void poly1305_init_arch(struct poly1305_desc_ctx *desc, const u8 *key)
109-
{
110-
poly1305_init_generic(desc, key);
111-
}
112-
EXPORT_SYMBOL(poly1305_init_arch);
113-
114285
void poly1305_update_arch(struct poly1305_desc_ctx *dctx, const u8 *src,
115286
unsigned int srclen)
116287
{
@@ -158,9 +329,31 @@ void poly1305_update_arch(struct poly1305_desc_ctx *dctx, const u8 *src,
158329
}
159330
EXPORT_SYMBOL(poly1305_update_arch);
160331

161-
void poly1305_final_arch(struct poly1305_desc_ctx *desc, u8 *digest)
332+
void poly1305_final_arch(struct poly1305_desc_ctx *desc, u8 *dst)
162333
{
163-
poly1305_final_generic(desc, digest);
334+
__le32 digest[4];
335+
u64 f = 0;
336+
337+
if (unlikely(desc->buflen)) {
338+
desc->buf[desc->buflen++] = 1;
339+
memset(desc->buf + desc->buflen, 0,
340+
POLY1305_BLOCK_SIZE - desc->buflen);
341+
poly1305_integer_blocks(&desc->h, desc->opaque_r, desc->buf, 1, 0);
342+
}
343+
344+
poly1305_integer_emit(&desc->h, digest);
345+
346+
/* mac = (h + s) % (2^128) */
347+
f = (f >> 32) + le32_to_cpu(digest[0]) + desc->s[0];
348+
put_unaligned_le32(f, dst + 0);
349+
f = (f >> 32) + le32_to_cpu(digest[1]) + desc->s[1];
350+
put_unaligned_le32(f, dst + 4);
351+
f = (f >> 32) + le32_to_cpu(digest[2]) + desc->s[2];
352+
put_unaligned_le32(f, dst + 8);
353+
f = (f >> 32) + le32_to_cpu(digest[3]) + desc->s[3];
354+
put_unaligned_le32(f, dst + 12);
355+
356+
*desc = (struct poly1305_desc_ctx){};
164357
}
165358
EXPORT_SYMBOL(poly1305_final_arch);
166359

@@ -183,7 +376,7 @@ static int crypto_poly1305_final(struct shash_desc *desc, u8 *dst)
183376
if (unlikely(!dctx->sset))
184377
return -ENOKEY;
185378

186-
poly1305_final_generic(dctx, dst);
379+
poly1305_final_arch(dctx, dst);
187380
return 0;
188381
}
189382

crypto/adiantum.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@ struct adiantum_tfm_ctx {
7070
struct crypto_skcipher *streamcipher;
7171
struct crypto_cipher *blockcipher;
7272
struct crypto_shash *hash;
73-
struct poly1305_key header_hash_key;
73+
struct poly1305_core_key header_hash_key;
7474
};
7575

7676
struct adiantum_request_ctx {
@@ -239,7 +239,7 @@ static void adiantum_hash_header(struct skcipher_request *req)
239239
poly1305_core_blocks(&state, &tctx->header_hash_key, req->iv,
240240
TWEAK_SIZE / POLY1305_BLOCK_SIZE, 1);
241241

242-
poly1305_core_emit(&state, &rctx->header_hash);
242+
poly1305_core_emit(&state, NULL, &rctx->header_hash);
243243
}
244244

245245
/* Hash the left-hand part (the "bulk") of the message using NHPoly1305 */

crypto/nhpoly1305.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -210,7 +210,7 @@ int crypto_nhpoly1305_final_helper(struct shash_desc *desc, u8 *dst, nh_t nh_fn)
210210
if (state->nh_remaining)
211211
process_nh_hash_value(state, key);
212212

213-
poly1305_core_emit(&state->poly_state, dst);
213+
poly1305_core_emit(&state->poly_state, NULL, dst);
214214
return 0;
215215
}
216216
EXPORT_SYMBOL(crypto_nhpoly1305_final_helper);

crypto/poly1305_generic.c

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,29 @@ static int crypto_poly1305_init(struct shash_desc *desc)
3131
return 0;
3232
}
3333

34+
static unsigned int crypto_poly1305_setdesckey(struct poly1305_desc_ctx *dctx,
35+
const u8 *src, unsigned int srclen)
36+
{
37+
if (!dctx->sset) {
38+
if (!dctx->rset && srclen >= POLY1305_BLOCK_SIZE) {
39+
poly1305_core_setkey(&dctx->core_r, src);
40+
src += POLY1305_BLOCK_SIZE;
41+
srclen -= POLY1305_BLOCK_SIZE;
42+
dctx->rset = 2;
43+
}
44+
if (srclen >= POLY1305_BLOCK_SIZE) {
45+
dctx->s[0] = get_unaligned_le32(src + 0);
46+
dctx->s[1] = get_unaligned_le32(src + 4);
47+
dctx->s[2] = get_unaligned_le32(src + 8);
48+
dctx->s[3] = get_unaligned_le32(src + 12);
49+
src += POLY1305_BLOCK_SIZE;
50+
srclen -= POLY1305_BLOCK_SIZE;
51+
dctx->sset = true;
52+
}
53+
}
54+
return srclen;
55+
}
56+
3457
static void poly1305_blocks(struct poly1305_desc_ctx *dctx, const u8 *src,
3558
unsigned int srclen)
3659
{
@@ -42,7 +65,7 @@ static void poly1305_blocks(struct poly1305_desc_ctx *dctx, const u8 *src,
4265
srclen = datalen;
4366
}
4467

45-
poly1305_core_blocks(&dctx->h, dctx->r, src,
68+
poly1305_core_blocks(&dctx->h, &dctx->core_r, src,
4669
srclen / POLY1305_BLOCK_SIZE, 1);
4770
}
4871

0 commit comments

Comments
 (0)