/* Copyright 2023 The ChromiumOS Authors * Use of this source code is governed by a BSD-style license that can be * found in the LICENSE file. * * Authors: Muhammad Monir Hossain * Jeremy Compostella */ /* * The algorithm implemented below is described in Montgomery Multiplication * Using Vector Instructions document from Microsoft Research, August 20, 2013 * (cf. https://eprint.iacr.org/2013/519.pdf). * * This implementation leverages SSE2 instructions to perform arithmetic * operations in parallel. * * This algorithm uses the modulus positive inverse (1 / N mod 2^32) which can * be easily computed from the modulus negative inverse provided by the public * key data structure `n0inv' field. */ #include "2api.h" #include "2common.h" #include "2return_codes.h" #include "2rsa.h" typedef long long vb2_m128i __attribute__((__vector_size__(16), __may_alias__)); typedef int vb2_v4si __attribute__((__vector_size__(16))); typedef unsigned long long vb2_v2du __attribute__((__vector_size__(16))); static inline vb2_m128i __attribute__((__always_inline__)) vb2_set_epi32 (int q3, int q2, int q1, int q0) { return (vb2_m128i)(vb2_v4si){ q0, q1, q2, q3 }; } static inline vb2_m128i __attribute__((__always_inline__)) vb2_setzero_si128 (void) { return (vb2_m128i)(vb2_v4si){ 0, 0, 0, 0 }; } static inline vb2_m128i __attribute__((__always_inline__)) vb2_add_epi64 (vb2_m128i a, vb2_m128i b) { return (vb2_m128i)((vb2_v2du)a + (vb2_v2du)b); } static inline vb2_m128i __attribute__((__always_inline__)) vb2_srli_epi64 (vb2_m128i a, int b) { return (vb2_m128i)__builtin_ia32_psrlqi128(a, b); } static inline vb2_m128i __attribute__((__always_inline__)) vb2_mul_epu32 (vb2_m128i a, vb2_m128i b) { return (vb2_m128i)__builtin_ia32_pmuludq128((vb2_v4si)a, (vb2_v4si)b); } static inline vb2_m128i __attribute__((__always_inline__)) vb2_and_si128 (vb2_m128i a, vb2_m128i b) { return (vb2_m128i)((vb2_v2du)a & (vb2_v2du)b); } /** * Montgomery c[] = d[] - e[] if d[] > e[], c[] = d[] - e[] + mod[] otherwise. * * de[] has d[] in lower 64 bits (effectively lower 32 bits) and e[] in upper * 64 bits (effectively lower 32 bits) * de[] is used as a temporary buffer and therefore its content will be lost. */ static void sub_mod(const struct vb2_public_key *key, vb2_m128i *de, uint32_t *c) { uint32_t i, borrow = 0, carry = 0, d, e; uint64_t sum, *de_i; for (i = 0; i < key->arrsize; i++) { de_i = (uint64_t *)&de[i]; d = (uint32_t)de_i[1]; e = (uint32_t)de_i[0]; /* Use de_i[0] as temporary storage of d[] - e[]. */ de_i[0] = (uint32_t)d - e - borrow; borrow = d ^ ((d ^ e) | (d ^ (uint32_t)de_i[0])); borrow >>= 31; } /* To keep the code running in constant-time for side-channel * resistance, D − E + mod is systematically computed even if we do not * need it. */ for (i = 0; i < key->arrsize; i++) { de_i = (uint64_t *)&de[i]; sum = de_i[0] + key->n[i] + carry; carry = sum >> 32; /* Use de_i[1] as temporary storage. */ de_i[1] = (uint32_t)sum; } int index = borrow ? 1 : 0; for (i = 0; i < key->arrsize; i++) { de_i = (uint64_t *)&de[i]; c[i] = (uint32_t)de_i[index]; } } /** * Montgomery c[] = a[] * b[] / R % mod */ static void mont_mult(const struct vb2_public_key *key, uint32_t *c, const uint32_t *a, const uint32_t *b, const uint32_t mu, vb2_m128i *de, vb2_m128i *b_modulus) { const uint32_t mub0 = mu * b[0]; const vb2_m128i mask = vb2_set_epi32(0, 0xffffffff, 0, 0xffffffff); const uint64_t *de0 = (uint64_t *)de; uint32_t i, j, q, muc0; vb2_m128i p01, t01, mul; for (i = 0; i < key->arrsize; i++) { b_modulus[i] = vb2_set_epi32(0, b[i], 0, key->n[i]); de[i] = vb2_setzero_si128(); } for (j = 0; j < key->arrsize; j++) { c[0] = (uint32_t)de0[1] - de0[0]; muc0 = mu * c[0]; q = muc0 + mub0 * a[j]; mul = vb2_set_epi32(0, a[j], 0, q); p01 = vb2_add_epi64(de[0], vb2_mul_epu32(mul, b_modulus[0])); t01 = vb2_srli_epi64(p01, 32); for (i = 1; i < key->arrsize; i++) { p01 = vb2_add_epi64(vb2_add_epi64(t01, de[i]), vb2_mul_epu32(mul, b_modulus[i])); t01 = vb2_srli_epi64(p01, 32); de[i - 1] = vb2_and_si128(mask, p01); } de[key->arrsize - 1] = t01; } sub_mod(key, de, c); } static void swap_endianness(const uint32_t *in, uint32_t *out, size_t size) { size_t i; for (i = 0; i < size; i++) out[i] = __builtin_bswap32(in[size - 1 - i]); } vb2_error_t vb2ex_hwcrypto_modexp(const struct vb2_public_key *key, uint8_t *inout, void *workbuf, size_t workbuf_size, int exp) { const uint32_t mu = -key->n0inv; uint32_t *a = workbuf; uint32_t *aR = a + key->arrsize; uint32_t *aaR = aR + key->arrsize; uint32_t *aaa = aaR; /* Re-use location. */ vb2_m128i *de = (vb2_m128i *)(((uintptr_t)(aaa + key->arrsize) + 0xf) & ~0xf); vb2_m128i *b_modulus = de + key->arrsize; size_t i; if ((void *)&b_modulus[key->arrsize] - workbuf > workbuf_size) { VB2_DEBUG("ERROR - HW modexp work buffer too small!\n"); return VB2_ERROR_WORKBUF_SMALL; } /* Convert big endian to little endian. */ swap_endianness((uint32_t *)inout, a, key->arrsize); /* aR = a * RR / R mod M */ mont_mult(key, aR, a, key->rr, mu, de, b_modulus); if (exp == 3) { /* aaR = aR * aR / R mod M */ mont_mult(key, aaR, aR, aR, mu, de, b_modulus); /* a = aaR * aR / R mod M */ mont_mult(key, a, aaR, aR, mu, de, b_modulus); /* To multiply with 1, prepare aR with first element 1 and * others as 0. */ aR[0] = 1; for (i = 1; i < key->arrsize; i++) aR[i] = 0; /* aaa = a * aR / R mod M = a * 1 / R mod M*/ mont_mult(key, aaa, a, aR, mu, de, b_modulus); } else { /* Exponent 65537 */ for (i = 0; i < 16; i += 2) { /* aaR = aR * aR / R mod M */ mont_mult(key, aaR, aR, aR, mu, de, b_modulus); /* aR = aaR * aaR / R mod M */ mont_mult(key, aR, aaR, aaR, mu, de, b_modulus); } /* aaa = aR * a / R mod M */ mont_mult(key, aaa, aR, a, mu, de, b_modulus); } /* Convert little endian to big endian. */ swap_endianness(aaa, (uint32_t *)inout, key->arrsize); return VB2_SUCCESS; }