diff --git a/networking/tls.c b/networking/tls.c index 69c81b558..b0a4f7e75 100644 --- a/networking/tls.c +++ b/networking/tls.c @@ -1,7 +1,7 @@ /* - * Licensed under GPLv2, see file LICENSE in this source tree. - * * Copyright (C) 2017 Denys Vlasenko + * + * Licensed under GPLv2, see file LICENSE in this source tree. */ //config:config TLS //config: bool "tls (debugging)" @@ -10,6 +10,11 @@ //applet:IF_TLS(APPLET(tls, BB_DIR_USR_BIN, BB_SUID_DROP)) //kbuild:lib-$(CONFIG_TLS) += tls.o +//kbuild:lib-$(CONFIG_TLS) += tls_pstm.o +//kbuild:lib-$(CONFIG_TLS) += tls_pstm_montgomery_reduce.o +//kbuild:lib-$(CONFIG_TLS) += tls_pstm_mul_comba.o +//kbuild:lib-$(CONFIG_TLS) += tls_pstm_sqr_comba.o +//kbuild:lib-$(CONFIG_TLS) += tls_rsa.o ////kbuild:lib-$(CONFIG_TLS) += tls_ciphers.o ////kbuild:lib-$(CONFIG_TLS) += tls_aes.o ////kbuild:lib-$(CONFIG_TLS) += tls_aes_gcm.o @@ -18,9 +23,7 @@ //usage: "HOST[:PORT]" //usage:#define tls_full_usage "\n\n" -#include "libbb.h" -//#include "tls_cryptoapi.h" -//#include "tls_ciphers.h" +#include "tls.h" #if 1 # define dbg(...) fprintf(stderr, __VA_ARGS__) @@ -28,23 +31,26 @@ # define dbg(...) ((void)0) #endif -#define RECORD_TYPE_CHANGE_CIPHER_SPEC 20 -#define RECORD_TYPE_ALERT 21 -#define RECORD_TYPE_HANDSHAKE 22 -#define RECORD_TYPE_APPLICATION_DATA 23 +#define RECORD_TYPE_CHANGE_CIPHER_SPEC 20 +#define RECORD_TYPE_ALERT 21 +#define RECORD_TYPE_HANDSHAKE 22 +#define RECORD_TYPE_APPLICATION_DATA 23 -#define HANDSHAKE_HELLO_REQUEST 0 -#define HANDSHAKE_CLIENT_HELLO 1 -#define HANDSHAKE_SERVER_HELLO 2 -#define HANDSHAKE_HELLO_VERIFY_REQUEST 3 -#define HANDSHAKE_NEW_SESSION_TICKET 4 -#define HANDSHAKE_CERTIFICATE 11 -#define HANDSHAKE_SERVER_KEY_EXCHANGE 12 -#define HANDSHAKE_CERTIFICATE_REQUEST 13 -#define HANDSHAKE_SERVER_HELLO_DONE 14 -#define HANDSHAKE_CERTIFICATE_VERIFY 15 -#define HANDSHAKE_CLIENT_KEY_EXCHANGE 16 -#define HANDSHAKE_FINISHED 20 +#define HANDSHAKE_HELLO_REQUEST 0 +#define HANDSHAKE_CLIENT_HELLO 1 +#define HANDSHAKE_SERVER_HELLO 2 +#define HANDSHAKE_HELLO_VERIFY_REQUEST 3 +#define HANDSHAKE_NEW_SESSION_TICKET 4 +#define HANDSHAKE_CERTIFICATE 11 +#define HANDSHAKE_SERVER_KEY_EXCHANGE 12 +#define HANDSHAKE_CERTIFICATE_REQUEST 13 +#define HANDSHAKE_SERVER_HELLO_DONE 14 +#define HANDSHAKE_CERTIFICATE_VERIFY 15 +#define HANDSHAKE_CLIENT_KEY_EXCHANGE 16 +#define HANDSHAKE_FINISHED 20 + +#define SSL_HS_RANDOM_SIZE 32 +#define SSL_HS_RSA_PREMASTER_SIZE 48 #define SSL_NULL_WITH_NULL_NULL 0x0000 #define SSL_RSA_WITH_NULL_MD5 0x0001 @@ -112,6 +118,7 @@ //TLS 1.2 #define TLS_MAJ 3 #define TLS_MIN 3 +//#define CIPHER_ID TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA // ok, recvs SERVER_KEY_EXCHANGE *** matrixssl uses this on my box //#define CIPHER_ID TLS_RSA_WITH_AES_256_CBC_SHA256 // ok, no SERVER_KEY_EXCHANGE // All GCMs: //#define CIPHER_ID TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384 // SSL_ALERT_HANDSHAKE_FAILURE @@ -123,9 +130,9 @@ //#define CIPHER_ID TLS_ECDH_RSA_WITH_AES_256_GCM_SHA384 //#define CIPHER_ID TLS_ECDH_RSA_WITH_AES_128_GCM_SHA256 // SSL_ALERT_HANDSHAKE_FAILURE //#define CIPHER_ID TLS_RSA_WITH_AES_256_GCM_SHA384 // ok, no SERVER_KEY_EXCHANGE -#define CIPHER_ID TLS_RSA_WITH_AES_128_GCM_SHA256 // ok, no SERVER_KEY_EXCHANGE +#define CIPHER_ID TLS_RSA_WITH_AES_128_GCM_SHA256 // ok, no SERVER_KEY_EXCHANGE *** select this? //#define CIPHER_ID TLS_DH_anon_WITH_AES_256_CBC_SHA // SSL_ALERT_HANDSHAKE_FAILURE -// (tested b/c this one doesn't req server certs... no luck) +//^^^^^^^^^^^^^^^^^^^^^^^ (tested b/c this one doesn't req server certs... no luck) //test TLS_RSA_WITH_AES_128_CBC_SHA, in tls 1.2 it's mandated to be always supported struct record_hdr { @@ -137,8 +144,7 @@ struct record_hdr { typedef struct tls_state { int fd; - uint8_t *pubkey; - int pubkey_len; + psRsaKey_t server_rsa_pub_key; // RFC 5246 // |6.2.1. Fragmentation @@ -170,6 +176,12 @@ typedef struct tls_state { uint8_t inbuf[18*1024]; } tls_state_t; +void tls_get_random(void *buf, unsigned len) +{ + if (len != open_read_close("/dev/urandom", buf, len)) + xfunc_die(); +} + static tls_state_t *new_tls_state(void) { @@ -286,7 +298,7 @@ static void send_client_hello(tls_state_t *tls) hello.len24_lo = (sizeof(hello) - sizeof(hello.xhdr) - 4); hello.proto_maj = TLS_MAJ; hello.proto_min = TLS_MIN; - open_read_close("/dev/urandom", hello.rand32, sizeof(hello.rand32)); + tls_get_random(hello.rand32, sizeof(hello.rand32)); //hello.session_id_len = 0; //hello.cipherid_len16_hi = 0; hello.cipherid_len16_lo = 2 * 1; @@ -407,7 +419,18 @@ static uint8_t *skip_der_item(uint8_t *der, uint8_t *end) return new_der; } -static void *find_key_in_der_cert(int *key_len, uint8_t *der, int len) +static void der_binary_to_pstm(pstm_int *pstm_n, uint8_t *der, uint8_t *end) +{ + uint8_t *bin_ptr; + unsigned len = get_der_len(&bin_ptr, der, end); + + dbg("binary bytes:%u, first:0x%02x\n", len, bin_ptr[0]); + pstm_init_for_read_unsigned_bin(/*pool:*/ NULL, pstm_n, len); + pstm_read_unsigned_bin(pstm_n, bin_ptr, len); + //return bin + len; +} + +static void find_key_in_der_cert(tls_state_t *tls, uint8_t *der, int len) { /* Certificate is a DER-encoded data structure. Each DER element has a length, * which makes it easy to skip over large compound elements of any complexity @@ -504,19 +527,43 @@ static void *find_key_in_der_cert(int *key_len, uint8_t *der, int len) der = skip_der_item(der, end); /* validity */ der = skip_der_item(der, end); /* subject */ - /* enter "subjectPublicKeyInfo" */ + /* enter subjectPublicKeyInfo */ der = enter_der_item(der, &end); - - /* skip "subjectPublicKeyInfo.algorithm" */ + { /* check subjectPublicKeyInfo.algorithm */ + static const uint8_t expected[] = { + 0x30,0x0d, // SEQ 13 bytes + 0x06,0x09, 0x2a,0x86,0x48,0x86,0xf7,0x0d,0x01,0x01,0x01, // OID RSA_KEY_ALG 42.134.72.134.247.13.1.1.1 + //0x05,0x00, // NULL + }; + if (memcmp(der, expected, sizeof(expected)) != 0) + bb_error_msg_and_die("not RSA key"); + } + /* skip subjectPublicKeyInfo.algorithm */ der = skip_der_item(der, end); - /* enter "subjectPublicKeyInfo.publicKey" */ + /* enter subjectPublicKeyInfo.publicKey */ // die_if_not_this_der_type(der, end, 0x03); /* must be BITSTRING */ der = enter_der_item(der, &end); - /* return a copy */ - *key_len = end - der; - dbg("copying key bytes:%u, first:0x%02x\n", *key_len, der[0]); - return xmemdup(der, *key_len); + /* parse RSA key: */ +//based on getAsnRsaPubKey(), pkcs1ParsePrivBin() is also of note + dbg("key bytes:%u, first:0x%02x\n", (int)(end - der), der[0]); + if (end - der < 14) xfunc_die(); + /* example format: + * ignore bits: 00 + * SEQ 0x018a/394 bytes: 3082018a + * INTEGER 0x0181/385 bytes (modulus): 02820181 XX...XXX + * INTEGER 3 bytes (exponent): 0203 010001 + */ + if (*der != 0) /* "ignore bits", should be 0 */ + xfunc_die(); + der++; + der = enter_der_item(der, &end); /* enter SEQ */ + //memset(tls->server_rsa_pub_key, 0, sizeof(tls->server_rsa_pub_key)); + der_binary_to_pstm(&tls->server_rsa_pub_key.N, der, end); /* modulus */ + der = skip_der_item(der, end); + der_binary_to_pstm(&tls->server_rsa_pub_key.e, der, end); /* exponent */ + tls->server_rsa_pub_key.size = pstm_unsigned_bin_size(&tls->server_rsa_pub_key.N); + dbg("server_rsa_pub_key.size:%d\n", tls->server_rsa_pub_key.size); } static void get_server_cert_or_die(tls_state_t *tls) @@ -553,7 +600,107 @@ static void get_server_cert_or_die(tls_state_t *tls) len = len1; if (len) - tls->pubkey = find_key_in_der_cert(&tls->pubkey_len, certbuf + 10, len); + find_key_in_der_cert(tls, certbuf + 10, len); +} + +static void send_client_key_exchange(tls_state_t *tls) +{ +#if 0 //matrixssl code snippets: + int32 csRsaEncryptPub(psPool_t *pool, psPubKey_t *key, + unsigned char *in, uint32 inlen, unsigned char *out, uint32 outlen, + void *data) + { + psAssert(key->type == PS_RSA); + return psRsaEncryptPub(pool, (psRsaKey_t*)key->key, in, inlen, out, outlen, + data); + } +... + /* pkaAfter.user is buffer len */ + if ((rc = csRsaEncryptPub(pka->pool, &ssl->sec.cert->publicKey, + ssl->sec.premaster, ssl->sec.premasterSize, pka->outbuf, + pka->user, pka->data)) < 0) { + if (rc == PS_PENDING) { + /* For these ClientKeyExchange paths, we do want to come + back through nowDoCkePka for a double pass so each + case can manage its own pkaAfter and to make sure + psX509FreeCert and sslCreateKeys() are hit below. */ + return rc; + } + psTraceIntInfo("csRsaEncryptPub in CKE failed %d\n", rc); + return MATRIXSSL_ERROR; + } + /* RSA closed the pool on second pass */ + pka->pool = NULL; + clearPkaAfter(ssl); +... +#ifdef USE_RSA_CIPHER_SUITE +/* + Standard RSA suite +*/ + ssl->sec.premasterSize = SSL_HS_RSA_PREMASTER_SIZE; + ssl->sec.premaster = psMalloc(ssl->hsPool, + SSL_HS_RSA_PREMASTER_SIZE); + if (ssl->sec.premaster == NULL) { + return SSL_MEM_ERROR; + } + + ssl->sec.premaster[0] = ssl->reqMajVer; + ssl->sec.premaster[1] = ssl->reqMinVer; + if (matrixCryptoGetPrngData(ssl->sec.premaster + 2, + SSL_HS_RSA_PREMASTER_SIZE - 2, ssl->userPtr) < 0) { + return MATRIXSSL_ERROR; + } + + /* Shedule RSA encryption. Put tmp pool under control of After */ + pkaAfter->type = PKA_AFTER_RSA_ENCRYPT; + pkaAfter->outbuf = c; + pkaAfter->data = pkiData; + pkaAfter->pool = pkiPool; + pkaAfter->user = (uint32)(end - c); /* Available space */ + + c += keyLen; +#endif +#endif // 0 + + struct client_key_exchange { + struct record_hdr xhdr; + uint8_t type; + uint8_t len24_hi, len24_mid, len24_lo; + uint8_t keylen16_hi, keylen16_lo; /* exist for RSA, but not for some other key types */ +//had a bug when had no keylen: we: +//write(3, "\x16\x03\x03\x01\x84\x10\x00\x01\x80\xXX\xXX\xXX\xXX\xXX\xXX...", 393) = 393 +//openssl: +//write to 0xe9a090 [0xf9ac20] (395 bytes => 395 (0x18B)) +//0000 - 16 03 03 01 86 10 00 01 -82 01 80 xx xx xx xx xx + uint8_t key[384]; // size?? + }; + struct client_key_exchange record; + uint8_t premaster[SSL_HS_RSA_PREMASTER_SIZE]; + + memset(&record, 0, sizeof(record)); + record.xhdr.type = RECORD_TYPE_HANDSHAKE; + record.xhdr.proto_maj = TLS_MAJ; + record.xhdr.proto_min = TLS_MIN; + record.xhdr.len16_hi = (sizeof(record) - sizeof(record.xhdr)) >> 8; + record.xhdr.len16_lo = (sizeof(record) - sizeof(record.xhdr)) & 0xff; + record.type = HANDSHAKE_CLIENT_KEY_EXCHANGE; + //record.len24_hi = 0; + record.len24_mid = (sizeof(record) - sizeof(record.xhdr) - 4) >> 8; + record.len24_lo = (sizeof(record) - sizeof(record.xhdr) - 4) & 0xff; + record.keylen16_hi = (sizeof(record) - sizeof(record.xhdr) - 6) >> 8; + record.keylen16_lo = (sizeof(record) - sizeof(record.xhdr) - 6) & 0xff; + + tls_get_random(premaster, sizeof(premaster)); + premaster[0] = TLS_MAJ; + premaster[1] = TLS_MIN; + psRsaEncryptPub(/*pool:*/ NULL, + /* psRsaKey_t* */ &tls->server_rsa_pub_key, + premaster, /*inlen:*/ sizeof(premaster), + record.key, sizeof(record.key), + data_param_ignored + ); + + xwrite(tls->fd, &record, sizeof(record)); } static void tls_handshake(tls_state_t *tls) @@ -614,6 +761,8 @@ static void tls_handshake(tls_state_t *tls) // 459 bytes: // 0c 00|01|c7 03|00|17|41|04|87|94|2e|2f|68|d0|c9|f4|97|a8|2d|ef|ed|67|ea|c6|f3|b3|56|47|5d|27|b6|bd|ee|70|25|30|5e|b0|8e|f6|21|5a... //SvKey len=455^ + // with TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA: 461 bytes: + // 0c 00|01|c9 03|00|17|41|04|cd|9b|b4|29|1f|f6|b0|c2|84|82|7f|29|6a|47|4e|ec|87|0b|c1|9c|69|e1|f8|c6|d0|53|e9|27|90|a5|c8|02|15|75... dbg("got SERVER_KEY_EXCHANGE\n"); len = xread_tls_block(tls); break; @@ -624,6 +773,8 @@ static void tls_handshake(tls_state_t *tls) case HANDSHAKE_SERVER_HELLO_DONE: // 0e 000000 (len:0) dbg("got SERVER_HELLO_DONE\n"); + send_client_key_exchange(tls); + len = xread_tls_block(tls); break; default: tls_error_die(tls); diff --git a/networking/tls.h b/networking/tls.h new file mode 100644 index 000000000..20317ecc3 --- /dev/null +++ b/networking/tls.h @@ -0,0 +1,73 @@ +/* + * Copyright (C) 2017 Denys Vlasenko + * + * Licensed under GPLv2, see file LICENSE in this source tree. + */ +#include "libbb.h" + +/* config tweaks */ +#define HAVE_NATIVE_INT64 1 +#undef DISABLE_PSTM +#undef USE_1024_KEY_SPEED_OPTIMIZATIONS +#undef USE_2048_KEY_SPEED_OPTIMIZATIONS +//TODO: enable to use asm: +//#if defined(__GNUC__) && defined(__i386__) -> #define PSTM_32BIT and PSTM_X86 +//#if defined(__GNUC__) && defined(__x86_64__) -> #define PSTM_64BIT and PSTM_X86_64 +//ARM and MIPS also have these + + +#define PS_SUCCESS 0 +#define PS_FAILURE -1 +#define PS_ARG_FAIL -6 /* Failure due to bad function param */ +#define PS_PLATFORM_FAIL -7 /* Failure as a result of system call error */ +#define PS_MEM_FAIL -8 /* Failure to allocate requested memory */ +#define PS_LIMIT_FAIL -9 /* Failure on sanity/limit tests */ + +#define PS_TRUE 1 +#define PS_FALSE 0 + +#if BB_BIG_ENDIAN +# define ENDIAN_BIG 1 +# undef ENDIAN_LITTLE +//#???? ENDIAN_32BITWORD +// controls only STORE32L, which we don't use +#else +# define ENDIAN_LITTLE 1 +# undef ENDIAN_BIG +#endif + +typedef uint64_t uint64; +typedef int64_t int64; +typedef uint32_t uint32; +typedef int32_t int32; +typedef uint16_t uint16; +typedef int16_t int16; + +//FIXME +typedef char psPool_t; + +//#ifdef PS_PUBKEY_OPTIMIZE_FOR_SMALLER_RAM +#define PS_EXPTMOD_WINSIZE 3 +//#ifdef PS_PUBKEY_OPTIMIZE_FOR_FASTER_SPEED +//#define PS_EXPTMOD_WINSIZE 5 + +#define PUBKEY_TYPE 0x01 +#define PRIVKEY_TYPE 0x02 + +void tls_get_random(void *buf, unsigned len); + +#define matrixCryptoGetPrngData(buf, len, userPtr) (tls_get_random(buf, len), PS_SUCCESS) + +#define psFree(p, pool) free(p) +#define psTraceCrypto(msg) bb_error_msg_and_die(msg) + +/* Secure zerofill */ +#define memset_s(A,B,C,D) memset((A),(C),(D)) +/* Constant time memory comparison */ +#define memcmpct(s1, s2, len) memcmp((s1), (s2), (len)) +#undef min +#define min(x, y) ((x) < (y) ? (x) : (y)) + + +#include "tls_pstm.h" +#include "tls_rsa.h" diff --git a/networking/tls_pstm.c b/networking/tls_pstm.c new file mode 100644 index 000000000..0d797f87f --- /dev/null +++ b/networking/tls_pstm.c @@ -0,0 +1,2254 @@ +/* + * Copyright (C) 2017 Denys Vlasenko + * + * Licensed under GPLv2, see file LICENSE in this source tree. + */ +#include "tls.h" + +/** + * @file pstm.c + * @version 33ef80f (HEAD, tag: MATRIXSSL-3-7-2-OPEN, tag: MATRIXSSL-3-7-2-COMM, origin/master, origin/HEAD, master) + * + * Multiprecision number implementation. + */ +/* + * Copyright (c) 2013-2015 INSIDE Secure Corporation + * Copyright (c) PeerSec Networks, 2002-2011 + * All Rights Reserved + * + * The latest version of this code is available at http://www.matrixssl.org + * + * This software is open source; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This General Public License does NOT permit incorporating this software + * into proprietary programs. If you are unable to comply with the GPL, a + * commercial license for this software may be purchased from INSIDE at + * http://www.insidesecure.com/eng/Company/Locations + * + * This program is distributed in WITHOUT ANY WARRANTY; without even the + * implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * http://www.gnu.org/copyleft/gpl.html + */ +/******************************************************************************/ + +///bbox +//#include "../cryptoApi.h" +#ifndef DISABLE_PSTM + +static int32 pstm_mul_2d(pstm_int *a, int16 b, pstm_int *c); + +/******************************************************************************/ +/* + init an pstm_int for a given size + */ +int32 pstm_init_size(psPool_t *pool, pstm_int * a, uint32 size) +{ +// uint16 x; + +/* + alloc mem + */ + a->dp = xzalloc(sizeof (pstm_digit) * size); + a->pool = pool; + a->used = 0; + a->alloc = (int16)size; + a->sign = PSTM_ZPOS; +/* + zero the digits + */ +///bbox +// for (x = 0; x < size; x++) { +// a->dp[x] = 0; +// } + return PSTM_OKAY; +} + +/******************************************************************************/ +/* + Init a new pstm_int. +*/ +int32 pstm_init(psPool_t *pool, pstm_int * a) +{ +// int32 i; +/* + allocate memory required and clear it + */ + a->dp = xzalloc(sizeof (pstm_digit) * PSTM_DEFAULT_INIT); +/* + set the digits to zero + */ +///bbox +// for (i = 0; i < PSTM_DEFAULT_INIT; i++) { +// a->dp[i] = 0; +// } +/* + set the used to zero, allocated digits to the default precision and sign + to positive + */ + a->pool = pool; + a->used = 0; + a->alloc = PSTM_DEFAULT_INIT; + a->sign = PSTM_ZPOS; + + return PSTM_OKAY; +} + +/******************************************************************************/ +/* + Grow as required + */ +int32 pstm_grow(pstm_int * a, int16 size) +{ + int16 i; + pstm_digit *tmp; + +/* + If the alloc size is smaller alloc more ram. + */ + if (a->alloc < size) { +/* + Reallocate the array a->dp + + We store the return in a temporary variable in case the operation + failed we don't want to overwrite the dp member of a. +*/ + tmp = xrealloc(a->dp, sizeof (pstm_digit) * size); +/* + reallocation succeeded so set a->dp + */ + a->dp = tmp; +/* + zero excess digits + */ + i = a->alloc; + a->alloc = size; + for (; i < a->alloc; i++) { + a->dp[i] = 0; + } + } + return PSTM_OKAY; +} + +/******************************************************************************/ +/* + copy, b = a (b must be pre-allocated) + */ +int32 pstm_copy(pstm_int * a, pstm_int * b) +{ + int32 res, n; + +/* + If dst == src do nothing + */ + if (a == b) { + return PSTM_OKAY; + } +/* + Grow dest + */ + if (b->alloc < a->used) { + if ((res = pstm_grow (b, a->used)) != PSTM_OKAY) { + return res; + } + } +/* + Zero b and copy the parameters over + */ + { + register pstm_digit *tmpa, *tmpb; + + /* pointer aliases */ + /* source */ + tmpa = a->dp; + + /* destination */ + tmpb = b->dp; + + /* copy all the digits */ + for (n = 0; n < a->used; n++) { + *tmpb++ = *tmpa++; + } + + /* clear high digits */ + for (; n < b->used; n++) { + *tmpb++ = 0; + } + } +/* + copy used count and sign + */ + b->used = a->used; + b->sign = a->sign; + return PSTM_OKAY; +} + +/******************************************************************************/ +/* + Trim unused digits + + This is used to ensure that leading zero digits are trimed and the + leading "used" digit will be non-zero. Typically very fast. Also fixes + the sign if there are no more leading digits +*/ +void pstm_clamp(pstm_int * a) +{ +/* decrease used while the most significant digit is zero. */ + while (a->used > 0 && a->dp[a->used - 1] == 0) { + --(a->used); + } +/* reset the sign flag if used == 0 */ + if (a->used == 0) { + a->sign = PSTM_ZPOS; + } +} + +/******************************************************************************/ +/* + clear one (frees). + */ +void pstm_clear(pstm_int * a) +{ + int32 i; +/* + only do anything if a hasn't been freed previously + */ + if (a != NULL && a->dp != NULL) { +/* + first zero the digits + */ + for (i = 0; i < a->used; i++) { + a->dp[i] = 0; + } + + psFree (a->dp, a->pool); +/* + reset members to make debugging easier + */ + a->dp = NULL; + a->alloc = a->used = 0; + a->sign = PSTM_ZPOS; + } +} + +/******************************************************************************/ +/* + clear many (frees). + */ +void pstm_clear_multi(pstm_int *mp0, pstm_int *mp1, pstm_int *mp2, + pstm_int *mp3, pstm_int *mp4, pstm_int *mp5, + pstm_int *mp6, pstm_int *mp7) +{ + int32 n; /* Number of ok inits */ + + pstm_int *tempArray[9]; + + tempArray[0] = mp0; + tempArray[1] = mp1; + tempArray[2] = mp2; + tempArray[3] = mp3; + tempArray[4] = mp4; + tempArray[5] = mp5; + tempArray[6] = mp6; + tempArray[7] = mp7; + tempArray[8] = NULL; + + for (n = 0; tempArray[n] != NULL; n++) { + if ((tempArray[n] != NULL) && (tempArray[n]->dp != NULL)) { + pstm_clear(tempArray[n]); + } + } +} + +/******************************************************************************/ +/* + Set to zero. + */ +void pstm_zero(pstm_int * a) +{ + int32 n; + pstm_digit *tmp; + + a->sign = PSTM_ZPOS; + a->used = 0; + + tmp = a->dp; + for (n = 0; n < a->alloc; n++) { + *tmp++ = 0; + } +} + + +/******************************************************************************/ +/* + Compare maginitude of two ints (unsigned). + */ +int32 pstm_cmp_mag(pstm_int * a, pstm_int * b) +{ + int16 n; + pstm_digit *tmpa, *tmpb; + +/* + compare based on # of non-zero digits + */ + if (a->used > b->used) { + return PSTM_GT; + } + + if (a->used < b->used) { + return PSTM_LT; + } + + /* alias for a */ + tmpa = a->dp + (a->used - 1); + + /* alias for b */ + tmpb = b->dp + (a->used - 1); + +/* + compare based on digits + */ + for (n = 0; n < a->used; ++n, --tmpa, --tmpb) { + if (*tmpa > *tmpb) { + return PSTM_GT; + } + if (*tmpa < *tmpb) { + return PSTM_LT; + } + } + return PSTM_EQ; +} + +/******************************************************************************/ +/* + Compare two ints (signed) + */ +int32 pstm_cmp(pstm_int * a, pstm_int * b) +{ +/* + compare based on sign + */ + if (a->sign != b->sign) { + if (a->sign == PSTM_NEG) { + return PSTM_LT; + } else { + return PSTM_GT; + } + } +/* + compare digits + */ + if (a->sign == PSTM_NEG) { + /* if negative compare opposite direction */ + return pstm_cmp_mag(b, a); + } else { + return pstm_cmp_mag(a, b); + } +} + +/******************************************************************************/ +/* + pstm_ints can be initialized more precisely when they will populated + using pstm_read_unsigned_bin since the length of the byte stream is known +*/ +int32 pstm_init_for_read_unsigned_bin(psPool_t *pool, pstm_int *a, uint32 len) +{ + int32 size; +/* + Need to set this based on how many words max it will take to store the bin. + The magic + 2: + 1 to round up for the remainder of this integer math + 1 for the initial carry of '1' bits that fall between DIGIT_BIT and 8 +*/ + size = (((len / sizeof(pstm_digit)) * (sizeof(pstm_digit) * CHAR_BIT)) + / DIGIT_BIT) + 2; + return pstm_init_size(pool, a, size); +} + + +/******************************************************************************/ +/* + Reads a unsigned char array into pstm_int format. User should have + called pstm_init_for_read_unsigned_bin first. There is some grow logic + here if the default pstm_init was used but we don't really want to hit it. +*/ +int32 pstm_read_unsigned_bin(pstm_int *a, unsigned char *b, int32 c) +{ + /* zero the int */ + pstm_zero (a); + +/* + If we know the endianness of this architecture, and we're using + 32-bit pstm_digits, we can optimize this +*/ +#if (defined(ENDIAN_LITTLE) || defined(ENDIAN_BIG)) && !defined(PSTM_64BIT) + /* But not for both simultaneously */ +#if defined(ENDIAN_LITTLE) && defined(ENDIAN_BIG) +#error Both ENDIAN_LITTLE and ENDIAN_BIG defined. +#endif + { + unsigned char *pd; + if ((unsigned)c > (PSTM_MAX_SIZE * sizeof(pstm_digit))) { + uint32 excess = c - (PSTM_MAX_SIZE * sizeof(pstm_digit)); + c -= excess; + b += excess; + } + a->used = (int16)((c + sizeof(pstm_digit) - 1)/sizeof(pstm_digit)); + if (a->alloc < a->used) { + if (pstm_grow(a, a->used) != PSTM_OKAY) { + return PSTM_MEM; + } + } + pd = (unsigned char *)a->dp; + /* read the bytes in */ +#ifdef ENDIAN_BIG + { + /* Use Duff's device to unroll the loop. */ + int32 idx = (c - 1) & ~3; + switch (c % 4) { + case 0: do { pd[idx+0] = *b++; + case 3: pd[idx+1] = *b++; + case 2: pd[idx+2] = *b++; + case 1: pd[idx+3] = *b++; + idx -= 4; + } while ((c -= 4) > 0); + } + } +#else + for (c -= 1; c >= 0; c -= 1) { + pd[c] = *b++; + } +#endif + } +#else + /* Big enough based on the len? */ + a->used = (((c / sizeof(pstm_digit)) * (sizeof(pstm_digit) * CHAR_BIT)) + / DIGIT_BIT) + 2; + + if (a->alloc < a->used) { + if (pstm_grow(a, a->used) != PSTM_OKAY) { + return PSTM_MEM; + } + } + /* read the bytes in */ + for (; c > 0; c--) { + if (pstm_mul_2d (a, 8, a) != PSTM_OKAY) { + return PS_MEM_FAIL; + } + a->dp[0] |= *b++; + a->used += 1; + } +#endif + + pstm_clamp (a); + return PS_SUCCESS; +} + +/******************************************************************************/ +/* +*/ +int16 pstm_count_bits (pstm_int * a) +{ + int16 r; + pstm_digit q; + + if (a->used == 0) { + return 0; + } + + /* get number of digits and add that */ + r = (a->used - 1) * DIGIT_BIT; + + /* take the last digit and count the bits in it */ + q = a->dp[a->used - 1]; + while (q > ((pstm_digit) 0)) { + ++r; + q >>= ((pstm_digit) 1); + } + return r; +} + +/******************************************************************************/ +int32 pstm_unsigned_bin_size(pstm_int *a) +{ + int32 size = pstm_count_bits (a); + return (size / 8 + ((size & 7) != 0 ? 1 : 0)); +} + +/******************************************************************************/ +void pstm_set(pstm_int *a, pstm_digit b) +{ + pstm_zero(a); + a->dp[0] = b; + a->used = a->dp[0] ? 1 : 0; +} + +/******************************************************************************/ +/* + Right shift +*/ +void pstm_rshd(pstm_int *a, int16 x) +{ + int16 y; + + /* too many digits just zero and return */ + if (x >= a->used) { + pstm_zero(a); + return; + } + + /* shift */ + for (y = 0; y < a->used - x; y++) { + a->dp[y] = a->dp[y+x]; + } + + /* zero rest */ + for (; y < a->used; y++) { + a->dp[y] = 0; + } + + /* decrement count */ + a->used -= x; + pstm_clamp(a); +} + +/******************************************************************************/ +/* + Shift left a certain amount of digits. + */ +int32 pstm_lshd(pstm_int * a, int16 b) +{ + int16 x; + int32 res; + +/* + If its less than zero return. + */ + if (b <= 0) { + return PSTM_OKAY; + } +/* + Grow to fit the new digits. + */ + if (a->alloc < a->used + b) { + if ((res = pstm_grow (a, a->used + b)) != PSTM_OKAY) { + return res; + } + } + + { + register pstm_digit *top, *bottom; +/* + Increment the used by the shift amount then copy upwards. + */ + a->used += b; + + /* top */ + top = a->dp + a->used - 1; + + /* base */ + bottom = a->dp + a->used - 1 - b; +/* + This is implemented using a sliding window except the window goes the + other way around. Copying from the bottom to the top. + */ + for (x = a->used - 1; x >= b; x--) { + *top-- = *bottom--; + } + + /* zero the lower digits */ + top = a->dp; + for (x = 0; x < b; x++) { + *top++ = 0; + } + } + return PSTM_OKAY; +} + +/******************************************************************************/ +/* + computes a = 2**b +*/ +int32 pstm_2expt(pstm_int *a, int16 b) +{ + int16 z; + + /* zero a as per default */ + pstm_zero (a); + + if (b < 0) { + return PSTM_OKAY; + } + + z = b / DIGIT_BIT; + if (z >= PSTM_MAX_SIZE) { + return PS_LIMIT_FAIL; + } + + /* set the used count of where the bit will go */ + a->used = z + 1; + + if (a->used > a->alloc) { + if (pstm_grow(a, a->used) != PSTM_OKAY) { + return PS_MEM_FAIL; + } + } + + /* put the single bit in its place */ + a->dp[z] = ((pstm_digit)1) << (b % DIGIT_BIT); + return PSTM_OKAY; +} + +/******************************************************************************/ +/* + +*/ +int32 pstm_mul_2(pstm_int * a, pstm_int * b) +{ + int32 res; + int16 x, oldused; + +/* + grow to accomodate result + */ + if (b->alloc < a->used + 1) { + if ((res = pstm_grow (b, a->used + 1)) != PSTM_OKAY) { + return res; + } + } + oldused = b->used; + b->used = a->used; + + { + register pstm_digit r, rr, *tmpa, *tmpb; + + /* alias for source */ + tmpa = a->dp; + + /* alias for dest */ + tmpb = b->dp; + + /* carry */ + r = 0; + for (x = 0; x < a->used; x++) { +/* + get what will be the *next* carry bit from the + MSB of the current digit +*/ + rr = *tmpa >> ((pstm_digit)(DIGIT_BIT - 1)); +/* + now shift up this digit, add in the carry [from the previous] +*/ + *tmpb++ = ((*tmpa++ << ((pstm_digit)1)) | r); +/* + copy the carry that would be from the source + digit into the next iteration +*/ + r = rr; + } + + /* new leading digit? */ + if (r != 0 && b->used != (PSTM_MAX_SIZE-1)) { + /* add a MSB which is always 1 at this point */ + *tmpb = 1; + ++(b->used); + } +/* + now zero any excess digits on the destination that we didn't write to +*/ + tmpb = b->dp + b->used; + for (x = b->used; x < oldused; x++) { + *tmpb++ = 0; + } + } + b->sign = a->sign; + return PSTM_OKAY; +} + +/******************************************************************************/ +/* + unsigned subtraction ||a|| >= ||b|| ALWAYS! +*/ +int32 s_pstm_sub(pstm_int *a, pstm_int *b, pstm_int *c) +{ + int16 oldbused, oldused; + int32 x; + pstm_word t; + + if (b->used > a->used) { + return PS_LIMIT_FAIL; + } + if (c->alloc < a->used) { + if ((x = pstm_grow (c, a->used)) != PSTM_OKAY) { + return x; + } + } + oldused = c->used; + oldbused = b->used; + c->used = a->used; + t = 0; + + for (x = 0; x < oldbused; x++) { + t = ((pstm_word)a->dp[x]) - (((pstm_word)b->dp[x]) + t); + c->dp[x] = (pstm_digit)t; + t = (t >> DIGIT_BIT)&1; + } + for (; x < a->used; x++) { + t = ((pstm_word)a->dp[x]) - t; + c->dp[x] = (pstm_digit)t; + t = (t >> DIGIT_BIT); + } + for (; x < oldused; x++) { + c->dp[x] = 0; + } + pstm_clamp(c); + return PSTM_OKAY; +} + +/******************************************************************************/ +/* + unsigned addition +*/ +static int32 s_pstm_add(pstm_int *a, pstm_int *b, pstm_int *c) +{ + int16 x, y, oldused; + register pstm_word t, adp, bdp; + + y = a->used; + if (b->used > y) { + y = b->used; + } + oldused = c->used; + c->used = y; + + if (c->used > c->alloc) { + if (pstm_grow(c, c->used) != PSTM_OKAY) { + return PS_MEM_FAIL; + } + } + + t = 0; + for (x = 0; x < y; x++) { + if (a->used < x) { + adp = 0; + } else { + adp = (pstm_word)a->dp[x]; + } + if (b->used < x) { + bdp = 0; + } else { + bdp = (pstm_word)b->dp[x]; + } + t += (adp) + (bdp); + c->dp[x] = (pstm_digit)t; + t >>= DIGIT_BIT; + } + if (t != 0 && x < PSTM_MAX_SIZE) { + if (c->used == c->alloc) { + if (pstm_grow(c, c->alloc + 1) != PSTM_OKAY) { + return PS_MEM_FAIL; + } + } + c->dp[c->used++] = (pstm_digit)t; + ++x; + } + + c->used = x; + for (; x < oldused; x++) { + c->dp[x] = 0; + } + pstm_clamp(c); + return PSTM_OKAY; +} + + +/******************************************************************************/ +/* + +*/ +int32 pstm_sub(pstm_int *a, pstm_int *b, pstm_int *c) +{ + int32 res; + int16 sa, sb; + + sa = a->sign; + sb = b->sign; + + if (sa != sb) { +/* + subtract a negative from a positive, OR a positive from a negative. + For both, ADD their magnitudes, and use the sign of the first number. + */ + c->sign = sa; + if ((res = s_pstm_add (a, b, c)) != PSTM_OKAY) { + return res; + } + } else { +/* + subtract a positive from a positive, OR a negative from a negative. + First, take the difference between their magnitudes, then... + */ + if (pstm_cmp_mag (a, b) != PSTM_LT) { + /* Copy the sign from the first */ + c->sign = sa; + /* The first has a larger or equal magnitude */ + if ((res = s_pstm_sub (a, b, c)) != PSTM_OKAY) { + return res; + } + } else { + /* The result has the _opposite_ sign from the first number. */ + c->sign = (sa == PSTM_ZPOS) ? PSTM_NEG : PSTM_ZPOS; + /* The second has a larger magnitude */ + if ((res = s_pstm_sub (b, a, c)) != PSTM_OKAY) { + return res; + } + } + } + return PS_SUCCESS; +} + +/******************************************************************************/ +/* + c = a - b +*/ +int32 pstm_sub_d(psPool_t *pool, pstm_int *a, pstm_digit b, pstm_int *c) +{ + pstm_int tmp; + int32 res; + + if (pstm_init_size(pool, &tmp, sizeof(pstm_digit)) != PSTM_OKAY) { + return PS_MEM_FAIL; + } + pstm_set(&tmp, b); + res = pstm_sub(a, &tmp, c); + pstm_clear(&tmp); + return res; +} + +/******************************************************************************/ +/* + setups the montgomery reduction +*/ +int32 pstm_montgomery_setup(pstm_int *a, pstm_digit *rho) +{ + pstm_digit x, b; + +/* + fast inversion mod 2**k + Based on the fact that + XA = 1 (mod 2**n) => (X(2-XA)) A = 1 (mod 2**2n) + => 2*X*A - X*X*A*A = 1 + => 2*(1) - (1) = 1 + */ + b = a->dp[0]; + + if ((b & 1) == 0) { + psTraceCrypto("pstm_montogomery_setup failure\n"); + return PS_ARG_FAIL; + } + + x = (((b + 2) & 4) << 1) + b; /* here x*a==1 mod 2**4 */ + x *= 2 - b * x; /* here x*a==1 mod 2**8 */ + x *= 2 - b * x; /* here x*a==1 mod 2**16 */ + x *= 2 - b * x; /* here x*a==1 mod 2**32 */ +#ifdef PSTM_64BIT + x *= 2 - b * x; /* here x*a==1 mod 2**64 */ +#endif + /* rho = -1/m mod b */ + *rho = (pstm_digit)(((pstm_word) 1 << ((pstm_word) DIGIT_BIT)) - + ((pstm_word)x)); + return PSTM_OKAY; +} + +/******************************************************************************/ +/* + * computes a = B**n mod b without division or multiplication useful for + * normalizing numbers in a Montgomery system. + */ +int32 pstm_montgomery_calc_normalization(pstm_int *a, pstm_int *b) +{ + int32 x; + int16 bits; + + /* how many bits of last digit does b use */ + bits = pstm_count_bits (b) % DIGIT_BIT; + if (!bits) bits = DIGIT_BIT; + + /* compute A = B^(n-1) * 2^(bits-1) */ + if (b->used > 1) { + if ((x = pstm_2expt (a, (b->used - 1) * DIGIT_BIT + bits - 1)) != + PSTM_OKAY) { + return x; + } + } else { + pstm_set(a, 1); + bits = 1; + } + + /* now compute C = A * B mod b */ + for (x = bits - 1; x < (int32)DIGIT_BIT; x++) { + if (pstm_mul_2 (a, a) != PSTM_OKAY) { + return PS_MEM_FAIL; + } + if (pstm_cmp_mag (a, b) != PSTM_LT) { + if (s_pstm_sub (a, b, a) != PSTM_OKAY) { + return PS_MEM_FAIL; + } + } + } + return PSTM_OKAY; +} + +/******************************************************************************/ +/* + c = a * 2**d +*/ +static int32 pstm_mul_2d(pstm_int *a, int16 b, pstm_int *c) +{ + pstm_digit carry, carrytmp, shift; + int16 x; + + /* copy it */ + if (pstm_copy(a, c) != PSTM_OKAY) { + return PS_MEM_FAIL; + } + + /* handle whole digits */ + if (b >= DIGIT_BIT) { + if (pstm_lshd(c, b/DIGIT_BIT) != PSTM_OKAY) { + return PS_MEM_FAIL; + } + } + b %= DIGIT_BIT; + + /* shift the digits */ + if (b != 0) { + carry = 0; + shift = DIGIT_BIT - b; + for (x = 0; x < c->used; x++) { + carrytmp = c->dp[x] >> shift; + c->dp[x] = (c->dp[x] << b) + carry; + carry = carrytmp; + } + /* store last carry if room */ + if (carry && x < PSTM_MAX_SIZE) { + if (c->used == c->alloc) { + if (pstm_grow(c, c->alloc + 1) != PSTM_OKAY) { + return PS_MEM_FAIL; + } + } + c->dp[c->used++] = carry; + } + } + pstm_clamp(c); + return PSTM_OKAY; +} + +/******************************************************************************/ +/* + c = a mod 2**d +*/ +static int32 pstm_mod_2d(pstm_int *a, int16 b, pstm_int *c) +{ + int16 x; + + /* zero if count less than or equal to zero */ + if (b <= 0) { + pstm_zero(c); + return PSTM_OKAY; + } + + /* get copy of input */ + if (pstm_copy(a, c) != PSTM_OKAY) { + return PS_MEM_FAIL; + } + + /* if 2**d is larger than we just return */ + if (b >= (DIGIT_BIT * a->used)) { + return PSTM_OKAY; + } + + /* zero digits above the last digit of the modulus */ + for (x = (b / DIGIT_BIT) + ((b % DIGIT_BIT) == 0 ? 0 : 1); x < c->used; x++) + { + c->dp[x] = 0; + } + /* clear the digit that is not completely outside/inside the modulus */ + c->dp[b / DIGIT_BIT] &= ~((pstm_digit)0) >> (DIGIT_BIT - b); + pstm_clamp (c); + return PSTM_OKAY; +} + + +/******************************************************************************/ +/* + c = a * b +*/ +int32 pstm_mul_d(pstm_int *a, pstm_digit b, pstm_int *c) +{ + pstm_word w; + int32 res; + int16 x, oldused; + + if (c->alloc < a->used + 1) { + if ((res = pstm_grow (c, a->used + 1)) != PSTM_OKAY) { + return res; + } + } + oldused = c->used; + c->used = a->used; + c->sign = a->sign; + w = 0; + for (x = 0; x < a->used; x++) { + w = ((pstm_word)a->dp[x]) * ((pstm_word)b) + w; + c->dp[x] = (pstm_digit)w; + w = w >> DIGIT_BIT; + } + if (w != 0 && (a->used != PSTM_MAX_SIZE)) { + c->dp[c->used++] = (pstm_digit)w; + ++x; + } + for (; x < oldused; x++) { + c->dp[x] = 0; + } + pstm_clamp(c); + return PSTM_OKAY; +} + +/******************************************************************************/ +/* + c = a / 2**b +*/ +int32 pstm_div_2d(psPool_t *pool, pstm_int *a, int16 b, pstm_int *c, + pstm_int *d) +{ + pstm_digit D, r, rr; + int32 res; + int16 x; + pstm_int t; + + /* if the shift count is <= 0 then we do no work */ + if (b <= 0) { + if (pstm_copy (a, c) != PSTM_OKAY) { + return PS_MEM_FAIL; + } + if (d != NULL) { + pstm_zero (d); + } + return PSTM_OKAY; + } + + /* get the remainder */ + if (d != NULL) { + if (pstm_init(pool, &t) != PSTM_OKAY) { + return PS_MEM_FAIL; + } + if (pstm_mod_2d (a, b, &t) != PSTM_OKAY) { + res = PS_MEM_FAIL; + goto LBL_DONE; + } + } + + /* copy */ + if (pstm_copy(a, c) != PSTM_OKAY) { + res = PS_MEM_FAIL; + goto LBL_DONE; + } + + /* shift by as many digits in the bit count */ + if (b >= (int32)DIGIT_BIT) { + pstm_rshd (c, b / DIGIT_BIT); + } + + /* shift any bit count < DIGIT_BIT */ + D = (pstm_digit) (b % DIGIT_BIT); + if (D != 0) { + register pstm_digit *tmpc, mask, shift; + + /* mask */ + mask = (((pstm_digit)1) << D) - 1; + + /* shift for lsb */ + shift = DIGIT_BIT - D; + + /* alias */ + tmpc = c->dp + (c->used - 1); + + /* carry */ + r = 0; + for (x = c->used - 1; x >= 0; x--) { + /* get the lower bits of this word in a temp */ + rr = *tmpc & mask; + + /* shift the current word and mix in the carry bits from previous */ + *tmpc = (*tmpc >> D) | (r << shift); + --tmpc; + + /* set the carry to the carry bits of the current word above */ + r = rr; + } + } + pstm_clamp (c); + + res = PSTM_OKAY; +LBL_DONE: + if (d != NULL) { + if (pstm_copy(&t, d) != PSTM_OKAY) { + res = PS_MEM_FAIL; + } + pstm_clear(&t); + } + return res; +} + +/******************************************************************************/ +/* + b = a/2 +*/ +int32 pstm_div_2(pstm_int * a, pstm_int * b) +{ + int16 x, oldused; + + if (b->alloc < a->used) { + if (pstm_grow(b, a->used) != PSTM_OKAY) { + return PS_MEM_FAIL; + } + } + oldused = b->used; + b->used = a->used; + { + register pstm_digit r, rr, *tmpa, *tmpb; + + /* source alias */ + tmpa = a->dp + b->used - 1; + + /* dest alias */ + tmpb = b->dp + b->used - 1; + + /* carry */ + r = 0; + for (x = b->used - 1; x >= 0; x--) { + /* get the carry for the next iteration */ + rr = *tmpa & 1; + + /* shift the current digit, add in carry and store */ + *tmpb-- = (*tmpa-- >> 1) | (r << (DIGIT_BIT - 1)); + + /* forward carry to next iteration */ + r = rr; + } + + /* zero excess digits */ + tmpb = b->dp + b->used; + for (x = b->used; x < oldused; x++) { + *tmpb++ = 0; + } + } + b->sign = a->sign; + pstm_clamp (b); + return PSTM_OKAY; +} + +/******************************************************************************/ +/* + Creates "a" then copies b into it + */ +int32 pstm_init_copy(psPool_t *pool, pstm_int * a, pstm_int * b, int16 toSqr) +{ + int16 x; + int32 res; + + if (a == b) { + return PSTM_OKAY; + } + x = b->alloc; + + if (toSqr) { +/* + Smart-size: Increasing size of a if b->used is roughly half + of b->alloc because usage has shown that a lot of these copies + go on to be squared and need these extra digits +*/ + if ((b->used * 2) + 2 >= x) { + x = (b->used * 2) + 3; + } + } + if ((res = pstm_init_size(pool, a, x)) != PSTM_OKAY) { + return res; + } + return pstm_copy(b, a); +} + +/******************************************************************************/ +/* + With some compilers, we have seen issues linking with the builtin + 64 bit division routine. The issues with either manifest in a failure + to find 'udivdi3' at link time, or a runtime invalid instruction fault + during an RSA operation. + The routine below divides a 64 bit unsigned int by a 32 bit unsigned int + explicitly, rather than using the division operation + The 64 bit result is placed in the 'numerator' parameter + The 32 bit mod (remainder) of the division is the return parameter + Based on implementations by: + Copyright (C) 2003 Bernardo Innocenti + Copyright (C) 1999 Hewlett-Packard Co + Copyright (C) 1999 David Mosberger-Tang +*/ +#if defined(USE_MATRIX_DIV64) && defined(PSTM_32BIT) +static uint32 psDiv64(uint64 *numerator, uint32 denominator) +{ + uint64 rem = *numerator; + uint64 b = denominator; + uint64 res = 0; + uint64 d = 1; + uint32 high = rem >> 32; + + if (high >= denominator) { + high /= denominator; + res = (uint64) high << 32; + rem -= (uint64) (high * denominator) << 32; + } + while ((int64)b > 0 && b < rem) { + b = b+b; + d = d+d; + } + do { + if (rem >= b) { + rem -= b; + res += d; + } + b >>= 1; + d >>= 1; + } while (d); + *numerator = res; + return rem; +} +#endif /* USE_MATRIX_DIV64 */ + +#if defined(USE_MATRIX_DIV128) && defined(PSTM_64BIT) +typedef unsigned long uint128 __attribute__ ((mode(TI))); +static uint64 psDiv128(uint128 *numerator, uint64 denominator) +{ + uint128 rem = *numerator; + uint128 b = denominator; + uint128 res = 0; + uint128 d = 1; + uint64 high = rem >> 64; + + if (high >= denominator) { + high /= denominator; + res = (uint128) high << 64; + rem -= (uint128) (high * denominator) << 64; + } + while ((uint128)b > 0 && b < rem) { + b = b+b; + d = d+d; + } + do { + if (rem >= b) { + rem -= b; + res += d; + } + b >>= 1; + d >>= 1; + } while (d); + *numerator = res; + return rem; +} +#endif /* USE_MATRIX_DIV128 */ + +/******************************************************************************/ +/* + a/b => cb + d == a +*/ +int32 pstm_div(psPool_t *pool, pstm_int *a, pstm_int *b, pstm_int *c, + pstm_int *d) +{ + pstm_int q, x, y, t1, t2; + int32 res; + int16 n, t, i, norm, neg; + + /* is divisor zero ? */ + if (pstm_iszero (b) == 1) { + return PS_LIMIT_FAIL; + } + + /* if a < b then q=0, r = a */ + if (pstm_cmp_mag (a, b) == PSTM_LT) { + if (d != NULL) { + if (pstm_copy(a, d) != PSTM_OKAY) { + return PS_MEM_FAIL; + } + } + if (c != NULL) { + pstm_zero (c); + } + return PSTM_OKAY; + } +/* + Smart-size inits +*/ + if ((res = pstm_init_size(pool, &t1, a->alloc)) != PSTM_OKAY) { + return res; + } + if ((res = pstm_init_size(pool, &t2, 3)) != PSTM_OKAY) { + goto LBL_T1; + } + if ((res = pstm_init_copy(pool, &x, a, 0)) != PSTM_OKAY) { + goto LBL_T2; + } +/* + Used to be an init_copy on b but pstm_grow was always hit with triple size +*/ + if ((res = pstm_init_size(pool, &y, b->used * 3)) != PSTM_OKAY) { + goto LBL_X; + } + if ((res = pstm_copy(b, &y)) != PSTM_OKAY) { + goto LBL_Y; + } + + /* fix the sign */ + neg = (a->sign == b->sign) ? PSTM_ZPOS : PSTM_NEG; + x.sign = y.sign = PSTM_ZPOS; + + /* normalize both x and y, ensure that y >= b/2, [b == 2**DIGIT_BIT] */ + norm = pstm_count_bits(&y) % DIGIT_BIT; + if (norm < (int32)(DIGIT_BIT-1)) { + norm = (DIGIT_BIT-1) - norm; + if ((res = pstm_mul_2d(&x, norm, &x)) != PSTM_OKAY) { + goto LBL_Y; + } + if ((res = pstm_mul_2d(&y, norm, &y)) != PSTM_OKAY) { + goto LBL_Y; + } + } else { + norm = 0; + } + + /* note hac does 0 based, so if used==5 then its 0,1,2,3,4, e.g. use 4 */ + n = x.used - 1; + t = y.used - 1; + + if ((res = pstm_init_size(pool, &q, n - t + 1)) != PSTM_OKAY) { + goto LBL_Y; + } + q.used = n - t + 1; + + /* while (x >= y*b**n-t) do { q[n-t] += 1; x -= y*b**{n-t} } */ + if ((res = pstm_lshd(&y, n - t)) != PSTM_OKAY) { /* y = y*b**{n-t} */ + goto LBL_Q; + } + + while (pstm_cmp (&x, &y) != PSTM_LT) { + ++(q.dp[n - t]); + if ((res = pstm_sub(&x, &y, &x)) != PSTM_OKAY) { + goto LBL_Q; + } + } + + /* reset y by shifting it back down */ + pstm_rshd (&y, n - t); + + /* step 3. for i from n down to (t + 1) */ + for (i = n; i >= (t + 1); i--) { + if (i > x.used) { + continue; + } + + /* step 3.1 if xi == yt then set q{i-t-1} to b-1, + * otherwise set q{i-t-1} to (xi*b + x{i-1})/yt */ + if (x.dp[i] == y.dp[t]) { + q.dp[i - t - 1] = (pstm_digit)((((pstm_word)1) << DIGIT_BIT) - 1); + } else { + pstm_word tmp; + tmp = ((pstm_word) x.dp[i]) << ((pstm_word) DIGIT_BIT); + tmp |= ((pstm_word) x.dp[i - 1]); +#if defined(USE_MATRIX_DIV64) && defined(PSTM_32BIT) + psDiv64(&tmp, y.dp[t]); +#elif defined(USE_MATRIX_DIV128) && defined(PSTM_64BIT) + psDiv128(&tmp, y.dp[t]); +#else + tmp /= ((pstm_word) y.dp[t]); +#endif /* USE_MATRIX_DIV64 */ + q.dp[i - t - 1] = (pstm_digit) (tmp); + } + + /* while (q{i-t-1} * (yt * b + y{t-1})) > + xi * b**2 + xi-1 * b + xi-2 + + do q{i-t-1} -= 1; + */ + q.dp[i - t - 1] = (q.dp[i - t - 1] + 1); + do { + q.dp[i - t - 1] = (q.dp[i - t - 1] - 1); + + /* find left hand */ + pstm_zero (&t1); + t1.dp[0] = (t - 1 < 0) ? 0 : y.dp[t - 1]; + t1.dp[1] = y.dp[t]; + t1.used = 2; + if ((res = pstm_mul_d (&t1, q.dp[i - t - 1], &t1)) != PSTM_OKAY) { + goto LBL_Q; + } + + /* find right hand */ + t2.dp[0] = (i - 2 < 0) ? 0 : x.dp[i - 2]; + t2.dp[1] = (i - 1 < 0) ? 0 : x.dp[i - 1]; + t2.dp[2] = x.dp[i]; + t2.used = 3; + } while (pstm_cmp_mag(&t1, &t2) == PSTM_GT); + + /* step 3.3 x = x - q{i-t-1} * y * b**{i-t-1} */ + if ((res = pstm_mul_d(&y, q.dp[i - t - 1], &t1)) != PSTM_OKAY) { + goto LBL_Q; + } + + if ((res = pstm_lshd(&t1, i - t - 1)) != PSTM_OKAY) { + goto LBL_Q; + } + + if ((res = pstm_sub(&x, &t1, &x)) != PSTM_OKAY) { + goto LBL_Q; + } + + /* if x < 0 then { x = x + y*b**{i-t-1}; q{i-t-1} -= 1; } */ + if (x.sign == PSTM_NEG) { + if ((res = pstm_copy(&y, &t1)) != PSTM_OKAY) { + goto LBL_Q; + } + if ((res = pstm_lshd (&t1, i - t - 1)) != PSTM_OKAY) { + goto LBL_Q; + } + if ((res = pstm_add (&x, &t1, &x)) != PSTM_OKAY) { + goto LBL_Q; + } + q.dp[i - t - 1] = q.dp[i - t - 1] - 1; + } + } +/* + now q is the quotient and x is the remainder (which we have to normalize) +*/ + /* get sign before writing to c */ + x.sign = x.used == 0 ? PSTM_ZPOS : a->sign; + + if (c != NULL) { + pstm_clamp (&q); + if (pstm_copy (&q, c) != PSTM_OKAY) { + res = PS_MEM_FAIL; + goto LBL_Q; + } + c->sign = neg; + } + + if (d != NULL) { + if ((res = pstm_div_2d (pool, &x, norm, &x, NULL)) != PSTM_OKAY) { + goto LBL_Q; + } +/* + the following is a kludge, essentially we were seeing the right + remainder but with excess digits that should have been zero + */ + for (i = b->used; i < x.used; i++) { + x.dp[i] = 0; + } + pstm_clamp(&x); + if (pstm_copy (&x, d) != PSTM_OKAY) { + res = PS_MEM_FAIL; + goto LBL_Q; + } + } + + res = PSTM_OKAY; + +LBL_Q:pstm_clear (&q); +LBL_Y:pstm_clear (&y); +LBL_X:pstm_clear (&x); +LBL_T2:pstm_clear (&t2); +LBL_T1:pstm_clear (&t1); + + return res; +} + +/******************************************************************************/ +/* + Swap the elements of two integers, for cases where you can't simply swap + the pstm_int pointers around +*/ +void pstm_exch(pstm_int * a, pstm_int * b) +{ + pstm_int t; + + t = *a; + *a = *b; + *b = t; +} + +/******************************************************************************/ +/* + c = a mod b, 0 <= c < b +*/ +int32 pstm_mod(psPool_t *pool, pstm_int *a, pstm_int *b, pstm_int *c) +{ + pstm_int t; + int32 err; +/* + Smart-size +*/ + if ((err = pstm_init_size(pool, &t, b->alloc)) != PSTM_OKAY) { + return err; + } + if ((err = pstm_div(pool, a, b, NULL, &t)) != PSTM_OKAY) { + pstm_clear (&t); + return err; + } + if (t.sign != b->sign) { + err = pstm_add(&t, b, c); + } else { + pstm_exch (&t, c); + } + pstm_clear (&t); + return err; +} + +/******************************************************************************/ +/* + d = a * b (mod c) +*/ +int32 pstm_mulmod(psPool_t *pool, pstm_int *a, pstm_int *b, pstm_int *c, + pstm_int *d) +{ + int32 res; + int16 size; + pstm_int tmp; + +/* + Smart-size pstm_inits. d is an output that is influenced by this local 't' + so don't shrink 'd' if it wants to becuase this will lead to an pstm_grow + in RSA operations +*/ + size = a->used + b->used + 1; + if ((a == d) && (size < a->alloc)) { + size = a->alloc; + } + if ((res = pstm_init_size(pool, &tmp, size)) != PSTM_OKAY) { + return res; + } + if ((res = pstm_mul_comba(pool, a, b, &tmp, NULL, 0)) != PSTM_OKAY) { + pstm_clear(&tmp); + return res; + } + res = pstm_mod(pool, &tmp, c, d); + pstm_clear(&tmp); + return res; +} + +/******************************************************************************/ +/* + * y = g**x (mod b) + * Some restrictions... x must be positive and < b + */ +int32 pstm_exptmod(psPool_t *pool, pstm_int *G, pstm_int *X, pstm_int *P, + pstm_int *Y) +{ + pstm_int M[32], res; /* Keep this winsize based: (1 << max_winsize) */ + pstm_digit buf, mp; + pstm_digit *paD; + int32 err, bitbuf; + int16 bitcpy, bitcnt, mode, digidx, x, y, winsize; + uint32 paDlen; + + /* set window size from what user set as optimization */ + x = pstm_count_bits(X); + if (x < 50) { + winsize = 2; + } else { + winsize = PS_EXPTMOD_WINSIZE; + } + + /* now setup montgomery */ + if ((err = pstm_montgomery_setup (P, &mp)) != PSTM_OKAY) { + return err; + } + + /* setup result */ + if ((err = pstm_init_size(pool, &res, (P->used * 2) + 1)) != PSTM_OKAY) { + return err; + } +/* + create M table + The M table contains powers of the input base, e.g. M[x] = G^x mod P + The first half of the table is not computed though except for M[0] and M[1] + */ + /* now we need R mod m */ + if ((err = pstm_montgomery_calc_normalization (&res, P)) != PSTM_OKAY) { + goto LBL_RES; + } +/* + init M array + init first cell + */ + if ((err = pstm_init_size(pool, &M[1], res.used)) != PSTM_OKAY) { + goto LBL_RES; + } + + /* now set M[1] to G * R mod m */ + if (pstm_cmp_mag(P, G) != PSTM_GT) { + /* G > P so we reduce it first */ + if ((err = pstm_mod(pool, G, P, &M[1])) != PSTM_OKAY) { + goto LBL_M; + } + } else { + if ((err = pstm_copy(G, &M[1])) != PSTM_OKAY) { + goto LBL_M; + } + } + if ((err = pstm_mulmod (pool, &M[1], &res, P, &M[1])) != PSTM_OKAY) { + goto LBL_M; + } +/* + Pre-allocated digit. Used for mul, sqr, AND reduce +*/ + paDlen = ((M[1].used + 3) * 2) * sizeof(pstm_digit); + paD = xzalloc(paDlen); +/* + compute the value at M[1<<(winsize-1)] by squaring M[1] (winsize-1) times + */ + if (pstm_init_copy(pool, &M[1 << (winsize - 1)], &M[1], 1) != PSTM_OKAY) { + err = PS_MEM_FAIL; + goto LBL_PAD; + } + for (x = 0; x < (winsize - 1); x++) { + if ((err = pstm_sqr_comba (pool, &M[1 << (winsize - 1)], + &M[1 << (winsize - 1)], paD, paDlen)) != PSTM_OKAY) { + goto LBL_PAD; + } + if ((err = pstm_montgomery_reduce(pool, &M[1 << (winsize - 1)], P, mp, + paD, paDlen)) != PSTM_OKAY) { + goto LBL_PAD; + } + } +/* + now init the second half of the array +*/ + for (x = (1<<(winsize-1)) + 1; x < (1 << winsize); x++) { + if ((err = pstm_init_size(pool, &M[x], M[1<<(winsize-1)].alloc + 1)) + != PSTM_OKAY) { + for (y = 1<<(winsize-1); y < x; y++) { + pstm_clear(&M[y]); + } + goto LBL_PAD; + } + } + + /* create upper table */ + for (x = (1 << (winsize - 1)) + 1; x < (1 << winsize); x++) { + if ((err = pstm_mul_comba(pool, &M[x - 1], &M[1], &M[x], paD, paDlen)) + != PSTM_OKAY) { + goto LBL_MARRAY; + } + if ((err = pstm_montgomery_reduce(pool, &M[x], P, mp, paD, paDlen)) != + PSTM_OKAY) { + goto LBL_MARRAY; + } + } + + /* set initial mode and bit cnt */ + mode = 0; + bitcnt = 1; + buf = 0; + digidx = X->used - 1; + bitcpy = 0; + bitbuf = 0; + + for (;;) { + /* grab next digit as required */ + if (--bitcnt == 0) { + /* if digidx == -1 we are out of digits so break */ + if (digidx == -1) { + break; + } + /* read next digit and reset bitcnt */ + buf = X->dp[digidx--]; + bitcnt = (int32)DIGIT_BIT; + } + + /* grab the next msb from the exponent */ + y = (pstm_digit)(buf >> (DIGIT_BIT - 1)) & 1; + buf <<= (pstm_digit)1; +/* + If the bit is zero and mode == 0 then we ignore it. + These represent the leading zero bits before the first 1 bit + in the exponent. Technically this opt is not required but it + does lower the # of trivial squaring/reductions used +*/ + if (mode == 0 && y == 0) { + continue; + } + + /* if the bit is zero and mode == 1 then we square */ + if (mode == 1 && y == 0) { + if ((err = pstm_sqr_comba(pool, &res, &res, paD, paDlen)) != + PSTM_OKAY) { + goto LBL_MARRAY; + } + if ((err = pstm_montgomery_reduce(pool, &res, P, mp, paD, paDlen)) + != PSTM_OKAY) { + goto LBL_MARRAY; + } + continue; + } + + /* else we add it to the window */ + bitbuf |= (y << (winsize - ++bitcpy)); + mode = 2; + + if (bitcpy == winsize) { + /* ok window is filled so square as required and mul square first */ + for (x = 0; x < winsize; x++) { + if ((err = pstm_sqr_comba(pool, &res, &res, paD, paDlen)) != + PSTM_OKAY) { + goto LBL_MARRAY; + } + if ((err = pstm_montgomery_reduce(pool, &res, P, mp, paD, + paDlen)) != PSTM_OKAY) { + goto LBL_MARRAY; + } + } + + /* then multiply */ + if ((err = pstm_mul_comba(pool, &res, &M[bitbuf], &res, paD, + paDlen)) != PSTM_OKAY) { + goto LBL_MARRAY; + } + if ((err = pstm_montgomery_reduce(pool, &res, P, mp, paD, paDlen)) + != PSTM_OKAY) { + goto LBL_MARRAY; + } + + /* empty window and reset */ + bitcpy = 0; + bitbuf = 0; + mode = 1; + } + } + + /* if bits remain then square/multiply */ + if (mode == 2 && bitcpy > 0) { + /* square then multiply if the bit is set */ + for (x = 0; x < bitcpy; x++) { + if ((err = pstm_sqr_comba(pool, &res, &res, paD, paDlen)) != + PSTM_OKAY) { + goto LBL_MARRAY; + } + if ((err = pstm_montgomery_reduce(pool, &res, P, mp, paD, paDlen)) + != PSTM_OKAY) { + goto LBL_MARRAY; + } + + /* get next bit of the window */ + bitbuf <<= 1; + if ((bitbuf & (1 << winsize)) != 0) { + /* then multiply */ + if ((err = pstm_mul_comba(pool, &res, &M[1], &res, paD, paDlen)) + != PSTM_OKAY) { + goto LBL_MARRAY; + } + if ((err = pstm_montgomery_reduce(pool, &res, P, mp, paD, + paDlen)) != PSTM_OKAY) { + goto LBL_MARRAY; + } + } + } + } +/* + Fix up result if Montgomery reduction is used recall that any value in a + Montgomery system is actually multiplied by R mod n. So we have to reduce + one more time to cancel out the factor of R. +*/ + if ((err = pstm_montgomery_reduce(pool, &res, P, mp, paD, paDlen)) != + PSTM_OKAY) { + goto LBL_MARRAY; + } + /* swap res with Y */ + if ((err = pstm_copy (&res, Y)) != PSTM_OKAY) { + goto LBL_MARRAY; + } + err = PSTM_OKAY; +LBL_MARRAY: + for (x = 1<<(winsize-1); x < (1 << winsize); x++) { + pstm_clear(&M[x]); + } +LBL_PAD:psFree(paD, pool); +LBL_M: pstm_clear(&M[1]); +LBL_RES:pstm_clear(&res); + return err; +} + +/******************************************************************************/ +/* + +*/ +int32 pstm_add(pstm_int *a, pstm_int *b, pstm_int *c) +{ + int32 res; + int16 sa, sb; + + /* get sign of both inputs */ + sa = a->sign; + sb = b->sign; + + /* handle two cases, not four */ + if (sa == sb) { + /* both positive or both negative, add their mags, copy the sign */ + c->sign = sa; + if ((res = s_pstm_add (a, b, c)) != PSTM_OKAY) { + return res; + } + } else { +/* + one positive, the other negative + subtract the one with the greater magnitude from the one of the lesser + magnitude. The result gets the sign of the one with the greater mag. + */ + if (pstm_cmp_mag (a, b) == PSTM_LT) { + c->sign = sb; + if ((res = s_pstm_sub (b, a, c)) != PSTM_OKAY) { + return res; + } + } else { + c->sign = sa; + if ((res = s_pstm_sub (a, b, c)) != PSTM_OKAY) { + return res; + } + } + } + return PS_SUCCESS; +} + +/******************************************************************************/ +/* + reverse an array, used for radix code +*/ +static void pstm_reverse (unsigned char *s, int16 len) +{ + int32 ix, iy; + unsigned char t; + + ix = 0; + iy = len - 1; + while (ix < iy) { + t = s[ix]; + s[ix] = s[iy]; + s[iy] = t; + ++ix; + --iy; + } +} +/******************************************************************************/ +/* + No reverse. Useful in some of the EIP-154 PKA stuff where special byte + order seems to come into play more often +*/ +int32 pstm_to_unsigned_bin_nr(psPool_t *pool, pstm_int *a, unsigned char *b) +{ + int32 res; + int16 x; + pstm_int t = { 0 }; + + if ((res = pstm_init_copy(pool, &t, a, 0)) != PSTM_OKAY) { + return res; + } + + x = 0; + while (pstm_iszero (&t) == 0) { + b[x++] = (unsigned char) (t.dp[0] & 255); + if ((res = pstm_div_2d (pool, &t, 8, &t, NULL)) != PSTM_OKAY) { + pstm_clear(&t); + return res; + } + } + pstm_clear(&t); + return PS_SUCCESS; +} +/******************************************************************************/ +/* + +*/ +int32 pstm_to_unsigned_bin(psPool_t *pool, pstm_int *a, unsigned char *b) +{ + int32 res; + int16 x; + pstm_int t = { 0 }; + + if ((res = pstm_init_copy(pool, &t, a, 0)) != PSTM_OKAY) { + return res; + } + + x = 0; + while (pstm_iszero (&t) == 0) { + b[x++] = (unsigned char) (t.dp[0] & 255); + if ((res = pstm_div_2d (pool, &t, 8, &t, NULL)) != PSTM_OKAY) { + pstm_clear(&t); + return res; + } + } + pstm_reverse (b, x); + pstm_clear(&t); + return PS_SUCCESS; +} + +/******************************************************************************/ +/* + compare against a single digit +*/ +int32 pstm_cmp_d(pstm_int *a, pstm_digit b) +{ + /* compare based on sign */ + if ((b && a->used == 0) || a->sign == PSTM_NEG) { + return PSTM_LT; + } + + /* compare based on magnitude */ + if (a->used > 1) { + return PSTM_GT; + } + + /* compare the only digit of a to b */ + if (a->dp[0] > b) { + return PSTM_GT; + } else if (a->dp[0] < b) { + return PSTM_LT; + } else { + return PSTM_EQ; + } +} + +/* + Need invmod for ECC and also private key loading for hardware crypto + in cases where dQ > dP. The values must be switched and a new qP must be + calculated using this function +*/ +static int32 pstm_invmod_slow(psPool_t *pool, pstm_int * a, pstm_int * b, + pstm_int * c) +{ + pstm_int x, y, u, v, A, B, C, D; + int32 res; + + /* b cannot be negative */ + if (b->sign == PSTM_NEG || pstm_iszero(b) == 1) { + return PS_LIMIT_FAIL; + } + + /* init temps */ + if (pstm_init_size(pool, &x, b->used) != PSTM_OKAY) { + return PS_MEM_FAIL; + } + + /* x = a, y = b */ + if ((res = pstm_mod(pool, a, b, &x)) != PSTM_OKAY) { + goto LBL_X; + } + + if (pstm_init_copy(pool, &y, b, 0) != PSTM_OKAY) { + goto LBL_X; + } + + /* 2. [modified] if x,y are both even then return an error! */ + if (pstm_iseven (&x) == 1 && pstm_iseven (&y) == 1) { + res = PS_FAILURE; + goto LBL_Y; + } + + /* 3. u=x, v=y, A=1, B=0, C=0,D=1 */ + if ((res = pstm_init_copy(pool, &u, &x, 0)) != PSTM_OKAY) { + goto LBL_Y; + } + if ((res = pstm_init_copy(pool, &v, &y, 0)) != PSTM_OKAY) { + goto LBL_U; + } + + if ((res = pstm_init_size(pool, &A, sizeof(pstm_digit))) != PSTM_OKAY) { + goto LBL_V; + } + + if ((res = pstm_init_size(pool, &D, sizeof(pstm_digit))) != PSTM_OKAY) { + goto LBL_A; + } + pstm_set (&A, 1); + pstm_set (&D, 1); + + if ((res = pstm_init(pool, &B)) != PSTM_OKAY) { + goto LBL_D; + } + if ((res = pstm_init(pool, &C)) != PSTM_OKAY) { + goto LBL_B; + } + +top: + /* 4. while u is even do */ + while (pstm_iseven (&u) == 1) { + /* 4.1 u = u/2 */ + if ((res = pstm_div_2 (&u, &u)) != PSTM_OKAY) { + goto LBL_C; + } + + /* 4.2 if A or B is odd then */ + if (pstm_isodd (&A) == 1 || pstm_isodd (&B) == 1) { + /* A = (A+y)/2, B = (B-x)/2 */ + if ((res = pstm_add (&A, &y, &A)) != PSTM_OKAY) { + goto LBL_C; + } + if ((res = pstm_sub (&B, &x, &B)) != PSTM_OKAY) { + goto LBL_C; + } + } + /* A = A/2, B = B/2 */ + if ((res = pstm_div_2 (&A, &A)) != PSTM_OKAY) { + goto LBL_C; + } + if ((res = pstm_div_2 (&B, &B)) != PSTM_OKAY) { + goto LBL_C; + } + } + + /* 5. while v is even do */ + while (pstm_iseven (&v) == 1) { + /* 5.1 v = v/2 */ + if ((res = pstm_div_2 (&v, &v)) != PSTM_OKAY) { + goto LBL_C; + } + + /* 5.2 if C or D is odd then */ + if (pstm_isodd (&C) == 1 || pstm_isodd (&D) == 1) { + /* C = (C+y)/2, D = (D-x)/2 */ + if ((res = pstm_add (&C, &y, &C)) != PSTM_OKAY) { + goto LBL_C; + } + if ((res = pstm_sub (&D, &x, &D)) != PSTM_OKAY) { + goto LBL_C; + } + } + /* C = C/2, D = D/2 */ + if ((res = pstm_div_2 (&C, &C)) != PSTM_OKAY) { + goto LBL_C; + } + if ((res = pstm_div_2 (&D, &D)) != PSTM_OKAY) { + goto LBL_C; + } + } + + /* 6. if u >= v then */ + if (pstm_cmp (&u, &v) != PSTM_LT) { + /* u = u - v, A = A - C, B = B - D */ + if ((res = pstm_sub (&u, &v, &u)) != PSTM_OKAY) { + goto LBL_C; + } + if ((res = pstm_sub (&A, &C, &A)) != PSTM_OKAY) { + goto LBL_C; + } + if ((res = pstm_sub (&B, &D, &B)) != PSTM_OKAY) { + goto LBL_C; + } + } else { + /* v - v - u, C = C - A, D = D - B */ + if ((res = pstm_sub (&v, &u, &v)) != PSTM_OKAY) { + goto LBL_C; + } + if ((res = pstm_sub (&C, &A, &C)) != PSTM_OKAY) { + goto LBL_C; + } + if ((res = pstm_sub (&D, &B, &D)) != PSTM_OKAY) { + goto LBL_C; + } + } + + /* if not zero goto step 4 */ + if (pstm_iszero (&u) == 0) + goto top; + + /* now a = C, b = D, gcd == g*v */ + + /* if v != 1 then there is no inverse */ + if (pstm_cmp_d (&v, 1) != PSTM_EQ) { + res = PS_FAILURE; + goto LBL_C; + } + + /* if its too low */ + while (pstm_cmp_d(&C, 0) == PSTM_LT) { + if ((res = pstm_add(&C, b, &C)) != PSTM_OKAY) { + goto LBL_C; + } + } + + /* too big */ + while (pstm_cmp_mag(&C, b) != PSTM_LT) { + if ((res = pstm_sub(&C, b, &C)) != PSTM_OKAY) { + goto LBL_C; + } + } + + /* C is now the inverse */ + if ((res = pstm_copy(&C, c)) != PSTM_OKAY) { + goto LBL_C; + } + res = PSTM_OKAY; + +LBL_C: pstm_clear(&C); +LBL_D: pstm_clear(&D); +LBL_B: pstm_clear(&B); +LBL_A: pstm_clear(&A); +LBL_V: pstm_clear(&v); +LBL_U: pstm_clear(&u); +LBL_Y: pstm_clear(&y); +LBL_X: pstm_clear(&x); + + return res; +} + +/* c = 1/a (mod b) for odd b only */ +int32 pstm_invmod(psPool_t *pool, pstm_int *a, pstm_int *b, pstm_int *c) +{ + pstm_int x, y, u, v, B, D; + int32 res; + uint16 neg, sanity; + + /* 2. [modified] b must be odd */ + if (pstm_iseven (b) == 1) { + return pstm_invmod_slow(pool, a,b,c); + } + + /* x == modulus, y == value to invert */ + if ((res = pstm_init_copy(pool, &x, b, 0)) != PSTM_OKAY) { + return res; + } + + if ((res = pstm_init_size(pool, &y, a->alloc)) != PSTM_OKAY) { + goto LBL_X; + } + + /* we need y = |a| */ + pstm_abs(a, &y); + + /* 3. u=x, v=y, A=1, B=0, C=0,D=1 */ + if ((res = pstm_init_copy(pool, &u, &x, 0)) != PSTM_OKAY) { + goto LBL_Y; + } + if ((res = pstm_init_copy(pool, &v, &y, 0)) != PSTM_OKAY) { + goto LBL_U; + } + if ((res = pstm_init(pool, &B)) != PSTM_OKAY) { + goto LBL_V; + } + if ((res = pstm_init(pool, &D)) != PSTM_OKAY) { + goto LBL_B; + } + + pstm_set (&D, 1); + + sanity = 0; +top: + /* 4. while u is even do */ + while (pstm_iseven (&u) == 1) { + /* 4.1 u = u/2 */ + if ((res = pstm_div_2 (&u, &u)) != PSTM_OKAY) { + goto LBL_D; + } + + /* 4.2 if B is odd then */ + if (pstm_isodd (&B) == 1) { + if ((res = pstm_sub (&B, &x, &B)) != PSTM_OKAY) { + goto LBL_D; + } + } + /* B = B/2 */ + if ((res = pstm_div_2 (&B, &B)) != PSTM_OKAY) { + goto LBL_D; + } + } + + /* 5. while v is even do */ + while (pstm_iseven (&v) == 1) { + /* 5.1 v = v/2 */ + if ((res = pstm_div_2 (&v, &v)) != PSTM_OKAY) { + goto LBL_D; + } + /* 5.2 if D is odd then */ + if (pstm_isodd (&D) == 1) { + /* D = (D-x)/2 */ + if ((res = pstm_sub (&D, &x, &D)) != PSTM_OKAY) { + goto LBL_D; + } + } + /* D = D/2 */ + if ((res = pstm_div_2 (&D, &D)) != PSTM_OKAY) { + goto LBL_D; + } + } + + /* 6. if u >= v then */ + if (pstm_cmp (&u, &v) != PSTM_LT) { + /* u = u - v, B = B - D */ + if ((res = pstm_sub (&u, &v, &u)) != PSTM_OKAY) { + goto LBL_D; + } + if ((res = pstm_sub (&B, &D, &B)) != PSTM_OKAY) { + goto LBL_D; + } + } else { + /* v - v - u, D = D - B */ + if ((res = pstm_sub (&v, &u, &v)) != PSTM_OKAY) { + goto LBL_D; + } + if ((res = pstm_sub (&D, &B, &D)) != PSTM_OKAY) { + goto LBL_D; + } + } + + /* if not zero goto step 4 */ + if (sanity++ > 1000) { + res = PS_LIMIT_FAIL; + goto LBL_D; + } + if (pstm_iszero (&u) == 0) { + goto top; + } + + /* now a = C, b = D, gcd == g*v */ + + /* if v != 1 then there is no inverse */ + if (pstm_cmp_d (&v, 1) != PSTM_EQ) { + res = PS_FAILURE; + goto LBL_D; + } + + /* b is now the inverse */ + neg = a->sign; + while (D.sign == PSTM_NEG) { + if ((res = pstm_add (&D, b, &D)) != PSTM_OKAY) { + goto LBL_D; + } + } + if ((res = pstm_copy (&D, c)) != PSTM_OKAY) { + goto LBL_D; + } + c->sign = neg; + res = PSTM_OKAY; + +LBL_D: pstm_clear(&D); +LBL_B: pstm_clear(&B); +LBL_V: pstm_clear(&v); +LBL_U: pstm_clear(&u); +LBL_Y: pstm_clear(&y); +LBL_X: pstm_clear(&x); + return res; +} +#endif /* !DISABLE_PSTM */ +/******************************************************************************/ diff --git a/networking/tls_pstm.h b/networking/tls_pstm.h new file mode 100644 index 000000000..1affc1b69 --- /dev/null +++ b/networking/tls_pstm.h @@ -0,0 +1,238 @@ +/** + * @file pstm.h + * @version 33ef80f (HEAD, tag: MATRIXSSL-3-7-2-OPEN, tag: MATRIXSSL-3-7-2-COMM, origin/master, origin/HEAD, master) + * + * multiple-precision integer library. + */ +/* + * Copyright (c) 2013-2015 INSIDE Secure Corporation + * Copyright (c) PeerSec Networks, 2002-2011 + * All Rights Reserved + * + * The latest version of this code is available at http://www.matrixssl.org + * + * This software is open source; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This General Public License does NOT permit incorporating this software + * into proprietary programs. If you are unable to comply with the GPL, a + * commercial license for this software may be purchased from INSIDE at + * http://www.insidesecure.com/eng/Company/Locations + * + * This program is distributed in WITHOUT ANY WARRANTY; without even the + * implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * http://www.gnu.org/copyleft/gpl.html + */ +/******************************************************************************/ + +#ifndef _h_PSTMATH +#define _h_PSTMATH +#ifndef DISABLE_PSTM + +/* Define this here to avoid including circular limits.h on some platforms */ +#ifndef CHAR_BIT +#define CHAR_BIT 8 +#endif + +/******************************************************************************/ +/* + If native 64 bit integers are not supported, we do not support 32x32->64 + in hardware, so we must set the 16 bit flag to produce 16x16->32 products. +*/ +#ifndef HAVE_NATIVE_INT64 + #define PSTM_16BIT +#endif /* ! HAVE_NATIVE_INT64 */ + +/******************************************************************************/ +/* + Some default configurations. + + pstm_word should be the largest value the processor can hold as the product + of a multiplication. Most platforms support a 32x32->64 MAC instruction, + so 64bits is the default pstm_word size. + pstm_digit should be half the size of pstm_word + */ +#ifdef PSTM_8BIT +/* 8-bit digits, 16-bit word products */ + typedef unsigned char pstm_digit; + typedef unsigned short pstm_word; + #define DIGIT_BIT 8 + +#elif defined(PSTM_16BIT) +/* 16-bit digits, 32-bit word products */ + typedef unsigned short pstm_digit; + typedef unsigned long pstm_word; + #define DIGIT_BIT 16 + +#elif defined(PSTM_64BIT) +/* 64-bit digits, 128-bit word products */ + #ifndef __GNUC__ + #error "64bit digits requires GCC" + #endif + typedef unsigned long pstm_digit; + typedef unsigned long pstm_word __attribute__ ((mode(TI))); + #define DIGIT_BIT 64 + +#else +/* This is the default case, 32-bit digits, 64-bit word products */ + typedef uint32 pstm_digit; + typedef uint64 pstm_word; + #define DIGIT_BIT 32 + #define PSTM_32BIT +#endif /* digit and word size */ + +#define PSTM_MASK (pstm_digit)(-1) +#define PSTM_DIGIT_MAX PSTM_MASK + +/******************************************************************************/ +/* + equalities + */ +#define PSTM_LT -1 /* less than */ +#define PSTM_EQ 0 /* equal to */ +#define PSTM_GT 1 /* greater than */ + +#define PSTM_ZPOS 0 /* positive integer */ +#define PSTM_NEG 1 /* negative */ + +#define PSTM_OKAY PS_SUCCESS +#define PSTM_MEM PS_MEM_FAIL + +/******************************************************************************/ +/* + Various build options + */ +#define PSTM_DEFAULT_INIT 64 /* default (64) digits of allocation */ +#define PSTM_MAX_SIZE 4096 + +typedef struct { + int16 used, alloc, sign; + pstm_digit *dp; + psPool_t *pool; +} pstm_int; + +/******************************************************************************/ +/* + Operations on large integers + */ +#define pstm_iszero(a) (((a)->used == 0) ? PS_TRUE : PS_FALSE) +#define pstm_iseven(a) (((a)->used > 0 && (((a)->dp[0] & 1) == 0)) ? PS_TRUE : PS_FALSE) +#define pstm_isodd(a) (((a)->used > 0 && (((a)->dp[0] & 1) == 1)) ? PS_TRUE : PS_FALSE) +#define pstm_abs(a, b) { pstm_copy(a, b); (b)->sign = 0; } + +extern void pstm_set(pstm_int *a, pstm_digit b); + +extern void pstm_zero(pstm_int * a); + +extern int32 pstm_init(psPool_t *pool, pstm_int * a); + +extern int32 pstm_init_size(psPool_t *pool, pstm_int * a, uint32 size); + +extern int32 pstm_init_copy(psPool_t *pool, pstm_int * a, pstm_int * b, + int16 toSqr); + +extern int16 pstm_count_bits (pstm_int * a); + +extern int32 pstm_init_for_read_unsigned_bin(psPool_t *pool, pstm_int *a, + uint32 len); + +extern int32 pstm_read_unsigned_bin(pstm_int *a, unsigned char *b, int32 c); + +extern int32 pstm_unsigned_bin_size(pstm_int *a); + +extern int32 pstm_copy(pstm_int * a, pstm_int * b); + +extern void pstm_exch(pstm_int * a, pstm_int * b); + +extern void pstm_clear(pstm_int * a); + +extern void pstm_clear_multi(pstm_int *mp0, pstm_int *mp1, pstm_int *mp2, + pstm_int *mp3, pstm_int *mp4, pstm_int *mp5, pstm_int *mp6, + pstm_int *mp7); + +extern int32 pstm_grow(pstm_int * a, int16 size); + +extern void pstm_clamp(pstm_int * a); + +extern int32 pstm_cmp(pstm_int * a, pstm_int * b); + +extern int32 pstm_cmp_mag(pstm_int * a, pstm_int * b); + +extern void pstm_rshd(pstm_int *a, int16 x); + +extern int32 pstm_lshd(pstm_int * a, int16 b); + +extern int32 pstm_div(psPool_t *pool, pstm_int *a, pstm_int *b, pstm_int *c, + pstm_int *d); + +extern int32 pstm_div_2d(psPool_t *pool, pstm_int *a, int16 b, pstm_int *c, + pstm_int *d); + +extern int32 pstm_div_2(pstm_int * a, pstm_int * b); + +extern int32 s_pstm_sub(pstm_int *a, pstm_int *b, pstm_int *c); + +extern int32 pstm_sub(pstm_int *a, pstm_int *b, pstm_int *c); + +extern int32 pstm_sub_d(psPool_t *pool, pstm_int *a, pstm_digit b, pstm_int *c); + +extern int32 pstm_mul_2(pstm_int * a, pstm_int * b); + +extern int32 pstm_mod(psPool_t *pool, pstm_int *a, pstm_int *b, pstm_int *c); + +extern int32 pstm_mulmod(psPool_t *pool, pstm_int *a, pstm_int *b, pstm_int *c, + pstm_int *d); + +extern int32 pstm_exptmod(psPool_t *pool, pstm_int *G, pstm_int *X, pstm_int *P, + pstm_int *Y); + +extern int32 pstm_2expt(pstm_int *a, int16 b); + +extern int32 pstm_add(pstm_int *a, pstm_int *b, pstm_int *c); + +extern int32 pstm_to_unsigned_bin(psPool_t *pool, pstm_int *a, + unsigned char *b); + +extern int32 pstm_to_unsigned_bin_nr(psPool_t *pool, pstm_int *a, + unsigned char *b); + +extern int32 pstm_montgomery_setup(pstm_int *a, pstm_digit *rho); + +///bbox: pool unused +#define pstm_montgomery_reduce(pool, a, m, mp, paD, paDlen) \ + pstm_montgomery_reduce( a, m, mp, paD, paDlen) +extern int32 pstm_montgomery_reduce(psPool_t *pool, pstm_int *a, pstm_int *m, + pstm_digit mp, pstm_digit *paD, uint32 paDlen); + +#define pstm_mul_comba(pool, A, B, C, paD, paDlen) \ + pstm_mul_comba( A, B, C, paD, paDlen) +extern int32 pstm_mul_comba(psPool_t *pool, pstm_int *A, pstm_int *B, + pstm_int *C, pstm_digit *paD, uint32 paDlen); + +///bbox: pool unused +#define pstm_sqr_comba(pool, A, B, paD, paDlen) \ + pstm_sqr_comba( A, B, paD, paDlen) +extern int32 pstm_sqr_comba(psPool_t *pool, pstm_int *A, pstm_int *B, + pstm_digit *paD, uint32 paDlen); + +extern int32 pstm_cmp_d(pstm_int *a, pstm_digit b); + +extern int32 pstm_montgomery_calc_normalization(pstm_int *a, pstm_int *b); + +extern int32 pstm_mul_d(pstm_int *a, pstm_digit b, pstm_int *c); + +extern int32 pstm_invmod(psPool_t *pool, pstm_int * a, pstm_int * b, + pstm_int * c); + +#else /* DISABLE_PSTM */ + typedef int32 pstm_int; +#endif /* !DISABLE_PSTM */ +#endif /* _h_PSTMATH */ + diff --git a/networking/tls_pstm_montgomery_reduce.c b/networking/tls_pstm_montgomery_reduce.c new file mode 100644 index 000000000..c231c4ddf --- /dev/null +++ b/networking/tls_pstm_montgomery_reduce.c @@ -0,0 +1,423 @@ +/* + * Copyright (C) 2017 Denys Vlasenko + * + * Licensed under GPLv2, see file LICENSE in this source tree. + */ +#include "tls.h" + +/** + * @file pstm_montgomery_reduce.c + * @version 33ef80f (HEAD, tag: MATRIXSSL-3-7-2-OPEN, tag: MATRIXSSL-3-7-2-COMM, origin/master, origin/HEAD, master) + * + * Multiprecision Montgomery Reduction. + */ +/* + * Copyright (c) 2013-2015 INSIDE Secure Corporation + * Copyright (c) PeerSec Networks, 2002-2011 + * All Rights Reserved + * + * The latest version of this code is available at http://www.matrixssl.org + * + * This software is open source; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This General Public License does NOT permit incorporating this software + * into proprietary programs. If you are unable to comply with the GPL, a + * commercial license for this software may be purchased from INSIDE at + * http://www.insidesecure.com/eng/Company/Locations + * + * This program is distributed in WITHOUT ANY WARRANTY; without even the + * implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * http://www.gnu.org/copyleft/gpl.html + */ +/******************************************************************************/ + +///bbox +//#include "../cryptoApi.h" +#ifndef DISABLE_PSTM + +/******************************************************************************/ + +#if defined(PSTM_X86) +/* x86-32 optimized for 32 bit platforms. For 64 bit mode use X86_64 instead */ +#if !defined(__GNUC__) || !defined(__i386__) || !defined(PSTM_32BIT) +#error "PSTM_X86 option requires GCC and 32 bit mode x86 processor" +#endif +//#pragma message ("Using 32 bit x86 Assembly Optimizations") + +#define MONT_START +#define MONT_FINI +#define LOOP_END +#define LOOP_START \ + mu = c[x] * mp + +#define INNERMUL \ +asm( \ + "movl %5,%%eax \n\t" \ + "mull %4 \n\t" \ + "addl %1,%%eax \n\t" \ + "adcl $0,%%edx \n\t" \ + "addl %%eax,%0 \n\t" \ + "adcl $0,%%edx \n\t" \ + "movl %%edx,%1 \n\t" \ +:"=g"(_c[LO]), "=r"(cy) \ +:"0"(_c[LO]), "1"(cy), "g"(mu), "g"(*tmpm++) \ +: "%eax", "%edx", "%cc") + +#define PROPCARRY \ +asm( \ + "addl %1,%0 \n\t" \ + "setb %%al \n\t" \ + "movzbl %%al,%1 \n\t" \ +:"=g"(_c[LO]), "=r"(cy) \ +:"0"(_c[LO]), "1"(cy) \ +: "%eax", "%cc") + +/******************************************************************************/ +#elif defined(PSTM_X86_64) +/* x86-64 optimized */ +#if !defined(__GNUC__) || !defined(__x86_64__) || !defined(PSTM_64BIT) +#error "PSTM_X86_64 option requires PSTM_64BIT, GCC and 64 bit mode x86 processor" +#endif +//#pragma message ("Using 64 bit x86_64 Assembly Optimizations") + +#define MONT_START +#define MONT_FINI +#define LOOP_END +#define LOOP_START \ +mu = c[x] * mp + +#define INNERMUL \ +asm( \ + "movq %5,%%rax \n\t" \ + "mulq %4 \n\t" \ + "addq %1,%%rax \n\t" \ + "adcq $0,%%rdx \n\t" \ + "addq %%rax,%0 \n\t" \ + "adcq $0,%%rdx \n\t" \ + "movq %%rdx,%1 \n\t" \ + :"=g"(_c[LO]), "=r"(cy) \ + :"0"(_c[LO]), "1"(cy), "r"(mu), "r"(*tmpm++) \ + : "%rax", "%rdx", "cc") + +#define INNERMUL8 \ +asm( \ + "movq 0(%5),%%rax \n\t" \ + "movq 0(%2),%%r10 \n\t" \ + "movq 0x8(%5),%%r11 \n\t" \ + "mulq %4 \n\t" \ + "addq %%r10,%%rax \n\t" \ + "adcq $0,%%rdx \n\t" \ + "movq 0x8(%2),%%r10 \n\t" \ + "addq %3,%%rax \n\t" \ + "adcq $0,%%rdx \n\t" \ + "movq %%rax,0(%0) \n\t" \ + "movq %%rdx,%1 \n\t" \ + \ + "movq %%r11,%%rax \n\t" \ + "movq 0x10(%5),%%r11 \n\t" \ + "mulq %4 \n\t" \ + "addq %%r10,%%rax \n\t" \ + "adcq $0,%%rdx \n\t" \ + "movq 0x10(%2),%%r10 \n\t" \ + "addq %3,%%rax \n\t" \ + "adcq $0,%%rdx \n\t" \ + "movq %%rax,0x8(%0) \n\t" \ + "movq %%rdx,%1 \n\t" \ + \ + "movq %%r11,%%rax \n\t" \ + "movq 0x18(%5),%%r11 \n\t" \ + "mulq %4 \n\t" \ + "addq %%r10,%%rax \n\t" \ + "adcq $0,%%rdx \n\t" \ + "movq 0x18(%2),%%r10 \n\t" \ + "addq %3,%%rax \n\t" \ + "adcq $0,%%rdx \n\t" \ + "movq %%rax,0x10(%0) \n\t" \ + "movq %%rdx,%1 \n\t" \ + \ + "movq %%r11,%%rax \n\t" \ + "movq 0x20(%5),%%r11 \n\t" \ + "mulq %4 \n\t" \ + "addq %%r10,%%rax \n\t" \ + "adcq $0,%%rdx \n\t" \ + "movq 0x20(%2),%%r10 \n\t" \ + "addq %3,%%rax \n\t" \ + "adcq $0,%%rdx \n\t" \ + "movq %%rax,0x18(%0) \n\t" \ + "movq %%rdx,%1 \n\t" \ + \ + "movq %%r11,%%rax \n\t" \ + "movq 0x28(%5),%%r11 \n\t" \ + "mulq %4 \n\t" \ + "addq %%r10,%%rax \n\t" \ + "adcq $0,%%rdx \n\t" \ + "movq 0x28(%2),%%r10 \n\t" \ + "addq %3,%%rax \n\t" \ + "adcq $0,%%rdx \n\t" \ + "movq %%rax,0x20(%0) \n\t" \ + "movq %%rdx,%1 \n\t" \ + \ + "movq %%r11,%%rax \n\t" \ + "movq 0x30(%5),%%r11 \n\t" \ + "mulq %4 \n\t" \ + "addq %%r10,%%rax \n\t" \ + "adcq $0,%%rdx \n\t" \ + "movq 0x30(%2),%%r10 \n\t" \ + "addq %3,%%rax \n\t" \ + "adcq $0,%%rdx \n\t" \ + "movq %%rax,0x28(%0) \n\t" \ + "movq %%rdx,%1 \n\t" \ + \ + "movq %%r11,%%rax \n\t" \ + "movq 0x38(%5),%%r11 \n\t" \ + "mulq %4 \n\t" \ + "addq %%r10,%%rax \n\t" \ + "adcq $0,%%rdx \n\t" \ + "movq 0x38(%2),%%r10 \n\t" \ + "addq %3,%%rax \n\t" \ + "adcq $0,%%rdx \n\t" \ + "movq %%rax,0x30(%0) \n\t" \ + "movq %%rdx,%1 \n\t" \ + \ + "movq %%r11,%%rax \n\t" \ + "mulq %4 \n\t" \ + "addq %%r10,%%rax \n\t" \ + "adcq $0,%%rdx \n\t" \ + "addq %3,%%rax \n\t" \ + "adcq $0,%%rdx \n\t" \ + "movq %%rax,0x38(%0) \n\t" \ + "movq %%rdx,%1 \n\t" \ + \ + :"=r"(_c), "=r"(cy) \ + : "0"(_c), "1"(cy), "g"(mu), "r"(tmpm)\ + : "%rax", "%rdx", "%r10", "%r11", "cc") + +#define PROPCARRY \ +asm( \ + "addq %1,%0 \n\t" \ + "setb %%al \n\t" \ + "movzbq %%al,%1 \n\t" \ + :"=g"(_c[LO]), "=r"(cy) \ + :"0"(_c[LO]), "1"(cy) \ + : "%rax", "cc") + +/******************************************************************************/ +#elif defined(PSTM_ARM) + +#define MONT_START +#define MONT_FINI +#define LOOP_END +#define LOOP_START \ +mu = c[x] * mp + +#ifdef __thumb2__ +//#pragma message ("Using 32 bit ARM Thumb2 Assembly Optimizations") +#define INNERMUL \ +asm( \ + " LDR r0,%1 \n\t" \ + " ADDS r0,r0,%0 \n\t" \ + " ITE CS \n\t" \ + " MOVCS %0,#1 \n\t" \ + " MOVCC %0,#0 \n\t" \ + " UMLAL r0,%0,%3,%4 \n\t" \ + " STR r0,%1 \n\t" \ + :"=r"(cy),"=m"(_c[0])\ + :"0"(cy),"r"(mu),"r"(*tmpm++),"m"(_c[0])\ + :"r0","%cc"); +#define PROPCARRY \ +asm( \ + " LDR r0,%1 \n\t" \ + " ADDS r0,r0,%0 \n\t" \ + " STR r0,%1 \n\t" \ + " ITE CS \n\t" \ + " MOVCS %0,#1 \n\t" \ + " MOVCC %0,#0 \n\t" \ + :"=r"(cy),"=m"(_c[0])\ + :"0"(cy),"m"(_c[0])\ + :"r0","%cc"); +#else /* Non-Thumb2 code */ +//#pragma message ("Using 32 bit ARM Assembly Optimizations") +#define INNERMUL \ +asm( \ + " LDR r0,%1 \n\t" \ + " ADDS r0,r0,%0 \n\t" \ + " MOVCS %0,#1 \n\t" \ + " MOVCC %0,#0 \n\t" \ + " UMLAL r0,%0,%3,%4 \n\t" \ + " STR r0,%1 \n\t" \ + :"=r"(cy),"=m"(_c[0])\ + :"0"(cy),"r"(mu),"r"(*tmpm++),"m"(_c[0])\ + :"r0","%cc"); +#define PROPCARRY \ +asm( \ + " LDR r0,%1 \n\t" \ + " ADDS r0,r0,%0 \n\t" \ + " STR r0,%1 \n\t" \ + " MOVCS %0,#1 \n\t" \ + " MOVCC %0,#0 \n\t" \ + :"=r"(cy),"=m"(_c[0])\ + :"0"(cy),"m"(_c[0])\ + :"r0","%cc"); +#endif /* __thumb2__ */ + + +/******************************************************************************/ +#elif defined(PSTM_MIPS) +/* MIPS32 */ +//#pragma message ("Using 32 bit MIPS Assembly Optimizations") +#define MONT_START +#define MONT_FINI +#define LOOP_END +#define LOOP_START \ +mu = c[x] * mp + +#define INNERMUL \ +asm( \ + " multu %3,%4 \n\t" \ + " mflo $12 \n\t" \ + " mfhi $13 \n\t" \ + " addu $12,$12,%0 \n\t" \ + " sltu $10,$12,%0 \n\t" \ + " addu $13,$13,$10 \n\t" \ + " lw $10,%1 \n\t" \ + " addu $12,$12,$10 \n\t" \ + " sltu $10,$12,$10 \n\t" \ + " addu %0,$13,$10 \n\t" \ + " sw $12,%1 \n\t" \ + :"=r"(cy),"=m"(_c[0])\ + :"r"(cy),"r"(mu),"r"(tmpm[0]),"r"(_c[0])\ + :"$10","$12","$13")\ +; ++tmpm; + +#define PROPCARRY \ +asm( \ + " lw $10,%1 \n\t" \ + " addu $10,$10,%0 \n\t" \ + " sw $10,%1 \n\t" \ + " sltu %0,$10,%0 \n\t" \ + :"=r"(cy),"=m"(_c[0])\ + :"r"(cy),"r"(_c[0])\ + :"$10"); + + +/******************************************************************************/ +#else + +/* ISO C code */ +#define MONT_START +#define MONT_FINI +#define LOOP_END +#define LOOP_START \ + mu = c[x] * mp + +#define INNERMUL \ + do { pstm_word t; \ + t = ((pstm_word)_c[0] + (pstm_word)cy) + \ + (((pstm_word)mu) * ((pstm_word)*tmpm++)); \ + _c[0] = (pstm_digit)t; \ + cy = (pstm_digit)(t >> DIGIT_BIT); \ + } while (0) + +#define PROPCARRY \ + do { pstm_digit t = _c[0] += cy; cy = (t < cy); } while (0) + +#endif + +/******************************************************************************/ + +#define LO 0 + +/* computes x/R == x (mod N) via Montgomery Reduction */ +int32 pstm_montgomery_reduce(psPool_t *pool, pstm_int *a, pstm_int *m, + pstm_digit mp, pstm_digit *paD, uint32 paDlen) +{ + pstm_digit *c, *_c, *tmpm, mu; + int32 oldused, x, y; + int16 pa; + + pa = m->used; + if (pa > a->alloc) { + /* Sanity test for bad numbers. This will confirm no buffer overruns */ + return PS_LIMIT_FAIL; + } + + if (paD && paDlen >= (uint32)2*pa+1) { + c = paD; + memset(c, 0x0, paDlen); + } else { + c = xzalloc(2*pa+1); + } + /* copy the input */ + oldused = a->used; + for (x = 0; x < oldused; x++) { + c[x] = a->dp[x]; + } + + MONT_START; + + for (x = 0; x < pa; x++) { + pstm_digit cy = 0; + /* get Mu for this round */ + LOOP_START; + _c = c + x; + tmpm = m->dp; + y = 0; +#ifdef PSTM_X86_64 + for (; y < (pa & ~7); y += 8) { + INNERMUL8; + _c += 8; + tmpm += 8; + } +#endif /* PSTM_X86_64 */ + for (; y < pa; y++) { + INNERMUL; + ++_c; + } + LOOP_END; + while (cy) { + PROPCARRY; + ++_c; + } + } + + /* now copy out */ + _c = c + pa; + tmpm = a->dp; + for (x = 0; x < pa+1; x++) { + *tmpm++ = *_c++; + } + + for (; x < oldused; x++) { + *tmpm++ = 0; + } + + MONT_FINI; + + a->used = pa+1; + pstm_clamp(a); + + /* reuse x as return code */ + x = PSTM_OKAY; + + /* if A >= m then A = A - m */ + if (pstm_cmp_mag (a, m) != PSTM_LT) { + if (s_pstm_sub (a, m, a) != PSTM_OKAY) { + x = PS_MEM_FAIL; + } + } + if (paDlen < (uint32)2*pa+1) { + psFree(c, pool); + } + return x; +} + +#endif /* !DISABLE_PSTM */ +/******************************************************************************/ diff --git a/networking/tls_pstm_mul_comba.c b/networking/tls_pstm_mul_comba.c new file mode 100644 index 000000000..6e051baeb --- /dev/null +++ b/networking/tls_pstm_mul_comba.c @@ -0,0 +1,777 @@ +/* + * Copyright (C) 2017 Denys Vlasenko + * + * Licensed under GPLv2, see file LICENSE in this source tree. + */ +#include "tls.h" + +/** + * @file pstm_mul_comba.c + * @version 33ef80f (HEAD, tag: MATRIXSSL-3-7-2-OPEN, tag: MATRIXSSL-3-7-2-COMM, origin/master, origin/HEAD, master) + * + * Multiprecision multiplication with Comba technique. + */ +/* + * Copyright (c) 2013-2015 INSIDE Secure Corporation + * Copyright (c) PeerSec Networks, 2002-2011 + * All Rights Reserved + * + * The latest version of this code is available at http://www.matrixssl.org + * + * This software is open source; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This General Public License does NOT permit incorporating this software + * into proprietary programs. If you are unable to comply with the GPL, a + * commercial license for this software may be purchased from INSIDE at + * http://www.insidesecure.com/eng/Company/Locations + * + * This program is distributed in WITHOUT ANY WARRANTY; without even the + * implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * http://www.gnu.org/copyleft/gpl.html + */ +/******************************************************************************/ + +///bbox +//#include "../cryptoApi.h" +#ifndef DISABLE_PSTM + +/******************************************************************************/ +#if defined(PSTM_X86) +/* x86-32 optimized for 32 bit platforms. For 64 bit mode use X86_64 instead */ +#if !defined(__GNUC__) || !defined(__i386__) || !defined(PSTM_32BIT) +#error "PSTM_X86 option requires GCC and 32 bit mode x86 processor" +#endif +//#pragma message ("Using 32 bit x86 Assembly Optimizations") + +/* anything you need at the start */ +#define COMBA_START + +/* clear the chaining variables */ +#define COMBA_CLEAR \ + c0 = c1 = c2 = 0; + +/* forward the carry to the next digit */ +#define COMBA_FORWARD \ + do { c0 = c1; c1 = c2; c2 = 0; } while (0); + +/* store the first sum */ +#define COMBA_STORE(x) \ + x = c0; + +/* store the second sum [carry] */ +#define COMBA_STORE2(x) \ + x = c1; + +/* anything you need at the end */ +#define COMBA_FINI + +/* this should multiply i and j */ +#define MULADD(i, j) \ +asm( \ + "movl %6,%%eax \n\t" \ + "mull %7 \n\t" \ + "addl %%eax,%0 \n\t" \ + "adcl %%edx,%1 \n\t" \ + "adcl $0,%2 \n\t" \ + :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j) :"%eax","%edx","%cc"); + +/******************************************************************************/ +#elif defined(PSTM_X86_64) +/* x86-64 optimized */ +#if !defined(__GNUC__) || !defined(__x86_64__) || !defined(PSTM_64BIT) +#error "PSTM_X86_64 option requires PSTM_64BIT, GCC and 64 bit mode x86 processor" +#endif +//#pragma message ("Using 64 bit x86_64 Assembly Optimizations") + +/* anything you need at the start */ +#define COMBA_START + +/* clear the chaining variables */ +#define COMBA_CLEAR \ +c0 = c1 = c2 = 0; + +/* forward the carry to the next digit */ +#define COMBA_FORWARD \ +do { c0 = c1; c1 = c2; c2 = 0; } while (0); + +/* store the first sum */ +#define COMBA_STORE(x) \ +x = c0; + +/* store the second sum [carry] */ +#define COMBA_STORE2(x) \ +x = c1; + +/* anything you need at the end */ +#define COMBA_FINI + +/* this should multiply i and j */ +#define MULADD(i, j) \ +asm ( \ + "movq %6,%%rax \n\t" \ + "mulq %7 \n\t" \ + "addq %%rax,%0 \n\t" \ + "adcq %%rdx,%1 \n\t" \ + "adcq $0,%2 \n\t" \ + :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "g"(i), "g"(j) :"%rax","%rdx","cc"); + +/******************************************************************************/ +#elif defined(PSTM_ARM) +/* ARM code */ +//#pragma message ("Using 32 bit ARM Assembly Optimizations") + +#define COMBA_START + +#define COMBA_CLEAR \ +c0 = c1 = c2 = 0; + +#define COMBA_FORWARD \ +do { c0 = c1; c1 = c2; c2 = 0; } while (0); + +#define COMBA_STORE(x) \ +x = c0; + +#define COMBA_STORE2(x) \ +x = c1; + +#define COMBA_FINI + +#define MULADD(i, j) \ +asm( \ + " UMULL r0,r1,%6,%7 \n\t" \ + " ADDS %0,%0,r0 \n\t" \ + " ADCS %1,%1,r1 \n\t" \ + " ADC %2,%2,#0 \n\t" \ + :"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j) : "r0", "r1", "%cc"); + +/******************************************************************************/ +#elif defined(PSTM_MIPS) +/* MIPS32 code */ +//#pragma message ("Using 32 bit MIPS Assembly Optimizations") + +#define COMBA_START + +#define COMBA_CLEAR \ +c0 = c1 = c2 = 0; + +#define COMBA_FORWARD \ +do { c0 = c1; c1 = c2; c2 = 0; } while (0); + +#define COMBA_STORE(x) \ +x = c0; + +#define COMBA_STORE2(x) \ +x = c1; + +#define COMBA_FINI + +#define MULADD(i, j) \ +asm( \ + " multu %6,%7 \n\t" \ + " mflo $12 \n\t" \ + " mfhi $13 \n\t" \ + " addu %0,%0,$12 \n\t" \ + " sltu $12,%0,$12 \n\t" \ + " addu %1,%1,$13 \n\t" \ + " sltu $13,%1,$13 \n\t" \ + " addu %1,%1,$12 \n\t" \ + " sltu $12,%1,$12 \n\t" \ + " addu %2,%2,$13 \n\t" \ + " addu %2,%2,$12 \n\t" \ + :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j):"$12","$13"); + +/******************************************************************************/ +#else + +#define COMBA_START + +#define COMBA_CLEAR \ + c0 = c1 = c2 = 0; + +#define COMBA_FORWARD \ + do { c0 = c1; c1 = c2; c2 = 0; } while (0); + +#define COMBA_STORE(x) \ + x = c0; + +#define COMBA_STORE2(x) \ + x = c1; + +#define COMBA_FINI + +#define MULADD(i, j) \ + do { pstm_word t; \ + t = (pstm_word)c0 + ((pstm_word)i) * ((pstm_word)j); c0 = (pstm_digit)t; \ + t = (pstm_word)c1 + (t >> DIGIT_BIT); \ + c1 = (pstm_digit)t; c2 += (pstm_digit)(t >> DIGIT_BIT); \ + } while (0); + +#endif + +/******************************************************************************/ +/* generic PxQ multiplier */ +///bbox: pool unused +#define pstm_mul_comba_gen(pool, A, B, C, paD, paDlen) \ + pstm_mul_comba_gen( A, B, C, paD, paDlen) +static int32 pstm_mul_comba_gen(psPool_t *pool, pstm_int *A, pstm_int *B, + pstm_int *C, pstm_digit *paD, uint32 paDlen) +{ + int16 paDfail, pa; + int32 ix, iy, iz, tx, ty; + pstm_digit c0, c1, c2, *tmpx, *tmpy, *dst; + + COMBA_START; + COMBA_CLEAR; + + paDfail = 0; + /* get size of output and trim */ + pa = A->used + B->used; + +/* + If c is not large enough grow it and continue +*/ + if (C->alloc < pa) { + if (pstm_grow(C, pa) != PSTM_OKAY) { + return PS_MEM_FAIL; + } + } + if (paD != NULL) { + if (paDlen < (sizeof(pstm_digit) * pa)) { + paDfail = 1; /* have a paD but it's not large enough */ + dst = xzalloc(sizeof(pstm_digit) * pa); + } else { + dst = paD; + memset(dst, 0x0, paDlen); + } + } else { + dst = xzalloc(sizeof(pstm_digit) * pa); + } + + for (ix = 0; ix < pa; ix++) { + /* get offsets into the two bignums */ + ty = min(ix, B->used-1); + tx = ix - ty; + + /* setup temp aliases */ + tmpx = A->dp + tx; + tmpy = B->dp + ty; +/* + This is the number of times the loop will iterate, essentially it's + while (tx++ < a->used && ty-- >= 0) { ... } +*/ + iy = min(A->used-tx, ty+1); + + /* execute loop */ + COMBA_FORWARD; + for (iz = 0; iz < iy; ++iz) { + MULADD(*tmpx++, *tmpy--); + } + + /* store term */ + COMBA_STORE(dst[ix]); + } + COMBA_FINI; +/* + setup dest + */ + iy = C->used; + C->used = pa; + C->sign = A->sign ^ B->sign; + { + pstm_digit *tmpc; + tmpc = C->dp; + for (ix = 0; ix < pa; ix++) { + *tmpc++ = dst[ix]; + } +/* + clear unused digits [that existed in the old copy of c] + */ + for (; ix < iy; ix++) { + *tmpc++ = 0; + } + } + pstm_clamp(C); + + if ((paD == NULL) || (paDfail == 1)) { + psFree(dst, pool); + } + + return PS_SUCCESS; +} + +/******************************************************************************/ +#ifdef USE_1024_KEY_SPEED_OPTIMIZATIONS +static int32 pstm_mul_comba16(pstm_int *A, pstm_int *B, pstm_int *C) +{ + pstm_digit c0, c1, c2, at[32]; + + if (C->alloc < 32) { + if (pstm_grow(C, 32) != PSTM_OKAY) { + return PS_MEM_FAIL; + } + } + memcpy(at, A->dp, 16 * sizeof(pstm_digit)); + memcpy(at+16, B->dp, 16 * sizeof(pstm_digit)); + + COMBA_START; + + COMBA_CLEAR; + /* 0 */ + MULADD(at[0], at[16]); + COMBA_STORE(C->dp[0]); + /* 1 */ + COMBA_FORWARD; + MULADD(at[0], at[17]); MULADD(at[1], at[16]); + COMBA_STORE(C->dp[1]); + /* 2 */ + COMBA_FORWARD; + MULADD(at[0], at[18]); MULADD(at[1], at[17]); MULADD(at[2], at[16]); + COMBA_STORE(C->dp[2]); + /* 3 */ + COMBA_FORWARD; + MULADD(at[0], at[19]); MULADD(at[1], at[18]); MULADD(at[2], at[17]); MULADD(at[3], at[16]); + COMBA_STORE(C->dp[3]); + /* 4 */ + COMBA_FORWARD; + MULADD(at[0], at[20]); MULADD(at[1], at[19]); MULADD(at[2], at[18]); MULADD(at[3], at[17]); MULADD(at[4], at[16]); + COMBA_STORE(C->dp[4]); + /* 5 */ + COMBA_FORWARD; + MULADD(at[0], at[21]); MULADD(at[1], at[20]); MULADD(at[2], at[19]); MULADD(at[3], at[18]); MULADD(at[4], at[17]); MULADD(at[5], at[16]); + COMBA_STORE(C->dp[5]); + /* 6 */ + COMBA_FORWARD; + MULADD(at[0], at[22]); MULADD(at[1], at[21]); MULADD(at[2], at[20]); MULADD(at[3], at[19]); MULADD(at[4], at[18]); MULADD(at[5], at[17]); MULADD(at[6], at[16]); + COMBA_STORE(C->dp[6]); + /* 7 */ + COMBA_FORWARD; + MULADD(at[0], at[23]); MULADD(at[1], at[22]); MULADD(at[2], at[21]); MULADD(at[3], at[20]); MULADD(at[4], at[19]); MULADD(at[5], at[18]); MULADD(at[6], at[17]); MULADD(at[7], at[16]); + COMBA_STORE(C->dp[7]); + /* 8 */ + COMBA_FORWARD; + MULADD(at[0], at[24]); MULADD(at[1], at[23]); MULADD(at[2], at[22]); MULADD(at[3], at[21]); MULADD(at[4], at[20]); MULADD(at[5], at[19]); MULADD(at[6], at[18]); MULADD(at[7], at[17]); MULADD(at[8], at[16]); + COMBA_STORE(C->dp[8]); + /* 9 */ + COMBA_FORWARD; + MULADD(at[0], at[25]); MULADD(at[1], at[24]); MULADD(at[2], at[23]); MULADD(at[3], at[22]); MULADD(at[4], at[21]); MULADD(at[5], at[20]); MULADD(at[6], at[19]); MULADD(at[7], at[18]); MULADD(at[8], at[17]); MULADD(at[9], at[16]); + COMBA_STORE(C->dp[9]); + /* 10 */ + COMBA_FORWARD; + MULADD(at[0], at[26]); MULADD(at[1], at[25]); MULADD(at[2], at[24]); MULADD(at[3], at[23]); MULADD(at[4], at[22]); MULADD(at[5], at[21]); MULADD(at[6], at[20]); MULADD(at[7], at[19]); MULADD(at[8], at[18]); MULADD(at[9], at[17]); MULADD(at[10], at[16]); + COMBA_STORE(C->dp[10]); + /* 11 */ + COMBA_FORWARD; + MULADD(at[0], at[27]); MULADD(at[1], at[26]); MULADD(at[2], at[25]); MULADD(at[3], at[24]); MULADD(at[4], at[23]); MULADD(at[5], at[22]); MULADD(at[6], at[21]); MULADD(at[7], at[20]); MULADD(at[8], at[19]); MULADD(at[9], at[18]); MULADD(at[10], at[17]); MULADD(at[11], at[16]); + COMBA_STORE(C->dp[11]); + /* 12 */ + COMBA_FORWARD; + MULADD(at[0], at[28]); MULADD(at[1], at[27]); MULADD(at[2], at[26]); MULADD(at[3], at[25]); MULADD(at[4], at[24]); MULADD(at[5], at[23]); MULADD(at[6], at[22]); MULADD(at[7], at[21]); MULADD(at[8], at[20]); MULADD(at[9], at[19]); MULADD(at[10], at[18]); MULADD(at[11], at[17]); MULADD(at[12], at[16]); + COMBA_STORE(C->dp[12]); + /* 13 */ + COMBA_FORWARD; + MULADD(at[0], at[29]); MULADD(at[1], at[28]); MULADD(at[2], at[27]); MULADD(at[3], at[26]); MULADD(at[4], at[25]); MULADD(at[5], at[24]); MULADD(at[6], at[23]); MULADD(at[7], at[22]); MULADD(at[8], at[21]); MULADD(at[9], at[20]); MULADD(at[10], at[19]); MULADD(at[11], at[18]); MULADD(at[12], at[17]); MULADD(at[13], at[16]); + COMBA_STORE(C->dp[13]); + /* 14 */ + COMBA_FORWARD; + MULADD(at[0], at[30]); MULADD(at[1], at[29]); MULADD(at[2], at[28]); MULADD(at[3], at[27]); MULADD(at[4], at[26]); MULADD(at[5], at[25]); MULADD(at[6], at[24]); MULADD(at[7], at[23]); MULADD(at[8], at[22]); MULADD(at[9], at[21]); MULADD(at[10], at[20]); MULADD(at[11], at[19]); MULADD(at[12], at[18]); MULADD(at[13], at[17]); MULADD(at[14], at[16]); + COMBA_STORE(C->dp[14]); + /* 15 */ + COMBA_FORWARD; + MULADD(at[0], at[31]); MULADD(at[1], at[30]); MULADD(at[2], at[29]); MULADD(at[3], at[28]); MULADD(at[4], at[27]); MULADD(at[5], at[26]); MULADD(at[6], at[25]); MULADD(at[7], at[24]); MULADD(at[8], at[23]); MULADD(at[9], at[22]); MULADD(at[10], at[21]); MULADD(at[11], at[20]); MULADD(at[12], at[19]); MULADD(at[13], at[18]); MULADD(at[14], at[17]); MULADD(at[15], at[16]); + COMBA_STORE(C->dp[15]); + /* 16 */ + COMBA_FORWARD; + MULADD(at[1], at[31]); MULADD(at[2], at[30]); MULADD(at[3], at[29]); MULADD(at[4], at[28]); MULADD(at[5], at[27]); MULADD(at[6], at[26]); MULADD(at[7], at[25]); MULADD(at[8], at[24]); MULADD(at[9], at[23]); MULADD(at[10], at[22]); MULADD(at[11], at[21]); MULADD(at[12], at[20]); MULADD(at[13], at[19]); MULADD(at[14], at[18]); MULADD(at[15], at[17]); + COMBA_STORE(C->dp[16]); + /* 17 */ + COMBA_FORWARD; + MULADD(at[2], at[31]); MULADD(at[3], at[30]); MULADD(at[4], at[29]); MULADD(at[5], at[28]); MULADD(at[6], at[27]); MULADD(at[7], at[26]); MULADD(at[8], at[25]); MULADD(at[9], at[24]); MULADD(at[10], at[23]); MULADD(at[11], at[22]); MULADD(at[12], at[21]); MULADD(at[13], at[20]); MULADD(at[14], at[19]); MULADD(at[15], at[18]); + COMBA_STORE(C->dp[17]); + /* 18 */ + COMBA_FORWARD; + MULADD(at[3], at[31]); MULADD(at[4], at[30]); MULADD(at[5], at[29]); MULADD(at[6], at[28]); MULADD(at[7], at[27]); MULADD(at[8], at[26]); MULADD(at[9], at[25]); MULADD(at[10], at[24]); MULADD(at[11], at[23]); MULADD(at[12], at[22]); MULADD(at[13], at[21]); MULADD(at[14], at[20]); MULADD(at[15], at[19]); + COMBA_STORE(C->dp[18]); + /* 19 */ + COMBA_FORWARD; + MULADD(at[4], at[31]); MULADD(at[5], at[30]); MULADD(at[6], at[29]); MULADD(at[7], at[28]); MULADD(at[8], at[27]); MULADD(at[9], at[26]); MULADD(at[10], at[25]); MULADD(at[11], at[24]); MULADD(at[12], at[23]); MULADD(at[13], at[22]); MULADD(at[14], at[21]); MULADD(at[15], at[20]); + COMBA_STORE(C->dp[19]); + /* 20 */ + COMBA_FORWARD; + MULADD(at[5], at[31]); MULADD(at[6], at[30]); MULADD(at[7], at[29]); MULADD(at[8], at[28]); MULADD(at[9], at[27]); MULADD(at[10], at[26]); MULADD(at[11], at[25]); MULADD(at[12], at[24]); MULADD(at[13], at[23]); MULADD(at[14], at[22]); MULADD(at[15], at[21]); + COMBA_STORE(C->dp[20]); + /* 21 */ + COMBA_FORWARD; + MULADD(at[6], at[31]); MULADD(at[7], at[30]); MULADD(at[8], at[29]); MULADD(at[9], at[28]); MULADD(at[10], at[27]); MULADD(at[11], at[26]); MULADD(at[12], at[25]); MULADD(at[13], at[24]); MULADD(at[14], at[23]); MULADD(at[15], at[22]); + COMBA_STORE(C->dp[21]); + /* 22 */ + COMBA_FORWARD; + MULADD(at[7], at[31]); MULADD(at[8], at[30]); MULADD(at[9], at[29]); MULADD(at[10], at[28]); MULADD(at[11], at[27]); MULADD(at[12], at[26]); MULADD(at[13], at[25]); MULADD(at[14], at[24]); MULADD(at[15], at[23]); + COMBA_STORE(C->dp[22]); + /* 23 */ + COMBA_FORWARD; + MULADD(at[8], at[31]); MULADD(at[9], at[30]); MULADD(at[10], at[29]); MULADD(at[11], at[28]); MULADD(at[12], at[27]); MULADD(at[13], at[26]); MULADD(at[14], at[25]); MULADD(at[15], at[24]); + COMBA_STORE(C->dp[23]); + /* 24 */ + COMBA_FORWARD; + MULADD(at[9], at[31]); MULADD(at[10], at[30]); MULADD(at[11], at[29]); MULADD(at[12], at[28]); MULADD(at[13], at[27]); MULADD(at[14], at[26]); MULADD(at[15], at[25]); + COMBA_STORE(C->dp[24]); + /* 25 */ + COMBA_FORWARD; + MULADD(at[10], at[31]); MULADD(at[11], at[30]); MULADD(at[12], at[29]); MULADD(at[13], at[28]); MULADD(at[14], at[27]); MULADD(at[15], at[26]); + COMBA_STORE(C->dp[25]); + /* 26 */ + COMBA_FORWARD; + MULADD(at[11], at[31]); MULADD(at[12], at[30]); MULADD(at[13], at[29]); MULADD(at[14], at[28]); MULADD(at[15], at[27]); + COMBA_STORE(C->dp[26]); + /* 27 */ + COMBA_FORWARD; + MULADD(at[12], at[31]); MULADD(at[13], at[30]); MULADD(at[14], at[29]); MULADD(at[15], at[28]); + COMBA_STORE(C->dp[27]); + /* 28 */ + COMBA_FORWARD; + MULADD(at[13], at[31]); MULADD(at[14], at[30]); MULADD(at[15], at[29]); + COMBA_STORE(C->dp[28]); + /* 29 */ + COMBA_FORWARD; + MULADD(at[14], at[31]); MULADD(at[15], at[30]); + COMBA_STORE(C->dp[29]); + /* 30 */ + COMBA_FORWARD; + MULADD(at[15], at[31]); + COMBA_STORE(C->dp[30]); + COMBA_STORE2(C->dp[31]); + C->used = 32; + C->sign = A->sign ^ B->sign; + pstm_clamp(C); + COMBA_FINI; + return PSTM_OKAY; +} +#endif /* USE_1024_KEY_SPEED_OPTIMIZATIONS */ + + +#ifdef USE_2048_KEY_SPEED_OPTIMIZATIONS +static int32 pstm_mul_comba32(pstm_int *A, pstm_int *B, pstm_int *C) +{ + pstm_digit c0, c1, c2, at[64]; + int32 out_size; + + if (C->alloc < 64) { + if (pstm_grow(C, 64) != PSTM_OKAY) { + return PS_MEM_FAIL; + } + } + + out_size = A->used + B->used; + memcpy(at, A->dp, 32 * sizeof(pstm_digit)); + memcpy(at+32, B->dp, 32 * sizeof(pstm_digit)); + COMBA_START; + + COMBA_CLEAR; + /* 0 */ + MULADD(at[0], at[32]); + COMBA_STORE(C->dp[0]); + /* 1 */ + COMBA_FORWARD; + MULADD(at[0], at[33]); MULADD(at[1], at[32]); + COMBA_STORE(C->dp[1]); + /* 2 */ + COMBA_FORWARD; + MULADD(at[0], at[34]); MULADD(at[1], at[33]); MULADD(at[2], at[32]); + COMBA_STORE(C->dp[2]); + /* 3 */ + COMBA_FORWARD; + MULADD(at[0], at[35]); MULADD(at[1], at[34]); MULADD(at[2], at[33]); MULADD(at[3], at[32]); + COMBA_STORE(C->dp[3]); + /* 4 */ + COMBA_FORWARD; + MULADD(at[0], at[36]); MULADD(at[1], at[35]); MULADD(at[2], at[34]); MULADD(at[3], at[33]); MULADD(at[4], at[32]); + COMBA_STORE(C->dp[4]); + /* 5 */ + COMBA_FORWARD; + MULADD(at[0], at[37]); MULADD(at[1], at[36]); MULADD(at[2], at[35]); MULADD(at[3], at[34]); MULADD(at[4], at[33]); MULADD(at[5], at[32]); + COMBA_STORE(C->dp[5]); + /* 6 */ + COMBA_FORWARD; + MULADD(at[0], at[38]); MULADD(at[1], at[37]); MULADD(at[2], at[36]); MULADD(at[3], at[35]); MULADD(at[4], at[34]); MULADD(at[5], at[33]); MULADD(at[6], at[32]); + COMBA_STORE(C->dp[6]); + /* 7 */ + COMBA_FORWARD; + MULADD(at[0], at[39]); MULADD(at[1], at[38]); MULADD(at[2], at[37]); MULADD(at[3], at[36]); MULADD(at[4], at[35]); MULADD(at[5], at[34]); MULADD(at[6], at[33]); MULADD(at[7], at[32]); + COMBA_STORE(C->dp[7]); + /* 8 */ + COMBA_FORWARD; + MULADD(at[0], at[40]); MULADD(at[1], at[39]); MULADD(at[2], at[38]); MULADD(at[3], at[37]); MULADD(at[4], at[36]); MULADD(at[5], at[35]); MULADD(at[6], at[34]); MULADD(at[7], at[33]); MULADD(at[8], at[32]); + COMBA_STORE(C->dp[8]); + /* 9 */ + COMBA_FORWARD; + MULADD(at[0], at[41]); MULADD(at[1], at[40]); MULADD(at[2], at[39]); MULADD(at[3], at[38]); MULADD(at[4], at[37]); MULADD(at[5], at[36]); MULADD(at[6], at[35]); MULADD(at[7], at[34]); MULADD(at[8], at[33]); MULADD(at[9], at[32]); + COMBA_STORE(C->dp[9]); + /* 10 */ + COMBA_FORWARD; + MULADD(at[0], at[42]); MULADD(at[1], at[41]); MULADD(at[2], at[40]); MULADD(at[3], at[39]); MULADD(at[4], at[38]); MULADD(at[5], at[37]); MULADD(at[6], at[36]); MULADD(at[7], at[35]); MULADD(at[8], at[34]); MULADD(at[9], at[33]); MULADD(at[10], at[32]); + COMBA_STORE(C->dp[10]); + /* 11 */ + COMBA_FORWARD; + MULADD(at[0], at[43]); MULADD(at[1], at[42]); MULADD(at[2], at[41]); MULADD(at[3], at[40]); MULADD(at[4], at[39]); MULADD(at[5], at[38]); MULADD(at[6], at[37]); MULADD(at[7], at[36]); MULADD(at[8], at[35]); MULADD(at[9], at[34]); MULADD(at[10], at[33]); MULADD(at[11], at[32]); + COMBA_STORE(C->dp[11]); + /* 12 */ + COMBA_FORWARD; + MULADD(at[0], at[44]); MULADD(at[1], at[43]); MULADD(at[2], at[42]); MULADD(at[3], at[41]); MULADD(at[4], at[40]); MULADD(at[5], at[39]); MULADD(at[6], at[38]); MULADD(at[7], at[37]); MULADD(at[8], at[36]); MULADD(at[9], at[35]); MULADD(at[10], at[34]); MULADD(at[11], at[33]); MULADD(at[12], at[32]); + COMBA_STORE(C->dp[12]); + /* 13 */ + COMBA_FORWARD; + MULADD(at[0], at[45]); MULADD(at[1], at[44]); MULADD(at[2], at[43]); MULADD(at[3], at[42]); MULADD(at[4], at[41]); MULADD(at[5], at[40]); MULADD(at[6], at[39]); MULADD(at[7], at[38]); MULADD(at[8], at[37]); MULADD(at[9], at[36]); MULADD(at[10], at[35]); MULADD(at[11], at[34]); MULADD(at[12], at[33]); MULADD(at[13], at[32]); + COMBA_STORE(C->dp[13]); + /* 14 */ + COMBA_FORWARD; + MULADD(at[0], at[46]); MULADD(at[1], at[45]); MULADD(at[2], at[44]); MULADD(at[3], at[43]); MULADD(at[4], at[42]); MULADD(at[5], at[41]); MULADD(at[6], at[40]); MULADD(at[7], at[39]); MULADD(at[8], at[38]); MULADD(at[9], at[37]); MULADD(at[10], at[36]); MULADD(at[11], at[35]); MULADD(at[12], at[34]); MULADD(at[13], at[33]); MULADD(at[14], at[32]); + COMBA_STORE(C->dp[14]); + /* 15 */ + COMBA_FORWARD; + MULADD(at[0], at[47]); MULADD(at[1], at[46]); MULADD(at[2], at[45]); MULADD(at[3], at[44]); MULADD(at[4], at[43]); MULADD(at[5], at[42]); MULADD(at[6], at[41]); MULADD(at[7], at[40]); MULADD(at[8], at[39]); MULADD(at[9], at[38]); MULADD(at[10], at[37]); MULADD(at[11], at[36]); MULADD(at[12], at[35]); MULADD(at[13], at[34]); MULADD(at[14], at[33]); MULADD(at[15], at[32]); + COMBA_STORE(C->dp[15]); + /* 16 */ + COMBA_FORWARD; + MULADD(at[0], at[48]); MULADD(at[1], at[47]); MULADD(at[2], at[46]); MULADD(at[3], at[45]); MULADD(at[4], at[44]); MULADD(at[5], at[43]); MULADD(at[6], at[42]); MULADD(at[7], at[41]); MULADD(at[8], at[40]); MULADD(at[9], at[39]); MULADD(at[10], at[38]); MULADD(at[11], at[37]); MULADD(at[12], at[36]); MULADD(at[13], at[35]); MULADD(at[14], at[34]); MULADD(at[15], at[33]); MULADD(at[16], at[32]); + COMBA_STORE(C->dp[16]); + /* 17 */ + COMBA_FORWARD; + MULADD(at[0], at[49]); MULADD(at[1], at[48]); MULADD(at[2], at[47]); MULADD(at[3], at[46]); MULADD(at[4], at[45]); MULADD(at[5], at[44]); MULADD(at[6], at[43]); MULADD(at[7], at[42]); MULADD(at[8], at[41]); MULADD(at[9], at[40]); MULADD(at[10], at[39]); MULADD(at[11], at[38]); MULADD(at[12], at[37]); MULADD(at[13], at[36]); MULADD(at[14], at[35]); MULADD(at[15], at[34]); MULADD(at[16], at[33]); MULADD(at[17], at[32]); + COMBA_STORE(C->dp[17]); + /* 18 */ + COMBA_FORWARD; + MULADD(at[0], at[50]); MULADD(at[1], at[49]); MULADD(at[2], at[48]); MULADD(at[3], at[47]); MULADD(at[4], at[46]); MULADD(at[5], at[45]); MULADD(at[6], at[44]); MULADD(at[7], at[43]); MULADD(at[8], at[42]); MULADD(at[9], at[41]); MULADD(at[10], at[40]); MULADD(at[11], at[39]); MULADD(at[12], at[38]); MULADD(at[13], at[37]); MULADD(at[14], at[36]); MULADD(at[15], at[35]); MULADD(at[16], at[34]); MULADD(at[17], at[33]); MULADD(at[18], at[32]); + COMBA_STORE(C->dp[18]); + /* 19 */ + COMBA_FORWARD; + MULADD(at[0], at[51]); MULADD(at[1], at[50]); MULADD(at[2], at[49]); MULADD(at[3], at[48]); MULADD(at[4], at[47]); MULADD(at[5], at[46]); MULADD(at[6], at[45]); MULADD(at[7], at[44]); MULADD(at[8], at[43]); MULADD(at[9], at[42]); MULADD(at[10], at[41]); MULADD(at[11], at[40]); MULADD(at[12], at[39]); MULADD(at[13], at[38]); MULADD(at[14], at[37]); MULADD(at[15], at[36]); MULADD(at[16], at[35]); MULADD(at[17], at[34]); MULADD(at[18], at[33]); MULADD(at[19], at[32]); + COMBA_STORE(C->dp[19]); + /* 20 */ + COMBA_FORWARD; + MULADD(at[0], at[52]); MULADD(at[1], at[51]); MULADD(at[2], at[50]); MULADD(at[3], at[49]); MULADD(at[4], at[48]); MULADD(at[5], at[47]); MULADD(at[6], at[46]); MULADD(at[7], at[45]); MULADD(at[8], at[44]); MULADD(at[9], at[43]); MULADD(at[10], at[42]); MULADD(at[11], at[41]); MULADD(at[12], at[40]); MULADD(at[13], at[39]); MULADD(at[14], at[38]); MULADD(at[15], at[37]); MULADD(at[16], at[36]); MULADD(at[17], at[35]); MULADD(at[18], at[34]); MULADD(at[19], at[33]); MULADD(at[20], at[32]); + COMBA_STORE(C->dp[20]); + /* 21 */ + COMBA_FORWARD; + MULADD(at[0], at[53]); MULADD(at[1], at[52]); MULADD(at[2], at[51]); MULADD(at[3], at[50]); MULADD(at[4], at[49]); MULADD(at[5], at[48]); MULADD(at[6], at[47]); MULADD(at[7], at[46]); MULADD(at[8], at[45]); MULADD(at[9], at[44]); MULADD(at[10], at[43]); MULADD(at[11], at[42]); MULADD(at[12], at[41]); MULADD(at[13], at[40]); MULADD(at[14], at[39]); MULADD(at[15], at[38]); MULADD(at[16], at[37]); MULADD(at[17], at[36]); MULADD(at[18], at[35]); MULADD(at[19], at[34]); MULADD(at[20], at[33]); MULADD(at[21], at[32]); + COMBA_STORE(C->dp[21]); + /* 22 */ + COMBA_FORWARD; + MULADD(at[0], at[54]); MULADD(at[1], at[53]); MULADD(at[2], at[52]); MULADD(at[3], at[51]); MULADD(at[4], at[50]); MULADD(at[5], at[49]); MULADD(at[6], at[48]); MULADD(at[7], at[47]); MULADD(at[8], at[46]); MULADD(at[9], at[45]); MULADD(at[10], at[44]); MULADD(at[11], at[43]); MULADD(at[12], at[42]); MULADD(at[13], at[41]); MULADD(at[14], at[40]); MULADD(at[15], at[39]); MULADD(at[16], at[38]); MULADD(at[17], at[37]); MULADD(at[18], at[36]); MULADD(at[19], at[35]); MULADD(at[20], at[34]); MULADD(at[21], at[33]); MULADD(at[22], at[32]); + COMBA_STORE(C->dp[22]); + /* 23 */ + COMBA_FORWARD; + MULADD(at[0], at[55]); MULADD(at[1], at[54]); MULADD(at[2], at[53]); MULADD(at[3], at[52]); MULADD(at[4], at[51]); MULADD(at[5], at[50]); MULADD(at[6], at[49]); MULADD(at[7], at[48]); MULADD(at[8], at[47]); MULADD(at[9], at[46]); MULADD(at[10], at[45]); MULADD(at[11], at[44]); MULADD(at[12], at[43]); MULADD(at[13], at[42]); MULADD(at[14], at[41]); MULADD(at[15], at[40]); MULADD(at[16], at[39]); MULADD(at[17], at[38]); MULADD(at[18], at[37]); MULADD(at[19], at[36]); MULADD(at[20], at[35]); MULADD(at[21], at[34]); MULADD(at[22], at[33]); MULADD(at[23], at[32]); + COMBA_STORE(C->dp[23]); + /* 24 */ + COMBA_FORWARD; + MULADD(at[0], at[56]); MULADD(at[1], at[55]); MULADD(at[2], at[54]); MULADD(at[3], at[53]); MULADD(at[4], at[52]); MULADD(at[5], at[51]); MULADD(at[6], at[50]); MULADD(at[7], at[49]); MULADD(at[8], at[48]); MULADD(at[9], at[47]); MULADD(at[10], at[46]); MULADD(at[11], at[45]); MULADD(at[12], at[44]); MULADD(at[13], at[43]); MULADD(at[14], at[42]); MULADD(at[15], at[41]); MULADD(at[16], at[40]); MULADD(at[17], at[39]); MULADD(at[18], at[38]); MULADD(at[19], at[37]); MULADD(at[20], at[36]); MULADD(at[21], at[35]); MULADD(at[22], at[34]); MULADD(at[23], at[33]); MULADD(at[24], at[32]); + COMBA_STORE(C->dp[24]); + /* 25 */ + COMBA_FORWARD; + MULADD(at[0], at[57]); MULADD(at[1], at[56]); MULADD(at[2], at[55]); MULADD(at[3], at[54]); MULADD(at[4], at[53]); MULADD(at[5], at[52]); MULADD(at[6], at[51]); MULADD(at[7], at[50]); MULADD(at[8], at[49]); MULADD(at[9], at[48]); MULADD(at[10], at[47]); MULADD(at[11], at[46]); MULADD(at[12], at[45]); MULADD(at[13], at[44]); MULADD(at[14], at[43]); MULADD(at[15], at[42]); MULADD(at[16], at[41]); MULADD(at[17], at[40]); MULADD(at[18], at[39]); MULADD(at[19], at[38]); MULADD(at[20], at[37]); MULADD(at[21], at[36]); MULADD(at[22], at[35]); MULADD(at[23], at[34]); MULADD(at[24], at[33]); MULADD(at[25], at[32]); + COMBA_STORE(C->dp[25]); + /* 26 */ + COMBA_FORWARD; + MULADD(at[0], at[58]); MULADD(at[1], at[57]); MULADD(at[2], at[56]); MULADD(at[3], at[55]); MULADD(at[4], at[54]); MULADD(at[5], at[53]); MULADD(at[6], at[52]); MULADD(at[7], at[51]); MULADD(at[8], at[50]); MULADD(at[9], at[49]); MULADD(at[10], at[48]); MULADD(at[11], at[47]); MULADD(at[12], at[46]); MULADD(at[13], at[45]); MULADD(at[14], at[44]); MULADD(at[15], at[43]); MULADD(at[16], at[42]); MULADD(at[17], at[41]); MULADD(at[18], at[40]); MULADD(at[19], at[39]); MULADD(at[20], at[38]); MULADD(at[21], at[37]); MULADD(at[22], at[36]); MULADD(at[23], at[35]); MULADD(at[24], at[34]); MULADD(at[25], at[33]); MULADD(at[26], at[32]); + COMBA_STORE(C->dp[26]); + /* 27 */ + COMBA_FORWARD; + MULADD(at[0], at[59]); MULADD(at[1], at[58]); MULADD(at[2], at[57]); MULADD(at[3], at[56]); MULADD(at[4], at[55]); MULADD(at[5], at[54]); MULADD(at[6], at[53]); MULADD(at[7], at[52]); MULADD(at[8], at[51]); MULADD(at[9], at[50]); MULADD(at[10], at[49]); MULADD(at[11], at[48]); MULADD(at[12], at[47]); MULADD(at[13], at[46]); MULADD(at[14], at[45]); MULADD(at[15], at[44]); MULADD(at[16], at[43]); MULADD(at[17], at[42]); MULADD(at[18], at[41]); MULADD(at[19], at[40]); MULADD(at[20], at[39]); MULADD(at[21], at[38]); MULADD(at[22], at[37]); MULADD(at[23], at[36]); MULADD(at[24], at[35]); MULADD(at[25], at[34]); MULADD(at[26], at[33]); MULADD(at[27], at[32]); + COMBA_STORE(C->dp[27]); + /* 28 */ + COMBA_FORWARD; + MULADD(at[0], at[60]); MULADD(at[1], at[59]); MULADD(at[2], at[58]); MULADD(at[3], at[57]); MULADD(at[4], at[56]); MULADD(at[5], at[55]); MULADD(at[6], at[54]); MULADD(at[7], at[53]); MULADD(at[8], at[52]); MULADD(at[9], at[51]); MULADD(at[10], at[50]); MULADD(at[11], at[49]); MULADD(at[12], at[48]); MULADD(at[13], at[47]); MULADD(at[14], at[46]); MULADD(at[15], at[45]); MULADD(at[16], at[44]); MULADD(at[17], at[43]); MULADD(at[18], at[42]); MULADD(at[19], at[41]); MULADD(at[20], at[40]); MULADD(at[21], at[39]); MULADD(at[22], at[38]); MULADD(at[23], at[37]); MULADD(at[24], at[36]); MULADD(at[25], at[35]); MULADD(at[26], at[34]); MULADD(at[27], at[33]); MULADD(at[28], at[32]); + COMBA_STORE(C->dp[28]); + /* 29 */ + COMBA_FORWARD; + MULADD(at[0], at[61]); MULADD(at[1], at[60]); MULADD(at[2], at[59]); MULADD(at[3], at[58]); MULADD(at[4], at[57]); MULADD(at[5], at[56]); MULADD(at[6], at[55]); MULADD(at[7], at[54]); MULADD(at[8], at[53]); MULADD(at[9], at[52]); MULADD(at[10], at[51]); MULADD(at[11], at[50]); MULADD(at[12], at[49]); MULADD(at[13], at[48]); MULADD(at[14], at[47]); MULADD(at[15], at[46]); MULADD(at[16], at[45]); MULADD(at[17], at[44]); MULADD(at[18], at[43]); MULADD(at[19], at[42]); MULADD(at[20], at[41]); MULADD(at[21], at[40]); MULADD(at[22], at[39]); MULADD(at[23], at[38]); MULADD(at[24], at[37]); MULADD(at[25], at[36]); MULADD(at[26], at[35]); MULADD(at[27], at[34]); MULADD(at[28], at[33]); MULADD(at[29], at[32]); + COMBA_STORE(C->dp[29]); + /* 30 */ + COMBA_FORWARD; + MULADD(at[0], at[62]); MULADD(at[1], at[61]); MULADD(at[2], at[60]); MULADD(at[3], at[59]); MULADD(at[4], at[58]); MULADD(at[5], at[57]); MULADD(at[6], at[56]); MULADD(at[7], at[55]); MULADD(at[8], at[54]); MULADD(at[9], at[53]); MULADD(at[10], at[52]); MULADD(at[11], at[51]); MULADD(at[12], at[50]); MULADD(at[13], at[49]); MULADD(at[14], at[48]); MULADD(at[15], at[47]); MULADD(at[16], at[46]); MULADD(at[17], at[45]); MULADD(at[18], at[44]); MULADD(at[19], at[43]); MULADD(at[20], at[42]); MULADD(at[21], at[41]); MULADD(at[22], at[40]); MULADD(at[23], at[39]); MULADD(at[24], at[38]); MULADD(at[25], at[37]); MULADD(at[26], at[36]); MULADD(at[27], at[35]); MULADD(at[28], at[34]); MULADD(at[29], at[33]); MULADD(at[30], at[32]); + COMBA_STORE(C->dp[30]); + /* 31 */ + COMBA_FORWARD; + MULADD(at[0], at[63]); MULADD(at[1], at[62]); MULADD(at[2], at[61]); MULADD(at[3], at[60]); MULADD(at[4], at[59]); MULADD(at[5], at[58]); MULADD(at[6], at[57]); MULADD(at[7], at[56]); MULADD(at[8], at[55]); MULADD(at[9], at[54]); MULADD(at[10], at[53]); MULADD(at[11], at[52]); MULADD(at[12], at[51]); MULADD(at[13], at[50]); MULADD(at[14], at[49]); MULADD(at[15], at[48]); MULADD(at[16], at[47]); MULADD(at[17], at[46]); MULADD(at[18], at[45]); MULADD(at[19], at[44]); MULADD(at[20], at[43]); MULADD(at[21], at[42]); MULADD(at[22], at[41]); MULADD(at[23], at[40]); MULADD(at[24], at[39]); MULADD(at[25], at[38]); MULADD(at[26], at[37]); MULADD(at[27], at[36]); MULADD(at[28], at[35]); MULADD(at[29], at[34]); MULADD(at[30], at[33]); MULADD(at[31], at[32]); + COMBA_STORE(C->dp[31]); + /* 32 */ + COMBA_FORWARD; + MULADD(at[1], at[63]); MULADD(at[2], at[62]); MULADD(at[3], at[61]); MULADD(at[4], at[60]); MULADD(at[5], at[59]); MULADD(at[6], at[58]); MULADD(at[7], at[57]); MULADD(at[8], at[56]); MULADD(at[9], at[55]); MULADD(at[10], at[54]); MULADD(at[11], at[53]); MULADD(at[12], at[52]); MULADD(at[13], at[51]); MULADD(at[14], at[50]); MULADD(at[15], at[49]); MULADD(at[16], at[48]); MULADD(at[17], at[47]); MULADD(at[18], at[46]); MULADD(at[19], at[45]); MULADD(at[20], at[44]); MULADD(at[21], at[43]); MULADD(at[22], at[42]); MULADD(at[23], at[41]); MULADD(at[24], at[40]); MULADD(at[25], at[39]); MULADD(at[26], at[38]); MULADD(at[27], at[37]); MULADD(at[28], at[36]); MULADD(at[29], at[35]); MULADD(at[30], at[34]); MULADD(at[31], at[33]); + COMBA_STORE(C->dp[32]); + /* 33 */ + COMBA_FORWARD; + MULADD(at[2], at[63]); MULADD(at[3], at[62]); MULADD(at[4], at[61]); MULADD(at[5], at[60]); MULADD(at[6], at[59]); MULADD(at[7], at[58]); MULADD(at[8], at[57]); MULADD(at[9], at[56]); MULADD(at[10], at[55]); MULADD(at[11], at[54]); MULADD(at[12], at[53]); MULADD(at[13], at[52]); MULADD(at[14], at[51]); MULADD(at[15], at[50]); MULADD(at[16], at[49]); MULADD(at[17], at[48]); MULADD(at[18], at[47]); MULADD(at[19], at[46]); MULADD(at[20], at[45]); MULADD(at[21], at[44]); MULADD(at[22], at[43]); MULADD(at[23], at[42]); MULADD(at[24], at[41]); MULADD(at[25], at[40]); MULADD(at[26], at[39]); MULADD(at[27], at[38]); MULADD(at[28], at[37]); MULADD(at[29], at[36]); MULADD(at[30], at[35]); MULADD(at[31], at[34]); + COMBA_STORE(C->dp[33]); + /* 34 */ + COMBA_FORWARD; + MULADD(at[3], at[63]); MULADD(at[4], at[62]); MULADD(at[5], at[61]); MULADD(at[6], at[60]); MULADD(at[7], at[59]); MULADD(at[8], at[58]); MULADD(at[9], at[57]); MULADD(at[10], at[56]); MULADD(at[11], at[55]); MULADD(at[12], at[54]); MULADD(at[13], at[53]); MULADD(at[14], at[52]); MULADD(at[15], at[51]); MULADD(at[16], at[50]); MULADD(at[17], at[49]); MULADD(at[18], at[48]); MULADD(at[19], at[47]); MULADD(at[20], at[46]); MULADD(at[21], at[45]); MULADD(at[22], at[44]); MULADD(at[23], at[43]); MULADD(at[24], at[42]); MULADD(at[25], at[41]); MULADD(at[26], at[40]); MULADD(at[27], at[39]); MULADD(at[28], at[38]); MULADD(at[29], at[37]); MULADD(at[30], at[36]); MULADD(at[31], at[35]); + COMBA_STORE(C->dp[34]); + /* 35 */ + COMBA_FORWARD; + MULADD(at[4], at[63]); MULADD(at[5], at[62]); MULADD(at[6], at[61]); MULADD(at[7], at[60]); MULADD(at[8], at[59]); MULADD(at[9], at[58]); MULADD(at[10], at[57]); MULADD(at[11], at[56]); MULADD(at[12], at[55]); MULADD(at[13], at[54]); MULADD(at[14], at[53]); MULADD(at[15], at[52]); MULADD(at[16], at[51]); MULADD(at[17], at[50]); MULADD(at[18], at[49]); MULADD(at[19], at[48]); MULADD(at[20], at[47]); MULADD(at[21], at[46]); MULADD(at[22], at[45]); MULADD(at[23], at[44]); MULADD(at[24], at[43]); MULADD(at[25], at[42]); MULADD(at[26], at[41]); MULADD(at[27], at[40]); MULADD(at[28], at[39]); MULADD(at[29], at[38]); MULADD(at[30], at[37]); MULADD(at[31], at[36]); + COMBA_STORE(C->dp[35]); + /* 36 */ + COMBA_FORWARD; + MULADD(at[5], at[63]); MULADD(at[6], at[62]); MULADD(at[7], at[61]); MULADD(at[8], at[60]); MULADD(at[9], at[59]); MULADD(at[10], at[58]); MULADD(at[11], at[57]); MULADD(at[12], at[56]); MULADD(at[13], at[55]); MULADD(at[14], at[54]); MULADD(at[15], at[53]); MULADD(at[16], at[52]); MULADD(at[17], at[51]); MULADD(at[18], at[50]); MULADD(at[19], at[49]); MULADD(at[20], at[48]); MULADD(at[21], at[47]); MULADD(at[22], at[46]); MULADD(at[23], at[45]); MULADD(at[24], at[44]); MULADD(at[25], at[43]); MULADD(at[26], at[42]); MULADD(at[27], at[41]); MULADD(at[28], at[40]); MULADD(at[29], at[39]); MULADD(at[30], at[38]); MULADD(at[31], at[37]); + COMBA_STORE(C->dp[36]); + /* 37 */ + COMBA_FORWARD; + MULADD(at[6], at[63]); MULADD(at[7], at[62]); MULADD(at[8], at[61]); MULADD(at[9], at[60]); MULADD(at[10], at[59]); MULADD(at[11], at[58]); MULADD(at[12], at[57]); MULADD(at[13], at[56]); MULADD(at[14], at[55]); MULADD(at[15], at[54]); MULADD(at[16], at[53]); MULADD(at[17], at[52]); MULADD(at[18], at[51]); MULADD(at[19], at[50]); MULADD(at[20], at[49]); MULADD(at[21], at[48]); MULADD(at[22], at[47]); MULADD(at[23], at[46]); MULADD(at[24], at[45]); MULADD(at[25], at[44]); MULADD(at[26], at[43]); MULADD(at[27], at[42]); MULADD(at[28], at[41]); MULADD(at[29], at[40]); MULADD(at[30], at[39]); MULADD(at[31], at[38]); + COMBA_STORE(C->dp[37]); + /* 38 */ + COMBA_FORWARD; + MULADD(at[7], at[63]); MULADD(at[8], at[62]); MULADD(at[9], at[61]); MULADD(at[10], at[60]); MULADD(at[11], at[59]); MULADD(at[12], at[58]); MULADD(at[13], at[57]); MULADD(at[14], at[56]); MULADD(at[15], at[55]); MULADD(at[16], at[54]); MULADD(at[17], at[53]); MULADD(at[18], at[52]); MULADD(at[19], at[51]); MULADD(at[20], at[50]); MULADD(at[21], at[49]); MULADD(at[22], at[48]); MULADD(at[23], at[47]); MULADD(at[24], at[46]); MULADD(at[25], at[45]); MULADD(at[26], at[44]); MULADD(at[27], at[43]); MULADD(at[28], at[42]); MULADD(at[29], at[41]); MULADD(at[30], at[40]); MULADD(at[31], at[39]); + COMBA_STORE(C->dp[38]); + + /* early out at 40 digits, 40*32==1280, or two 640 bit operands */ + if (out_size <= 40) { COMBA_STORE2(C->dp[39]); C->used = 40; C->sign = A->sign ^ B->sign; pstm_clamp(C); COMBA_FINI; return PSTM_OKAY; } + + /* 39 */ + COMBA_FORWARD; + MULADD(at[8], at[63]); MULADD(at[9], at[62]); MULADD(at[10], at[61]); MULADD(at[11], at[60]); MULADD(at[12], at[59]); MULADD(at[13], at[58]); MULADD(at[14], at[57]); MULADD(at[15], at[56]); MULADD(at[16], at[55]); MULADD(at[17], at[54]); MULADD(at[18], at[53]); MULADD(at[19], at[52]); MULADD(at[20], at[51]); MULADD(at[21], at[50]); MULADD(at[22], at[49]); MULADD(at[23], at[48]); MULADD(at[24], at[47]); MULADD(at[25], at[46]); MULADD(at[26], at[45]); MULADD(at[27], at[44]); MULADD(at[28], at[43]); MULADD(at[29], at[42]); MULADD(at[30], at[41]); MULADD(at[31], at[40]); + COMBA_STORE(C->dp[39]); + /* 40 */ + COMBA_FORWARD; + MULADD(at[9], at[63]); MULADD(at[10], at[62]); MULADD(at[11], at[61]); MULADD(at[12], at[60]); MULADD(at[13], at[59]); MULADD(at[14], at[58]); MULADD(at[15], at[57]); MULADD(at[16], at[56]); MULADD(at[17], at[55]); MULADD(at[18], at[54]); MULADD(at[19], at[53]); MULADD(at[20], at[52]); MULADD(at[21], at[51]); MULADD(at[22], at[50]); MULADD(at[23], at[49]); MULADD(at[24], at[48]); MULADD(at[25], at[47]); MULADD(at[26], at[46]); MULADD(at[27], at[45]); MULADD(at[28], at[44]); MULADD(at[29], at[43]); MULADD(at[30], at[42]); MULADD(at[31], at[41]); + COMBA_STORE(C->dp[40]); + /* 41 */ + COMBA_FORWARD; + MULADD(at[10], at[63]); MULADD(at[11], at[62]); MULADD(at[12], at[61]); MULADD(at[13], at[60]); MULADD(at[14], at[59]); MULADD(at[15], at[58]); MULADD(at[16], at[57]); MULADD(at[17], at[56]); MULADD(at[18], at[55]); MULADD(at[19], at[54]); MULADD(at[20], at[53]); MULADD(at[21], at[52]); MULADD(at[22], at[51]); MULADD(at[23], at[50]); MULADD(at[24], at[49]); MULADD(at[25], at[48]); MULADD(at[26], at[47]); MULADD(at[27], at[46]); MULADD(at[28], at[45]); MULADD(at[29], at[44]); MULADD(at[30], at[43]); MULADD(at[31], at[42]); + COMBA_STORE(C->dp[41]); + /* 42 */ + COMBA_FORWARD; + MULADD(at[11], at[63]); MULADD(at[12], at[62]); MULADD(at[13], at[61]); MULADD(at[14], at[60]); MULADD(at[15], at[59]); MULADD(at[16], at[58]); MULADD(at[17], at[57]); MULADD(at[18], at[56]); MULADD(at[19], at[55]); MULADD(at[20], at[54]); MULADD(at[21], at[53]); MULADD(at[22], at[52]); MULADD(at[23], at[51]); MULADD(at[24], at[50]); MULADD(at[25], at[49]); MULADD(at[26], at[48]); MULADD(at[27], at[47]); MULADD(at[28], at[46]); MULADD(at[29], at[45]); MULADD(at[30], at[44]); MULADD(at[31], at[43]); + COMBA_STORE(C->dp[42]); + /* 43 */ + COMBA_FORWARD; + MULADD(at[12], at[63]); MULADD(at[13], at[62]); MULADD(at[14], at[61]); MULADD(at[15], at[60]); MULADD(at[16], at[59]); MULADD(at[17], at[58]); MULADD(at[18], at[57]); MULADD(at[19], at[56]); MULADD(at[20], at[55]); MULADD(at[21], at[54]); MULADD(at[22], at[53]); MULADD(at[23], at[52]); MULADD(at[24], at[51]); MULADD(at[25], at[50]); MULADD(at[26], at[49]); MULADD(at[27], at[48]); MULADD(at[28], at[47]); MULADD(at[29], at[46]); MULADD(at[30], at[45]); MULADD(at[31], at[44]); + COMBA_STORE(C->dp[43]); + /* 44 */ + COMBA_FORWARD; + MULADD(at[13], at[63]); MULADD(at[14], at[62]); MULADD(at[15], at[61]); MULADD(at[16], at[60]); MULADD(at[17], at[59]); MULADD(at[18], at[58]); MULADD(at[19], at[57]); MULADD(at[20], at[56]); MULADD(at[21], at[55]); MULADD(at[22], at[54]); MULADD(at[23], at[53]); MULADD(at[24], at[52]); MULADD(at[25], at[51]); MULADD(at[26], at[50]); MULADD(at[27], at[49]); MULADD(at[28], at[48]); MULADD(at[29], at[47]); MULADD(at[30], at[46]); MULADD(at[31], at[45]); + COMBA_STORE(C->dp[44]); + /* 45 */ + COMBA_FORWARD; + MULADD(at[14], at[63]); MULADD(at[15], at[62]); MULADD(at[16], at[61]); MULADD(at[17], at[60]); MULADD(at[18], at[59]); MULADD(at[19], at[58]); MULADD(at[20], at[57]); MULADD(at[21], at[56]); MULADD(at[22], at[55]); MULADD(at[23], at[54]); MULADD(at[24], at[53]); MULADD(at[25], at[52]); MULADD(at[26], at[51]); MULADD(at[27], at[50]); MULADD(at[28], at[49]); MULADD(at[29], at[48]); MULADD(at[30], at[47]); MULADD(at[31], at[46]); + COMBA_STORE(C->dp[45]); + /* 46 */ + COMBA_FORWARD; + MULADD(at[15], at[63]); MULADD(at[16], at[62]); MULADD(at[17], at[61]); MULADD(at[18], at[60]); MULADD(at[19], at[59]); MULADD(at[20], at[58]); MULADD(at[21], at[57]); MULADD(at[22], at[56]); MULADD(at[23], at[55]); MULADD(at[24], at[54]); MULADD(at[25], at[53]); MULADD(at[26], at[52]); MULADD(at[27], at[51]); MULADD(at[28], at[50]); MULADD(at[29], at[49]); MULADD(at[30], at[48]); MULADD(at[31], at[47]); + COMBA_STORE(C->dp[46]); + + /* early out at 48 digits, 48*32==1536, or two 768 bit operands */ + if (out_size <= 48) { COMBA_STORE2(C->dp[47]); C->used = 48; C->sign = A->sign ^ B->sign; pstm_clamp(C); COMBA_FINI; return PSTM_OKAY; } + + /* 47 */ + COMBA_FORWARD; + MULADD(at[16], at[63]); MULADD(at[17], at[62]); MULADD(at[18], at[61]); MULADD(at[19], at[60]); MULADD(at[20], at[59]); MULADD(at[21], at[58]); MULADD(at[22], at[57]); MULADD(at[23], at[56]); MULADD(at[24], at[55]); MULADD(at[25], at[54]); MULADD(at[26], at[53]); MULADD(at[27], at[52]); MULADD(at[28], at[51]); MULADD(at[29], at[50]); MULADD(at[30], at[49]); MULADD(at[31], at[48]); + COMBA_STORE(C->dp[47]); + /* 48 */ + COMBA_FORWARD; + MULADD(at[17], at[63]); MULADD(at[18], at[62]); MULADD(at[19], at[61]); MULADD(at[20], at[60]); MULADD(at[21], at[59]); MULADD(at[22], at[58]); MULADD(at[23], at[57]); MULADD(at[24], at[56]); MULADD(at[25], at[55]); MULADD(at[26], at[54]); MULADD(at[27], at[53]); MULADD(at[28], at[52]); MULADD(at[29], at[51]); MULADD(at[30], at[50]); MULADD(at[31], at[49]); + COMBA_STORE(C->dp[48]); + /* 49 */ + COMBA_FORWARD; + MULADD(at[18], at[63]); MULADD(at[19], at[62]); MULADD(at[20], at[61]); MULADD(at[21], at[60]); MULADD(at[22], at[59]); MULADD(at[23], at[58]); MULADD(at[24], at[57]); MULADD(at[25], at[56]); MULADD(at[26], at[55]); MULADD(at[27], at[54]); MULADD(at[28], at[53]); MULADD(at[29], at[52]); MULADD(at[30], at[51]); MULADD(at[31], at[50]); + COMBA_STORE(C->dp[49]); + /* 50 */ + COMBA_FORWARD; + MULADD(at[19], at[63]); MULADD(at[20], at[62]); MULADD(at[21], at[61]); MULADD(at[22], at[60]); MULADD(at[23], at[59]); MULADD(at[24], at[58]); MULADD(at[25], at[57]); MULADD(at[26], at[56]); MULADD(at[27], at[55]); MULADD(at[28], at[54]); MULADD(at[29], at[53]); MULADD(at[30], at[52]); MULADD(at[31], at[51]); + COMBA_STORE(C->dp[50]); + /* 51 */ + COMBA_FORWARD; + MULADD(at[20], at[63]); MULADD(at[21], at[62]); MULADD(at[22], at[61]); MULADD(at[23], at[60]); MULADD(at[24], at[59]); MULADD(at[25], at[58]); MULADD(at[26], at[57]); MULADD(at[27], at[56]); MULADD(at[28], at[55]); MULADD(at[29], at[54]); MULADD(at[30], at[53]); MULADD(at[31], at[52]); + COMBA_STORE(C->dp[51]); + /* 52 */ + COMBA_FORWARD; + MULADD(at[21], at[63]); MULADD(at[22], at[62]); MULADD(at[23], at[61]); MULADD(at[24], at[60]); MULADD(at[25], at[59]); MULADD(at[26], at[58]); MULADD(at[27], at[57]); MULADD(at[28], at[56]); MULADD(at[29], at[55]); MULADD(at[30], at[54]); MULADD(at[31], at[53]); + COMBA_STORE(C->dp[52]); + /* 53 */ + COMBA_FORWARD; + MULADD(at[22], at[63]); MULADD(at[23], at[62]); MULADD(at[24], at[61]); MULADD(at[25], at[60]); MULADD(at[26], at[59]); MULADD(at[27], at[58]); MULADD(at[28], at[57]); MULADD(at[29], at[56]); MULADD(at[30], at[55]); MULADD(at[31], at[54]); + COMBA_STORE(C->dp[53]); + /* 54 */ + COMBA_FORWARD; + MULADD(at[23], at[63]); MULADD(at[24], at[62]); MULADD(at[25], at[61]); MULADD(at[26], at[60]); MULADD(at[27], at[59]); MULADD(at[28], at[58]); MULADD(at[29], at[57]); MULADD(at[30], at[56]); MULADD(at[31], at[55]); + COMBA_STORE(C->dp[54]); + + /* early out at 56 digits, 56*32==1792, or two 896 bit operands */ + if (out_size <= 56) { COMBA_STORE2(C->dp[55]); C->used = 56; C->sign = A->sign ^ B->sign; pstm_clamp(C); COMBA_FINI; return PSTM_OKAY; } + + /* 55 */ + COMBA_FORWARD; + MULADD(at[24], at[63]); MULADD(at[25], at[62]); MULADD(at[26], at[61]); MULADD(at[27], at[60]); MULADD(at[28], at[59]); MULADD(at[29], at[58]); MULADD(at[30], at[57]); MULADD(at[31], at[56]); + COMBA_STORE(C->dp[55]); + /* 56 */ + COMBA_FORWARD; + MULADD(at[25], at[63]); MULADD(at[26], at[62]); MULADD(at[27], at[61]); MULADD(at[28], at[60]); MULADD(at[29], at[59]); MULADD(at[30], at[58]); MULADD(at[31], at[57]); + COMBA_STORE(C->dp[56]); + /* 57 */ + COMBA_FORWARD; + MULADD(at[26], at[63]); MULADD(at[27], at[62]); MULADD(at[28], at[61]); MULADD(at[29], at[60]); MULADD(at[30], at[59]); MULADD(at[31], at[58]); + COMBA_STORE(C->dp[57]); + /* 58 */ + COMBA_FORWARD; + MULADD(at[27], at[63]); MULADD(at[28], at[62]); MULADD(at[29], at[61]); MULADD(at[30], at[60]); MULADD(at[31], at[59]); + COMBA_STORE(C->dp[58]); + /* 59 */ + COMBA_FORWARD; + MULADD(at[28], at[63]); MULADD(at[29], at[62]); MULADD(at[30], at[61]); MULADD(at[31], at[60]); + COMBA_STORE(C->dp[59]); + /* 60 */ + COMBA_FORWARD; + MULADD(at[29], at[63]); MULADD(at[30], at[62]); MULADD(at[31], at[61]); + COMBA_STORE(C->dp[60]); + /* 61 */ + COMBA_FORWARD; + MULADD(at[30], at[63]); MULADD(at[31], at[62]); + COMBA_STORE(C->dp[61]); + /* 62 */ + COMBA_FORWARD; + MULADD(at[31], at[63]); + COMBA_STORE(C->dp[62]); + COMBA_STORE2(C->dp[63]); + C->used = 64; + C->sign = A->sign ^ B->sign; + pstm_clamp(C); + COMBA_FINI; + return PSTM_OKAY; +} +#endif /* USE_2048_KEY_SPEED_OPTIMIZATIONS */ + +/******************************************************************************/ + +int32 pstm_mul_comba(psPool_t *pool, pstm_int *A, pstm_int *B, pstm_int *C, + pstm_digit *paD, uint32 paDlen) +{ +#ifdef USE_1024_KEY_SPEED_OPTIMIZATIONS + if (A->used == 16 && B->used == 16) { + return pstm_mul_comba16(A, B, C); + } else { +#ifdef USE_2048_KEY_SPEED_OPTIMIZATIONS + if (A->used == 32 && B->used == 32) { + return pstm_mul_comba32(A, B, C); + } +#endif /* USE_2048_KEY_SPEED_OPTIMIZATIONS */ + return pstm_mul_comba_gen(pool, A, B, C, paD, paDlen); + } +#else +#ifdef USE_2048_KEY_SPEED_OPTIMIZATIONS + if (A->used == 32 && B->used == 32) { + return pstm_mul_comba32(A, B, C); + } +#endif /* USE_2048_KEY_SPEED_OPTIMIZATIONS */ + return pstm_mul_comba_gen(pool, A, B, C, paD, paDlen); +#endif +} + +#endif /* !DISABLE_PSTM */ +/******************************************************************************/ diff --git a/networking/tls_pstm_sqr_comba.c b/networking/tls_pstm_sqr_comba.c new file mode 100644 index 000000000..98186d31f --- /dev/null +++ b/networking/tls_pstm_sqr_comba.c @@ -0,0 +1,1107 @@ +/* + * Copyright (C) 2017 Denys Vlasenko + * + * Licensed under GPLv2, see file LICENSE in this source tree. + */ +#include "tls.h" + +/** + * @file pstm_sqr_comba.c + * @version 33ef80f (HEAD, tag: MATRIXSSL-3-7-2-OPEN, tag: MATRIXSSL-3-7-2-COMM, origin/master, origin/HEAD, master) + * + * Multiprecision Squaring with Comba technique. + */ +/* + * Copyright (c) 2013-2015 INSIDE Secure Corporation + * Copyright (c) PeerSec Networks, 2002-2011 + * All Rights Reserved + * + * The latest version of this code is available at http://www.matrixssl.org + * + * This software is open source; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This General Public License does NOT permit incorporating this software + * into proprietary programs. If you are unable to comply with the GPL, a + * commercial license for this software may be purchased from INSIDE at + * http://www.insidesecure.com/eng/Company/Locations + * + * This program is distributed in WITHOUT ANY WARRANTY; without even the + * implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * http://www.gnu.org/copyleft/gpl.html + */ +/******************************************************************************/ + +///bbox +//#include "../cryptoApi.h" +#ifndef DISABLE_PSTM + +/******************************************************************************/ +#if defined(PSTM_X86) +/* x86-32 optimized for 32 bit platforms. For 64 bit mode use X86_64 instead */ +#if !defined(__GNUC__) || !defined(__i386__) +#error "PSTM_X86 option requires GCC and 32 bit mode x86 processor" +#endif +//#pragma message ("Using 32 bit x86 Assembly Optimizations") + +#define COMBA_START + +#define CLEAR_CARRY \ + c0 = c1 = c2 = 0; + +#define COMBA_STORE(x) \ + x = c0; + +#define COMBA_STORE2(x) \ + x = c1; + +#define CARRY_FORWARD \ + do { c0 = c1; c1 = c2; c2 = 0; } while (0); + +#define COMBA_FINI + +#define SQRADD(i, j) \ +asm( \ + "movl %6,%%eax \n\t" \ + "mull %%eax \n\t" \ + "addl %%eax,%0 \n\t" \ + "adcl %%edx,%1 \n\t" \ + "adcl $0,%2 \n\t" \ + :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i) :"%eax","%edx","%cc"); + +#define SQRADD2(i, j) \ +asm( \ + "movl %6,%%eax \n\t" \ + "mull %7 \n\t" \ + "addl %%eax,%0 \n\t" \ + "adcl %%edx,%1 \n\t" \ + "adcl $0,%2 \n\t" \ + "addl %%eax,%0 \n\t" \ + "adcl %%edx,%1 \n\t" \ + "adcl $0,%2 \n\t" \ + :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j) :"%eax","%edx","%cc"); + +#define SQRADDSC(i, j) \ +asm( \ + "movl %6,%%eax \n\t" \ + "mull %7 \n\t" \ + "movl %%eax,%0 \n\t" \ + "movl %%edx,%1 \n\t" \ + "xorl %2,%2 \n\t" \ + :"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "g"(i), "g"(j) :"%eax","%edx","%cc"); + +#define SQRADDAC(i, j) \ +asm( \ + "movl %6,%%eax \n\t" \ + "mull %7 \n\t" \ + "addl %%eax,%0 \n\t" \ + "adcl %%edx,%1 \n\t" \ + "adcl $0,%2 \n\t" \ + :"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "g"(i), "g"(j) :"%eax","%edx","%cc"); + +#define SQRADDDB \ +asm( \ + "addl %6,%0 \n\t" \ + "adcl %7,%1 \n\t" \ + "adcl %8,%2 \n\t" \ + "addl %6,%0 \n\t" \ + "adcl %7,%1 \n\t" \ + "adcl %8,%2 \n\t" \ + :"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(sc0), "r"(sc1), "r"(sc2) : "%cc"); + +/******************************************************************************/ +#elif defined(PSTM_X86_64) +/* x86-64 optimized */ +#if !defined(__GNUC__) || !defined(__x86_64__) || !defined(PSTM_64BIT) +#error "PSTM_X86_64 option requires PSTM_64BIT, GCC and 64 bit mode x86 processor" +#endif +//#pragma message ("Using 64 bit x86_64 Assembly Optimizations") + +#define COMBA_START + +#define CLEAR_CARRY \ +c0 = c1 = c2 = 0; + +#define COMBA_STORE(x) \ +x = c0; + +#define COMBA_STORE2(x) \ +x = c1; + +#define CARRY_FORWARD \ +do { c0 = c1; c1 = c2; c2 = 0; } while (0); + +#define COMBA_FINI + +#define SQRADD(i, j) \ +asm( \ + "movq %6,%%rax \n\t" \ + "mulq %%rax \n\t" \ + "addq %%rax,%0 \n\t" \ + "adcq %%rdx,%1 \n\t" \ + "adcq $0,%2 \n\t" \ + :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "g"(i) :"%rax","%rdx","cc"); + +#define SQRADD2(i, j) \ +asm( \ + "movq %6,%%rax \n\t" \ + "mulq %7 \n\t" \ + "addq %%rax,%0 \n\t" \ + "adcq %%rdx,%1 \n\t" \ + "adcq $0,%2 \n\t" \ + "addq %%rax,%0 \n\t" \ + "adcq %%rdx,%1 \n\t" \ + "adcq $0,%2 \n\t" \ + :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "g"(i), "g"(j) :"%rax","%rdx","cc"); + +#define SQRADDSC(i, j) \ +asm( \ + "movq %6,%%rax \n\t" \ + "mulq %7 \n\t" \ + "movq %%rax,%0 \n\t" \ + "movq %%rdx,%1 \n\t" \ + "xorq %2,%2 \n\t" \ + :"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "g"(i), "g"(j) :"%rax","%rdx","cc"); + +#define SQRADDAC(i, j) \ +asm( \ + "movq %6,%%rax \n\t" \ + "mulq %7 \n\t" \ + "addq %%rax,%0 \n\t" \ + "adcq %%rdx,%1 \n\t" \ + "adcq $0,%2 \n\t" \ + :"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "g"(i), "g"(j) :"%rax","%rdx","cc"); + +#define SQRADDDB \ +asm( \ + "addq %6,%0 \n\t" \ + "adcq %7,%1 \n\t" \ + "adcq %8,%2 \n\t" \ + "addq %6,%0 \n\t" \ + "adcq %7,%1 \n\t" \ + "adcq %8,%2 \n\t" \ + :"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(sc0), "r"(sc1), "r"(sc2) : "cc"); + +/******************************************************************************/ +#elif defined(PSTM_ARM) +/* ARM code */ +//#pragma message ("Using 32 bit ARM Assembly Optimizations") + +#define COMBA_START + +#define CLEAR_CARRY \ +c0 = c1 = c2 = 0; + +#define COMBA_STORE(x) \ +x = c0; + +#define COMBA_STORE2(x) \ +x = c1; + +#define CARRY_FORWARD \ +do { c0 = c1; c1 = c2; c2 = 0; } while (0); + +#define COMBA_FINI + +/* multiplies point i and j, updates carry "c1" and digit c2 */ +#define SQRADD(i, j) \ +asm( \ +" UMULL r0,r1,%6,%6 \n\t" \ +" ADDS %0,%0,r0 \n\t" \ +" ADCS %1,%1,r1 \n\t" \ +" ADC %2,%2,#0 \n\t" \ +:"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(i) : "r0", "r1", "%cc"); + +/* for squaring some of the terms are doubled... */ +#define SQRADD2(i, j) \ +asm( \ +" UMULL r0,r1,%6,%7 \n\t" \ +" ADDS %0,%0,r0 \n\t" \ +" ADCS %1,%1,r1 \n\t" \ +" ADC %2,%2,#0 \n\t" \ +" ADDS %0,%0,r0 \n\t" \ +" ADCS %1,%1,r1 \n\t" \ +" ADC %2,%2,#0 \n\t" \ +:"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j) : "r0", "r1", "%cc"); + +#define SQRADDSC(i, j) \ +asm( \ +" UMULL %0,%1,%6,%7 \n\t" \ +" SUB %2,%2,%2 \n\t" \ +:"=r"(sc0), "=r"(sc1), "=r"(sc2) : "0"(sc0), "1"(sc1), "2"(sc2), "r"(i), "r"(j) : "%cc"); + +#define SQRADDAC(i, j) \ +asm( \ +" UMULL r0,r1,%6,%7 \n\t" \ +" ADDS %0,%0,r0 \n\t" \ +" ADCS %1,%1,r1 \n\t" \ +" ADC %2,%2,#0 \n\t" \ +:"=r"(sc0), "=r"(sc1), "=r"(sc2) : "0"(sc0), "1"(sc1), "2"(sc2), "r"(i), "r"(j) : "r0", "r1", "%cc"); + +#define SQRADDDB \ +asm( \ +" ADDS %0,%0,%3 \n\t" \ +" ADCS %1,%1,%4 \n\t" \ +" ADC %2,%2,%5 \n\t" \ +" ADDS %0,%0,%3 \n\t" \ +" ADCS %1,%1,%4 \n\t" \ +" ADC %2,%2,%5 \n\t" \ +:"=r"(c0), "=r"(c1), "=r"(c2) : "r"(sc0), "r"(sc1), "r"(sc2), "0"(c0), "1"(c1), "2"(c2) : "%cc"); + +/******************************************************************************/ +#elif defined(PSTM_MIPS) +/* MIPS32 */ +//#pragma message ("Using 32 bit MIPS Assembly Optimizations") + +#define COMBA_START + +#define CLEAR_CARRY \ +c0 = c1 = c2 = 0; + +#define COMBA_STORE(x) \ +x = c0; + +#define COMBA_STORE2(x) \ +x = c1; + +#define CARRY_FORWARD \ +do { c0 = c1; c1 = c2; c2 = 0; } while (0); + +#define COMBA_FINI + +/* multiplies point i and j, updates carry "c1" and digit c2 */ +#define SQRADD(i, j) \ +asm( \ + " multu %6,%6 \n\t" \ + " mflo $12 \n\t" \ + " mfhi $13 \n\t" \ + " addu %0,%0,$12 \n\t" \ + " sltu $12,%0,$12 \n\t" \ + " addu %1,%1,$13 \n\t" \ + " sltu $13,%1,$13 \n\t" \ + " addu %1,%1,$12 \n\t" \ + " sltu $12,%1,$12 \n\t" \ + " addu %2,%2,$13 \n\t" \ + " addu %2,%2,$12 \n\t" \ + :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i):"$12","$13"); + +/* for squaring some of the terms are doubled... */ +#define SQRADD2(i, j) \ +asm( \ + " multu %6,%7 \n\t" \ + " mflo $12 \n\t" \ + " mfhi $13 \n\t" \ + \ + " addu %0,%0,$12 \n\t" \ + " sltu $14,%0,$12 \n\t" \ + " addu %1,%1,$13 \n\t" \ + " sltu $15,%1,$13 \n\t" \ + " addu %1,%1,$14 \n\t" \ + " sltu $14,%1,$14 \n\t" \ + " addu %2,%2,$15 \n\t" \ + " addu %2,%2,$14 \n\t" \ + \ + " addu %0,%0,$12 \n\t" \ + " sltu $14,%0,$12 \n\t" \ + " addu %1,%1,$13 \n\t" \ + " sltu $15,%1,$13 \n\t" \ + " addu %1,%1,$14 \n\t" \ + " sltu $14,%1,$14 \n\t" \ + " addu %2,%2,$15 \n\t" \ + " addu %2,%2,$14 \n\t" \ + :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j):"$12", "$13", "$14", "$15"); + +#define SQRADDSC(i, j) \ +asm( \ + " multu %6,%7 \n\t" \ + " mflo %0 \n\t" \ + " mfhi %1 \n\t" \ + " xor %2,%2,%2 \n\t" \ + :"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i),"r"(j) : "%cc"); + +#define SQRADDAC(i, j) \ +asm( \ + " multu %6,%7 \n\t" \ + " mflo $12 \n\t" \ + " mfhi $13 \n\t" \ + " addu %0,%0,$12 \n\t" \ + " sltu $12,%0,$12 \n\t" \ + " addu %1,%1,$13 \n\t" \ + " sltu $13,%1,$13 \n\t" \ + " addu %1,%1,$12 \n\t" \ + " sltu $12,%1,$12 \n\t" \ + " addu %2,%2,$13 \n\t" \ + " addu %2,%2,$12 \n\t" \ + :"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i), "r"(j):"$12", "$13", "$14"); + +#define SQRADDDB \ +asm( \ + " addu %0,%0,%3 \n\t" \ + " sltu $10,%0,%3 \n\t" \ + " addu %1,%1,$10 \n\t" \ + " sltu $10,%1,$10 \n\t" \ + " addu %1,%1,%4 \n\t" \ + " sltu $11,%1,%4 \n\t" \ + " addu %2,%2,$10 \n\t" \ + " addu %2,%2,$11 \n\t" \ + " addu %2,%2,%5 \n\t" \ + \ + " addu %0,%0,%3 \n\t" \ + " sltu $10,%0,%3 \n\t" \ + " addu %1,%1,$10 \n\t" \ + " sltu $10,%1,$10 \n\t" \ + " addu %1,%1,%4 \n\t" \ + " sltu $11,%1,%4 \n\t" \ + " addu %2,%2,$10 \n\t" \ + " addu %2,%2,$11 \n\t" \ + " addu %2,%2,%5 \n\t" \ + :"=r"(c0), "=r"(c1), "=r"(c2) : "r"(sc0), "r"(sc1), "r"(sc2), "0"(c0), "1"(c1), "2"(c2) : "$10", "$11"); + +#else +/******************************************************************************/ +#define PSTM_ISO +/* ISO C portable code */ + +#define COMBA_START + +#define CLEAR_CARRY \ + c0 = c1 = c2 = 0; + +#define COMBA_STORE(x) \ + x = c0; + +#define COMBA_STORE2(x) \ + x = c1; + +#define CARRY_FORWARD \ + do { c0 = c1; c1 = c2; c2 = 0; } while (0); + +#define COMBA_FINI + +/* multiplies point i and j, updates carry "c1" and digit c2 */ +#define SQRADD(i, j) \ + do { pstm_word t; \ + t = c0 + ((pstm_word)i) * ((pstm_word)j); c0 = (pstm_digit)t; \ + t = c1 + (t >> DIGIT_BIT); \ + c1 = (pstm_digit)t; c2 += (pstm_digit)(t >> DIGIT_BIT); \ + } while (0); + + +/* for squaring some of the terms are doubled... */ +#define SQRADD2(i, j) \ + do { pstm_word t; \ + t = ((pstm_word)i) * ((pstm_word)j); \ + tt = (pstm_word)c0 + t; c0 = (pstm_digit)tt; \ + tt = (pstm_word)c1 + (tt >> DIGIT_BIT); \ + c1 = (pstm_digit)tt; c2 += (pstm_digit)(tt >> DIGIT_BIT); \ + tt = (pstm_word)c0 + t; c0 = (pstm_digit)tt; \ + tt = (pstm_word)c1 + (tt >> DIGIT_BIT); \ + c1 = (pstm_digit)tt; c2 += (pstm_digit)(tt >> DIGIT_BIT); \ + } while (0); + +#define SQRADDSC(i, j) \ + do { pstm_word t; \ + t = ((pstm_word)i) * ((pstm_word)j); \ + sc0 = (pstm_digit)t; sc1 = (pstm_digit)(t >> DIGIT_BIT); sc2 = 0; \ + } while (0); + +#define SQRADDAC(i, j) \ + do { pstm_word t; \ + t = ((pstm_word)sc0) + ((pstm_word)i) * ((pstm_word)j); \ + sc0 = (pstm_digit)t; \ + t = ((pstm_word)sc1) + (t >> DIGIT_BIT); sc1 = (pstm_digit)t; \ + sc2 += (pstm_digit)(t >> DIGIT_BIT); \ + } while (0); + +#define SQRADDDB \ + do { pstm_word t; \ + t = ((pstm_word)sc0) + ((pstm_word)sc0) + ((pstm_word)c0); \ + c0 = (pstm_digit)t; \ + t = ((pstm_word)sc1) + ((pstm_word)sc1) + c1 + (t >> DIGIT_BIT); \ + c1 = (pstm_digit)t; \ + c2 = c2 + sc2 + sc2 + (pstm_digit)(t >> DIGIT_BIT); \ + } while (0); + +#endif /* ISO_C */ + +/******************************************************************************/ +/* + Non-unrolled comba squarer + */ +///bbox: pool unused +#define pstm_sqr_comba_gen(pool, A, B, paD, paDlen) \ + pstm_sqr_comba_gen( A, B, paD, paDlen) +static int32 pstm_sqr_comba_gen(psPool_t *pool, pstm_int *A, pstm_int *B, + pstm_digit *paD, uint32 paDlen) +{ + int16 paDfail, pa; + int32 ix, iz; + pstm_digit c0, c1, c2, *dst; +#ifdef PSTM_ISO + pstm_word tt; +#endif + + paDfail = 0; + /* get size of output and trim */ + pa = A->used + A->used; + + /* number of output digits to produce */ + COMBA_START; + CLEAR_CARRY; +/* + If b is not large enough grow it and continue +*/ + if (B->alloc < pa) { + if (pstm_grow(B, pa) != PSTM_OKAY) { + return PS_MEM_FAIL; + } + } + if (paD != NULL) { + if (paDlen < (sizeof(pstm_digit) * pa)) { + paDfail = 1; /* have a paD, but it's not big enough */ + dst = xzalloc(sizeof(pstm_digit) * pa); + } else { + dst = paD; + memset(dst, 0x0, paDlen); + } + } else { + dst = xzalloc(sizeof(pstm_digit) * pa); + } + + for (ix = 0; ix < pa; ix++) { + int32 tx, ty, iy; + pstm_digit *tmpy, *tmpx; + + /* get offsets into the two bignums */ + ty = min(A->used-1, ix); + tx = ix - ty; + + /* setup temp aliases */ + tmpx = A->dp + tx; + tmpy = A->dp + ty; + +/* + This is the number of times the loop will iterate, + while (tx++ < a->used && ty-- >= 0) { ... } +*/ + iy = min(A->used-tx, ty+1); + +/* + now for squaring tx can never equal ty. We halve the distance since + they approach at a rate of 2x and we have to round because odd cases + need to be executed +*/ + iy = min(iy, (ty-tx+1)>>1); + + /* forward carries */ + CARRY_FORWARD; + + /* execute loop */ + for (iz = 0; iz < iy; iz++) { + SQRADD2(*tmpx++, *tmpy--); + } + + /* even columns have the square term in them */ + if ((ix&1) == 0) { + SQRADD(A->dp[ix>>1], A->dp[ix>>1]); + } + + /* store it */ + COMBA_STORE(dst[ix]); + } + + COMBA_FINI; +/* + setup dest + */ + iz = B->used; + B->used = pa; + { + pstm_digit *tmpc; + tmpc = B->dp; + for (ix = 0; ix < pa; ix++) { + *tmpc++ = dst[ix]; + } + /* clear unused digits (that existed in the old copy of c) */ + for (; ix < iz; ix++) { + *tmpc++ = 0; + } + } + pstm_clamp(B); + + if ((paD == NULL) || paDfail == 1) { + psFree(dst, pool); + } + return PS_SUCCESS; +} + +/******************************************************************************/ +/* + Unrolled Comba loop for 1024 bit keys + */ +#ifdef USE_1024_KEY_SPEED_OPTIMIZATIONS +static int32 pstm_sqr_comba16(pstm_int *A, pstm_int *B) +{ + pstm_digit *a, b[32], c0, c1, c2, sc0, sc1, sc2; +#ifdef PSTM_ISO + pstm_word tt; +#endif + + if (B->alloc < 32) { + if (pstm_grow(B, 32) != PSTM_OKAY) { + return PS_MEM_FAIL; + } + } + a = A->dp; + sc0 = sc1 = sc2 = 0; + + COMBA_START; + + /* clear carries */ + CLEAR_CARRY; + + /* output 0 */ + SQRADD(a[0],a[0]); + COMBA_STORE(b[0]); + + /* output 1 */ + CARRY_FORWARD; + SQRADD2(a[0], a[1]); + COMBA_STORE(b[1]); + + /* output 2 */ + CARRY_FORWARD; + SQRADD2(a[0], a[2]); SQRADD(a[1], a[1]); + COMBA_STORE(b[2]); + + /* output 3 */ + CARRY_FORWARD; + SQRADD2(a[0], a[3]); SQRADD2(a[1], a[2]); + COMBA_STORE(b[3]); + + /* output 4 */ + CARRY_FORWARD; + SQRADD2(a[0], a[4]); SQRADD2(a[1], a[3]); SQRADD(a[2], a[2]); + COMBA_STORE(b[4]); + + /* output 5 */ + CARRY_FORWARD; + SQRADDSC(a[0], a[5]); SQRADDAC(a[1], a[4]); SQRADDAC(a[2], a[3]); SQRADDDB; + COMBA_STORE(b[5]); + + /* output 6 */ + CARRY_FORWARD; + SQRADDSC(a[0], a[6]); SQRADDAC(a[1], a[5]); SQRADDAC(a[2], a[4]); SQRADDDB; SQRADD(a[3], a[3]); + COMBA_STORE(b[6]); + + /* output 7 */ + CARRY_FORWARD; + SQRADDSC(a[0], a[7]); SQRADDAC(a[1], a[6]); SQRADDAC(a[2], a[5]); SQRADDAC(a[3], a[4]); SQRADDDB; + COMBA_STORE(b[7]); + + /* output 8 */ + CARRY_FORWARD; + SQRADDSC(a[0], a[8]); SQRADDAC(a[1], a[7]); SQRADDAC(a[2], a[6]); SQRADDAC(a[3], a[5]); SQRADDDB; SQRADD(a[4], a[4]); + COMBA_STORE(b[8]); + + /* output 9 */ + CARRY_FORWARD; + SQRADDSC(a[0], a[9]); SQRADDAC(a[1], a[8]); SQRADDAC(a[2], a[7]); SQRADDAC(a[3], a[6]); SQRADDAC(a[4], a[5]); SQRADDDB; + COMBA_STORE(b[9]); + + /* output 10 */ + CARRY_FORWARD; + SQRADDSC(a[0], a[10]); SQRADDAC(a[1], a[9]); SQRADDAC(a[2], a[8]); SQRADDAC(a[3], a[7]); SQRADDAC(a[4], a[6]); SQRADDDB; SQRADD(a[5], a[5]); + COMBA_STORE(b[10]); + + /* output 11 */ + CARRY_FORWARD; + SQRADDSC(a[0], a[11]); SQRADDAC(a[1], a[10]); SQRADDAC(a[2], a[9]); SQRADDAC(a[3], a[8]); SQRADDAC(a[4], a[7]); SQRADDAC(a[5], a[6]); SQRADDDB; + COMBA_STORE(b[11]); + + /* output 12 */ + CARRY_FORWARD; + SQRADDSC(a[0], a[12]); SQRADDAC(a[1], a[11]); SQRADDAC(a[2], a[10]); SQRADDAC(a[3], a[9]); SQRADDAC(a[4], a[8]); SQRADDAC(a[5], a[7]); SQRADDDB; SQRADD(a[6], a[6]); + COMBA_STORE(b[12]); + + /* output 13 */ + CARRY_FORWARD; + SQRADDSC(a[0], a[13]); SQRADDAC(a[1], a[12]); SQRADDAC(a[2], a[11]); SQRADDAC(a[3], a[10]); SQRADDAC(a[4], a[9]); SQRADDAC(a[5], a[8]); SQRADDAC(a[6], a[7]); SQRADDDB; + COMBA_STORE(b[13]); + + /* output 14 */ + CARRY_FORWARD; + SQRADDSC(a[0], a[14]); SQRADDAC(a[1], a[13]); SQRADDAC(a[2], a[12]); SQRADDAC(a[3], a[11]); SQRADDAC(a[4], a[10]); SQRADDAC(a[5], a[9]); SQRADDAC(a[6], a[8]); SQRADDDB; SQRADD(a[7], a[7]); + COMBA_STORE(b[14]); + + /* output 15 */ + CARRY_FORWARD; + SQRADDSC(a[0], a[15]); SQRADDAC(a[1], a[14]); SQRADDAC(a[2], a[13]); SQRADDAC(a[3], a[12]); SQRADDAC(a[4], a[11]); SQRADDAC(a[5], a[10]); SQRADDAC(a[6], a[9]); SQRADDAC(a[7], a[8]); SQRADDDB; + COMBA_STORE(b[15]); + + /* output 16 */ + CARRY_FORWARD; + SQRADDSC(a[1], a[15]); SQRADDAC(a[2], a[14]); SQRADDAC(a[3], a[13]); SQRADDAC(a[4], a[12]); SQRADDAC(a[5], a[11]); SQRADDAC(a[6], a[10]); SQRADDAC(a[7], a[9]); SQRADDDB; SQRADD(a[8], a[8]); + COMBA_STORE(b[16]); + + /* output 17 */ + CARRY_FORWARD; + SQRADDSC(a[2], a[15]); SQRADDAC(a[3], a[14]); SQRADDAC(a[4], a[13]); SQRADDAC(a[5], a[12]); SQRADDAC(a[6], a[11]); SQRADDAC(a[7], a[10]); SQRADDAC(a[8], a[9]); SQRADDDB; + COMBA_STORE(b[17]); + + /* output 18 */ + CARRY_FORWARD; + SQRADDSC(a[3], a[15]); SQRADDAC(a[4], a[14]); SQRADDAC(a[5], a[13]); SQRADDAC(a[6], a[12]); SQRADDAC(a[7], a[11]); SQRADDAC(a[8], a[10]); SQRADDDB; SQRADD(a[9], a[9]); + COMBA_STORE(b[18]); + + /* output 19 */ + CARRY_FORWARD; + SQRADDSC(a[4], a[15]); SQRADDAC(a[5], a[14]); SQRADDAC(a[6], a[13]); SQRADDAC(a[7], a[12]); SQRADDAC(a[8], a[11]); SQRADDAC(a[9], a[10]); SQRADDDB; + COMBA_STORE(b[19]); + + /* output 20 */ + CARRY_FORWARD; + SQRADDSC(a[5], a[15]); SQRADDAC(a[6], a[14]); SQRADDAC(a[7], a[13]); SQRADDAC(a[8], a[12]); SQRADDAC(a[9], a[11]); SQRADDDB; SQRADD(a[10], a[10]); + COMBA_STORE(b[20]); + + /* output 21 */ + CARRY_FORWARD; + SQRADDSC(a[6], a[15]); SQRADDAC(a[7], a[14]); SQRADDAC(a[8], a[13]); SQRADDAC(a[9], a[12]); SQRADDAC(a[10], a[11]); SQRADDDB; + COMBA_STORE(b[21]); + + /* output 22 */ + CARRY_FORWARD; + SQRADDSC(a[7], a[15]); SQRADDAC(a[8], a[14]); SQRADDAC(a[9], a[13]); SQRADDAC(a[10], a[12]); SQRADDDB; SQRADD(a[11], a[11]); + COMBA_STORE(b[22]); + + /* output 23 */ + CARRY_FORWARD; + SQRADDSC(a[8], a[15]); SQRADDAC(a[9], a[14]); SQRADDAC(a[10], a[13]); SQRADDAC(a[11], a[12]); SQRADDDB; + COMBA_STORE(b[23]); + + /* output 24 */ + CARRY_FORWARD; + SQRADDSC(a[9], a[15]); SQRADDAC(a[10], a[14]); SQRADDAC(a[11], a[13]); SQRADDDB; SQRADD(a[12], a[12]); + COMBA_STORE(b[24]); + + /* output 25 */ + CARRY_FORWARD; + SQRADDSC(a[10], a[15]); SQRADDAC(a[11], a[14]); SQRADDAC(a[12], a[13]); SQRADDDB; + COMBA_STORE(b[25]); + + /* output 26 */ + CARRY_FORWARD; + SQRADD2(a[11], a[15]); SQRADD2(a[12], a[14]); SQRADD(a[13], a[13]); + COMBA_STORE(b[26]); + + /* output 27 */ + CARRY_FORWARD; + SQRADD2(a[12], a[15]); SQRADD2(a[13], a[14]); + COMBA_STORE(b[27]); + + /* output 28 */ + CARRY_FORWARD; + SQRADD2(a[13], a[15]); SQRADD(a[14], a[14]); + COMBA_STORE(b[28]); + + /* output 29 */ + CARRY_FORWARD; + SQRADD2(a[14], a[15]); + COMBA_STORE(b[29]); + + /* output 30 */ + CARRY_FORWARD; + SQRADD(a[15], a[15]); + COMBA_STORE(b[30]); + COMBA_STORE2(b[31]); + COMBA_FINI; + + B->used = 32; + B->sign = PSTM_ZPOS; + memcpy(B->dp, b, 32 * sizeof(pstm_digit)); + pstm_clamp(B); + return PSTM_OKAY; +} +#endif /* USE_1024_KEY_SPEED_OPTIMIZATIONS */ + + +#ifdef USE_2048_KEY_SPEED_OPTIMIZATIONS +static int32 pstm_sqr_comba32(pstm_int *A, pstm_int *B) +{ + pstm_digit *a, b[64], c0, c1, c2, sc0, sc1, sc2; +#ifdef PSTM_ISO + pstm_word tt; +#endif + + if (B->alloc < 64) { + if (pstm_grow(B, 64) != PSTM_OKAY) { + return PS_MEM_FAIL; + } + } + sc0 = sc1 = sc2 = 0; + a = A->dp; + COMBA_START; + + /* clear carries */ + CLEAR_CARRY; + + /* output 0 */ + SQRADD(a[0],a[0]); + COMBA_STORE(b[0]); + + /* output 1 */ + CARRY_FORWARD; + SQRADD2(a[0], a[1]); + COMBA_STORE(b[1]); + + /* output 2 */ + CARRY_FORWARD; + SQRADD2(a[0], a[2]); SQRADD(a[1], a[1]); + COMBA_STORE(b[2]); + + /* output 3 */ + CARRY_FORWARD; + SQRADD2(a[0], a[3]); SQRADD2(a[1], a[2]); + COMBA_STORE(b[3]); + + /* output 4 */ + CARRY_FORWARD; + SQRADD2(a[0], a[4]); SQRADD2(a[1], a[3]); SQRADD(a[2], a[2]); + COMBA_STORE(b[4]); + + /* output 5 */ + CARRY_FORWARD; + SQRADDSC(a[0], a[5]); SQRADDAC(a[1], a[4]); SQRADDAC(a[2], a[3]); SQRADDDB; + COMBA_STORE(b[5]); + + /* output 6 */ + CARRY_FORWARD; + SQRADDSC(a[0], a[6]); SQRADDAC(a[1], a[5]); SQRADDAC(a[2], a[4]); SQRADDDB; SQRADD(a[3], a[3]); + COMBA_STORE(b[6]); + + /* output 7 */ + CARRY_FORWARD; + SQRADDSC(a[0], a[7]); SQRADDAC(a[1], a[6]); SQRADDAC(a[2], a[5]); SQRADDAC(a[3], a[4]); SQRADDDB; + COMBA_STORE(b[7]); + + /* output 8 */ + CARRY_FORWARD; + SQRADDSC(a[0], a[8]); SQRADDAC(a[1], a[7]); SQRADDAC(a[2], a[6]); SQRADDAC(a[3], a[5]); SQRADDDB; SQRADD(a[4], a[4]); + COMBA_STORE(b[8]); + + /* output 9 */ + CARRY_FORWARD; + SQRADDSC(a[0], a[9]); SQRADDAC(a[1], a[8]); SQRADDAC(a[2], a[7]); SQRADDAC(a[3], a[6]); SQRADDAC(a[4], a[5]); SQRADDDB; + COMBA_STORE(b[9]); + + /* output 10 */ + CARRY_FORWARD; + SQRADDSC(a[0], a[10]); SQRADDAC(a[1], a[9]); SQRADDAC(a[2], a[8]); SQRADDAC(a[3], a[7]); SQRADDAC(a[4], a[6]); SQRADDDB; SQRADD(a[5], a[5]); + COMBA_STORE(b[10]); + + /* output 11 */ + CARRY_FORWARD; + SQRADDSC(a[0], a[11]); SQRADDAC(a[1], a[10]); SQRADDAC(a[2], a[9]); SQRADDAC(a[3], a[8]); SQRADDAC(a[4], a[7]); SQRADDAC(a[5], a[6]); SQRADDDB; + COMBA_STORE(b[11]); + + /* output 12 */ + CARRY_FORWARD; + SQRADDSC(a[0], a[12]); SQRADDAC(a[1], a[11]); SQRADDAC(a[2], a[10]); SQRADDAC(a[3], a[9]); SQRADDAC(a[4], a[8]); SQRADDAC(a[5], a[7]); SQRADDDB; SQRADD(a[6], a[6]); + COMBA_STORE(b[12]); + + /* output 13 */ + CARRY_FORWARD; + SQRADDSC(a[0], a[13]); SQRADDAC(a[1], a[12]); SQRADDAC(a[2], a[11]); SQRADDAC(a[3], a[10]); SQRADDAC(a[4], a[9]); SQRADDAC(a[5], a[8]); SQRADDAC(a[6], a[7]); SQRADDDB; + COMBA_STORE(b[13]); + + /* output 14 */ + CARRY_FORWARD; + SQRADDSC(a[0], a[14]); SQRADDAC(a[1], a[13]); SQRADDAC(a[2], a[12]); SQRADDAC(a[3], a[11]); SQRADDAC(a[4], a[10]); SQRADDAC(a[5], a[9]); SQRADDAC(a[6], a[8]); SQRADDDB; SQRADD(a[7], a[7]); + COMBA_STORE(b[14]); + + /* output 15 */ + CARRY_FORWARD; + SQRADDSC(a[0], a[15]); SQRADDAC(a[1], a[14]); SQRADDAC(a[2], a[13]); SQRADDAC(a[3], a[12]); SQRADDAC(a[4], a[11]); SQRADDAC(a[5], a[10]); SQRADDAC(a[6], a[9]); SQRADDAC(a[7], a[8]); SQRADDDB; + COMBA_STORE(b[15]); + + /* output 16 */ + CARRY_FORWARD; + SQRADDSC(a[0], a[16]); SQRADDAC(a[1], a[15]); SQRADDAC(a[2], a[14]); SQRADDAC(a[3], a[13]); SQRADDAC(a[4], a[12]); SQRADDAC(a[5], a[11]); SQRADDAC(a[6], a[10]); SQRADDAC(a[7], a[9]); SQRADDDB; SQRADD(a[8], a[8]); + COMBA_STORE(b[16]); + + /* output 17 */ + CARRY_FORWARD; + SQRADDSC(a[0], a[17]); SQRADDAC(a[1], a[16]); SQRADDAC(a[2], a[15]); SQRADDAC(a[3], a[14]); SQRADDAC(a[4], a[13]); SQRADDAC(a[5], a[12]); SQRADDAC(a[6], a[11]); SQRADDAC(a[7], a[10]); SQRADDAC(a[8], a[9]); SQRADDDB; + COMBA_STORE(b[17]); + + /* output 18 */ + CARRY_FORWARD; + SQRADDSC(a[0], a[18]); SQRADDAC(a[1], a[17]); SQRADDAC(a[2], a[16]); SQRADDAC(a[3], a[15]); SQRADDAC(a[4], a[14]); SQRADDAC(a[5], a[13]); SQRADDAC(a[6], a[12]); SQRADDAC(a[7], a[11]); SQRADDAC(a[8], a[10]); SQRADDDB; SQRADD(a[9], a[9]); + COMBA_STORE(b[18]); + + /* output 19 */ + CARRY_FORWARD; + SQRADDSC(a[0], a[19]); SQRADDAC(a[1], a[18]); SQRADDAC(a[2], a[17]); SQRADDAC(a[3], a[16]); SQRADDAC(a[4], a[15]); SQRADDAC(a[5], a[14]); SQRADDAC(a[6], a[13]); SQRADDAC(a[7], a[12]); SQRADDAC(a[8], a[11]); SQRADDAC(a[9], a[10]); SQRADDDB; + COMBA_STORE(b[19]); + + /* output 20 */ + CARRY_FORWARD; + SQRADDSC(a[0], a[20]); SQRADDAC(a[1], a[19]); SQRADDAC(a[2], a[18]); SQRADDAC(a[3], a[17]); SQRADDAC(a[4], a[16]); SQRADDAC(a[5], a[15]); SQRADDAC(a[6], a[14]); SQRADDAC(a[7], a[13]); SQRADDAC(a[8], a[12]); SQRADDAC(a[9], a[11]); SQRADDDB; SQRADD(a[10], a[10]); + COMBA_STORE(b[20]); + + /* output 21 */ + CARRY_FORWARD; + SQRADDSC(a[0], a[21]); SQRADDAC(a[1], a[20]); SQRADDAC(a[2], a[19]); SQRADDAC(a[3], a[18]); SQRADDAC(a[4], a[17]); SQRADDAC(a[5], a[16]); SQRADDAC(a[6], a[15]); SQRADDAC(a[7], a[14]); SQRADDAC(a[8], a[13]); SQRADDAC(a[9], a[12]); SQRADDAC(a[10], a[11]); SQRADDDB; + COMBA_STORE(b[21]); + + /* output 22 */ + CARRY_FORWARD; + SQRADDSC(a[0], a[22]); SQRADDAC(a[1], a[21]); SQRADDAC(a[2], a[20]); SQRADDAC(a[3], a[19]); SQRADDAC(a[4], a[18]); SQRADDAC(a[5], a[17]); SQRADDAC(a[6], a[16]); SQRADDAC(a[7], a[15]); SQRADDAC(a[8], a[14]); SQRADDAC(a[9], a[13]); SQRADDAC(a[10], a[12]); SQRADDDB; SQRADD(a[11], a[11]); + COMBA_STORE(b[22]); + + /* output 23 */ + CARRY_FORWARD; + SQRADDSC(a[0], a[23]); SQRADDAC(a[1], a[22]); SQRADDAC(a[2], a[21]); SQRADDAC(a[3], a[20]); SQRADDAC(a[4], a[19]); SQRADDAC(a[5], a[18]); SQRADDAC(a[6], a[17]); SQRADDAC(a[7], a[16]); SQRADDAC(a[8], a[15]); SQRADDAC(a[9], a[14]); SQRADDAC(a[10], a[13]); SQRADDAC(a[11], a[12]); SQRADDDB; + COMBA_STORE(b[23]); + + /* output 24 */ + CARRY_FORWARD; + SQRADDSC(a[0], a[24]); SQRADDAC(a[1], a[23]); SQRADDAC(a[2], a[22]); SQRADDAC(a[3], a[21]); SQRADDAC(a[4], a[20]); SQRADDAC(a[5], a[19]); SQRADDAC(a[6], a[18]); SQRADDAC(a[7], a[17]); SQRADDAC(a[8], a[16]); SQRADDAC(a[9], a[15]); SQRADDAC(a[10], a[14]); SQRADDAC(a[11], a[13]); SQRADDDB; SQRADD(a[12], a[12]); + COMBA_STORE(b[24]); + + /* output 25 */ + CARRY_FORWARD; + SQRADDSC(a[0], a[25]); SQRADDAC(a[1], a[24]); SQRADDAC(a[2], a[23]); SQRADDAC(a[3], a[22]); SQRADDAC(a[4], a[21]); SQRADDAC(a[5], a[20]); SQRADDAC(a[6], a[19]); SQRADDAC(a[7], a[18]); SQRADDAC(a[8], a[17]); SQRADDAC(a[9], a[16]); SQRADDAC(a[10], a[15]); SQRADDAC(a[11], a[14]); SQRADDAC(a[12], a[13]); SQRADDDB; + COMBA_STORE(b[25]); + + /* output 26 */ + CARRY_FORWARD; + SQRADDSC(a[0], a[26]); SQRADDAC(a[1], a[25]); SQRADDAC(a[2], a[24]); SQRADDAC(a[3], a[23]); SQRADDAC(a[4], a[22]); SQRADDAC(a[5], a[21]); SQRADDAC(a[6], a[20]); SQRADDAC(a[7], a[19]); SQRADDAC(a[8], a[18]); SQRADDAC(a[9], a[17]); SQRADDAC(a[10], a[16]); SQRADDAC(a[11], a[15]); SQRADDAC(a[12], a[14]); SQRADDDB; SQRADD(a[13], a[13]); + COMBA_STORE(b[26]); + + /* output 27 */ + CARRY_FORWARD; + SQRADDSC(a[0], a[27]); SQRADDAC(a[1], a[26]); SQRADDAC(a[2], a[25]); SQRADDAC(a[3], a[24]); SQRADDAC(a[4], a[23]); SQRADDAC(a[5], a[22]); SQRADDAC(a[6], a[21]); SQRADDAC(a[7], a[20]); SQRADDAC(a[8], a[19]); SQRADDAC(a[9], a[18]); SQRADDAC(a[10], a[17]); SQRADDAC(a[11], a[16]); SQRADDAC(a[12], a[15]); SQRADDAC(a[13], a[14]); SQRADDDB; + COMBA_STORE(b[27]); + + /* output 28 */ + CARRY_FORWARD; + SQRADDSC(a[0], a[28]); SQRADDAC(a[1], a[27]); SQRADDAC(a[2], a[26]); SQRADDAC(a[3], a[25]); SQRADDAC(a[4], a[24]); SQRADDAC(a[5], a[23]); SQRADDAC(a[6], a[22]); SQRADDAC(a[7], a[21]); SQRADDAC(a[8], a[20]); SQRADDAC(a[9], a[19]); SQRADDAC(a[10], a[18]); SQRADDAC(a[11], a[17]); SQRADDAC(a[12], a[16]); SQRADDAC(a[13], a[15]); SQRADDDB; SQRADD(a[14], a[14]); + COMBA_STORE(b[28]); + + /* output 29 */ + CARRY_FORWARD; + SQRADDSC(a[0], a[29]); SQRADDAC(a[1], a[28]); SQRADDAC(a[2], a[27]); SQRADDAC(a[3], a[26]); SQRADDAC(a[4], a[25]); SQRADDAC(a[5], a[24]); SQRADDAC(a[6], a[23]); SQRADDAC(a[7], a[22]); SQRADDAC(a[8], a[21]); SQRADDAC(a[9], a[20]); SQRADDAC(a[10], a[19]); SQRADDAC(a[11], a[18]); SQRADDAC(a[12], a[17]); SQRADDAC(a[13], a[16]); SQRADDAC(a[14], a[15]); SQRADDDB; + COMBA_STORE(b[29]); + + /* output 30 */ + CARRY_FORWARD; + SQRADDSC(a[0], a[30]); SQRADDAC(a[1], a[29]); SQRADDAC(a[2], a[28]); SQRADDAC(a[3], a[27]); SQRADDAC(a[4], a[26]); SQRADDAC(a[5], a[25]); SQRADDAC(a[6], a[24]); SQRADDAC(a[7], a[23]); SQRADDAC(a[8], a[22]); SQRADDAC(a[9], a[21]); SQRADDAC(a[10], a[20]); SQRADDAC(a[11], a[19]); SQRADDAC(a[12], a[18]); SQRADDAC(a[13], a[17]); SQRADDAC(a[14], a[16]); SQRADDDB; SQRADD(a[15], a[15]); + COMBA_STORE(b[30]); + + /* output 31 */ + CARRY_FORWARD; + SQRADDSC(a[0], a[31]); SQRADDAC(a[1], a[30]); SQRADDAC(a[2], a[29]); SQRADDAC(a[3], a[28]); SQRADDAC(a[4], a[27]); SQRADDAC(a[5], a[26]); SQRADDAC(a[6], a[25]); SQRADDAC(a[7], a[24]); SQRADDAC(a[8], a[23]); SQRADDAC(a[9], a[22]); SQRADDAC(a[10], a[21]); SQRADDAC(a[11], a[20]); SQRADDAC(a[12], a[19]); SQRADDAC(a[13], a[18]); SQRADDAC(a[14], a[17]); SQRADDAC(a[15], a[16]); SQRADDDB; + COMBA_STORE(b[31]); + + /* output 32 */ + CARRY_FORWARD; + SQRADDSC(a[1], a[31]); SQRADDAC(a[2], a[30]); SQRADDAC(a[3], a[29]); SQRADDAC(a[4], a[28]); SQRADDAC(a[5], a[27]); SQRADDAC(a[6], a[26]); SQRADDAC(a[7], a[25]); SQRADDAC(a[8], a[24]); SQRADDAC(a[9], a[23]); SQRADDAC(a[10], a[22]); SQRADDAC(a[11], a[21]); SQRADDAC(a[12], a[20]); SQRADDAC(a[13], a[19]); SQRADDAC(a[14], a[18]); SQRADDAC(a[15], a[17]); SQRADDDB; SQRADD(a[16], a[16]); + COMBA_STORE(b[32]); + + /* output 33 */ + CARRY_FORWARD; + SQRADDSC(a[2], a[31]); SQRADDAC(a[3], a[30]); SQRADDAC(a[4], a[29]); SQRADDAC(a[5], a[28]); SQRADDAC(a[6], a[27]); SQRADDAC(a[7], a[26]); SQRADDAC(a[8], a[25]); SQRADDAC(a[9], a[24]); SQRADDAC(a[10], a[23]); SQRADDAC(a[11], a[22]); SQRADDAC(a[12], a[21]); SQRADDAC(a[13], a[20]); SQRADDAC(a[14], a[19]); SQRADDAC(a[15], a[18]); SQRADDAC(a[16], a[17]); SQRADDDB; + COMBA_STORE(b[33]); + + /* output 34 */ + CARRY_FORWARD; + SQRADDSC(a[3], a[31]); SQRADDAC(a[4], a[30]); SQRADDAC(a[5], a[29]); SQRADDAC(a[6], a[28]); SQRADDAC(a[7], a[27]); SQRADDAC(a[8], a[26]); SQRADDAC(a[9], a[25]); SQRADDAC(a[10], a[24]); SQRADDAC(a[11], a[23]); SQRADDAC(a[12], a[22]); SQRADDAC(a[13], a[21]); SQRADDAC(a[14], a[20]); SQRADDAC(a[15], a[19]); SQRADDAC(a[16], a[18]); SQRADDDB; SQRADD(a[17], a[17]); + COMBA_STORE(b[34]); + + /* output 35 */ + CARRY_FORWARD; + SQRADDSC(a[4], a[31]); SQRADDAC(a[5], a[30]); SQRADDAC(a[6], a[29]); SQRADDAC(a[7], a[28]); SQRADDAC(a[8], a[27]); SQRADDAC(a[9], a[26]); SQRADDAC(a[10], a[25]); SQRADDAC(a[11], a[24]); SQRADDAC(a[12], a[23]); SQRADDAC(a[13], a[22]); SQRADDAC(a[14], a[21]); SQRADDAC(a[15], a[20]); SQRADDAC(a[16], a[19]); SQRADDAC(a[17], a[18]); SQRADDDB; + COMBA_STORE(b[35]); + + /* output 36 */ + CARRY_FORWARD; + SQRADDSC(a[5], a[31]); SQRADDAC(a[6], a[30]); SQRADDAC(a[7], a[29]); SQRADDAC(a[8], a[28]); SQRADDAC(a[9], a[27]); SQRADDAC(a[10], a[26]); SQRADDAC(a[11], a[25]); SQRADDAC(a[12], a[24]); SQRADDAC(a[13], a[23]); SQRADDAC(a[14], a[22]); SQRADDAC(a[15], a[21]); SQRADDAC(a[16], a[20]); SQRADDAC(a[17], a[19]); SQRADDDB; SQRADD(a[18], a[18]); + COMBA_STORE(b[36]); + + /* output 37 */ + CARRY_FORWARD; + SQRADDSC(a[6], a[31]); SQRADDAC(a[7], a[30]); SQRADDAC(a[8], a[29]); SQRADDAC(a[9], a[28]); SQRADDAC(a[10], a[27]); SQRADDAC(a[11], a[26]); SQRADDAC(a[12], a[25]); SQRADDAC(a[13], a[24]); SQRADDAC(a[14], a[23]); SQRADDAC(a[15], a[22]); SQRADDAC(a[16], a[21]); SQRADDAC(a[17], a[20]); SQRADDAC(a[18], a[19]); SQRADDDB; + COMBA_STORE(b[37]); + + /* output 38 */ + CARRY_FORWARD; + SQRADDSC(a[7], a[31]); SQRADDAC(a[8], a[30]); SQRADDAC(a[9], a[29]); SQRADDAC(a[10], a[28]); SQRADDAC(a[11], a[27]); SQRADDAC(a[12], a[26]); SQRADDAC(a[13], a[25]); SQRADDAC(a[14], a[24]); SQRADDAC(a[15], a[23]); SQRADDAC(a[16], a[22]); SQRADDAC(a[17], a[21]); SQRADDAC(a[18], a[20]); SQRADDDB; SQRADD(a[19], a[19]); + COMBA_STORE(b[38]); + + /* output 39 */ + CARRY_FORWARD; + SQRADDSC(a[8], a[31]); SQRADDAC(a[9], a[30]); SQRADDAC(a[10], a[29]); SQRADDAC(a[11], a[28]); SQRADDAC(a[12], a[27]); SQRADDAC(a[13], a[26]); SQRADDAC(a[14], a[25]); SQRADDAC(a[15], a[24]); SQRADDAC(a[16], a[23]); SQRADDAC(a[17], a[22]); SQRADDAC(a[18], a[21]); SQRADDAC(a[19], a[20]); SQRADDDB; + COMBA_STORE(b[39]); + + /* output 40 */ + CARRY_FORWARD; + SQRADDSC(a[9], a[31]); SQRADDAC(a[10], a[30]); SQRADDAC(a[11], a[29]); SQRADDAC(a[12], a[28]); SQRADDAC(a[13], a[27]); SQRADDAC(a[14], a[26]); SQRADDAC(a[15], a[25]); SQRADDAC(a[16], a[24]); SQRADDAC(a[17], a[23]); SQRADDAC(a[18], a[22]); SQRADDAC(a[19], a[21]); SQRADDDB; SQRADD(a[20], a[20]); + COMBA_STORE(b[40]); + + /* output 41 */ + CARRY_FORWARD; + SQRADDSC(a[10], a[31]); SQRADDAC(a[11], a[30]); SQRADDAC(a[12], a[29]); SQRADDAC(a[13], a[28]); SQRADDAC(a[14], a[27]); SQRADDAC(a[15], a[26]); SQRADDAC(a[16], a[25]); SQRADDAC(a[17], a[24]); SQRADDAC(a[18], a[23]); SQRADDAC(a[19], a[22]); SQRADDAC(a[20], a[21]); SQRADDDB; + COMBA_STORE(b[41]); + + /* output 42 */ + CARRY_FORWARD; + SQRADDSC(a[11], a[31]); SQRADDAC(a[12], a[30]); SQRADDAC(a[13], a[29]); SQRADDAC(a[14], a[28]); SQRADDAC(a[15], a[27]); SQRADDAC(a[16], a[26]); SQRADDAC(a[17], a[25]); SQRADDAC(a[18], a[24]); SQRADDAC(a[19], a[23]); SQRADDAC(a[20], a[22]); SQRADDDB; SQRADD(a[21], a[21]); + COMBA_STORE(b[42]); + + /* output 43 */ + CARRY_FORWARD; + SQRADDSC(a[12], a[31]); SQRADDAC(a[13], a[30]); SQRADDAC(a[14], a[29]); SQRADDAC(a[15], a[28]); SQRADDAC(a[16], a[27]); SQRADDAC(a[17], a[26]); SQRADDAC(a[18], a[25]); SQRADDAC(a[19], a[24]); SQRADDAC(a[20], a[23]); SQRADDAC(a[21], a[22]); SQRADDDB; + COMBA_STORE(b[43]); + + /* output 44 */ + CARRY_FORWARD; + SQRADDSC(a[13], a[31]); SQRADDAC(a[14], a[30]); SQRADDAC(a[15], a[29]); SQRADDAC(a[16], a[28]); SQRADDAC(a[17], a[27]); SQRADDAC(a[18], a[26]); SQRADDAC(a[19], a[25]); SQRADDAC(a[20], a[24]); SQRADDAC(a[21], a[23]); SQRADDDB; SQRADD(a[22], a[22]); + COMBA_STORE(b[44]); + + /* output 45 */ + CARRY_FORWARD; + SQRADDSC(a[14], a[31]); SQRADDAC(a[15], a[30]); SQRADDAC(a[16], a[29]); SQRADDAC(a[17], a[28]); SQRADDAC(a[18], a[27]); SQRADDAC(a[19], a[26]); SQRADDAC(a[20], a[25]); SQRADDAC(a[21], a[24]); SQRADDAC(a[22], a[23]); SQRADDDB; + COMBA_STORE(b[45]); + + /* output 46 */ + CARRY_FORWARD; + SQRADDSC(a[15], a[31]); SQRADDAC(a[16], a[30]); SQRADDAC(a[17], a[29]); SQRADDAC(a[18], a[28]); SQRADDAC(a[19], a[27]); SQRADDAC(a[20], a[26]); SQRADDAC(a[21], a[25]); SQRADDAC(a[22], a[24]); SQRADDDB; SQRADD(a[23], a[23]); + COMBA_STORE(b[46]); + + /* output 47 */ + CARRY_FORWARD; + SQRADDSC(a[16], a[31]); SQRADDAC(a[17], a[30]); SQRADDAC(a[18], a[29]); SQRADDAC(a[19], a[28]); SQRADDAC(a[20], a[27]); SQRADDAC(a[21], a[26]); SQRADDAC(a[22], a[25]); SQRADDAC(a[23], a[24]); SQRADDDB; + COMBA_STORE(b[47]); + + /* output 48 */ + CARRY_FORWARD; + SQRADDSC(a[17], a[31]); SQRADDAC(a[18], a[30]); SQRADDAC(a[19], a[29]); SQRADDAC(a[20], a[28]); SQRADDAC(a[21], a[27]); SQRADDAC(a[22], a[26]); SQRADDAC(a[23], a[25]); SQRADDDB; SQRADD(a[24], a[24]); + COMBA_STORE(b[48]); + + /* output 49 */ + CARRY_FORWARD; + SQRADDSC(a[18], a[31]); SQRADDAC(a[19], a[30]); SQRADDAC(a[20], a[29]); SQRADDAC(a[21], a[28]); SQRADDAC(a[22], a[27]); SQRADDAC(a[23], a[26]); SQRADDAC(a[24], a[25]); SQRADDDB; + COMBA_STORE(b[49]); + + /* output 50 */ + CARRY_FORWARD; + SQRADDSC(a[19], a[31]); SQRADDAC(a[20], a[30]); SQRADDAC(a[21], a[29]); SQRADDAC(a[22], a[28]); SQRADDAC(a[23], a[27]); SQRADDAC(a[24], a[26]); SQRADDDB; SQRADD(a[25], a[25]); + COMBA_STORE(b[50]); + + /* output 51 */ + CARRY_FORWARD; + SQRADDSC(a[20], a[31]); SQRADDAC(a[21], a[30]); SQRADDAC(a[22], a[29]); SQRADDAC(a[23], a[28]); SQRADDAC(a[24], a[27]); SQRADDAC(a[25], a[26]); SQRADDDB; + COMBA_STORE(b[51]); + + /* output 52 */ + CARRY_FORWARD; + SQRADDSC(a[21], a[31]); SQRADDAC(a[22], a[30]); SQRADDAC(a[23], a[29]); SQRADDAC(a[24], a[28]); SQRADDAC(a[25], a[27]); SQRADDDB; SQRADD(a[26], a[26]); + COMBA_STORE(b[52]); + + /* output 53 */ + CARRY_FORWARD; + SQRADDSC(a[22], a[31]); SQRADDAC(a[23], a[30]); SQRADDAC(a[24], a[29]); SQRADDAC(a[25], a[28]); SQRADDAC(a[26], a[27]); SQRADDDB; + COMBA_STORE(b[53]); + + /* output 54 */ + CARRY_FORWARD; + SQRADDSC(a[23], a[31]); SQRADDAC(a[24], a[30]); SQRADDAC(a[25], a[29]); SQRADDAC(a[26], a[28]); SQRADDDB; SQRADD(a[27], a[27]); + COMBA_STORE(b[54]); + + /* output 55 */ + CARRY_FORWARD; + SQRADDSC(a[24], a[31]); SQRADDAC(a[25], a[30]); SQRADDAC(a[26], a[29]); SQRADDAC(a[27], a[28]); SQRADDDB; + COMBA_STORE(b[55]); + + /* output 56 */ + CARRY_FORWARD; + SQRADDSC(a[25], a[31]); SQRADDAC(a[26], a[30]); SQRADDAC(a[27], a[29]); SQRADDDB; SQRADD(a[28], a[28]); + COMBA_STORE(b[56]); + + /* output 57 */ + CARRY_FORWARD; + SQRADDSC(a[26], a[31]); SQRADDAC(a[27], a[30]); SQRADDAC(a[28], a[29]); SQRADDDB; + COMBA_STORE(b[57]); + + /* output 58 */ + CARRY_FORWARD; + SQRADD2(a[27], a[31]); SQRADD2(a[28], a[30]); SQRADD(a[29], a[29]); + COMBA_STORE(b[58]); + + /* output 59 */ + CARRY_FORWARD; + SQRADD2(a[28], a[31]); SQRADD2(a[29], a[30]); + COMBA_STORE(b[59]); + + /* output 60 */ + CARRY_FORWARD; + SQRADD2(a[29], a[31]); SQRADD(a[30], a[30]); + COMBA_STORE(b[60]); + + /* output 61 */ + CARRY_FORWARD; + SQRADD2(a[30], a[31]); + COMBA_STORE(b[61]); + + /* output 62 */ + CARRY_FORWARD; + SQRADD(a[31], a[31]); + COMBA_STORE(b[62]); + COMBA_STORE2(b[63]); + COMBA_FINI; + + B->used = 64; + B->sign = PSTM_ZPOS; + memcpy(B->dp, b, 64 * sizeof(pstm_digit)); + pstm_clamp(B); + return PSTM_OKAY; +} +#endif /* USE_2048_KEY_SPEED_OPTIMIZATIONS */ + +/******************************************************************************/ +/* + */ +int32 pstm_sqr_comba(psPool_t *pool, pstm_int *A, pstm_int *B, pstm_digit *paD, + uint32 paDlen) +{ +#ifdef USE_1024_KEY_SPEED_OPTIMIZATIONS + if (A->used == 16) { + return pstm_sqr_comba16(A, B); + } else { +#ifdef USE_2048_KEY_SPEED_OPTIMIZATIONS + if (A->used == 32) { + return pstm_sqr_comba32(A, B); + } +#endif /* USE_2048_KEY_SPEED_OPTIMIZATIONS */ + return pstm_sqr_comba_gen(pool, A, B, paD, paDlen); + } +#else +#ifdef USE_2048_KEY_SPEED_OPTIMIZATIONS + if (A->used == 32) { + return pstm_sqr_comba32(A, B); + } +#endif /* USE_2048_KEY_SPEED_OPTIMIZATIONS */ + return pstm_sqr_comba_gen(pool, A, B, paD, paDlen); +#endif +} + +#endif /* DISABLE_PSTM */ +/******************************************************************************/ diff --git a/networking/tls_rsa.c b/networking/tls_rsa.c new file mode 100644 index 000000000..058b09cee --- /dev/null +++ b/networking/tls_rsa.c @@ -0,0 +1,203 @@ +/* + * Copyright (C) 2017 Denys Vlasenko + * + * Licensed under GPLv2, see file LICENSE in this source tree. + */ +#include "tls.h" + +#define pkcs1Pad(in, inlen, out, outlen, cryptType, userPtr) \ + pkcs1Pad(in, inlen, out, outlen, cryptType) +static ///bbox +int32 pkcs1Pad(unsigned char *in, uint32 inlen, unsigned char *out, + uint32 outlen, int32 cryptType, void *userPtr) +{ + unsigned char *c; + int32 randomLen; + + randomLen = outlen - 3 - inlen; + if (randomLen < 8) { + psTraceCrypto("pkcs1Pad failure\n"); + return PS_LIMIT_FAIL; + } + c = out; + *c = 0x00; + c++; + *c = (unsigned char)cryptType; + c++; + if (cryptType == PUBKEY_TYPE) { + while (randomLen-- > 0) { + *c++ = 0xFF; + } + } else { + if (matrixCryptoGetPrngData(c, (uint32)randomLen, userPtr) < 0) { + return PS_PLATFORM_FAIL; + } +/* + SECURITY: Read through the random data and change all 0x0 to 0x01. + This is per spec that no random bytes should be 0 +*/ + while (randomLen-- > 0) { + if (*c == 0x0) { + *c = 0x01; + } + c++; + } + } + *c = 0x00; + c++; + memcpy(c, in, inlen); + + return outlen; +} + +#define psRsaCrypt(pool, in, inlen, out, outlen, key, type, data) \ + psRsaCrypt(pool, in, inlen, out, outlen, key, type) +static ///bbox +int32 psRsaCrypt(psPool_t *pool, const unsigned char *in, uint32 inlen, + unsigned char *out, uint32 *outlen, psRsaKey_t *key, int32 type, + void *data) +{ + pstm_int tmp, tmpa, tmpb; + int32 res; + uint32 x; + + if (in == NULL || out == NULL || outlen == NULL || key == NULL) { + psTraceCrypto("NULL parameter error in psRsaCrypt\n"); + return PS_ARG_FAIL; + } + + tmp.dp = tmpa.dp = tmpb.dp = NULL; + + /* Init and copy into tmp */ + if (pstm_init_for_read_unsigned_bin(pool, &tmp, inlen + sizeof(pstm_digit)) + != PS_SUCCESS) { + return PS_FAILURE; + } + if (pstm_read_unsigned_bin(&tmp, (unsigned char *)in, inlen) != PS_SUCCESS){ + pstm_clear(&tmp); + return PS_FAILURE; + } + /* Sanity check on the input */ + if (pstm_cmp(&key->N, &tmp) == PSTM_LT) { + res = PS_LIMIT_FAIL; + goto done; + } + if (type == PRIVKEY_TYPE) { + if (key->optimized) { + if (pstm_init_size(pool, &tmpa, key->p.alloc) != PS_SUCCESS) { + res = PS_FAILURE; + goto done; + } + if (pstm_init_size(pool, &tmpb, key->q.alloc) != PS_SUCCESS) { + pstm_clear(&tmpa); + res = PS_FAILURE; + goto done; + } + if (pstm_exptmod(pool, &tmp, &key->dP, &key->p, &tmpa) != + PS_SUCCESS) { + psTraceCrypto("decrypt error: pstm_exptmod dP, p\n"); + goto error; + } + if (pstm_exptmod(pool, &tmp, &key->dQ, &key->q, &tmpb) != + PS_SUCCESS) { + psTraceCrypto("decrypt error: pstm_exptmod dQ, q\n"); + goto error; + } + if (pstm_sub(&tmpa, &tmpb, &tmp) != PS_SUCCESS) { + psTraceCrypto("decrypt error: sub tmpb, tmp\n"); + goto error; + } + if (pstm_mulmod(pool, &tmp, &key->qP, &key->p, &tmp) != PS_SUCCESS) { + psTraceCrypto("decrypt error: pstm_mulmod qP, p\n"); + goto error; + } + if (pstm_mul_comba(pool, &tmp, &key->q, &tmp, NULL, 0) + != PS_SUCCESS){ + psTraceCrypto("decrypt error: pstm_mul q \n"); + goto error; + } + if (pstm_add(&tmp, &tmpb, &tmp) != PS_SUCCESS) { + psTraceCrypto("decrypt error: pstm_add tmp \n"); + goto error; + } + } else { + if (pstm_exptmod(pool, &tmp, &key->d, &key->N, &tmp) != + PS_SUCCESS) { + psTraceCrypto("psRsaCrypt error: pstm_exptmod\n"); + goto error; + } + } + } else if (type == PUBKEY_TYPE) { + if (pstm_exptmod(pool, &tmp, &key->e, &key->N, &tmp) != PS_SUCCESS) { + psTraceCrypto("psRsaCrypt error: pstm_exptmod\n"); + goto error; + } + } else { + psTraceCrypto("psRsaCrypt error: invalid type param\n"); + goto error; + } + /* Read it back */ + x = pstm_unsigned_bin_size(&key->N); + + if ((uint32)x > *outlen) { + res = -1; + psTraceCrypto("psRsaCrypt error: pstm_unsigned_bin_size\n"); + goto done; + } + /* We want the encrypted value to always be the key size. Pad with 0x0 */ + while ((uint32)x < (unsigned long)key->size) { + *out++ = 0x0; + x++; + } + + *outlen = x; + /* Convert it */ + memset(out, 0x0, x); + + if (pstm_to_unsigned_bin(pool, &tmp, out+(x-pstm_unsigned_bin_size(&tmp))) + != PS_SUCCESS) { + psTraceCrypto("psRsaCrypt error: pstm_to_unsigned_bin\n"); + goto error; + } + /* Clean up and return */ + res = PS_SUCCESS; + goto done; +error: + res = PS_FAILURE; +done: + if (type == PRIVKEY_TYPE && key->optimized) { + pstm_clear_multi(&tmpa, &tmpb, NULL, NULL, NULL, NULL, NULL, NULL); + } + pstm_clear(&tmp); + return res; +} + +int32 psRsaEncryptPub(psPool_t *pool, psRsaKey_t *key, + unsigned char *in, uint32 inlen, + unsigned char *out, uint32 outlen, void *data) +{ + int32 err; + uint32 size; + + size = key->size; + if (outlen < size) { + psTraceCrypto("Error on bad outlen parameter to psRsaEncryptPub\n"); + return PS_ARG_FAIL; + } + + if ((err = pkcs1Pad(in, inlen, out, size, PRIVKEY_TYPE, data)) + < PS_SUCCESS) { + psTraceCrypto("Error padding psRsaEncryptPub. Likely data too long\n"); + return err; + } + if ((err = psRsaCrypt(pool, out, size, out, (uint32*)&outlen, key, + PUBKEY_TYPE, data)) < PS_SUCCESS) { + psTraceCrypto("Error performing psRsaEncryptPub\n"); + return err; + } + if (outlen != size) { + psTraceCrypto("Encrypted size error in psRsaEncryptPub\n"); + return PS_FAILURE; + } + return size; +} diff --git a/networking/tls_rsa.h b/networking/tls_rsa.h new file mode 100644 index 000000000..3281087c7 --- /dev/null +++ b/networking/tls_rsa.h @@ -0,0 +1,18 @@ +/* + * Copyright (C) 2017 Denys Vlasenko + * + * Licensed under GPLv2, see file LICENSE in this source tree. + */ + +typedef struct { + pstm_int e, d, N, qP, dP, dQ, p, q; + uint32 size; /* Size of the key in bytes */ + int32 optimized; /* 1 for optimized */ + psPool_t *pool; +} psRsaKey_t; + +#define psRsaEncryptPub(pool, key, in, inlen, out, outlen, data) \ + psRsaEncryptPub(pool, key, in, inlen, out, outlen) +int32 psRsaEncryptPub(psPool_t *pool, psRsaKey_t *key, + unsigned char *in, uint32 inlen, + unsigned char *out, uint32 outlen, void *data);