/* salsa20.c - Bernstein's Salsa20 cipher * Copyright (C) 2012 Simon Josefsson, Niels Möller * Copyright (C) 2013 g10 Code GmbH * * This file is part of Libgcrypt. * * Libgcrypt is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser general Public License as * published by the Free Software Foundation; either version 2.1 of * the License, or (at your option) any later version. * * Libgcrypt is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this program; if not, see . * * For a description of the algorithm, see: * http://cr.yp.to/snuffle/spec.pdf * http://cr.yp.to/snuffle/design.pdf */ /* The code is based on the code in Nettle (git commit id 9d2d8ddaee35b91a4e1a32ae77cba04bea3480e7) which in turn is based on salsa20-ref.c version 20051118 D. J. Bernstein Public domain. */ #include #include #include #include #include "types.h" #include "g10lib.h" #include "cipher.h" #include "bufhelp.h" /* USE_AMD64 indicates whether to compile with AMD64 code. */ #undef USE_AMD64 #if defined(__x86_64__) && (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \ defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) # define USE_AMD64 1 #endif /* USE_ARM_NEON_ASM indicates whether to enable ARM NEON assembly code. */ #undef USE_ARM_NEON_ASM #ifdef ENABLE_NEON_SUPPORT # if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__) \ && defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS) \ && defined(HAVE_GCC_INLINE_ASM_NEON) # define USE_ARM_NEON_ASM 1 # endif #endif /*ENABLE_NEON_SUPPORT*/ #define SALSA20_MIN_KEY_SIZE 16 /* Bytes. */ #define SALSA20_MAX_KEY_SIZE 32 /* Bytes. */ #define SALSA20_BLOCK_SIZE 64 /* Bytes. */ #define SALSA20_IV_SIZE 8 /* Bytes. */ #define SALSA20_INPUT_LENGTH 16 /* Bytes. */ /* Number of rounds. The standard uses 20 rounds. In any case the number of rounds must be even. */ #define SALSA20_ROUNDS 20 #define SALSA20R12_ROUNDS 12 struct SALSA20_context_s; typedef unsigned int (*salsa20_core_t) (u32 *dst, struct SALSA20_context_s *ctx, unsigned int rounds); typedef void (* salsa20_keysetup_t)(struct SALSA20_context_s *ctx, const byte *key, int keylen); typedef void (* salsa20_ivsetup_t)(struct SALSA20_context_s *ctx, const byte *iv); typedef struct SALSA20_context_s { /* Indices 1-4 and 11-14 holds the key (two identical copies for the shorter key size), indices 0, 5, 10, 15 are constant, indices 6, 7 are the IV, and indices 8, 9 are the block counter: C K K K K C I I B B C K K K K C */ u32 input[SALSA20_INPUT_LENGTH]; u32 pad[SALSA20_INPUT_LENGTH]; unsigned int unused; /* bytes in the pad. */ #ifdef USE_ARM_NEON_ASM int use_neon; #endif salsa20_keysetup_t keysetup; salsa20_ivsetup_t ivsetup; salsa20_core_t core; } SALSA20_context_t; /* The masking of the right shift is needed to allow n == 0 (using just 32 - n and 64 - n results in undefined behaviour). Most uses of these macros use a constant and non-zero rotation count. */ #define ROTL32(n,x) (((x)<<(n)) | ((x)>>((-(n)&31)))) #define LE_SWAP32(v) le_bswap32(v) #define LE_READ_UINT32(p) buf_get_le32(p) static void salsa20_setiv (void *context, const byte *iv, size_t ivlen); static const char *selftest (void); #ifdef USE_AMD64 /* Assembly implementations use SystemV ABI, ABI conversion and additional * stack to store XMM6-XMM15 needed on Win64. */ #ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS # define ASM_FUNC_ABI __attribute__((sysv_abi)) # define ASM_EXTRA_STACK (10 * 16) #else # define ASM_FUNC_ABI # define ASM_EXTRA_STACK 0 #endif /* AMD64 assembly implementations of Salsa20. */ void _gcry_salsa20_amd64_keysetup(u32 *ctxinput, const void *key, int keybits) ASM_FUNC_ABI; void _gcry_salsa20_amd64_ivsetup(u32 *ctxinput, const void *iv) ASM_FUNC_ABI; unsigned int _gcry_salsa20_amd64_encrypt_blocks(u32 *ctxinput, const void *src, void *dst, size_t len, int rounds) ASM_FUNC_ABI; static void salsa20_keysetup(SALSA20_context_t *ctx, const byte *key, int keylen) { _gcry_salsa20_amd64_keysetup(ctx->input, key, keylen * 8); } static void salsa20_ivsetup(SALSA20_context_t *ctx, const byte *iv) { _gcry_salsa20_amd64_ivsetup(ctx->input, iv); } static unsigned int salsa20_core (u32 *dst, SALSA20_context_t *ctx, unsigned int rounds) { memset(dst, 0, SALSA20_BLOCK_SIZE); return _gcry_salsa20_amd64_encrypt_blocks(ctx->input, dst, dst, 1, rounds) + ASM_EXTRA_STACK; } #else /* USE_AMD64 */ #if 0 # define SALSA20_CORE_DEBUG(i) do { \ unsigned debug_j; \ for (debug_j = 0; debug_j < 16; debug_j++) \ { \ if (debug_j == 0) \ fprintf(stderr, "%2d:", (i)); \ else if (debug_j % 4 == 0) \ fprintf(stderr, "\n "); \ fprintf(stderr, " %8x", pad[debug_j]); \ } \ fprintf(stderr, "\n"); \ } while (0) #else # define SALSA20_CORE_DEBUG(i) #endif #define QROUND(x0, x1, x2, x3) \ do { \ x1 ^= ROTL32 ( 7, x0 + x3); \ x2 ^= ROTL32 ( 9, x1 + x0); \ x3 ^= ROTL32 (13, x2 + x1); \ x0 ^= ROTL32 (18, x3 + x2); \ } while(0) static unsigned int salsa20_core (u32 *dst, SALSA20_context_t *ctx, unsigned rounds) { u32 pad[SALSA20_INPUT_LENGTH], *src = ctx->input; unsigned int i; memcpy (pad, src, sizeof(pad)); for (i = 0; i < rounds; i += 2) { SALSA20_CORE_DEBUG (i); QROUND (pad[0], pad[4], pad[8], pad[12]); QROUND (pad[5], pad[9], pad[13], pad[1] ); QROUND (pad[10], pad[14], pad[2], pad[6] ); QROUND (pad[15], pad[3], pad[7], pad[11]); SALSA20_CORE_DEBUG (i+1); QROUND (pad[0], pad[1], pad[2], pad[3] ); QROUND (pad[5], pad[6], pad[7], pad[4] ); QROUND (pad[10], pad[11], pad[8], pad[9] ); QROUND (pad[15], pad[12], pad[13], pad[14]); } SALSA20_CORE_DEBUG (i); for (i = 0; i < SALSA20_INPUT_LENGTH; i++) { u32 t = pad[i] + src[i]; dst[i] = LE_SWAP32 (t); } /* Update counter. */ if (!++src[8]) src[9]++; /* burn_stack */ return ( 3*sizeof (void*) \ + 2*sizeof (void*) \ + 64 \ + sizeof (unsigned int) \ + sizeof (u32) ); } #undef QROUND #undef SALSA20_CORE_DEBUG static void salsa20_keysetup(SALSA20_context_t *ctx, const byte *key, int keylen) { /* These constants are the little endian encoding of the string "expand 32-byte k". For the 128 bit variant, the "32" in that string will be fixed up to "16". */ ctx->input[0] = 0x61707865; /* "apxe" */ ctx->input[5] = 0x3320646e; /* "3 dn" */ ctx->input[10] = 0x79622d32; /* "yb-2" */ ctx->input[15] = 0x6b206574; /* "k et" */ ctx->input[1] = LE_READ_UINT32(key + 0); ctx->input[2] = LE_READ_UINT32(key + 4); ctx->input[3] = LE_READ_UINT32(key + 8); ctx->input[4] = LE_READ_UINT32(key + 12); if (keylen == SALSA20_MAX_KEY_SIZE) /* 256 bits */ { ctx->input[11] = LE_READ_UINT32(key + 16); ctx->input[12] = LE_READ_UINT32(key + 20); ctx->input[13] = LE_READ_UINT32(key + 24); ctx->input[14] = LE_READ_UINT32(key + 28); } else /* 128 bits */ { ctx->input[11] = ctx->input[1]; ctx->input[12] = ctx->input[2]; ctx->input[13] = ctx->input[3]; ctx->input[14] = ctx->input[4]; ctx->input[5] -= 0x02000000; /* Change to "1 dn". */ ctx->input[10] += 0x00000004; /* Change to "yb-6". */ } } static void salsa20_ivsetup(SALSA20_context_t *ctx, const byte *iv) { ctx->input[6] = LE_READ_UINT32(iv + 0); ctx->input[7] = LE_READ_UINT32(iv + 4); /* Reset the block counter. */ ctx->input[8] = 0; ctx->input[9] = 0; } #endif /*!USE_AMD64*/ #ifdef USE_ARM_NEON_ASM /* ARM NEON implementation of Salsa20. */ unsigned int _gcry_arm_neon_salsa20_encrypt(void *c, const void *m, unsigned int nblks, void *k, unsigned int rounds); static unsigned int salsa20_core_neon (u32 *dst, SALSA20_context_t *ctx, unsigned int rounds) { return _gcry_arm_neon_salsa20_encrypt(dst, NULL, 1, ctx->input, rounds); } static void salsa20_ivsetup_neon(SALSA20_context_t *ctx, const byte *iv) { memcpy(ctx->input + 8, iv, 8); /* Reset the block counter. */ memset(ctx->input + 10, 0, 8); } static void salsa20_keysetup_neon(SALSA20_context_t *ctx, const byte *key, int klen) { static const unsigned char sigma32[16] = "expand 32-byte k"; static const unsigned char sigma16[16] = "expand 16-byte k"; if (klen == 16) { memcpy (ctx->input, key, 16); memcpy (ctx->input + 4, key, 16); /* Duplicate 128-bit key. */ memcpy (ctx->input + 12, sigma16, 16); } else { /* 32-byte key */ memcpy (ctx->input, key, 32); memcpy (ctx->input + 12, sigma32, 16); } } #endif /*USE_ARM_NEON_ASM*/ static gcry_err_code_t salsa20_do_setkey (SALSA20_context_t *ctx, const byte *key, unsigned int keylen) { static int initialized; static const char *selftest_failed; if (!initialized ) { initialized = 1; selftest_failed = selftest (); if (selftest_failed) log_error ("SALSA20 selftest failed (%s)\n", selftest_failed ); } if (selftest_failed) return GPG_ERR_SELFTEST_FAILED; if (keylen != SALSA20_MIN_KEY_SIZE && keylen != SALSA20_MAX_KEY_SIZE) return GPG_ERR_INV_KEYLEN; /* Default ops. */ ctx->keysetup = salsa20_keysetup; ctx->ivsetup = salsa20_ivsetup; ctx->core = salsa20_core; #ifdef USE_ARM_NEON_ASM ctx->use_neon = (_gcry_get_hw_features () & HWF_ARM_NEON) != 0; if (ctx->use_neon) { /* Use ARM NEON ops instead. */ ctx->keysetup = salsa20_keysetup_neon; ctx->ivsetup = salsa20_ivsetup_neon; ctx->core = salsa20_core_neon; } #endif ctx->keysetup (ctx, key, keylen); /* We default to a zero nonce. */ salsa20_setiv (ctx, NULL, 0); return 0; } static gcry_err_code_t salsa20_setkey (void *context, const byte *key, unsigned int keylen) { SALSA20_context_t *ctx = (SALSA20_context_t *)context; gcry_err_code_t rc = salsa20_do_setkey (ctx, key, keylen); _gcry_burn_stack (4 + sizeof (void *) + 4 * sizeof (void *)); return rc; } static void salsa20_setiv (void *context, const byte *iv, size_t ivlen) { SALSA20_context_t *ctx = (SALSA20_context_t *)context; byte tmp[SALSA20_IV_SIZE]; if (iv && ivlen != SALSA20_IV_SIZE) log_info ("WARNING: salsa20_setiv: bad ivlen=%u\n", (u32)ivlen); if (!iv || ivlen != SALSA20_IV_SIZE) memset (tmp, 0, sizeof(tmp)); else memcpy (tmp, iv, SALSA20_IV_SIZE); ctx->ivsetup (ctx, tmp); /* Reset the unused pad bytes counter. */ ctx->unused = 0; wipememory (tmp, sizeof(tmp)); } /* Note: This function requires LENGTH > 0. */ static void salsa20_do_encrypt_stream (SALSA20_context_t *ctx, byte *outbuf, const byte *inbuf, size_t length, unsigned rounds) { unsigned int nburn, burn = 0; if (ctx->unused) { unsigned char *p = (void*)ctx->pad; size_t n; gcry_assert (ctx->unused < SALSA20_BLOCK_SIZE); n = ctx->unused; if (n > length) n = length; buf_xor (outbuf, inbuf, p + SALSA20_BLOCK_SIZE - ctx->unused, n); length -= n; outbuf += n; inbuf += n; ctx->unused -= n; if (!length) return; gcry_assert (!ctx->unused); } #ifdef USE_AMD64 if (length >= SALSA20_BLOCK_SIZE) { size_t nblocks = length / SALSA20_BLOCK_SIZE; burn = _gcry_salsa20_amd64_encrypt_blocks(ctx->input, inbuf, outbuf, nblocks, rounds); burn += ASM_EXTRA_STACK; length -= SALSA20_BLOCK_SIZE * nblocks; outbuf += SALSA20_BLOCK_SIZE * nblocks; inbuf += SALSA20_BLOCK_SIZE * nblocks; } #endif #ifdef USE_ARM_NEON_ASM if (ctx->use_neon && length >= SALSA20_BLOCK_SIZE) { unsigned int nblocks = length / SALSA20_BLOCK_SIZE; _gcry_arm_neon_salsa20_encrypt (outbuf, inbuf, nblocks, ctx->input, rounds); length -= SALSA20_BLOCK_SIZE * nblocks; outbuf += SALSA20_BLOCK_SIZE * nblocks; inbuf += SALSA20_BLOCK_SIZE * nblocks; } #endif while (length > 0) { /* Create the next pad and bump the block counter. Note that it is the user's duty to change to another nonce not later than after 2^70 processed bytes. */ nburn = ctx->core (ctx->pad, ctx, rounds); burn = nburn > burn ? nburn : burn; if (length <= SALSA20_BLOCK_SIZE) { buf_xor (outbuf, inbuf, ctx->pad, length); ctx->unused = SALSA20_BLOCK_SIZE - length; break; } buf_xor (outbuf, inbuf, ctx->pad, SALSA20_BLOCK_SIZE); length -= SALSA20_BLOCK_SIZE; outbuf += SALSA20_BLOCK_SIZE; inbuf += SALSA20_BLOCK_SIZE; } _gcry_burn_stack (burn); } static void salsa20_encrypt_stream (void *context, byte *outbuf, const byte *inbuf, size_t length) { SALSA20_context_t *ctx = (SALSA20_context_t *)context; if (length) salsa20_do_encrypt_stream (ctx, outbuf, inbuf, length, SALSA20_ROUNDS); } static void salsa20r12_encrypt_stream (void *context, byte *outbuf, const byte *inbuf, size_t length) { SALSA20_context_t *ctx = (SALSA20_context_t *)context; if (length) salsa20_do_encrypt_stream (ctx, outbuf, inbuf, length, SALSA20R12_ROUNDS); } static const char* selftest (void) { byte ctxbuf[sizeof(SALSA20_context_t) + 15]; SALSA20_context_t *ctx; byte scratch[8+1]; byte buf[256+64+4]; int i; static byte key_1[] = { 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }; static const byte nonce_1[] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }; static const byte plaintext_1[] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }; static const byte ciphertext_1[] = { 0xE3, 0xBE, 0x8F, 0xDD, 0x8B, 0xEC, 0xA2, 0xE3}; /* 16-byte alignment required for amd64 implementation. */ ctx = (SALSA20_context_t *)((uintptr_t)(ctxbuf + 15) & ~(uintptr_t)15); salsa20_setkey (ctx, key_1, sizeof key_1); salsa20_setiv (ctx, nonce_1, sizeof nonce_1); scratch[8] = 0; salsa20_encrypt_stream (ctx, scratch, plaintext_1, sizeof plaintext_1); if (memcmp (scratch, ciphertext_1, sizeof ciphertext_1)) return "Salsa20 encryption test 1 failed."; if (scratch[8]) return "Salsa20 wrote too much."; salsa20_setkey( ctx, key_1, sizeof(key_1)); salsa20_setiv (ctx, nonce_1, sizeof nonce_1); salsa20_encrypt_stream (ctx, scratch, scratch, sizeof plaintext_1); if (memcmp (scratch, plaintext_1, sizeof plaintext_1)) return "Salsa20 decryption test 1 failed."; for (i = 0; i < sizeof buf; i++) buf[i] = i; salsa20_setkey (ctx, key_1, sizeof key_1); salsa20_setiv (ctx, nonce_1, sizeof nonce_1); /*encrypt*/ salsa20_encrypt_stream (ctx, buf, buf, sizeof buf); /*decrypt*/ salsa20_setkey (ctx, key_1, sizeof key_1); salsa20_setiv (ctx, nonce_1, sizeof nonce_1); salsa20_encrypt_stream (ctx, buf, buf, 1); salsa20_encrypt_stream (ctx, buf+1, buf+1, (sizeof buf)-1-1); salsa20_encrypt_stream (ctx, buf+(sizeof buf)-1, buf+(sizeof buf)-1, 1); for (i = 0; i < sizeof buf; i++) if (buf[i] != (byte)i) return "Salsa20 encryption test 2 failed."; return NULL; } gcry_cipher_spec_t _gcry_cipher_spec_salsa20 = { GCRY_CIPHER_SALSA20, {0, 0}, /* flags */ "SALSA20", /* name */ NULL, /* aliases */ NULL, /* oids */ 1, /* blocksize in bytes. */ SALSA20_MAX_KEY_SIZE*8, /* standard key length in bits. */ sizeof (SALSA20_context_t), salsa20_setkey, NULL, NULL, salsa20_encrypt_stream, salsa20_encrypt_stream, NULL, NULL, salsa20_setiv }; gcry_cipher_spec_t _gcry_cipher_spec_salsa20r12 = { GCRY_CIPHER_SALSA20R12, {0, 0}, /* flags */ "SALSA20R12", /* name */ NULL, /* aliases */ NULL, /* oids */ 1, /* blocksize in bytes. */ SALSA20_MAX_KEY_SIZE*8, /* standard key length in bits. */ sizeof (SALSA20_context_t), salsa20_setkey, NULL, NULL, salsa20r12_encrypt_stream, salsa20r12_encrypt_stream, NULL, NULL, salsa20_setiv };