/*-
 * Copyright (c) 2010 Konstantin Belousov <kib@FreeBSD.org>
 * Copyright (c) 2010 Pawel Jakub Dawidek <pjd@FreeBSD.org>
 * Copyright (c) 2012 John-Mark Gurney <jmg@FreeBSD.org>
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

#include <sys/cdefs.h>
__FBSDID("$FreeBSD: stable/9/sys/crypto/aesni/aesni_wrap.c 231979 2012-02-21 20:56:03Z kib $");

#include <sys/types.h>
#include <xmmintrin.h>
#include <crypto/cryptodev.h>
#include <strings.h>

void aesni_enc8(int rounds, const uint8_t *key_schedule, __m128i a,
    __m128i b, __m128i c, __m128i d, __m128i e, __m128i f, __m128i g,
    __m128i h, uint8_t to[AES_BLOCK_LEN * 8]);
void aesni_dec8(int rounds, const uint8_t *key_schedule, __m128i a,
    __m128i b, __m128i c, __m128i d, __m128i e, __m128i f, __m128i g,
    __m128i h, uint8_t to[AES_BLOCK_LEN * 8]);
__m128i aesni_enc(int rounds, const uint8_t *key_schedule, const __m128i from);
__m128i aesni_dec(int rounds, const uint8_t *key_schedule, const __m128i from);

void
aesni_encrypt_cbc(int rounds, const void *key_schedule, size_t len,
    const uint8_t *from, uint8_t *to, const uint8_t iv[AES_BLOCK_LEN])
{
	__m128i tot, ivreg;
	size_t i;

	len /= AES_BLOCK_LEN;
	ivreg = _mm_loadu_si128((__m128i *)iv);
	for (i = 0; i < len; i++) {
		tot = aesni_enc(rounds - 1, key_schedule,
		    _mm_loadu_si128((__m128i *)from) ^ ivreg);
		ivreg = tot;
		_mm_storeu_si128((__m128i *)to, tot);
		from += AES_BLOCK_LEN;
		to += AES_BLOCK_LEN;
	}
}

void
aesni_encrypt_ecb(int rounds, const void *key_schedule, size_t len,
    const uint8_t *from, uint8_t *to)
{
	__m128i tot;
	__m128i *blocks;
	size_t i, cnt;

	cnt = len / AES_BLOCK_LEN / 8;
	for (i = 0; i < cnt; i++) {
		blocks = (__m128i *)from;
		aesni_enc8(rounds - 1, key_schedule, blocks[0], blocks[1],
		    blocks[2], blocks[3], blocks[4], blocks[5], blocks[6],
		    blocks[7], to);
		from += AES_BLOCK_LEN * 8;
		to += AES_BLOCK_LEN * 8;
	}
	i *= 8;
	cnt = len / AES_BLOCK_LEN;
	for (; i < cnt; i++) {
		tot = aesni_enc(rounds - 1, key_schedule,
		    _mm_loadu_si128((__m128i *)from));
		_mm_storeu_si128((__m128i *)to, tot);
		from += AES_BLOCK_LEN;
		to += AES_BLOCK_LEN;
	}
}

void
aesni_decrypt_ecb(int rounds, const void *key_schedule, size_t len,
    const uint8_t from[AES_BLOCK_LEN], uint8_t to[AES_BLOCK_LEN])
{
	__m128i tot;
	__m128i *blocks;
	size_t i, cnt;

	cnt = len / AES_BLOCK_LEN / 8;
	for (i = 0; i < cnt; i++) {
		blocks = (__m128i *)from;
		aesni_dec8(rounds - 1, key_schedule, blocks[0], blocks[1],
		    blocks[2], blocks[3], blocks[4], blocks[5], blocks[6],
		    blocks[7], to);
		from += AES_BLOCK_LEN * 8;
		to += AES_BLOCK_LEN * 8;
	}
	i *= 8;
	cnt = len / AES_BLOCK_LEN;
	for (; i < cnt; i++) {
		tot = aesni_dec(rounds - 1, key_schedule,
		    _mm_loadu_si128((__m128i *)from));
		_mm_storeu_si128((__m128i *)to, tot);
		from += AES_BLOCK_LEN;
		to += AES_BLOCK_LEN;
	}
}

#define	AES_XTS_BLOCKSIZE	16
#define	AES_XTS_IVSIZE		8
#define	AES_XTS_ALPHA		0x87	/* GF(2^128) generator polynomial */

static inline __m128i
xts_crank_lfsr(__m128i inp)
{
	const __m128i alphamask = _mm_set_epi32(1, 1, 1, AES_XTS_ALPHA);
	__m128i xtweak, ret;

	/* set up xor mask */
	xtweak = _mm_shuffle_epi32(inp, 0x93);
	xtweak = _mm_srai_epi32(xtweak, 31);
	xtweak &= alphamask;

	/* next term */
	ret = _mm_slli_epi32(inp, 1);
	ret ^= xtweak;

	return ret;
}

static void
aesni_crypt_xts_block(int rounds, const void *key_schedule, __m128i *tweak,
    const __m128i *from, __m128i *to, int do_encrypt)
{
	__m128i block;

	block = *from ^ *tweak;

	if (do_encrypt)
		block = aesni_enc(rounds - 1, key_schedule, block);
	else
		block = aesni_dec(rounds - 1, key_schedule, block);

	*to = block ^ *tweak;

	*tweak = xts_crank_lfsr(*tweak);
}

static void
aesni_crypt_xts_block8(int rounds, const void *key_schedule, __m128i *tweak,
    const __m128i *from, __m128i *to, int do_encrypt)
{
	__m128i tmptweak;
	__m128i a, b, c, d, e, f, g, h;
	__m128i tweaks[8];
	int i;

	tmptweak = *tweak;

	/*
	 * unroll the loop.  This lets gcc put values directly in the
	 * register and saves memory accesses.
	 */
#define PREPINP(v, pos) 					\
		do {						\
			tweaks[(pos)] = tmptweak;		\
			(v) = from[(pos)] ^ tmptweak;		\
			tmptweak = xts_crank_lfsr(tmptweak);	\
		} while (0)
	PREPINP(a, 0);
	PREPINP(b, 1);
	PREPINP(c, 2);
	PREPINP(d, 3);
	PREPINP(e, 4);
	PREPINP(f, 5);
	PREPINP(g, 6);
	PREPINP(h, 7);
	*tweak = tmptweak;

	if (do_encrypt)
		aesni_enc8(rounds - 1, key_schedule, a, b, c, d, e, f, g, h,
		    (uint8_t *)to);
	else
		aesni_dec8(rounds - 1, key_schedule, a, b, c, d, e, f, g, h,
		    (uint8_t *)to);

	for (i = 0; i < 8; i++)
		to[i] ^= tweaks[i];
}

void
aesni_crypt_xts(int rounds, const void *data_schedule,
    const void *tweak_schedule, size_t len, const uint8_t *from, uint8_t *to,
    const uint8_t iv[AES_BLOCK_LEN], int do_encrypt)
{
	__m128i tweakreg;
	uint8_t tweak[AES_XTS_BLOCKSIZE] __aligned(16);
	size_t i, cnt;

	/*
	 * Prepare tweak as E_k2(IV). IV is specified as LE representation
	 * of a 64-bit block number which we allow to be passed in directly.
	 */
#if BYTE_ORDER == LITTLE_ENDIAN
	bcopy(iv, tweak, AES_XTS_IVSIZE);
	/* Last 64 bits of IV are always zero. */
	bzero(tweak + AES_XTS_IVSIZE, AES_XTS_IVSIZE);
#else
#error Only LITTLE_ENDIAN architectures are supported.
#endif
	tweakreg = _mm_loadu_si128((__m128i *)&tweak[0]);
	tweakreg = aesni_enc(rounds - 1, tweak_schedule, tweakreg);

	cnt = len / AES_XTS_BLOCKSIZE / 8;
	for (i = 0; i < cnt; i++) {
		aesni_crypt_xts_block8(rounds, data_schedule, &tweakreg,
		    (const __m128i *)from, (__m128i *)to, do_encrypt);
		from += AES_XTS_BLOCKSIZE * 8;
		to += AES_XTS_BLOCKSIZE * 8;
	}
	i *= 8;
	cnt = len / AES_XTS_BLOCKSIZE;
	for (; i < cnt; i++) {
		aesni_crypt_xts_block(rounds, data_schedule, &tweakreg,
		    (const __m128i *)from, (__m128i *)to, do_encrypt);
		from += AES_XTS_BLOCKSIZE;
		to += AES_XTS_BLOCKSIZE;
	}
}
