/* Cleanroom reference implementation of Helix */

/* See Ferguson et. al, Fast Software Encryption 2003, or
 * http://www.macfergus.com/helix/helixPreproc.pdf
 */

#include <string.h>

typedef unsigned long H_WORD;
typedef unsigned char H_UCHAR;

/* some useful macros -- little-endian */
#define B(x,i) ((H_UCHAR)(((x) >> (8*i)) & 0xFF))
#define BYTE2WORD(b) ( \
	(((H_WORD)(b)[3] & 0xFF)<<24) | \
	(((H_WORD)(b)[2] & 0xFF)<<16) | \
	(((H_WORD)(b)[1] & 0xFF)<<8) | \
	(((H_WORD)(b)[0] & 0xFF)) \
)
#define WORD2BYTE(w, b) { \
	(b)[3] = B(w,3); \
	(b)[2] = B(w,2); \
	(b)[1] = B(w,1); \
	(b)[0] = B(w,0); \
}
#define XORWORD(w, b) { \
	(b)[3] ^= B(w,3); \
	(b)[2] ^= B(w,2); \
	(b)[1] ^= B(w,1); \
	(b)[0] ^= B(w,0); \
}
#define ROTL(w,x) (((w) << (x))|((w) >> (32 - (x))))


/* HELIX variables */
H_WORD	h_iplus8[2];		/* block number maintained in two parts */
H_WORD	K[8];			/* expanded key */
H_WORD	N[8];			/* expanded nonce */
int	l_u;			/* length of user key */
H_WORD	A, B, C, D, E;		/* Z_0..Z_4 in the paper */

/* 3.2, figure 2, block function */
H_WORD
h_block(H_WORD X_i0, H_WORD P_i, H_WORD X_i1)
{
    H_WORD    r;

    r = A; /* for returning later */
    A += D;	    D = ROTL(D, 15);
    B += E;	    E = ROTL(E, 25);
    C ^= A;	    A = ROTL(A, 9);
    D ^= B;	    B = ROTL(B, 10);
    E += C;	    C = ROTL(C, 17);
    A ^= D + X_i0;  D = ROTL(D, 30);
    B ^= E;	    E = ROTL(E, 13);
    C += A;	    A = ROTL(A, 20);
    D += B;	    B = ROTL(B, 11);
    E ^= C;	    C = ROTL(C, 5);
    A += D ^ P_i;   D = ROTL(D, 15);
    B += E;	    E = ROTL(E, 25);
    C ^= A;	    A = ROTL(A, 9);
    D ^= B;	    B = ROTL(B, 10);
    E += C;	    C = ROTL(C, 17);
    A ^= D + X_i1;  D = ROTL(D, 30);
    B ^= E;	    E = ROTL(E, 13);
    C += A;	    A = ROTL(A, 20);
    D += B;	    B = ROTL(B, 11);
    E ^= C;	    C = ROTL(C, 5);
    /* increment i in funny way */
    if (++h_iplus8[0] == 0x80000000lu) {
	++h_iplus8[1];
	h_iplus8[0] = 0;
    }
    return r;
}

/* 3.7 Key schedule.
 * Could do feistel in place, but this follows description in paper.
 */
void
h_key(unsigned char *U, int U_len)
{
    H_WORD    k[40]; /* room for key schedule */
    int	    i;

    if (U_len > 32)
	U_len = 32; /* limit size of key */
    memset((void *)k, 0, sizeof k);
    memcpy((void *)&k[32], U, U_len);
    for (i = 32; i < 40; ++i)
	k[i] = BYTE2WORD(((unsigned char *)&k[i])); /* convert to words */
    for (i = 7; i >= 0; --i) {
	A = k[4*i+4];
	B = k[4*i+5];
	C = k[4*i+6];
	D = k[4*i+7];
	E = U_len + 64;
	(void)h_block(0, 0, 0);
	k[4*i+0] = A ^ k[4*i+8];
	k[4*i+1] = B ^ k[4*i+9];
	k[4*i+2] = C ^ k[4*i+10];
	k[4*i+3] = D ^ k[4*i+11];
    }
    /* copy into K */
    for (i = 0; i < 8; ++i)
	K[i] = k[i];
    /* remember length of key */
    l_u = U_len;
}

/* 3.3, nonce setup */
void
h_nonce(H_UCHAR nonce[16])
{
    N[0] = BYTE2WORD(&nonce[0]);
    N[1] = BYTE2WORD(&nonce[4]);
    N[2] = BYTE2WORD(&nonce[8]);
    N[3] = BYTE2WORD(&nonce[12]);
    N[4] = 0 - N[0];
    N[5] = 1 - N[1];
    N[6] = 2 - N[2];
    N[7] = 3 - N[3];
}

/* 3.3, X_i functions */
H_WORD
X(int one)
{
    H_WORD    x = 0;

    if (one) {
	x = K[(h_iplus8[0] + 4) & 0x07] + N[h_iplus8[0] & 0x07] + h_iplus8[0];
	if ((h_iplus8[0] & 0x03) == 3)
	    x += h_iplus8[1];
	else if ((h_iplus8[0] & 0x03) == 1)
	    x += l_u << 2;
    }
    else
	x = K[h_iplus8[0] & 0x07];
    return x;
}

/* 3.4 initialisation */
void
h_init()
{
    int	    i;

    h_iplus8[0] = h_iplus8[1] = 0;
    A = K[3] ^ N[0];
    B = K[4] ^ N[1];
    C = K[5] ^ N[2];
    D = K[6] ^ N[3];
    E = K[7];
    for (i = 0; i < 8; ++i)
	(void) h_block(X(0), 0, X(1));
}

/* 3.5 encryption, and 3.6 compute MAC */
void
h_encrypt(H_UCHAR *buf, int n, H_UCHAR macbuf[16])
{
    H_UCHAR   b[4];
    H_WORD    w;
    int	    i;

    h_init();
    while (n >= 4) {
	w = h_block(X(0), BYTE2WORD(buf), X(1));
	XORWORD(w, buf);
	buf += 4;
	n -= 4;
    }
    if (n != 0) {
	/* handle an odd bit at the end */
	for (i = 0; i < n; ++i)
	    b[i] = buf[i];
	for (/*...*/; i < 4; ++i)
	    b[i] = 0;
	w = BYTE2WORD(b);
	w = h_block(X(0), w, X(1));
	XORWORD(w, b);
	for (i = 0; i < n; ++i)
	    buf[i] = b[i];
    }
    /* now compute MAC. Note that "n" is currently l(P) mod 4. */
    A ^= 0x912d94f1;
    for (i = 0; i < 8; ++i)
	(void) h_block(X(0), n, X(1));
    for (i = 0; i < 4; ++i) {
	w = h_block(X(0), n, X(1));
	WORD2BYTE(w, &macbuf[i*4]);
    }
}

/* 3.8 decryption, and 3.6 compute MAC */
void
h_decrypt(H_UCHAR *buf, int n, H_UCHAR macbuf[16])
{
    H_UCHAR   b[4];
    H_WORD    w;
    int	    i;

    h_init();
    while (n >= 4) {
	/* rather than muck with h_block, we use knowledge of A */
	w = BYTE2WORD(buf) ^ A; /* plaintext */
	w = h_block(X(0), w, X(1));
	XORWORD(w, buf);
	buf += 4;
	n -= 4;
    }
    if (n != 0) {
	/* handle an odd bit at the end */
	for (i = 0; i < n; ++i)
	    b[i] = buf[i];
	XORWORD(A, b);
	for (/*...*/; i < 4; ++i)
	    b[i] = 0;
	w = BYTE2WORD(b);
	(void) h_block(X(0), w, X(1)); /* note decryption already done */
	for (i = 0; i < n; ++i)
	    buf[i] = b[i];
    }
    /* now compute MAC. Note that "n" is currently l(P) mod 4. */
    A ^= 0x912d94f1;
    for (i = 0; i < 8; ++i)
	(void) h_block(X(0), n, X(1));
    for (i = 0; i < 4; ++i) {
	w = h_block(X(0), n, X(1));
	WORD2BYTE(w, &macbuf[i*4]);
    }
}

//#define TEST 0
#ifdef TEST
/*--------------------------------------------------------------------------*/
/* test harness                                                             */
/*--------------------------------------------------------------------------*/

#include "hexlib.h"
#include <stdlib.h>
#include <stdio.h>
#include <time.h>

/* self test */
void
test_helix(int quick)
{
    extern int	keylen;
    H_UCHAR	key[32], nonce[16], buf[32], mac[16];

    /* basic test */
    printf("Test Vector set 1:\n");
    hexprint("Initial Key", key, 0);
    memset((void *)nonce, 0, 16);
    hexprint("Nonce", nonce, 16);
    h_key(key, 0);
    h_nonce(nonce);
    hexwprint("Working Key", K, 32);
    hexwcheck(K, "a9 3b 6e 32 bc 23 4f 6c 32 6c 0f 82 74 ff a2 41"
		 "e3 da 57 7d ef 7c 1b 64 af 78 7c 38 dc ef e3 de", 32);
    hexwprint("Working N", N, 32);
    memset(buf, 0, 10);
    hexprint("Plaintext", buf, 10);
    h_encrypt(buf, 10, mac);
    hexprint("Ciphertext", buf, 10);
    hexcheck(buf, "70 44 c9 be 48 ae 89 22 66 e4", 10);
    hexprint("MAC", mac, 16);
    hexcheck(mac, "65 be 7a 60 fd 3b 8a 5e 31 61 80 80 56 32 d8 10", 16);
    h_decrypt(buf, 10, mac);
    hexprint("decrypted", buf, 10);
    hexprint("MAC", mac, 16);
    hexcheck(mac, "65 be 7a 60 fd 3b 8a 5e 31 61 80 80 56 32 d8 10", 16);

    /* second vector */
    printf("\nTest Vector set 2:\n");
    hexread(key, "00 00 00 00 01 00 00 00 02 00 00 00 03 00 00 00"
                 "04 00 00 00 05 00 00 00 06 00 00 00 07 00 00 00", 32);
    hexprint("Initial Key", key, 32);
    hexread(nonce, "00 00 00 00 01 00 00 00 02 00 00 00 03 00 00 00", 16);
    hexprint("Nonce", nonce, 16);
    h_key(key, 32);
    h_nonce(nonce);
    hexwprint("Working Key", K, 32);
    hexwcheck(K, "6e e9 a7 6c bd 0b f6 20 a6 d9 b7 59 49 d3 39 95"
		 "04 f8 4a d6 83 12 f9 06 ed d1 a6 98 9e c8 9d 45", 32);
    hexread(buf, "00 00 00 00 01 00 00 00 02 00 00 00 03 00 00 00"
                 "04 00 00 00 05 00 00 00 06 00 00 00 07 00 00 00", 32);
    hexprint("Plaintext", buf, 32);
    h_encrypt(buf, 32, mac);
    hexprint("Ciphertext", buf, 32);
    hexcheck(buf, "7a 72 a7 5b 62 50 38 0b 69 75 1c d1 28 30 8d 9a"
		  "0c 74 46 a3 bf 3f 99 e6 65 56 b9 c1 18 ca 7d 87", 32);
    hexprint("MAC", mac, 16);
    hexcheck(mac, "e4 e5 49 01 c5 0b 34 e7 80 c0 9c 39 b1 09 a1 17", 16);
    h_decrypt(buf, 32, mac);
    hexprint("decrypted", buf, 32);
    hexprint("MAC", mac, 16);
    hexcheck(mac, "e4 e5 49 01 c5 0b 34 e7 80 c0 9c 39 b1 09 a1 17", 16);

    /* third vector */
    printf("\nTest Vector set 3:\n");
    hexread(key, "48 65 6c 69 78", 5);
    hexprint("Initial Key", key, 5);
    hexread(nonce, "30 31 32 33 34 35 36 37 38 39 61 62 63 64 65 66", 16);
    hexprint("Nonce", nonce, 16);
    h_key(key, 5);
    h_nonce(nonce);
    hexwprint("Working Key", K, 32);
    hexwcheck(K, "6c 1e d7 7a cb a3 a1 d2 8f 1c d6 20 6d f1 15 da"
		 "f4 03 28 4a 73 9b b6 9f 35 7a 85 f5 51 32 11 39", 32);
    hexread(buf, "48 65 6c 6c 6f 2c 20 77 6f 72 6c 64 21", 13);
    hexprint("Plaintext", buf, 13);
    h_encrypt(buf, 13, mac);
    hexprint("Ciphertext", buf, 13);
    hexcheck(buf, "6c 4c 27 b9 7a 82 a0 c5 80 2c 23 f2 0d", 13);
    hexprint("MAC", mac, 16);
    hexcheck(mac, "6c 82 d1 aa 3b 90 5f 12 f1 44 3f a7 f6 a1 01 d2", 16);
    h_decrypt(buf, 13, mac);
    hexprint("decrypted", buf, 13);
    hexprint("MAC", mac, 16);
    hexcheck(mac, "6c 82 d1 aa 3b 90 5f 12 f1 44 3f a7 f6 a1 01 d2", 16);
}

#define BLOCKSIZE	1600	/* for MAC-style tests */
#define MACSIZE		16
char	*testkey = "test key 128bits";
H_UCHAR	testIV[16];
H_UCHAR	testframe[BLOCKSIZE];
H_UCHAR	testmac[16];

/* Perform various timing tests
 */
void
time_helix(void)
{
    long	i;
    clock_t	t;
    H_WORD	k[4] = { 0, 0, 0, 0 };

    /*test_helix(1);*/
    h_key(testkey, strlen((char *)testkey));
    h_nonce(testIV);

    /* test packet encryption speed */
    t = clock();
    for (i = 0; i < 200000000; ) {
	h_nonce(testIV);
	h_encrypt(testframe, sizeof testframe, testmac);
	i += BLOCKSIZE;
    }
    t = clock() - t;
    printf("%f Mbyte per second encrypt/MAC %d-byte blocks\n",
	(((double)i/((double)t / (double)CLOCKS_PER_SEC))) / 1000000.0,
	BLOCKSIZE, MACSIZE*8);

    /* test packet decryption speed */
    t = clock();
    for (i = 0; i < 200000000; ) {
	h_nonce(testIV);
	h_decrypt(testframe, sizeof testframe, testmac);
	i += BLOCKSIZE;
    }
    t = clock() - t;
    printf("%f Mbyte per second decrypt/MAC %d-byte blocks\n",
	(((double)i/((double)t / (double)CLOCKS_PER_SEC))) / 1000000.0,
	BLOCKSIZE, MACSIZE*8);

    /* test key setup time */
    t = clock();
    for (i = 0; i < 1000000; ++i) {
	k[3] = i;
	h_key((H_UCHAR *)k, 16);
    }
    t = clock() - t;
    printf("%f million 128-bit keys per second\n",
	(((double)i/((double)t / (double)CLOCKS_PER_SEC))) / 1000000.0);
}

H_UCHAR	bigbuf[1024*1024];
H_UCHAR	macbuf[16];
int
main(int ac, char **av)
{
    int         n, i;
    int		vflag = 0;
    H_UCHAR	key[32], IV[32];
    int         keysz, IVsz;
    extern int	keylen;
    extern H_WORD	K[];

    if (ac == 2 && strcmp(av[1], "-test") == 0) {
        test_helix(0);
        return nerrors;
    }

    if (ac == 2 && strcmp(av[1], "-time") == 0) {
        time_helix();
        return 0;
    }

    if (ac >= 2 && strcmp(av[1], "-verbose") == 0) {
	vflag = 1;
	++av, --ac;
    }
    if (ac >= 2)
        hexread(key, av[1], keysz = strlen(av[1]) / 2);
    else
        hexread(key, "0000000000000000", keysz = 8);
    if (ac >= 3)
        hexread(IV, av[2], IVsz = strlen(av[2]) / 2);
    else
        IVsz = 16;
    sscanf(ac >= 4 ? av[3] : "1000000", "%d", &n);

    if (IVsz != 16) {
	fprintf(stderr, "IV must be 16 bytes\n");
	return 1;
    }
    h_key(key, keysz);
    h_nonce(IV);
    while (n > 0) {
	i = n > sizeof bigbuf ? sizeof bigbuf : n;
	memset(bigbuf, 0, i);
	h_encrypt(bigbuf, i, macbuf);
	hexbulk(bigbuf, i);
	n -= i;
    }
    return 0;
}
#endif /* TEST */
