int main(int argc, char *argv[])
{
	int i;
	u8 *buff1, *buff2, gf_const_tbl[64], a = 2;
	struct perf start, stop;

	printf("gf_vect_mul_perf:\n");
	mk_gf_field();
	gf_vect_mul_init(a, gf_const_tbl);

	// Allocate large mem region
	buff1 = (u8*) malloc(TEST_LEN);
	buff2 = (u8*) malloc(TEST_LEN);
	if (NULL == buff1 || NULL == buff2){
		printf("Failed to allocate %dB\n", TEST_LEN);
		return 1;
	}

	memset(buff1, 0, TEST_LEN);
	memset(buff2, 0, TEST_LEN);

	gf_vect_mul(TEST_LEN, gf_const_tbl, buff1, buff2);

	printf("Start timed tests\n"); 
	fflush(0);

	gf_vect_mul(TEST_LEN, gf_const_tbl, buff1, buff2);
	perf_start(&start);
	for(i=0; i<TEST_LOOPS; i++){
		gf_vect_mul_init(a, gf_const_tbl);
		gf_vect_mul(TEST_LEN, gf_const_tbl, buff1, buff2);
	}
	perf_stop(&stop);
	printf("gf_vect_mul" TEST_TYPE_STR ": ");
	perf_print(stop,start,(long long)TEST_LEN*i);


	return 0;
}
int main(int argc, char *argv[])
{
	int i,j;
	void *buf;
	u8 g1[TEST_SOURCES], g2[TEST_SOURCES], g3[TEST_SOURCES];
	u8 g_tbls[3*TEST_SOURCES*32], *dest_ptrs[3], *buffs[TEST_SOURCES];
	u8 *dest1, *dest2, *dest3, *dest_ref1, *dest_ref2, *dest_ref3;
	struct perf start, stop;

	printf("gf_3vect_dot_prod_sse: %dx%d\n", TEST_SOURCES, TEST_LEN);

	mk_gf_field();


	// Allocate the arrays
	for(i=0; i<TEST_SOURCES; i++){
		if (posix_memalign(&buf, 64, TEST_LEN)) {
			printf("alloc error: Fail");
			return -1;
		}
		buffs[i] = buf;
	}

	if (posix_memalign(&buf, 64, TEST_LEN)) {
		printf("alloc error: Fail");
		return -1;
	}
	dest1 = buf;

	if (posix_memalign(&buf, 64, TEST_LEN)) {
		printf("alloc error: Fail");
		return -1;
	}
	dest2 = buf;

	if (posix_memalign(&buf, 64, TEST_LEN)) {
		printf("alloc error: Fail");
		return -1;
	}
	dest3 = buf;

	if (posix_memalign(&buf, 64, TEST_LEN)) {
		printf("alloc error: Fail");
		return -1;
	}
	dest_ref1 = buf;

	if (posix_memalign(&buf, 64, TEST_LEN)) {
		printf("alloc error: Fail");
		return -1;
	}
	dest_ref2 = buf;

	if (posix_memalign(&buf, 64, TEST_LEN)) {
		printf("alloc error: Fail");
		return -1;
	}
	dest_ref3 = buf;

	dest_ptrs[0] = dest1;
	dest_ptrs[1] = dest2;
	dest_ptrs[2] = dest3;


	// Performance test
	for(i=0; i<TEST_SOURCES; i++)
		for(j=0; j<TEST_LEN; j++)
			buffs[i][j] = rand();

	memset(dest1, 0, TEST_LEN);
	memset(dest2, 0, TEST_LEN);
	memset(dest_ref1, 0, TEST_LEN);
	memset(dest_ref2, 0, TEST_LEN);

	for (i=0; i<TEST_SOURCES; i++){
		g1[i] = rand();
		g2[i] = rand();
		g3[i] = rand();
	}

	for(j=0; j<TEST_SOURCES; j++){
		gf_vect_mul_init(g1[j], &g_tbls[j*32]);
		gf_vect_mul_init(g2[j], &g_tbls[(32*TEST_SOURCES) + (j*32)]);
		gf_vect_mul_init(g3[j], &g_tbls[(64*TEST_SOURCES) + (j*32)]);
	}

	gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[0], buffs, dest_ref1);
	gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[32*TEST_SOURCES], buffs, dest_ref2);
	gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[64*TEST_SOURCES], buffs, dest_ref3);

#ifdef DO_REF_PERF
	perf_start(&start);
	for (i=0; i<TEST_LOOPS/100; i++){
		for (j=0; j<TEST_SOURCES; j++){
			gf_vect_mul_init(g1[j], &g_tbls[j*32]);
			gf_vect_mul_init(g2[j], &g_tbls[(32*TEST_SOURCES) + (j*32)]);
			gf_vect_mul_init(g3[j], &g_tbls[(64*TEST_SOURCES) + (j*32)]);
		}

		gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[0], buffs, dest_ref1);
		gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[32*TEST_SOURCES], buffs, dest_ref2);
		gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[64*TEST_SOURCES], buffs, dest_ref3);
	}
	perf_stop(&stop);
	printf("gf_3vect_dot_prod_base" TEST_TYPE_STR ": ");
	perf_print(stop,start,(long long)TEST_LEN*(TEST_SOURCES+3)*i);
#endif

	gf_3vect_dot_prod_sse(TEST_LEN, TEST_SOURCES, g_tbls, buffs, dest_ptrs);

	perf_start(&start);
	for (i=0; i<TEST_LOOPS; i++) {
		for (j=0; j<TEST_SOURCES; j++){
			gf_vect_mul_init(g1[j], &g_tbls[j*32]);
			gf_vect_mul_init(g2[j], &g_tbls[(32*TEST_SOURCES) + (j*32)]);
			gf_vect_mul_init(g3[j], &g_tbls[(64*TEST_SOURCES) + (j*32)]);
		}

		gf_3vect_dot_prod_sse(TEST_LEN, TEST_SOURCES, g_tbls, buffs, dest_ptrs);
	}
	perf_stop(&stop);
	printf("gf_3vect_dot_prod_sse" TEST_TYPE_STR ": ");
	perf_print(stop,start, (long long)TEST_LEN*(TEST_SOURCES+3)*i);

	if (0 != memcmp(dest_ref1, dest1, TEST_LEN)){
		printf("Fail perf vect_dot_prod_sse test1\n");
		dump_matrix(buffs, 5, TEST_SOURCES);
		printf("dprod_base:"); 
		dump(dest_ref1, 25);
		printf("dprod_sse:"); 
		dump(dest1, 25);
		return -1;
	}
	if (0 != memcmp(dest_ref2, dest2, TEST_LEN)){
		printf("Fail perf vect_dot_prod_sse test2\n");
		dump_matrix(buffs, 5, TEST_SOURCES);
		printf("dprod_base:"); 
		dump(dest_ref2, 25);
		printf("dprod_sse:"); 
		dump(dest2, 25);
		return -1;
	}
	if (0 != memcmp(dest_ref3, dest3, TEST_LEN)){
		printf("Fail perf vect_dot_prod_sse test3\n");
		dump_matrix(buffs, 5, TEST_SOURCES);
		printf("dprod_base:"); 
		dump(dest_ref3, 25);
		printf("dprod_sse:"); 
		dump(dest3, 25);
		return -1;
	}

	printf("pass perf check\n");
	return 0;

}
int main(int argc, char *argv[])
{
	int i; 
	u8 *buff1, *buff2, *buff3, gf_const_tbl[64], a = 2;
	int align, size;
	unsigned char *efence_buff1;
	unsigned char *efence_buff2;
	unsigned char *efence_buff3;

	printf("gf_vect_mul_test:\n");
	mk_gf_field();
	gf_vect_mul_init(a, gf_const_tbl);


	buff1 = (u8*) malloc(TEST_SIZE);
	buff2 = (u8*) malloc(TEST_SIZE);
	buff3 = (u8*) malloc(TEST_SIZE);

	if (NULL == buff1 || NULL == buff2 || NULL == buff3){
		printf("buffer alloc error\n");
		return -1;
	}

	// Fill with rand data
	for(i=0; i<TEST_SIZE; i++)
		buff1[i] = rand();


	gf_vect_mul(TEST_SIZE, gf_const_tbl, buff1, buff2);

	for (i=0; i<TEST_SIZE; i++)
		if (gf_mul(a, buff1[i]) != buff2[i]) {
			printf("fail at %d, 0x%x x 2 = 0x%x (0x%x)\n",i, buff1[i], buff2[i], gf_mul(2, buff1[i]));
			return 1;
		}


	gf_vect_mul_base(TEST_SIZE, gf_const_tbl, buff1, buff3);

	// Check reference function
	for (i=0; i<TEST_SIZE; i++)
		if (buff2[i] != buff3[i]) {
			printf("fail at %d, 0x%x x 0x%d = 0x%x (0x%x)\n",
				i, a, buff1[i], buff2[i], gf_mul(a, buff1[i]));
			return 1;
		}




	for(i=0; i<TEST_SIZE; i++)
		buff1[i] = rand();

	// Check each possible constant
	printf("Random tests ");
	for(a=0; a!=255; a++){
		gf_vect_mul_init(a, gf_const_tbl);
		gf_vect_mul(TEST_SIZE, gf_const_tbl, buff1, buff2);

		for (i=0; i<TEST_SIZE; i++) {
			if (gf_mul(a, buff1[i]) != buff2[i]) {
				printf("fail at %d, 0x%x x %d = 0x%x (0x%x)\n",
					i, a, buff1[i], buff2[i], gf_mul(2, buff1[i]));
				return 1;
			}
		}
		putchar('.');
	}

	// Run tests at end of buffer for Electric Fence
	align = 32;
	a = 2;
	mk_gf_field();
	gf_vect_mul_init(a, gf_const_tbl);
	for(size=0; size<TEST_SIZE; size+=align){
		// Line up TEST_SIZE from end
		efence_buff1 = buff1 + size;
		efence_buff2 = buff2 + size;
		efence_buff3 = buff3 + size;

		gf_vect_mul(TEST_SIZE-size, gf_const_tbl, efence_buff1, efence_buff2);

		for (i=0; i<TEST_SIZE-size; i++)
			if (gf_mul(a, efence_buff1[i]) != efence_buff2[i]) {
				printf("fail at %d, 0x%x x 2 = 0x%x (0x%x)\n",
					i, efence_buff1[i], efence_buff2[i], gf_mul(2, efence_buff1[i]));
				return 1;
			}

		gf_vect_mul_base(TEST_SIZE-size, gf_const_tbl, efence_buff1, efence_buff3);

		// Check reference function
		for (i=0; i<TEST_SIZE-size; i++)
			if (efence_buff2[i] != efence_buff3[i]) {
				printf("fail at %d, 0x%x x 0x%d = 0x%x (0x%x)\n",
					i, a, efence_buff2[i], efence_buff3[i], gf_mul(2, efence_buff1[i]));
				return 1;
			}

		putchar('.');
	}

	printf(" done: Pass\n");
	return 0;
}
int main(void)
{
	int i, j, k;
	u8 s, vec[TEST_SOURCES], dest1[TEST_LEN], dest2[TEST_LEN];
	u8 *matrix[TEST_SOURCES];
	struct perf start, stop;

	mk_gf_field();
	mk_gf_mul_table(gf_mul_table);

	//generate random vector and matrix/data
	for (i = 0; i < TEST_SOURCES; i++) {
		vec[i] = rand();

		if (!(matrix[i] = malloc(TEST_LEN))) {
			fprintf(stderr, "Error failure\n\n");
			return -1;
		}
		for (j = 0; j < TEST_LEN; j++)
			matrix[i][j] = rand();

	}

	gf_vect_dot_prod_ref(TEST_LEN, TEST_SOURCES, vec, matrix, dest1);

	perf_start(&start);
	for (i = 0; i < TEST_LOOPS; i++)
		gf_vect_dot_prod_ref(TEST_LEN, TEST_SOURCES, vec, matrix, dest1);

	perf_stop(&stop);
	printf("gf_vect_dot_prod_2tbl" TEST_TYPE_STR ": ");
	perf_print(stop, start, (long long)TEST_LEN * (TEST_SOURCES + 1) * i);

	// Warm up mult tables
	for (i = 0; i < TEST_LEN; i++) {
		s = 0;
		for (j = 0; j < TEST_SOURCES; j++) {
			s ^= gf_mul_table[vec[j] * 256 + matrix[j][i]];
		}
		dest2[i] = s;
	}

	perf_start(&start);
	for (k = 0; k < TEST_LOOPS; k++) {
		for (i = 0; i < TEST_LEN; i++) {
			s = 0;
			for (j = 0; j < TEST_SOURCES; j++) {
				s ^= gf_mul_table[vec[j] * 256 + matrix[j][i]];
			}
			dest2[i] = s;
		}
	}
	perf_stop(&stop);
	printf("gf_vect_dot_prod_1tbl" TEST_TYPE_STR ": ");
	perf_print(stop, start, (long long)TEST_LEN * (TEST_SOURCES + 1) * k);

	// Compare with reference function
	if (0 != memcmp(dest1, dest2, TEST_LEN)) {
		printf("Error, different results!\n\n");
		return -1;
	}

	printf("Pass functional test\n");
	return 0;
}
int main(int argc, char *argv[])
{
	int i, j, rtest, srcs, m, k, nerrs, r, err;
	void *buf;
	u8 g[TEST_SOURCES], g_tbls[TEST_SOURCES*32], src_in_err[TEST_SOURCES];
	u8 *dest, *dest_ref, *temp_buff, *buffs[TEST_SOURCES];
	u8 a[MMAX*KMAX], b[MMAX*KMAX], d[MMAX*KMAX];
	u8  src_err_list[TEST_SOURCES], *recov[TEST_SOURCES];

	int align, size;
	unsigned char *efence_buffs[TEST_SOURCES];
	unsigned int offset;
	u8 *ubuffs[TEST_SOURCES];
	u8 *udest_ptr;

	printf("gf_vect_dot_prod_sse: %dx%d ", TEST_SOURCES, TEST_LEN);

	mk_gf_field();


	// Allocate the arrays
	for(i=0; i<TEST_SOURCES; i++){
		if (posix_memalign(&buf, 64, TEST_LEN)) {
			printf("alloc error: Fail");
			return -1;
		}
		buffs[i] = buf;
	}

	if (posix_memalign(&buf, 64, TEST_LEN)) {
		printf("alloc error: Fail");
		return -1;
	}
	dest = buf;

	if (posix_memalign(&buf, 64, TEST_LEN)) {
		printf("alloc error: Fail");
		return -1;
	}
	dest_ref = buf;
	
	if (posix_memalign(&buf, 64, TEST_LEN)) {
		printf("alloc error: Fail");
		return -1;
	}
	temp_buff = buf;


	// Test of all zeros
	for(i=0; i<TEST_SOURCES; i++)
		memset(buffs[i], 0, TEST_LEN);

	memset(dest, 0, TEST_LEN);
	memset(temp_buff, 0, TEST_LEN);
	memset(dest_ref, 0, TEST_LEN);
	memset(g, 0, TEST_SOURCES);


	for(i=0; i<TEST_SOURCES; i++)
		gf_vect_mul_init(g[i], &g_tbls[i*32]);

	gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[0], buffs, dest_ref);

	gf_vect_dot_prod_sse(TEST_LEN, TEST_SOURCES, g_tbls, buffs, dest);

	if (0 != memcmp(dest_ref, dest, TEST_LEN)){
		printf("Fail zero vect_dot_prod_sse test\n");
		dump_matrix(buffs, 5, TEST_SOURCES);
		printf("dprod_base:"); 
		dump(dest_ref, 25);
		printf("dprod_sse:"); 
		dump(dest, 25);;
		return -1;
	}
	else
		putchar('.');

	// Rand data test
	for(rtest=0; rtest<RANDOMS; rtest++){
		for(i=0; i<TEST_SOURCES; i++)
			for(j=0; j<TEST_LEN; j++)
				buffs[i][j] = rand();

		for (i=0; i<TEST_SOURCES; i++)
			g[i] = rand();

		for(i=0; i<TEST_SOURCES; i++)
			gf_vect_mul_init(g[i], &g_tbls[i*32]);

		gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[0], buffs, dest_ref);
		gf_vect_dot_prod_sse(TEST_LEN, TEST_SOURCES, g_tbls, buffs, dest);

		if (0 != memcmp(dest_ref, dest, TEST_LEN)){
			printf("Fail rand vect_dot_prod_sse test 1\n");
			dump_matrix(buffs, 5, TEST_SOURCES);
			printf("dprod_base:"); 
			dump(dest_ref, 25);
			printf("dprod_sse:"); 
			dump(dest, 25);
			return -1;
		}

		putchar('.');
	}

	// Rand data test with varied parameters
	for(rtest=0; rtest < RANDOMS; rtest++){
		for (srcs = TEST_SOURCES; srcs > 0; srcs--){
			for(i=0; i<srcs; i++)
				for(j=0; j<TEST_LEN; j++)
					buffs[i][j] = rand();

			for (i=0; i<srcs; i++)
				g[i] = rand();

			for(i=0; i<srcs; i++)
				gf_vect_mul_init(g[i], &g_tbls[i*32]);

			gf_vect_dot_prod_base(TEST_LEN, srcs, &g_tbls[0], buffs, dest_ref);
			gf_vect_dot_prod_sse(TEST_LEN, srcs, g_tbls, buffs, dest);

			if (0 != memcmp(dest_ref, dest, TEST_LEN)){
				printf("Fail rand vect_dot_prod_sse test 2\n");
				dump_matrix(buffs, 5, srcs);
				printf("dprod_base:"); 
				dump(dest_ref, 5);
				printf("dprod_sse:"); 
				dump(dest, 5);
				return -1;
			}

			putchar('.');
		}
	}




	// Test erasure code using gf_vect_dot_prod

	// Pick a first test
	m = 9;
	k = 5;
	if (m > MMAX || k > KMAX)
		return -1;

	gf_gen_rs_matrix(a, m, k);

	// Make random data
	for(i=0; i<k; i++)
		for(j=0; j<TEST_LEN; j++)
			buffs[i][j] = rand();

	// Make parity vects
	for (i=k; i<m; i++) {
		for (j=0; j<k; j++)
			gf_vect_mul_init(a[k*i+j], &g_tbls[j*32]);
#ifndef USEREF
		gf_vect_dot_prod_sse(TEST_LEN,
				k, g_tbls, buffs, buffs[i]);
#else
		gf_vect_dot_prod_base(TEST_LEN,
				k, &g_tbls[0], buffs, buffs[i]);
#endif
	}


	// Random buffers in erasure
	memset(src_in_err, 0, TEST_SOURCES);
	for (i=0, nerrs=0; i<k && nerrs<m-k; i++){
		err = 1 & rand();
		src_in_err[i] = err;
		if (err)
			src_err_list[nerrs++] = i;
	}

	// construct b by removing error rows
	for(i=0, r=0; i<k; i++, r++){
		while (src_in_err[r]) {
			r++; 
			continue;
		}
		for(j=0; j<k; j++)
			b[k*i+j] = a[k*r+j];
	}

	if (gf_invert_matrix((u8*)b, (u8*)d, k) < 0)
		printf("BAD MATRIX\n");
	

	for(i=0, r=0; i<k; i++, r++){
		while (src_in_err[r]) {
			r++; 
			continue;
		}
		recov[i] = buffs[r];
	}

	// Recover data
	for(i=0; i<nerrs; i++){
		for (j=0; j<k; j++)
			gf_vect_mul_init(d[k*src_err_list[i]+j], &g_tbls[j*32]);
#ifndef USEREF
		gf_vect_dot_prod_sse(TEST_LEN,
				k, g_tbls, recov, temp_buff);
#else
		gf_vect_dot_prod_base(TEST_LEN,
				k, &g_tbls[0], recov, temp_buff);
#endif

		if (0 != memcmp(temp_buff, buffs[src_err_list[i]],
					TEST_LEN)){
			printf("Fail error recovery (%d, %d, %d)\n", m, k, nerrs);
			printf("recov %d:",src_err_list[i]); 
			dump(temp_buff, 25);
			printf("orig   :");     
			dump(buffs[src_err_list[i]],25);
			return -1;
		}
	}


	// Do more random tests

	for (rtest = 0; rtest < RANDOMS; rtest++){
		while ((m = (rand() % MMAX)) < 2);
		while ((k = (rand() % KMAX)) >= m || k < 1);

		if (m>MMAX || k>KMAX)
			continue;

		gf_gen_rs_matrix(a, m, k);

		// Make random data
		for(i=0; i<k; i++)
			for(j=0; j<TEST_LEN; j++)
				buffs[i][j] = rand();

		// Make parity vects
		for (i=k; i<m; i++) {
			for (j=0; j<k; j++)
				gf_vect_mul_init(a[k*i+j], &g_tbls[j*32]);
#ifndef USEREF
			gf_vect_dot_prod_sse(TEST_LEN, k, g_tbls, buffs, buffs[i]);
#else
			gf_vect_dot_prod_base(TEST_LEN, k, &g_tbls[0], buffs, buffs[i]);
#endif
		}

		// Random errors
		memset(src_in_err, 0, TEST_SOURCES);
		for (i=0, nerrs=0; i<k && nerrs<m-k; i++){
			err = 1 & rand();
			src_in_err[i] = err;
			if (err)
				src_err_list[nerrs++] = i;
		}
		if (nerrs == 0){  // should have at least one error
			while ((err = (rand() % KMAX)) >= k) ;
			src_err_list[nerrs++] = err;
			src_in_err[err] = 1;
		}

		// construct b by removing error rows
		for(i=0, r=0; i<k; i++, r++){
			while (src_in_err[r]) {
				r++; 
				continue;
			}
			for(j=0; j<k; j++)
				b[k*i+j] = a[k*r+j];
		}

		if (gf_invert_matrix((u8*)b, (u8*)d, k) < 0)
			printf("BAD MATRIX\n");
	
		for(i=0, r=0; i<k; i++, r++){
			while (src_in_err[r]) {
				r++; 
				continue;
			}
			recov[i] = buffs[r];
		}

		// Recover data
		for(i=0; i<nerrs; i++){
			for (j=0; j<k; j++)
				gf_vect_mul_init(d[k*src_err_list[i]+j], &g_tbls[j*32]);
#ifndef USEREF
			gf_vect_dot_prod_sse(TEST_LEN, k, g_tbls, recov, temp_buff);
#else
			gf_vect_dot_prod_base(TEST_LEN, k, &g_tbls[0], recov, temp_buff);
#endif
			if (0 != memcmp(temp_buff, buffs[src_err_list[i]],
						TEST_LEN)){
				printf("Fail error recovery (%d, %d, %d) - ", m, k, nerrs);
				printf(" - erase list = ");
				for (i=0; i<nerrs; i++)
					printf(" %d", src_err_list[i]);
				printf("\na:\n"); 
				dump_u8xu8((u8*)a, m, k);
				printf("inv b:\n");   
				dump_u8xu8((u8*)d, k, k);
				printf("orig data:\n"); 
				dump_matrix(buffs, m, 25);
				printf("orig   :");     
				dump(buffs[src_err_list[i]],25);
				printf("recov %d:",src_err_list[i]); 
				dump(temp_buff, 25);
				return -1;
			}
		}
		putchar('.');
	}

	// Run tests at end of buffer for Electric Fence
	align = (LEN_ALIGN_CHK_B != 0) ? 1 : 16;
	for(size=EFENCE_TEST_MIN_SIZE; size<=TEST_SIZE; size+=align){
		for(i=0; i<TEST_SOURCES; i++)
			for(j=0; j<TEST_LEN; j++)
				buffs[i][j] = rand();

		for(i=0; i<TEST_SOURCES; i++) // Line up TEST_SIZE from end
			efence_buffs[i] = buffs[i] + TEST_LEN - size;

		for (i=0; i<TEST_SOURCES; i++)
			g[i] = rand();

		for(i=0; i<TEST_SOURCES; i++)
			gf_vect_mul_init(g[i], &g_tbls[i*32]);

		gf_vect_dot_prod_base(size, TEST_SOURCES, &g_tbls[0], efence_buffs, dest_ref);
		gf_vect_dot_prod_sse(size, TEST_SOURCES, g_tbls, efence_buffs, dest);

		if (0 != memcmp(dest_ref, dest, size)){
			printf("Fail rand vect_dot_prod_sse test 3\n");
			dump_matrix(efence_buffs, 5, TEST_SOURCES);
			printf("dprod_base:");
			dump(dest_ref, align);
			printf("dprod_sse:");
			dump(dest, align);
			return -1;
		}

		putchar('.');
	}

	// Test rand ptr alignment if available

	for(rtest=0; rtest<RANDOMS; rtest++){
		size = (TEST_LEN - PTR_ALIGN_CHK_B) & ~15;
		srcs = rand() % TEST_SOURCES;
		if (srcs == 0)
			continue;

		offset = (PTR_ALIGN_CHK_B != 0) ? 1 : PTR_ALIGN_CHK_B;
		// Add random offsets
		for(i=0; i<srcs; i++)
			ubuffs[i] = buffs[i] + (rand() & (PTR_ALIGN_CHK_B - offset));

		udest_ptr = dest + (rand() & (PTR_ALIGN_CHK_B - offset));

		memset(dest, 0, TEST_LEN);  // zero pad to check write-over

		for(i=0; i<srcs; i++)
			for(j=0; j<size; j++)
				ubuffs[i][j] = rand();

		for (i=0; i<srcs; i++)
			g[i] = rand();

		for(i=0; i<srcs; i++)
			gf_vect_mul_init(g[i], &g_tbls[i*32]);

		gf_vect_dot_prod_base(size, srcs, &g_tbls[0], ubuffs, dest_ref);

		gf_vect_dot_prod_sse(size, srcs, g_tbls, ubuffs, udest_ptr);

		if (memcmp(dest_ref, udest_ptr, size)){
			printf("Fail rand vect_dot_prod_sse test ualign srcs=%d\n", srcs);
			dump_matrix(ubuffs, 5, TEST_SOURCES);
			printf("dprod_base:"); 
			dump(dest_ref, 25);
			printf("dprod_sse:"); 
			dump(udest_ptr, 25);
			return -1;
		}

		// Confirm that padding around dests is unchanged
		memset(dest_ref, 0, PTR_ALIGN_CHK_B);  // Make reference zero buff
		offset = udest_ptr - dest;

		if (memcmp(dest, dest_ref, offset)){
			printf("Fail rand ualign pad start\n");
			return -1;
		}
		if (memcmp(dest + offset + size, dest_ref, PTR_ALIGN_CHK_B - offset)){
			printf("Fail rand ualign pad end\n");
			return -1;
		}

		putchar('.');
	}


	// Test all size alignment
	align = (LEN_ALIGN_CHK_B != 0) ? 1 : 16;

	for(size=TEST_LEN; size>15; size-=align){
		srcs = TEST_SOURCES;

		for(i=0; i<srcs; i++)
			for(j=0; j<size; j++)
				buffs[i][j] = rand();

		for (i=0; i<srcs; i++)
			g[i] = rand();

		for(i=0; i<srcs; i++)
			gf_vect_mul_init(g[i], &g_tbls[i*32]);

		gf_vect_dot_prod_base(size, srcs, &g_tbls[0], buffs, dest_ref);

		gf_vect_dot_prod_sse(size, srcs, g_tbls, buffs, dest);

		if (memcmp(dest_ref, dest, size)){
			printf("Fail rand vect_dot_prod_sse test ualign len=%d\n", size);
			dump_matrix(buffs, 5, TEST_SOURCES);
			printf("dprod_base:"); 
			dump(dest_ref, 25);
			printf("dprod_sse:"); 
			dump(dest, 25);
			return -1;
		}
	}

	printf("done all: Pass\n");
	return 0;
}
int main(int argc, char *argv[])
{
	int i,j, rtest, srcs;
	void *buf;
	u8 g1[TEST_SOURCES], g2[TEST_SOURCES], g_tbls[2*TEST_SOURCES*32];
	u8 *dest1, *dest2,  *dest_ref1,  *dest_ref2, *dest_ptrs[2];
	u8 *buffs[TEST_SOURCES];

	int align, size;
	unsigned char *efence_buffs[TEST_SOURCES];
	unsigned int offset;
	u8 *ubuffs[TEST_SOURCES];
	u8 *udest_ptrs[2];

	printf("gf_2vect_dot_prod_sse: %dx%d ", TEST_SOURCES, TEST_LEN);

	mk_gf_field();

	// Allocate the arrays
	for(i=0; i<TEST_SOURCES; i++){
		if (posix_memalign(&buf, 64, TEST_LEN)) {
			printf("alloc error: Fail");
			return -1;
		}
		buffs[i] = buf;
	}

	if (posix_memalign(&buf, 64, TEST_LEN)) {
		printf("alloc error: Fail");
		return -1;
	}
	dest1 = buf;

	if (posix_memalign(&buf, 64, TEST_LEN)) {
		printf("alloc error: Fail");
		return -1;
	}
	dest2 = buf;

	if (posix_memalign(&buf, 64, TEST_LEN)) {
		printf("alloc error: Fail");
		return -1;
	}
	dest_ref1 = buf;
	
	if (posix_memalign(&buf, 64, TEST_LEN)) {
		printf("alloc error: Fail");
		return -1;
	}
	dest_ref2 = buf;

	dest_ptrs[0] = dest1;
	dest_ptrs[1] = dest2;

	// Test of all zeros
	for(i=0; i<TEST_SOURCES; i++)
		memset(buffs[i], 0, TEST_LEN);

	memset(dest1, 0, TEST_LEN);
	memset(dest2, 0, TEST_LEN);
	memset(dest_ref1, 0, TEST_LEN);
	memset(dest_ref2, 0, TEST_LEN);
	memset(g1, 2, TEST_SOURCES);
	memset(g2, 1, TEST_SOURCES);


	for(i=0; i<TEST_SOURCES; i++){
		gf_vect_mul_init(g1[i], &g_tbls[i*32]);
		gf_vect_mul_init(g2[i], &g_tbls[32*TEST_SOURCES + i*32]);
	}

	gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[0], buffs, dest_ref1);
	gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[32*TEST_SOURCES], buffs, dest_ref2);

	gf_2vect_dot_prod_sse(TEST_LEN, TEST_SOURCES, g_tbls, buffs, dest_ptrs);

	if (0 != memcmp(dest_ref1, dest1, TEST_LEN)){
		printf("Fail zero vect_dot_prod_sse test1\n");
		dump_matrix(buffs, 5, TEST_SOURCES);
		printf("dprod_base:"); 
		dump(dest_ref1, 25);
		printf("dprod_sse:"); 
		dump(dest1, 25);
		return -1;
	}
	if (0 != memcmp(dest_ref2, dest2, TEST_LEN)){
		printf("Fail zero vect_dot_prod_sse test2\n");
		dump_matrix(buffs, 5, TEST_SOURCES);
		printf("dprod_base:"); 
		dump(dest_ref2, 25);
		printf("dprod_sse:"); 
		dump(dest2, 25);
		return -1;
	}


	putchar('.');


	// Rand data test

	for(rtest=0; rtest<RANDOMS; rtest++){
		for(i=0; i<TEST_SOURCES; i++)
			for(j=0; j<TEST_LEN; j++)
				buffs[i][j] = rand();

		for (i=0; i<TEST_SOURCES; i++){
			g1[i] = rand();
			g2[i] = rand();
		}

		for(i=0; i<TEST_SOURCES; i++){
			gf_vect_mul_init(g1[i], &g_tbls[i*32]);
			gf_vect_mul_init(g2[i], &g_tbls[(32*TEST_SOURCES) + (i*32)]);
		}

		gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[0], buffs, dest_ref1);
		gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[32*TEST_SOURCES], buffs, dest_ref2);

		gf_2vect_dot_prod_sse(TEST_LEN, TEST_SOURCES, g_tbls, buffs, dest_ptrs);

		if (0 != memcmp(dest_ref1, dest1, TEST_LEN)){
			printf("Fail rand 2vect_dot_prod_sse test1 %d\n", rtest);
			dump_matrix(buffs, 5, TEST_SOURCES);
			printf("dprod_base:"); 
			dump(dest_ref1, 25);
			printf("dprod_sse:"); 
			dump(dest1, 25);
			return -1;
		}
		if (0 != memcmp(dest_ref2, dest2, TEST_LEN)){
			printf("Fail rand 2vect_dot_prod_sse test2 %d\n", rtest);
			dump_matrix(buffs, 5, TEST_SOURCES);
			printf("dprod_base:"); 
			dump(dest_ref2, 25);
			printf("dprod_sse:"); 
			dump(dest2, 25);
			return -1;
		}

		putchar('.');
	}





	// Rand data test with varied parameters
	for(rtest=0; rtest<RANDOMS; rtest++){
		for (srcs = TEST_SOURCES; srcs > 0; srcs--){
			for(i=0; i<srcs; i++)
				for(j=0; j<TEST_LEN; j++)
					buffs[i][j] = rand();


			for (i=0; i<srcs; i++){
				g1[i] = rand();
				g2[i] = rand();
			}

			for(i=0; i<srcs; i++){
				gf_vect_mul_init(g1[i], &g_tbls[i*32]);
				gf_vect_mul_init(g2[i], &g_tbls[(32*srcs) + (i*32)]);
			}

			gf_vect_dot_prod_base(TEST_LEN, srcs, &g_tbls[0], buffs, dest_ref1);
			gf_vect_dot_prod_base(TEST_LEN, srcs, &g_tbls[32*srcs], buffs, dest_ref2);

			gf_2vect_dot_prod_sse(TEST_LEN, srcs, g_tbls, buffs, dest_ptrs);


			if (0 != memcmp(dest_ref1, dest1, TEST_LEN)){
				printf("Fail rand 2vect_dot_prod_sse test1 srcs=%d\n", srcs);
				dump_matrix(buffs, 5, TEST_SOURCES);
				printf("dprod_base:"); 
				dump(dest_ref1, 25);
				printf("dprod_sse:"); 
				dump(dest1, 25);
				return -1;
			}
			if (0 != memcmp(dest_ref2, dest2, TEST_LEN)){
				printf("Fail rand 2vect_dot_prod_sse test2 srcs=%d\n", srcs);
				dump_matrix(buffs, 5, TEST_SOURCES);
				printf("dprod_base:"); 
				dump(dest_ref2, 25);
				printf("dprod_sse:"); 
				dump(dest2, 25);
				return -1;
			}


		putchar('.');
		}
	}


	// Run tests at end of buffer for Electric Fence
	align = (LEN_ALIGN_CHK_B != 0) ? 1 : 16;
	for(size=EFENCE_TEST_MIN_SIZE; size<=TEST_SIZE; size+=align){
		for(i=0; i<TEST_SOURCES; i++)
			for(j=0; j<TEST_LEN; j++)
				buffs[i][j] = rand();

		for(i=0; i<TEST_SOURCES; i++) // Line up TEST_SIZE from end
			efence_buffs[i] = buffs[i] + TEST_LEN - size;

		for (i=0; i<TEST_SOURCES; i++){
			g1[i] = rand();
			g2[i] = rand();
		}

		for(i=0; i<TEST_SOURCES; i++){
			gf_vect_mul_init(g1[i], &g_tbls[i*32]);
			gf_vect_mul_init(g2[i], &g_tbls[(32*TEST_SOURCES) + (i*32)]);
		}

		gf_vect_dot_prod_base(size, TEST_SOURCES, &g_tbls[0], efence_buffs, dest_ref1);
		gf_vect_dot_prod_base(size, TEST_SOURCES, &g_tbls[32*TEST_SOURCES], efence_buffs, dest_ref2);

		gf_2vect_dot_prod_sse(size, TEST_SOURCES, g_tbls, efence_buffs, dest_ptrs);

		if (0 != memcmp(dest_ref1, dest1, size)){
			printf("Fail rand 2vect_dot_prod_sse test1 %d\n", rtest);
			dump_matrix(efence_buffs, 5, TEST_SOURCES);
			printf("dprod_base:");
			dump(dest_ref1, align);
			printf("dprod_sse:");
			dump(dest1, align);
			return -1;
		}
		
		if (0 != memcmp(dest_ref2, dest2, size)){
			printf("Fail rand 2vect_dot_prod_sse test2 %d\n", rtest);
			dump_matrix(efence_buffs, 5, TEST_SOURCES);
			printf("dprod_base:");
			dump(dest_ref2, align);
			printf("dprod_sse:");
			dump(dest2, align);
			return -1;
		}

		putchar('.');
	}

	// Test rand ptr alignment if available

	for(rtest=0; rtest<RANDOMS; rtest++){
		size = (TEST_LEN - PTR_ALIGN_CHK_B) & ~15;
		srcs = rand() % TEST_SOURCES;
		if (srcs == 0)
			continue;

		offset = (PTR_ALIGN_CHK_B != 0) ? 1 : PTR_ALIGN_CHK_B;
		// Add random offsets
		for(i=0; i<srcs; i++)
			ubuffs[i] = buffs[i] + (rand() & (PTR_ALIGN_CHK_B - offset));

		udest_ptrs[0] = dest1 + (rand() & (PTR_ALIGN_CHK_B - offset));
		udest_ptrs[1] = dest2 + (rand() & (PTR_ALIGN_CHK_B - offset));

		memset(dest1, 0, TEST_LEN);  // zero pad to check write-over
		memset(dest2, 0, TEST_LEN);

		for(i=0; i<srcs; i++)
			for(j=0; j<size; j++)
				ubuffs[i][j] = rand();

		for (i=0; i<srcs; i++){
			g1[i] = rand();
			g2[i] = rand();
		}

		for(i=0; i<srcs; i++){
			gf_vect_mul_init(g1[i], &g_tbls[i*32]);
			gf_vect_mul_init(g2[i], &g_tbls[(32*srcs) + (i*32)]);
		}

		gf_vect_dot_prod_base(size, srcs, &g_tbls[0], ubuffs, dest_ref1);
		gf_vect_dot_prod_base(size, srcs, &g_tbls[32*srcs], ubuffs, dest_ref2);

		gf_2vect_dot_prod_sse(size, srcs, g_tbls, ubuffs, udest_ptrs);

		if (memcmp(dest_ref1, udest_ptrs[0], size)){
			printf("Fail rand 2vect_dot_prod_sse test ualign srcs=%d\n", srcs);
			dump_matrix(ubuffs, 5, TEST_SOURCES);
			printf("dprod_base:"); 
			dump(dest_ref1, 25);
			printf("dprod_sse:"); 
			dump(udest_ptrs[0], 25);
			return -1;
		}
		if (memcmp(dest_ref2, udest_ptrs[1], size)){
			printf("Fail rand 2vect_dot_prod_sse test ualign srcs=%d\n", srcs);
			dump_matrix(ubuffs, 5, TEST_SOURCES);
			printf("dprod_base:"); 
			dump(dest_ref2, 25);
			printf("dprod_sse:"); 
			dump(udest_ptrs[1], 25);
			return -1;
		}

		// Confirm that padding around dests is unchanged
		memset(dest_ref1, 0, PTR_ALIGN_CHK_B);  // Make reference zero buff
		offset = udest_ptrs[0] - dest1;

		if (memcmp(dest1, dest_ref1, offset)){
			printf("Fail rand ualign pad1 start\n");
			return -1;
		}
		if (memcmp(dest1 + offset + size, dest_ref1, PTR_ALIGN_CHK_B - offset)){
			printf("Fail rand ualign pad1 end\n");
			return -1;
		}

		offset = udest_ptrs[1] - dest2;
		if (memcmp(dest2, dest_ref1, offset)){
			printf("Fail rand ualign pad2 start\n");
			return -1;
		}
		if (memcmp(dest2 + offset + size, dest_ref1, PTR_ALIGN_CHK_B - offset)){
			printf("Fail rand ualign pad2 end\n");
			return -1;
		}

		putchar('.');
	}


	// Test all size alignment
	align = (LEN_ALIGN_CHK_B != 0) ? 1 : 16;

	for(size=TEST_LEN; size>15; size-=align){
		srcs = TEST_SOURCES;

		for(i=0; i<srcs; i++)
			for(j=0; j<size; j++)
				buffs[i][j] = rand();

		for (i=0; i<srcs; i++){
			g1[i] = rand();
			g2[i] = rand();
		}

		for(i=0; i<srcs; i++){
			gf_vect_mul_init(g1[i], &g_tbls[i*32]);
			gf_vect_mul_init(g2[i], &g_tbls[(32*srcs) + (i*32)]);
		}

		gf_vect_dot_prod_base(size, srcs, &g_tbls[0], buffs, dest_ref1);
		gf_vect_dot_prod_base(size, srcs, &g_tbls[32*srcs], buffs, dest_ref2);

		gf_2vect_dot_prod_sse(size, srcs, g_tbls, buffs, dest_ptrs);

		if (memcmp(dest_ref1, dest_ptrs[0], size)){
			printf("Fail rand 2vect_dot_prod_sse test ualign len=%d\n", size);
			dump_matrix(buffs, 5, TEST_SOURCES);
			printf("dprod_base:"); 
			dump(dest_ref1, 25);
			printf("dprod_sse:"); 
			dump(dest_ptrs[0], 25);
			return -1;
		}
		if (memcmp(dest_ref2, dest_ptrs[1], size)){
			printf("Fail rand 2vect_dot_prod_sse test ualign len=%d\n", size);
			dump_matrix(buffs, 5, TEST_SOURCES);
			printf("dprod_base:"); 
			dump(dest_ref2, 25);
			printf("dprod_sse:"); 
			dump(dest_ptrs[1], 25);
			return -1;
		}
	}


	printf("Pass\n");
	return 0;

}