int main(int argc, char *argv[])
{
	int i, j;
	void *buf;
	u8 g[TEST_SOURCES], g_tbls[TEST_SOURCES * 32], *dest, *dest_ref;
	u8 *temp_buff, *buffs[TEST_SOURCES];
	struct perf start, stop;

	printf(xstr(FUNCTION_UNDER_TEST) ": %dx%d\n", TEST_SOURCES, TEST_LEN);

	// Allocate the arrays
	for (i = 0; i < TEST_SOURCES; i++) {
		if (posix_memalign(&buf, 64, TEST_LEN)) {
			printf("alloc error: Fail");
			return -1;
		}
		buffs[i] = buf;
	}

	if (posix_memalign(&buf, 64, TEST_LEN)) {
		printf("alloc error: Fail");
		return -1;
	}
	dest = buf;

	if (posix_memalign(&buf, 64, TEST_LEN)) {
		printf("alloc error: Fail");
		return -1;
	}
	dest_ref = buf;

	if (posix_memalign(&buf, 64, TEST_LEN)) {
		printf("alloc error: Fail");
		return -1;
	}
	temp_buff = buf;

	// Performance test
	for (i = 0; i < TEST_SOURCES; i++)
		for (j = 0; j < TEST_LEN; j++)
			buffs[i][j] = rand();

	memset(dest, 0, TEST_LEN);
	memset(temp_buff, 0, TEST_LEN);
	memset(dest_ref, 0, TEST_LEN);
	memset(g, 0, TEST_SOURCES);

	for (i = 0; i < TEST_SOURCES; i++)
		g[i] = rand();

	for (j = 0; j < TEST_SOURCES; j++)
		gf_vect_mul_init(g[j], &g_tbls[j * 32]);

	gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[0], buffs, dest_ref);

#ifdef DO_REF_PERF
	perf_start(&start);
	for (i = 0; i < TEST_LOOPS; i++) {
		for (j = 0; j < TEST_SOURCES; j++)
			gf_vect_mul_init(g[j], &g_tbls[j * 32]);

		gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[0], buffs, dest_ref);
	}
	perf_stop(&stop);
	printf("gf_vect_dot_prod_base" TEST_TYPE_STR ": ");
	perf_print(stop, start, (long long)TEST_LEN * (TEST_SOURCES + 1) * i);
#endif

	FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, g_tbls, buffs, dest);

	perf_start(&start);
	for (i = 0; i < TEST_LOOPS; i++) {
		for (j = 0; j < TEST_SOURCES; j++)
			gf_vect_mul_init(g[j], &g_tbls[j * 32]);

		FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, g_tbls, buffs, dest);
	}
	perf_stop(&stop);
	printf(xstr(FUNCTION_UNDER_TEST) TEST_TYPE_STR ": ");
	perf_print(stop, start, (long long)TEST_LEN * (TEST_SOURCES + 1) * i);

	if (0 != memcmp(dest_ref, dest, TEST_LEN)) {
		printf("Fail zero " xstr(FUNCTION_UNDER_TEST) " test\n");
		dump_matrix(buffs, 5, TEST_SOURCES);
		printf("dprod_base:");
		dump(dest_ref, 25);
		printf("dprod:");
		dump(dest, 25);
		return -1;
	}

	printf("pass perf check\n");
	return 0;
}
int main(int argc, char *argv[])
{
	int i, j, rtest, srcs;
	void *buf;
	u8 g1[TEST_SOURCES], g2[TEST_SOURCES], g3[TEST_SOURCES];
	u8 g_tbls[3 * TEST_SOURCES * 32], *dest_ptrs[3], *buffs[TEST_SOURCES];
	u8 *dest1, *dest2, *dest3, *dest_ref1, *dest_ref2, *dest_ref3;

	int align, size;
	unsigned char *efence_buffs[TEST_SOURCES];
	unsigned int offset;
	u8 *ubuffs[TEST_SOURCES];
	u8 *udest_ptrs[3];
	printf(xstr(FUNCTION_UNDER_TEST) "_test: %dx%d ", TEST_SOURCES, TEST_LEN);

	// Allocate the arrays
	for (i = 0; i < TEST_SOURCES; i++) {
		if (posix_memalign(&buf, 64, TEST_LEN)) {
			printf("alloc error: Fail");
			return -1;
		}
		buffs[i] = buf;
	}

	if (posix_memalign(&buf, 64, TEST_LEN)) {
		printf("alloc error: Fail");
		return -1;
	}
	dest1 = buf;

	if (posix_memalign(&buf, 64, TEST_LEN)) {
		printf("alloc error: Fail");
		return -1;
	}
	dest2 = buf;

	if (posix_memalign(&buf, 64, TEST_LEN)) {
		printf("alloc error: Fail");
		return -1;
	}
	dest3 = buf;

	if (posix_memalign(&buf, 64, TEST_LEN)) {
		printf("alloc error: Fail");
		return -1;
	}
	dest_ref1 = buf;

	if (posix_memalign(&buf, 64, TEST_LEN)) {
		printf("alloc error: Fail");;
		return -1;
	}
	dest_ref2 = buf;

	if (posix_memalign(&buf, 64, TEST_LEN)) {
		printf("alloc error: Fail");
		return -1;
	}
	dest_ref3 = buf;

	dest_ptrs[0] = dest1;
	dest_ptrs[1] = dest2;
	dest_ptrs[2] = dest3;

	// Test of all zeros
	for (i = 0; i < TEST_SOURCES; i++)
		memset(buffs[i], 0, TEST_LEN);

	memset(dest1, 0, TEST_LEN);
	memset(dest2, 0, TEST_LEN);
	memset(dest3, 0, TEST_LEN);
	memset(dest_ref1, 0, TEST_LEN);
	memset(dest_ref2, 0, TEST_LEN);
	memset(dest_ref3, 0, TEST_LEN);
	memset(g1, 2, TEST_SOURCES);
	memset(g2, 1, TEST_SOURCES);
	memset(g3, 7, TEST_SOURCES);

	for (i = 0; i < TEST_SOURCES; i++) {
		gf_vect_mul_init(g1[i], &g_tbls[i * 32]);
		gf_vect_mul_init(g2[i], &g_tbls[32 * TEST_SOURCES + i * 32]);
		gf_vect_mul_init(g3[i], &g_tbls[64 * TEST_SOURCES + i * 32]);
	}

	gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[0], buffs, dest_ref1);
	gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[32 * TEST_SOURCES], buffs,
			      dest_ref2);
	gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[64 * TEST_SOURCES], buffs,
			      dest_ref3);

	FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, g_tbls, buffs, dest_ptrs);

	if (0 != memcmp(dest_ref1, dest1, TEST_LEN)) {
		printf("Fail zero" xstr(FUNCTION_UNDER_TEST) " test1\n");
		dump_matrix(buffs, 5, TEST_SOURCES);
		printf("dprod_base:");
		dump(dest_ref1, 25);
		printf("dprod_dut:");
		dump(dest1, 25);
		return -1;
	}
	if (0 != memcmp(dest_ref2, dest2, TEST_LEN)) {
		printf("Fail zero " xstr(FUNCTION_UNDER_TEST) " test2\n");
		dump_matrix(buffs, 5, TEST_SOURCES);
		printf("dprod_base:");
		dump(dest_ref2, 25);
		printf("dprod_dut:");
		dump(dest2, 25);
		return -1;
	}
	if (0 != memcmp(dest_ref3, dest3, TEST_LEN)) {
		printf("Fail zero " xstr(FUNCTION_UNDER_TEST) " test3\n");
		dump_matrix(buffs, 5, TEST_SOURCES);
		printf("dprod_base:");
		dump(dest_ref3, 25);
		printf("dprod_dut:");
		dump(dest3, 25);
		return -1;
	}

	putchar('.');

	// Rand data test

	for (rtest = 0; rtest < RANDOMS; rtest++) {
		for (i = 0; i < TEST_SOURCES; i++)
			for (j = 0; j < TEST_LEN; j++)
				buffs[i][j] = rand();

		for (i = 0; i < TEST_SOURCES; i++) {
			g1[i] = rand();
			g2[i] = rand();
			g3[i] = rand();
		}

		for (i = 0; i < TEST_SOURCES; i++) {
			gf_vect_mul_init(g1[i], &g_tbls[i * 32]);
			gf_vect_mul_init(g2[i], &g_tbls[(32 * TEST_SOURCES) + (i * 32)]);
			gf_vect_mul_init(g3[i], &g_tbls[(64 * TEST_SOURCES) + (i * 32)]);
		}

		gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[0], buffs, dest_ref1);
		gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[32 * TEST_SOURCES],
				      buffs, dest_ref2);
		gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[64 * TEST_SOURCES],
				      buffs, dest_ref3);

		FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, g_tbls, buffs, dest_ptrs);

		if (0 != memcmp(dest_ref1, dest1, TEST_LEN)) {
			printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test1 %d\n", rtest);
			dump_matrix(buffs, 5, TEST_SOURCES);
			printf("dprod_base:");
			dump(dest_ref1, 25);
			printf("dprod_dut:");
			dump(dest1, 25);
			return -1;
		}
		if (0 != memcmp(dest_ref2, dest2, TEST_LEN)) {
			printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test2 %d\n", rtest);
			dump_matrix(buffs, 5, TEST_SOURCES);
			printf("dprod_base:");
			dump(dest_ref2, 25);
			printf("dprod_dut:");
			dump(dest2, 25);
			return -1;
		}
		if (0 != memcmp(dest_ref3, dest3, TEST_LEN)) {
			printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test3 %d\n", rtest);
			dump_matrix(buffs, 5, TEST_SOURCES);
			printf("dprod_base:");
			dump(dest_ref3, 25);
			printf("dprod_dut:");
			dump(dest3, 25);
			return -1;
		}

		putchar('.');
	}

	// Rand data test with varied parameters
	for (rtest = 0; rtest < RANDOMS; rtest++) {
		for (srcs = TEST_SOURCES; srcs > 0; srcs--) {
			for (i = 0; i < srcs; i++)
				for (j = 0; j < TEST_LEN; j++)
					buffs[i][j] = rand();

			for (i = 0; i < srcs; i++) {
				g1[i] = rand();
				g2[i] = rand();
				g3[i] = rand();
			}

			for (i = 0; i < srcs; i++) {
				gf_vect_mul_init(g1[i], &g_tbls[i * 32]);
				gf_vect_mul_init(g2[i], &g_tbls[(32 * srcs) + (i * 32)]);
				gf_vect_mul_init(g3[i], &g_tbls[(64 * srcs) + (i * 32)]);
			}

			gf_vect_dot_prod_base(TEST_LEN, srcs, &g_tbls[0], buffs, dest_ref1);
			gf_vect_dot_prod_base(TEST_LEN, srcs, &g_tbls[32 * srcs], buffs,
					      dest_ref2);
			gf_vect_dot_prod_base(TEST_LEN, srcs, &g_tbls[64 * srcs], buffs,
					      dest_ref3);

			FUNCTION_UNDER_TEST(TEST_LEN, srcs, g_tbls, buffs, dest_ptrs);

			if (0 != memcmp(dest_ref1, dest1, TEST_LEN)) {
				printf("Fail rand " xstr(FUNCTION_UNDER_TEST)
				       " test1 srcs=%d\n", srcs);
				dump_matrix(buffs, 5, TEST_SOURCES);
				printf("dprod_base:");
				dump(dest_ref1, 25);
				printf("dprod_dut:");
				dump(dest1, 25);
				return -1;
			}
			if (0 != memcmp(dest_ref2, dest2, TEST_LEN)) {
				printf("Fail rand " xstr(FUNCTION_UNDER_TEST)
				       " test2 srcs=%d\n", srcs);
				dump_matrix(buffs, 5, TEST_SOURCES);
				printf("dprod_base:");
				dump(dest_ref2, 25);
				printf("dprod_dut:");
				dump(dest2, 25);
				return -1;
			}
			if (0 != memcmp(dest_ref3, dest3, TEST_LEN)) {
				printf("Fail rand " xstr(FUNCTION_UNDER_TEST)
				       " test3 srcs=%d\n", srcs);
				dump_matrix(buffs, 5, TEST_SOURCES);
				printf("dprod_base:");
				dump(dest_ref3, 25);
				printf("dprod_dut:");
				dump(dest3, 25);
				return -1;
			}

			putchar('.');
		}
	}

	// Run tests at end of buffer for Electric Fence
	align = (LEN_ALIGN_CHK_B != 0) ? 1 : 16;
	for (size = TEST_MIN_SIZE; size <= TEST_SIZE; size += align) {
		for (i = 0; i < TEST_SOURCES; i++)
			for (j = 0; j < TEST_LEN; j++)
				buffs[i][j] = rand();

		for (i = 0; i < TEST_SOURCES; i++)	// Line up TEST_SIZE from end
			efence_buffs[i] = buffs[i] + TEST_LEN - size;

		for (i = 0; i < TEST_SOURCES; i++) {
			g1[i] = rand();
			g2[i] = rand();
			g3[i] = rand();
		}

		for (i = 0; i < TEST_SOURCES; i++) {
			gf_vect_mul_init(g1[i], &g_tbls[i * 32]);
			gf_vect_mul_init(g2[i], &g_tbls[(32 * TEST_SOURCES) + (i * 32)]);
			gf_vect_mul_init(g3[i], &g_tbls[(64 * TEST_SOURCES) + (i * 32)]);
		}

		gf_vect_dot_prod_base(size, TEST_SOURCES, &g_tbls[0], efence_buffs, dest_ref1);
		gf_vect_dot_prod_base(size, TEST_SOURCES, &g_tbls[32 * TEST_SOURCES],
				      efence_buffs, dest_ref2);
		gf_vect_dot_prod_base(size, TEST_SOURCES, &g_tbls[64 * TEST_SOURCES],
				      efence_buffs, dest_ref3);

		FUNCTION_UNDER_TEST(size, TEST_SOURCES, g_tbls, efence_buffs, dest_ptrs);

		if (0 != memcmp(dest_ref1, dest1, size)) {
			printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test1 %d\n", rtest);
			dump_matrix(efence_buffs, 5, TEST_SOURCES);
			printf("dprod_base:");
			dump(dest_ref1, align);
			printf("dprod_dut:");
			dump(dest1, align);
			return -1;
		}

		if (0 != memcmp(dest_ref2, dest2, size)) {
			printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test2 %d\n", rtest);
			dump_matrix(efence_buffs, 5, TEST_SOURCES);
			printf("dprod_base:");
			dump(dest_ref2, align);
			printf("dprod_dut:");
			dump(dest2, align);
			return -1;
		}

		if (0 != memcmp(dest_ref3, dest3, size)) {
			printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test3 %d\n", rtest);
			dump_matrix(efence_buffs, 5, TEST_SOURCES);
			printf("dprod_base:");
			dump(dest_ref3, align);
			printf("dprod_dut:");
			dump(dest3, align);
			return -1;
		}

		putchar('.');
	}

	// Test rand ptr alignment if available

	for (rtest = 0; rtest < RANDOMS; rtest++) {
		size = (TEST_LEN - PTR_ALIGN_CHK_B) & ~(TEST_MIN_SIZE - 1);
		srcs = rand() % TEST_SOURCES;
		if (srcs == 0)
			continue;

		offset = (PTR_ALIGN_CHK_B != 0) ? 1 : PTR_ALIGN_CHK_B;
		// Add random offsets
		for (i = 0; i < srcs; i++)
			ubuffs[i] = buffs[i] + (rand() & (PTR_ALIGN_CHK_B - offset));

		udest_ptrs[0] = dest1 + (rand() & (PTR_ALIGN_CHK_B - offset));
		udest_ptrs[1] = dest2 + (rand() & (PTR_ALIGN_CHK_B - offset));
		udest_ptrs[2] = dest3 + (rand() & (PTR_ALIGN_CHK_B - offset));

		memset(dest1, 0, TEST_LEN);	// zero pad to check write-over
		memset(dest2, 0, TEST_LEN);
		memset(dest3, 0, TEST_LEN);

		for (i = 0; i < srcs; i++)
			for (j = 0; j < size; j++)
				ubuffs[i][j] = rand();

		for (i = 0; i < srcs; i++) {
			g1[i] = rand();
			g2[i] = rand();
			g3[i] = rand();
		}

		for (i = 0; i < srcs; i++) {
			gf_vect_mul_init(g1[i], &g_tbls[i * 32]);
			gf_vect_mul_init(g2[i], &g_tbls[(32 * srcs) + (i * 32)]);
			gf_vect_mul_init(g3[i], &g_tbls[(64 * srcs) + (i * 32)]);
		}

		gf_vect_dot_prod_base(size, srcs, &g_tbls[0], ubuffs, dest_ref1);
		gf_vect_dot_prod_base(size, srcs, &g_tbls[32 * srcs], ubuffs, dest_ref2);
		gf_vect_dot_prod_base(size, srcs, &g_tbls[64 * srcs], ubuffs, dest_ref3);

		FUNCTION_UNDER_TEST(size, srcs, g_tbls, ubuffs, udest_ptrs);

		if (memcmp(dest_ref1, udest_ptrs[0], size)) {
			printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign srcs=%d\n",
			       srcs);
			dump_matrix(ubuffs, 5, TEST_SOURCES);
			printf("dprod_base:");
			dump(dest_ref1, 25);
			printf("dprod_dut:");
			dump(udest_ptrs[0], 25);
			return -1;
		}
		if (memcmp(dest_ref2, udest_ptrs[1], size)) {
			printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign srcs=%d\n",
			       srcs);
			dump_matrix(ubuffs, 5, TEST_SOURCES);
			printf("dprod_base:");
			dump(dest_ref2, 25);
			printf("dprod_dut:");
			dump(udest_ptrs[1], 25);
			return -1;
		}
		if (memcmp(dest_ref3, udest_ptrs[2], size)) {
			printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign srcs=%d\n",
			       srcs);
			dump_matrix(ubuffs, 5, TEST_SOURCES);
			printf("dprod_base:");
			dump(dest_ref3, 25);
			printf("dprod_dut:");
			dump(udest_ptrs[2], 25);
			return -1;
		}
		// Confirm that padding around dests is unchanged
		memset(dest_ref1, 0, PTR_ALIGN_CHK_B);	// Make reference zero buff
		offset = udest_ptrs[0] - dest1;

		if (memcmp(dest1, dest_ref1, offset)) {
			printf("Fail rand ualign pad1 start\n");
			return -1;
		}
		if (memcmp(dest1 + offset + size, dest_ref1, PTR_ALIGN_CHK_B - offset)) {
			printf("Fail rand ualign pad1 end\n");
			return -1;
		}

		offset = udest_ptrs[1] - dest2;
		if (memcmp(dest2, dest_ref1, offset)) {
			printf("Fail rand ualign pad2 start\n");
			return -1;
		}
		if (memcmp(dest2 + offset + size, dest_ref1, PTR_ALIGN_CHK_B - offset)) {
			printf("Fail rand ualign pad2 end\n");
			return -1;
		}

		offset = udest_ptrs[2] - dest3;
		if (memcmp(dest3, dest_ref1, offset)) {
			printf("Fail rand ualign pad3 start\n");
			return -1;
		}
		if (memcmp(dest3 + offset + size, dest_ref1, PTR_ALIGN_CHK_B - offset)) {
			printf("Fail rand ualign pad3 end\n");;
			return -1;
		}

		putchar('.');
	}

	// Test all size alignment
	align = (LEN_ALIGN_CHK_B != 0) ? 1 : 16;

	for (size = TEST_LEN; size >= TEST_MIN_SIZE; size -= align) {
		srcs = TEST_SOURCES;

		for (i = 0; i < srcs; i++)
			for (j = 0; j < size; j++)
				buffs[i][j] = rand();

		for (i = 0; i < srcs; i++) {
			g1[i] = rand();
			g2[i] = rand();
			g3[i] = rand();
		}

		for (i = 0; i < srcs; i++) {
			gf_vect_mul_init(g1[i], &g_tbls[i * 32]);
			gf_vect_mul_init(g2[i], &g_tbls[(32 * srcs) + (i * 32)]);
			gf_vect_mul_init(g3[i], &g_tbls[(64 * srcs) + (i * 32)]);
		}

		gf_vect_dot_prod_base(size, srcs, &g_tbls[0], buffs, dest_ref1);
		gf_vect_dot_prod_base(size, srcs, &g_tbls[32 * srcs], buffs, dest_ref2);
		gf_vect_dot_prod_base(size, srcs, &g_tbls[64 * srcs], buffs, dest_ref3);

		FUNCTION_UNDER_TEST(size, srcs, g_tbls, buffs, dest_ptrs);

		if (memcmp(dest_ref1, dest_ptrs[0], size)) {
			printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign len=%d\n",
			       size);
			dump_matrix(buffs, 5, TEST_SOURCES);
			printf("dprod_base:");
			dump(dest_ref1, 25);
			printf("dprod_dut:");
			dump(dest_ptrs[0], 25);
			return -1;
		}
		if (memcmp(dest_ref2, dest_ptrs[1], size)) {
			printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign len=%d\n",
			       size);
			dump_matrix(buffs, 5, TEST_SOURCES);
			printf("dprod_base:");
			dump(dest_ref2, 25);
			printf("dprod_dut:");
			dump(dest_ptrs[1], 25);
			return -1;
		}
		if (memcmp(dest_ref3, dest_ptrs[2], size)) {
			printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign len=%d\n",
			       size);
			dump_matrix(buffs, 5, TEST_SOURCES);
			printf("dprod_base:");
			dump(dest_ref3, 25);
			printf("dprod_dut:");
			dump(dest_ptrs[2], 25);
			return -1;
		}
	}

	printf("Pass\n");
	return 0;

}
int main(int argc, char *argv[])
{
	int i, j;
	void *buf;
	u8 g1[TEST_SOURCES], g2[TEST_SOURCES], g3[TEST_SOURCES];
	u8 g_tbls[3 * TEST_SOURCES * 32], *dest_ptrs[3], *buffs[TEST_SOURCES];
	u8 *dest1, *dest2, *dest3, *dest_ref1, *dest_ref2, *dest_ref3;
	struct perf start, stop;

	printf(xstr(FUNCTION_UNDER_TEST) ": %dx%d\n", TEST_SOURCES, TEST_LEN);

	// Allocate the arrays
	for (i = 0; i < TEST_SOURCES; i++) {
		if (posix_memalign(&buf, 64, TEST_LEN)) {
			printf("alloc error: Fail");
			return -1;
		}
		buffs[i] = buf;
	}

	if (posix_memalign(&buf, 64, TEST_LEN)) {
		printf("alloc error: Fail");
		return -1;
	}
	dest1 = buf;

	if (posix_memalign(&buf, 64, TEST_LEN)) {
		printf("alloc error: Fail");
		return -1;
	}
	dest2 = buf;

	if (posix_memalign(&buf, 64, TEST_LEN)) {
		printf("alloc error: Fail");
		return -1;
	}
	dest3 = buf;

	if (posix_memalign(&buf, 64, TEST_LEN)) {
		printf("alloc error: Fail");
		return -1;
	}
	dest_ref1 = buf;

	if (posix_memalign(&buf, 64, TEST_LEN)) {
		printf("alloc error: Fail");
		return -1;
	}
	dest_ref2 = buf;

	if (posix_memalign(&buf, 64, TEST_LEN)) {
		printf("alloc error: Fail");
		return -1;
	}
	dest_ref3 = buf;

	dest_ptrs[0] = dest1;
	dest_ptrs[1] = dest2;
	dest_ptrs[2] = dest3;

	// Performance test
	for (i = 0; i < TEST_SOURCES; i++)
		for (j = 0; j < TEST_LEN; j++)
			buffs[i][j] = rand();

	memset(dest1, 0, TEST_LEN);
	memset(dest2, 0, TEST_LEN);
	memset(dest_ref1, 0, TEST_LEN);
	memset(dest_ref2, 0, TEST_LEN);

	for (i = 0; i < TEST_SOURCES; i++) {
		g1[i] = rand();
		g2[i] = rand();
		g3[i] = rand();
	}

	for (j = 0; j < TEST_SOURCES; j++) {
		gf_vect_mul_init(g1[j], &g_tbls[j * 32]);
		gf_vect_mul_init(g2[j], &g_tbls[(32 * TEST_SOURCES) + (j * 32)]);
		gf_vect_mul_init(g3[j], &g_tbls[(64 * TEST_SOURCES) + (j * 32)]);
	}

	gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[0], buffs, dest_ref1);
	gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[32 * TEST_SOURCES], buffs,
			      dest_ref2);
	gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[64 * TEST_SOURCES], buffs,
			      dest_ref3);

#ifdef DO_REF_PERF
	perf_start(&start);
	for (i = 0; i < TEST_LOOPS / 100; i++) {
		for (j = 0; j < TEST_SOURCES; j++) {
			gf_vect_mul_init(g1[j], &g_tbls[j * 32]);
			gf_vect_mul_init(g2[j], &g_tbls[(32 * TEST_SOURCES) + (j * 32)]);
			gf_vect_mul_init(g3[j], &g_tbls[(64 * TEST_SOURCES) + (j * 32)]);
		}

		gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[0], buffs, dest_ref1);
		gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[32 * TEST_SOURCES],
				      buffs, dest_ref2);
		gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[64 * TEST_SOURCES],
				      buffs, dest_ref3);
	}
	perf_stop(&stop);
	printf("gf_3vect_dot_prod_base" TEST_TYPE_STR ": ");
	perf_print(stop, start, (long long)TEST_LEN * (TEST_SOURCES + 3) * i);
#endif

	FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, g_tbls, buffs, dest_ptrs);

	perf_start(&start);
	for (i = 0; i < TEST_LOOPS; i++) {
		for (j = 0; j < TEST_SOURCES; j++) {
			gf_vect_mul_init(g1[j], &g_tbls[j * 32]);
			gf_vect_mul_init(g2[j], &g_tbls[(32 * TEST_SOURCES) + (j * 32)]);
			gf_vect_mul_init(g3[j], &g_tbls[(64 * TEST_SOURCES) + (j * 32)]);
		}

		FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, g_tbls, buffs, dest_ptrs);
	}
	perf_stop(&stop);
	printf(xstr(FUNCTION_UNDER_TEST) TEST_TYPE_STR ": ");
	perf_print(stop, start, (long long)TEST_LEN * (TEST_SOURCES + 3) * i);

	if (0 != memcmp(dest_ref1, dest1, TEST_LEN)) {
		printf("Fail perf " xstr(FUNCTION_UNDER_TEST) " test1\n");
		dump_matrix(buffs, 5, TEST_SOURCES);
		printf("dprod_base:");
		dump(dest_ref1, 25);
		printf("dprod_dut:");
		dump(dest1, 25);
		return -1;
	}
	if (0 != memcmp(dest_ref2, dest2, TEST_LEN)) {
		printf("Fail perf " xstr(FUNCTION_UNDER_TEST) " test2\n");
		dump_matrix(buffs, 5, TEST_SOURCES);
		printf("dprod_base:");
		dump(dest_ref2, 25);
		printf("dprod_dut:");
		dump(dest2, 25);
		return -1;
	}
	if (0 != memcmp(dest_ref3, dest3, TEST_LEN)) {
		printf("Fail perf " xstr(FUNCTION_UNDER_TEST) " test3\n");
		dump_matrix(buffs, 5, TEST_SOURCES);
		printf("dprod_base:");
		dump(dest_ref3, 25);
		printf("dprod_dut:");
		dump(dest3, 25);
		return -1;
	}

	printf("pass perf check\n");
	return 0;

}
int main(int argc, char *argv[])
{
	int i, j, rtest, srcs;
	void *buf;
	u8 gf[6][TEST_SOURCES];
	u8 *g_tbls;
	u8 *dest_ref[VECT];
	u8 *dest_ptrs[VECT], *buffs[TEST_SOURCES];
	int vector = VECT;

	int align, size;
	unsigned char *efence_buffs[TEST_SOURCES];
	unsigned int offset;
	u8 *ubuffs[TEST_SOURCES];
	u8 *udest_ptrs[VECT];
	printf("test" xstr(FUNCTION_UNDER_TEST) ": %dx%d ", TEST_SOURCES, TEST_LEN);

	// Allocate the arrays
	for (i = 0; i < TEST_SOURCES; i++) {
		if (posix_memalign(&buf, 64, TEST_LEN)) {
			printf("alloc error: Fail");
			return -1;
		}
		buffs[i] = buf;
	}

	if (posix_memalign(&buf, 16, 2 * (vector * TEST_SOURCES * 32))) {
		printf("alloc error: Fail");
		return -1;
	}
	g_tbls = buf;

	for (i = 0; i < vector; i++) {
		if (posix_memalign(&buf, 64, TEST_LEN)) {
			printf("alloc error: Fail");
			return -1;
		}
		dest_ptrs[i] = buf;
		memset(dest_ptrs[i], 0, TEST_LEN);
	}

	for (i = 0; i < vector; i++) {
		if (posix_memalign(&buf, 64, TEST_LEN)) {
			printf("alloc error: Fail");
			return -1;
		}
		dest_ref[i] = buf;
		memset(dest_ref[i], 0, TEST_LEN);
	}

	// Test of all zeros
	for (i = 0; i < TEST_SOURCES; i++)
		memset(buffs[i], 0, TEST_LEN);

	switch (vector) {
	case 6:
		memset(gf[5], 0xe6, TEST_SOURCES);
	case 5:
		memset(gf[4], 4, TEST_SOURCES);
	case 4:
		memset(gf[3], 9, TEST_SOURCES);
	case 3:
		memset(gf[2], 7, TEST_SOURCES);
	case 2:
		memset(gf[1], 1, TEST_SOURCES);
	case 1:
		memset(gf[0], 2, TEST_SOURCES);
		break;
	default:
		return -1;
	}

	for (i = 0; i < TEST_SOURCES; i++)
		for (j = 0; j < TEST_LEN; j++)
			buffs[i][j] = rand();

	for (i = 0; i < vector; i++)
		for (j = 0; j < TEST_SOURCES; j++) {
			gf[i][j] = rand();
			gf_vect_mul_init(gf[i][j], &g_tbls[i * (32 * TEST_SOURCES) + j * 32]);
		}

	for (i = 0; i < vector; i++)
		gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[i * 32 * TEST_SOURCES],
				      buffs, dest_ref[i]);

	for (i = 0; i < vector; i++)
		memset(dest_ptrs[i], 0, TEST_LEN);
	for (i = 0; i < TEST_SOURCES; i++) {
#if (VECT == 1)
		FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, i, g_tbls, buffs[i], *dest_ptrs);
#else
		FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, i, g_tbls, buffs[i], dest_ptrs);
#endif
	}
	for (i = 0; i < vector; i++) {
		if (0 != memcmp(dest_ref[i], dest_ptrs[i], TEST_LEN)) {
			printf("Fail zero " xstr(FUNCTION_UNDER_TEST) " test%d\n", i);
			dump_matrix(buffs, vector, TEST_SOURCES);
			printf("dprod_base:");
			dump(dest_ref[i], 25);
			printf("dprod_dut:");
			dump(dest_ptrs[i], 25);
			return -1;
		}
	}

#if (VECT == 1)
	REF_FUNCTION(TEST_LEN, TEST_SOURCES, g_tbls, buffs, *dest_ref);
#else
	REF_FUNCTION(TEST_LEN, TEST_SOURCES, g_tbls, buffs, dest_ref);
#endif
	for (i = 0; i < vector; i++) {
		if (0 != memcmp(dest_ref[i], dest_ptrs[i], TEST_LEN)) {
			printf("Fail zero " xstr(FUNCTION_UNDER_TEST) " test%d\n", i);
			dump_matrix(buffs, vector, TEST_SOURCES);
			printf("dprod_base:");
			dump(dest_ref[i], 25);
			printf("dprod_dut:");
			dump(dest_ptrs[i], 25);
			return -1;
		}
	}

	putchar('.');

	// Rand data test

	for (rtest = 0; rtest < RANDOMS; rtest++) {
		for (i = 0; i < TEST_SOURCES; i++)
			for (j = 0; j < TEST_LEN; j++)
				buffs[i][j] = rand();

		for (i = 0; i < vector; i++)
			for (j = 0; j < TEST_SOURCES; j++) {
				gf[i][j] = rand();
				gf_vect_mul_init(gf[i][j],
						 &g_tbls[i * (32 * TEST_SOURCES) + j * 32]);
			}

		for (i = 0; i < vector; i++)
			gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES,
					      &g_tbls[i * 32 * TEST_SOURCES], buffs,
					      dest_ref[i]);

		for (i = 0; i < vector; i++)
			memset(dest_ptrs[i], 0, TEST_LEN);
		for (i = 0; i < TEST_SOURCES; i++) {
#if (VECT == 1)
			FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, i, g_tbls, buffs[i],
					    *dest_ptrs);
#else
			FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, i, g_tbls, buffs[i],
					    dest_ptrs);
#endif
		}
		for (i = 0; i < vector; i++) {
			if (0 != memcmp(dest_ref[i], dest_ptrs[i], TEST_LEN)) {
				printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test%d %d\n",
				       i, rtest);
				dump_matrix(buffs, vector, TEST_SOURCES);
				printf("dprod_base:");
				dump(dest_ref[i], 25);
				printf("dprod_dut:");
				dump(dest_ptrs[i], 25);
				return -1;
			}
		}

		putchar('.');
	}

	// Rand data test with varied parameters
	for (rtest = 0; rtest < RANDOMS; rtest++) {
		for (srcs = TEST_SOURCES; srcs > 0; srcs--) {
			for (i = 0; i < srcs; i++)
				for (j = 0; j < TEST_LEN; j++)
					buffs[i][j] = rand();

			for (i = 0; i < vector; i++)
				for (j = 0; j < srcs; j++) {
					gf[i][j] = rand();
					gf_vect_mul_init(gf[i][j],
							 &g_tbls[i * (32 * srcs) + j * 32]);
				}

			for (i = 0; i < vector; i++)
				gf_vect_dot_prod_base(TEST_LEN, srcs, &g_tbls[i * 32 * srcs],
						      buffs, dest_ref[i]);

			for (i = 0; i < vector; i++)
				memset(dest_ptrs[i], 0, TEST_LEN);
			for (i = 0; i < srcs; i++) {
#if (VECT == 1)
				FUNCTION_UNDER_TEST(TEST_LEN, srcs, i, g_tbls, buffs[i],
						    *dest_ptrs);
#else
				FUNCTION_UNDER_TEST(TEST_LEN, srcs, i, g_tbls, buffs[i],
						    dest_ptrs);
#endif

			}
			for (i = 0; i < vector; i++) {
				if (0 != memcmp(dest_ref[i], dest_ptrs[i], TEST_LEN)) {
					printf("Fail rand " xstr(FUNCTION_UNDER_TEST)
					       " test%d srcs=%d\n", i, srcs);
					dump_matrix(buffs, vector, TEST_SOURCES);
					printf("dprod_base:");
					dump(dest_ref[i], 25);
					printf("dprod_dut:");
					dump(dest_ptrs[i], 25);
					return -1;
				}
			}

			putchar('.');
		}
	}

	// Run tests at end of buffer for Electric Fence
	align = (LEN_ALIGN_CHK_B != 0) ? 1 : ALIGN_SIZE;
	for (size = TEST_MIN_SIZE; size <= TEST_SIZE; size += align) {
		for (i = 0; i < TEST_SOURCES; i++)
			for (j = 0; j < TEST_LEN; j++)
				buffs[i][j] = rand();

		for (i = 0; i < TEST_SOURCES; i++)	// Line up TEST_SIZE from end
			efence_buffs[i] = buffs[i] + TEST_LEN - size;

		for (i = 0; i < vector; i++)
			for (j = 0; j < TEST_SOURCES; j++) {
				gf[i][j] = rand();
				gf_vect_mul_init(gf[i][j],
						 &g_tbls[i * (32 * TEST_SOURCES) + j * 32]);
			}

		for (i = 0; i < vector; i++)
			gf_vect_dot_prod_base(size, TEST_SOURCES,
					      &g_tbls[i * 32 * TEST_SOURCES], efence_buffs,
					      dest_ref[i]);

		for (i = 0; i < vector; i++)
			memset(dest_ptrs[i], 0, size);
		for (i = 0; i < TEST_SOURCES; i++) {
#if (VECT == 1)
			FUNCTION_UNDER_TEST(size, TEST_SOURCES, i, g_tbls, efence_buffs[i],
					    *dest_ptrs);
#else
			FUNCTION_UNDER_TEST(size, TEST_SOURCES, i, g_tbls, efence_buffs[i],
					    dest_ptrs);
#endif
		}
		for (i = 0; i < vector; i++) {
			if (0 != memcmp(dest_ref[i], dest_ptrs[i], size)) {
				printf("Fail rand " xstr(FUNCTION_UNDER_TEST)
				       " test%d size=%d\n", i, size);
				dump_matrix(buffs, vector, TEST_SOURCES);
				printf("dprod_base:");
				dump(dest_ref[i], TEST_MIN_SIZE + align);
				printf("dprod_dut:");
				dump(dest_ptrs[i], TEST_MIN_SIZE + align);
				return -1;
			}
		}

		putchar('.');
	}

	// Test rand ptr alignment if available

	for (rtest = 0; rtest < RANDOMS; rtest++) {
		size = (TEST_LEN - PTR_ALIGN_CHK_B) & ~(TEST_MIN_SIZE - 1);
		srcs = rand() % TEST_SOURCES;
		if (srcs == 0)
			continue;

		offset = (PTR_ALIGN_CHK_B != 0) ? 1 : PTR_ALIGN_CHK_B;
		// Add random offsets
		for (i = 0; i < srcs; i++)
			ubuffs[i] = buffs[i] + (rand() & (PTR_ALIGN_CHK_B - offset));

		for (i = 0; i < vector; i++) {
			udest_ptrs[i] = dest_ptrs[i] + (rand() & (PTR_ALIGN_CHK_B - offset));
			memset(dest_ptrs[i], 0, TEST_LEN);	// zero pad to check write-over
		}

		for (i = 0; i < srcs; i++)
			for (j = 0; j < size; j++)
				ubuffs[i][j] = rand();

		for (i = 0; i < vector; i++)
			for (j = 0; j < srcs; j++) {
				gf[i][j] = rand();
				gf_vect_mul_init(gf[i][j], &g_tbls[i * (32 * srcs) + j * 32]);
			}

		for (i = 0; i < vector; i++)
			gf_vect_dot_prod_base(size, srcs, &g_tbls[i * 32 * srcs], ubuffs,
					      dest_ref[i]);

		for (i = 0; i < srcs; i++) {
#if (VECT == 1)
			FUNCTION_UNDER_TEST(size, srcs, i, g_tbls, ubuffs[i], *udest_ptrs);
#else
			FUNCTION_UNDER_TEST(size, srcs, i, g_tbls, ubuffs[i], udest_ptrs);
#endif
		}
		for (i = 0; i < vector; i++) {
			if (0 != memcmp(dest_ref[i], udest_ptrs[i], size)) {
				printf("Fail rand " xstr(FUNCTION_UNDER_TEST)
				       " test%d ualign srcs=%d\n", i, srcs);
				dump_matrix(buffs, vector, TEST_SOURCES);
				printf("dprod_base:");
				dump(dest_ref[i], 25);
				printf("dprod_dut:");
				dump(udest_ptrs[i], 25);
				return -1;
			}
		}

		// Confirm that padding around dests is unchanged
		memset(dest_ref[0], 0, PTR_ALIGN_CHK_B);	// Make reference zero buff

		for (i = 0; i < vector; i++) {
			offset = udest_ptrs[i] - dest_ptrs[i];
			if (memcmp(dest_ptrs[i], dest_ref[0], offset)) {
				printf("Fail rand ualign pad1 start\n");
				return -1;
			}
			if (memcmp
			    (dest_ptrs[i] + offset + size, dest_ref[0],
			     PTR_ALIGN_CHK_B - offset)) {
				printf("Fail rand ualign pad1 end\n");
				return -1;
			}
		}

		putchar('.');
	}

	// Test all size alignment
	align = (LEN_ALIGN_CHK_B != 0) ? 1 : ALIGN_SIZE;

	for (size = TEST_LEN; size >= TEST_MIN_SIZE; size -= align) {
		for (i = 0; i < TEST_SOURCES; i++)
			for (j = 0; j < size; j++)
				buffs[i][j] = rand();

		for (i = 0; i < vector; i++) {
			for (j = 0; j < TEST_SOURCES; j++) {
				gf[i][j] = rand();
				gf_vect_mul_init(gf[i][j],
						 &g_tbls[i * (32 * TEST_SOURCES) + j * 32]);
			}
			memset(dest_ptrs[i], 0, TEST_LEN);	// zero pad to check write-over
		}

		for (i = 0; i < vector; i++)
			gf_vect_dot_prod_base(size, TEST_SOURCES,
					      &g_tbls[i * 32 * TEST_SOURCES], buffs,
					      dest_ref[i]);

		for (i = 0; i < TEST_SOURCES; i++) {
#if (VECT == 1)
			FUNCTION_UNDER_TEST(size, TEST_SOURCES, i, g_tbls, buffs[i],
					    *dest_ptrs);
#else
			FUNCTION_UNDER_TEST(size, TEST_SOURCES, i, g_tbls, buffs[i],
					    dest_ptrs);
#endif
		}
		for (i = 0; i < vector; i++) {
			if (0 != memcmp(dest_ref[i], dest_ptrs[i], size)) {
				printf("Fail rand " xstr(FUNCTION_UNDER_TEST)
				       " test%d ualign len=%d\n", i, size);
				dump_matrix(buffs, vector, TEST_SOURCES);
				printf("dprod_base:");
				dump(dest_ref[i], 25);
				printf("dprod_dut:");
				dump(dest_ptrs[i], 25);
				return -1;
			}
		}

		putchar('.');

	}

	printf("Pass\n");
	return 0;

}