void unload_maxfiles(){
    if (-1!=cur_engine){
        max_unload(max_engines[cur_engine]);
        cur_engine = -1;
    }
    for (int i=0;i<K_TOTAL;++i){
        if (max_files[i]){
            max_file_free(max_files[i]);
        }
    }
}
int main(int argc, char *argv[]) {
	if (argc < 4) {
		printf("Syntax: %s <TOP local IP> <BOT local IP> <forward IP>\n", argv[0]);
		return 1;
	}

	struct in_addr top_ip;
	struct in_addr bot_ip;
	struct in_addr fwd_ip;
	struct in_addr netmask;

	inet_aton(argv[1], &top_ip);
	inet_aton(argv[2], &bot_ip);
	inet_aton(argv[3], &fwd_ip);
	inet_aton("255.255.255.0", &netmask);

	uint16_t port = 7653;

	printf("EthFwd: TOP IP '%s', BOT IP '%s', Forward IP '%s', port %u\n", argv[1], argv[2], argv[3], port);

	max_file_t *maxfile = EthFwd_init();
	max_engine_t * engine = max_load(maxfile, "*");

	max_ip_config(engine, MAX_NET_CONNECTION_QSFP_TOP_10G_PORT1, &top_ip, &netmask);
	max_ip_config(engine, MAX_NET_CONNECTION_QSFP_BOT_10G_PORT1, &bot_ip, &netmask);

	struct ether_addr local_mac2, remote_mac2;
	max_arp_lookup_entry(engine, MAX_NET_CONNECTION_QSFP_BOT_10G_PORT1, &fwd_ip, &remote_mac2);
	max_eth_get_default_mac_address(engine, MAX_NET_CONNECTION_QSFP_BOT_10G_PORT1, &local_mac2);

	uint64_t localMac = 0, forwardMac = 0;
	memcpy(&localMac, &local_mac2, 6);
	memcpy(&forwardMac, &remote_mac2, 6);

	max_config_set_bool(MAX_CONFIG_PRINTF_TO_STDOUT, true);

	max_actions_t *action = max_actions_init(maxfile, NULL);
	max_set_uint64t(action, "fwdKernel", "localIp", bot_ip.s_addr);
	max_set_uint64t(action, "fwdKernel", "forwardIp", fwd_ip.s_addr);
	max_set_uint64t(action, "fwdKernel", "localMac", localMac);
	max_set_uint64t(action, "fwdKernel", "forwardMac", forwardMac);
	max_set_uint64t(action, "fwdKernel", "port", port);
	max_run(engine, action);

	printf("JDFE Running.\n");
	getchar();

	max_unload(engine);
	max_file_free(maxfile);

	printf("Done.\n");
	return 0;
}
int main(int argc, char *argv[])
{
	if (argc != 3) {
		printf("Usage: %s <dfe_ip> <netmask>\n", argv[0]);
		return 1;
	}

	uint16_t Nsockets = 1;
	const int port = 80;
	struct in_addr dfe_ip;
	inet_aton(argv[1], &dfe_ip);
	struct in_addr netmask;
	inet_aton(argv[2], &netmask);

	// initialization files for crcIndex table, generated by init_code
	char fileCrcIndex1[] = "./results/romCrcIndex1_init.html";
	char fileCrcIndex2[] = "./results/romCrcIndex2_init.html";

	// LMEM initialization file location, generated by init_code
	char fileLmem[] = "./results/lmem_generated_file.html";

	uint64_t *arrCrc1;
	uint64_t *arrCrc2;
	long Lcrc;

	FILE *fpCrc1 = fopen(fileCrcIndex1, "rb");
	FILE *fpCrc2 = fopen(fileCrcIndex2, "rb");
	FILE *fpLmem = fopen(fileLmem, "rb");

	if (!(fpCrc1 && fpCrc2 && fpLmem)) {
		printf("Error with file\n");
		exit(0);
	}

	// obtain file size
	fseek(fpCrc1, 0, SEEK_END);
	Lcrc = ftell(fpCrc1);
	rewind(fpCrc1);

	fillRomCrcIndex(fpCrc1, &arrCrc1, Lcrc);
	fillRomCrcIndex(fpCrc2, &arrCrc2, Lcrc);

	printf("Preparing for init() and max_load()\n");
	max_file_t *maxfile = httpserver_init();
	max_engine_t * engine = max_load(maxfile, "*");
	printf("Done\n");

	max_actions_t *actions = max_actions_init(maxfile, NULL);

	int romDepthCrc = Lcrc / 8;
	for (uint32_t i = 0; i < romDepthCrc; i++) {
		max_set_mem_uint64t(actions, "CrcIndexTable", "romCrcIndex1", i, arrCrc1[i]);
		max_set_mem_uint64t(actions, "CrcIndexTable", "romCrcIndex2", i, arrCrc2[i]);
	}

	max_run(engine, actions);
	max_actions_free(actions);

	long L;
	size_t result;
	uint64_t* arrLmem;

	// obtain file size
	fseek(fpLmem, 0, SEEK_END);
	L = ftell(fpLmem);
	rewind(fpLmem);

	double diff = ceil(L / 8.0) - L / 8.0; // NULL character padding

	if (diff != 0) {
		L = (int) ceil(L / 8.0) * 8;
	}

	// allocate memory to contain the whole file
	size_t Nelem = sizeof(uint64_t) * (L / 8);
	arrLmem = (uint64_t*) malloc(Nelem);

	result = fread(arrLmem, 1, L, fpLmem); 

	int romDepth = L / 8;

	int burstLengthInBytes = max_get_burst_size(maxfile, "cmd_tolmem");
	inline int max(int a, int b) {
		return a > b ? a : b;
	}
	;

	const int size = romDepth;
	int sizeBytes = size * sizeof(uint64_t);
	uint64_t *inData; 


	printf("Writing to DFE memory.\n");
	inData = arrLmem;


	writeDataToLMem(inData, size, sizeBytes, burstLengthInBytes, engine, maxfile);
		printf("Done\n");



	max_ip_config(engine, MAX_NET_CONNECTION_QSFP_BOT_10G_PORT1, &dfe_ip, &netmask);

	//all sockets MUST be created before first call to max_tcp_connect or max_tcp_listen
	max_tcp_socket_t *(dfe_socket[Nsockets]);
	uint16_t socketNumber[Nsockets];

	for (int i = 0; i < Nsockets; i++) {
		//dfe_socket[i] = max_tcp_create_socket(engine, "tcp_ISCA_QSFP_BOT_10G_PORT1");
		dfe_socket[i] = max_tcp_create_socket_with_number(engine, "tcp_ISCA_QSFP_BOT_10G_PORT1", i);
		socketNumber[i] = max_tcp_get_socket_number(dfe_socket[i]);
		printf("Socket %d was assigned socket number %u\n", i, socketNumber[i]);
	}

	for (int i = 0; i < Nsockets; i++) {
		max_tcp_listen(dfe_socket[i], port + i);
		max_tcp_await_state(dfe_socket[i], MAX_TCP_STATE_LISTEN, NULL);
	}

	printf("CPU code: Total %u socket(s), listening on the port(s) %u-%u\n\n", Nsockets, port, port + Nsockets - 1);



	void *read_ptr;
	uint8_t *read_buffer;
	max_llstream_t *read_llstream;
	uint64_t *byteNumber;
	printf("CPU code: Setting up 'toCpuByteNumber' stream.\n");
	int Nslots_byteNumber = 512;
	size_t tCBN_buffer_size = Nslots_byteNumber * 16;
	posix_memalign((void *) &read_buffer, 4096, tCBN_buffer_size);
	read_llstream = max_llstream_setup(engine, "toCpuFileSizeBytes", Nslots_byteNumber, 16, read_buffer);




	uint8_t *read_buffer_socket;
	max_llstream_t *read_llstream_socket;
	printf("CPU code: Setting up 'toCpuSocketNumber' stream.\n");
	int Nslots_socketNumber = 512;
	size_t tCSB_buffer_size = Nslots_socketNumber * 16;
	posix_memalign((void *) &read_buffer_socket, 4096, tCSB_buffer_size);
	read_llstream_socket = max_llstream_setup(engine, "toCpuSocketNumber", Nslots_socketNumber, 16, read_buffer_socket);
	void *read_ptr_socket_slot;


	uint16_t ti = 10;
	while(ti > 0)
	{
		printf("CPU code: time=%u, waiting file size and socket numbers stream data to be sent to CPU\n", ti);
		usleep(1000*1000*1);
		ti--;
	}

	//while(1);


	uint64_t num_rx_bytes;
	uint64_t num_tx_bytes;
	uint8_t session_id;
	while (1) {

		//part 1: first wait to receive LengthBytes number

		printf("CPU code: PART 1 - waiting to receive LengthBytes number\n");

		int FoundByteNumber = 0;
		ti=0;
		while (FoundByteNumber != 1) //first wait to receive LengthBytes number
		{
			usleep(1000*1000*1);

			for (int i = 0; i < Nsockets; i++) {
				max_tcp_get_num_bytes_received(dfe_socket[i], &num_rx_bytes);
				max_tcp_get_num_bytes_transmitted(dfe_socket[i], &num_tx_bytes, &session_id);
				printf("CPU code: waiting, time=%u, port=%u, socket=%i, max_tcp_get_num_bytes_received=%llu, max_tcp_get_num_bytes_transmitted=%llu\n", ti, port + i, i, (long long unsigned int) num_rx_bytes, (long long unsigned int) num_tx_bytes);
			}
			ti++;

			uint8_t ii = max_llstream_read(read_llstream, 1, &read_ptr);
			if (ii) {
				byteNumber = (uint64_t*) read_ptr;
				printf("CPU code: number of slots found to contain new data=%u,  fileSizeBytes=%u\n", ii, (unsigned int) *byteNumber);
				max_llstream_read_discard(read_llstream, 1);
				FoundByteNumber = 1;
			}
		}

		//part 2: receive total number of data transfered

		printf("CPU code: PART 2 - receive socket number\n");
		while (max_llstream_read(read_llstream_socket, 1, &read_ptr_socket_slot) == 0)
			;

		
		uint16_t socket_returned = (uint16_t) *((uint16_t*) read_ptr_socket_slot); //event->socketID;
		unsigned int fileBytes = (unsigned int) *byteNumber;

		printf("CPU code: fileBytes=%u, socket_returned=%u\n", fileBytes, socket_returned);

		ti = 0;
		while (1) {
			{
				for (int i = 0; i < Nsockets; i++)
				{
					max_tcp_get_num_bytes_received(dfe_socket[i], &num_rx_bytes);
					max_tcp_get_num_bytes_transmitted(dfe_socket[i], &num_tx_bytes, &session_id);
					printf("CPU code: time=%i, port=%u, socket=%i, max_tcp_get_num_bytes_received=%llu, max_tcp_get_num_bytes_transmitted=%llu\n", ti, port + i, i, (long long unsigned int) num_rx_bytes, (long long unsigned int) num_tx_bytes);
				}
				ti++;
				printf("\n");

				max_tcp_get_num_bytes_transmitted(dfe_socket[socket_returned], &num_tx_bytes, &session_id);
				printf("CPU code: fileSizeBytes=%u, socketReturned=%u, num_tx_bytes=%llu\n", fileBytes, socket_returned, (long long unsigned int) num_tx_bytes);

			}

			//usleep(1000*100);
			//printf("CPU code: While LOOP, socket_returned=%u, fileBytes=%u, num_tx_bytes(max_tcp_get_num_bytes_transmitted)=%llu\n", socket_returned, fileBytes, (long long unsigned int) num_tx_bytes);
			if (num_tx_bytes == fileBytes) {
				//usleep(1000*1000*3);
				printf("CPU code: MATCH num_tx_bytes==fileBytes, socket_returned=%u, fileBytes=%u, num_tx_bytes(max_tcp_get_num_bytes_transmitted)=%llu\n", socket_returned, fileBytes, (long long unsigned int) num_tx_bytes);
				printf("CPU code: Closing socket=%u\n", socket_returned);
				max_tcp_close(dfe_socket[socket_returned]);
					//max_tcp_close_mode_t close_mode=MAX_TCP_CLOSE_ABORT_RESET;
					//max_tcp_close_advanced(dfe_socket[socket_returned],close_mode);

				printf("CPU code: Waiting for MAX_TCP_STATE_CLOSED\n");
				max_tcp_await_state(dfe_socket[socket_returned], MAX_TCP_STATE_CLOSED, NULL);

				printf("CPU code: Set LISTEN state\n");
				max_tcp_listen(dfe_socket[socket_returned], port);

				printf("CPU code: Waiting for MAX_TCP_STATE_LISTEN\n");
				max_tcp_await_state(dfe_socket[socket_returned], MAX_TCP_STATE_LISTEN, NULL);

				printf("CPU code: Again opened socket=%u\n", socket_returned);

				printf("\nCPU code: State of rx/tx after socket closing\n");


				break;
			}

			usleep(1000*1000*1);
		}
	}

	for (int i = 0; i < Nsockets; i++) {
		max_tcp_close(dfe_socket[i]);
		printf("max_tcp_close(dfe_socket[i])");
	}

	max_unload(engine);
	printf("max_unload(engine)");
	max_file_free(maxfile);
	printf("max_file_free(maxfile)");

	printf("The end\n");

	return 0;

}
int 
main(int argc, char *argv[]) 
{
    if(argc != 4) 
    {
        printf("Usage: %s <dfe_ip> <cpu_ip> <netmask>\n", argv[0]);
        return 1;
    }

    struct in_addr dfe_ip;
    inet_aton(argv[1], &dfe_ip);
    struct in_addr cpu_ip;
    inet_aton(argv[2], &cpu_ip);
    struct in_addr netmask;
    inet_aton(argv[3], &netmask);
    const int port = 5008;
    
    /* Create DFE Socket, then listen */
    max_file_t *maxfile = FieldAccumulatorTCP_init();
    max_engine_t *engine = max_load(maxfile, "*");
    max_ip_config(engine, MAX_NET_CONNECTION_CH2_SFP1, &dfe_ip, &netmask);
    
    max_udp_socket_t *dfe_socket = max_udp_create_socket(engine, "udp_ch2_sfp1");
    max_udp_bind(dfe_socket, port);
    max_udp_connect(dfe_socket, &cpu_ip, port);
    
    int cpu_socket = create_cpu_udp_socket(&cpu_ip, &dfe_ip, port);
    
    FILE *stream = fopen("source_data1.csv", "r");
    char line[BUFFERSIZE];

   // char *to_be_free = line;

    /* Ignore Header File */
    fgets(line, BUFFERSIZE, stream);
    printf(line);

    while (fgets(line, BUFFERSIZE, stream))
    {
    	struct input_data data;
    	parse(line,&data);
    	printf("\n Instrument id = %d \n level = %d \n side = %d \n Quantity = %d \n Price = %d",data.instrument_id,data.level,data.side,data.quantity,data.price);
    	calculateDeltas(cpu_socket, &data);
    }

//    /* Set Value A */
//    data.instrument_id = 0;
//    data.level         = 0;
//    data.side          = 0;
//    data.quantity      = 5;
//    data.price         = 10;
//    calculateDeltas(cpu_socket, &data);
//
//    /* Set B*/
//    data.instrument_id = 1;
//    data.level         = 0;
//    data.side          = 1;
//    data.quantity      = 3;
//    data.price         = 4;
//    calculateDeltas(cpu_socket, &data);
//
//    /* Hold */
//    data.instrument_id = 1;
//    data.level         = 0;
//    data.side          = 1;
//    data.quantity      = 5;
//    data.price         = 6;
//    calculateDeltas(cpu_socket, &data);
//
//    /* Set AB */
//    data.instrument_id = 2;
//    data.level         = 0;
//    data.side          = 1;
//    data.quantity      = 7;
//    data.price         = 8;
//    calculateDeltas(cpu_socket, &data);
    
    max_udp_close(dfe_socket);
    max_unload(engine);
    max_file_free(maxfile);
    
    return 0;
}
int main(int argc, char *argv[]) {

	max_file_t *maxfile = Gap_init();
	max_engine_t * engine = max_load(maxfile, "*");


	max_config_set_bool(MAX_CONFIG_PRINTF_TO_STDOUT, true);

	max_actions_t *action = max_actions_init(maxfile, NULL);
	max_run(engine, action);


	size_t bufferSize = 4096 * 4096;
	void *inBuffer = NULL;
	void *outBuffer = NULL;
	if (posix_memalign(&inBuffer, 4096, bufferSize)) {
		err(1, "Couldn't allocation input buffer");
	}
	if (posix_memalign(&outBuffer, 4096, bufferSize)) {
		err(1, "Couldn't allocation output buffer");
	}
	max_framed_stream_t *inFrame = max_framed_stream_setup(engine, "src", inBuffer, bufferSize, 2048-16);
	max_framed_stream_t *outFrame = max_framed_stream_setup(engine, "dst", outBuffer, bufferSize, -1);

	// Now, stream in some frames and see what happens.

	for (size_t i=0 ; i < 8; i++) {
		void *f;
		while (max_framed_stream_write_acquire(inFrame, 1, &f) != 1) usleep(10);

		uint8_t *inputData = f;

		/*
		 * Request a gap every other packet
		 */
		inputData[20] = i % 2 == 1 ? 'G' : 'N';

		size_t frameSize = 60;
		printf("Sending frame %zd\n", i);
		max_framed_stream_write(inFrame, 1, &frameSize);


		void *oFrame;
		size_t oFrameSize;
		while (max_framed_stream_read(outFrame, 1, &oFrame, &oFrameSize) != 1) usleep(10);

		printf("Got frame %zd - %zd bytes (Expecting %zd)\n", i, oFrameSize, frameSize);

		dump(oFrame, oFrameSize);


		max_framed_stream_discard(outFrame, 1);
	}



	max_unload(engine);
	max_file_free(maxfile);

	printf("Done.\n");
	return 0;
}
int main(int argc, char *argv[]) {
	if(argc < 3) {
		printf("Usage: $0 dfe_ip cpu_ip\n");
		return 1;
	}

	struct in_addr dfe_ip;
	inet_aton(argv[1], &dfe_ip);
	struct in_addr cpu_ip;
	inet_aton(argv[2], &cpu_ip);
	struct in_addr netmask;
	inet_aton("255.255.255.0", &netmask);
	const int port = 5007;

	max_file_t *maxfile = Tracker_init();
	max_engine_t * engine = max_load(maxfile, "*");


	max_config_set_bool(MAX_CONFIG_PRINTF_TO_STDOUT, true);

	max_actions_t *actions = max_actions_init(maxfile, NULL);
	char regName[32];
	for (int i=0; i < 1024; i++) {
		sprintf(regName, "filter_%d", i);
		if (i == 150) {
			max_set_uint64t(actions, "filteringKernel", regName, 0xCC /* a value to match... */);
		} else {
			max_set_uint64t(actions, "filteringKernel", regName, 0x4D1B /* or any value you want */);
		}
	}
	max_run(engine, actions);
	max_actions_free(actions);


	void *buffer;
	size_t bufferSize = 4096 * 512;
	posix_memalign(&buffer, 4096, bufferSize);

	max_framed_stream_t *toCpu = max_framed_stream_setup(engine, "toCPU", buffer, bufferSize, -1);

	/*
	 * This executable both creates a normal Linux UDP socket as well as a DFE UDP Socket.
	 * We then exchange data between the two.
	 */

	// DFE Socket
	max_ip_config(engine, MAX_NET_CONNECTION_QSFP_TOP_10G_PORT1, &dfe_ip, &netmask);
	max_udp_socket_t *dfe_socket = max_udp_create_socket(engine, "udpTopPort1");
	max_udp_bind(dfe_socket, port);
	max_udp_connect(dfe_socket, &cpu_ip, port);


	// Linux Socket
	int cpu_socket = create_cpu_udp_socket(&cpu_ip, &dfe_ip, port);

	printf("Sending test frame...\n");
	sendTestFrame(cpu_socket);

	printf("Waiting for kernel response...\n"); fflush(stdout);

	void *f;
	size_t fsz;
	size_t numMessageRx = 0;
	uint8_t received_data[512];
	while (numMessageRx < NUM_MESSAGES_EXPECTED) {
		if (max_framed_stream_read(toCpu, 1, &f, &fsz) == 1) {
			printf("CPU: Got output frame - size %zd - NumMsg = %zd!\n", fsz, numMessageRx); // Frame size would be rounded up to the next 8 bytes.

			memcpy(received_data, f, fsz);
			numMessageRx++;
			max_framed_stream_discard(toCpu, 1);
		} else 	usleep(10);
	}

	max_udp_close(dfe_socket);
	max_unload(engine);
	max_file_free(maxfile);

	printf("Done.\n"); fflush(stdout);
	return 0;
}
int main(int argc, char *argv[]) {
	if(argc < 3) {
		printf("Usage: $0 dfe_ip remote_ip\n");
		return 1;
	}

	struct in_addr dfe_ip;
	inet_aton(argv[1], &dfe_ip);
	struct in_addr remote_ip;
	inet_aton(argv[2], &remote_ip);
	struct in_addr netmask;
	inet_aton("255.255.255.0", &netmask);
	const int in_port = 2000;
	const int out_port = 2000;

//	struct in_addr mcastaddr;
//	inet_aton("224.0.0.1", &mcastaddr);

	max_file_t *maxfile = SignExtWithPatternMatching_init();
	max_engine_t * engine = max_load(maxfile, "*");

	max_config_set_bool(MAX_CONFIG_PRINTF_TO_STDOUT, true);

	max_actions_t *actions = max_actions_init(maxfile, NULL);

	max_run(engine, actions);
	max_actions_free(actions);


	void *buffer;
	size_t bufferSize = 4096 * 512;
	posix_memalign(&buffer, 4096, bufferSize);

	max_framed_stream_t *toCpu = max_framed_stream_setup(engine, "toCPU", buffer, bufferSize, -1);

	max_ip_config(engine, MAX_NET_CONNECTION_QSFP_TOP_10G_PORT1, &dfe_ip, &netmask);
	max_udp_socket_t *dfe_socket = max_udp_create_socket(engine, "udpTopPort1");
//	max_ip_multicast_join_group(engine, MAX_NET_CONNECTION_QSFP_TOP_10G_PORT1, &mcastaddr);
//	max_udp_bind_ip(dfe_socket, &mcastaddr, in_port);
	max_udp_bind(dfe_socket, in_port);
	max_udp_connect(dfe_socket, &remote_ip, out_port);

	printf("Listening on %s in_port %d\n", argv[1], in_port);

	printf("Waiting for kernel response...\n"); fflush(stdout);

	void *f;
	size_t fsz;
	size_t numMessageRx = 0;
	while (1) {
		if (max_framed_stream_read(toCpu, 1, &f, &fsz) == 1) {
			numMessageRx++;

			printf("CPU: Got output frame %zd - size %zd bytes\n", numMessageRx, fsz);

			uint64_t *w = f;
			for (size_t i=0; i < 3; i++) {
				printf("Frame [%zd] Word[%zd]: 0x%lx\n", numMessageRx, i, w[i]);
			}


			max_framed_stream_discard(toCpu, 1);
		} else 	usleep(10);
	}

//	max_ip_multicast_leave_group(engine, MAX_NET_CONNECTION_QSFP_TOP_10G_PORT1, &mcastaddr);
	max_udp_close(dfe_socket);
	max_unload(engine);
	max_file_free(maxfile);

	printf("Done.\n"); fflush(stdout);
	return 0;
}
Exemple #8
0
int main(int argc, char** argv) 
{
	max_file_t *max_file = jacobi_init();
	size_t dim               = 64;   // this should be a scalar input in the bitstream
	size_t MAX_ITER          = 20;
	size_t C                 = max_get_constant_uint64t(max_file, "C");
	size_t blks              = 100;
	size_t total_equations   = blks*C;
	clock_t engine_start     = 0;
	clock_t engine_end       = 0;
	double engine_total_time = 0.0;
	size_t max_dim           = max_get_constant_uint64t(max_file, "maxDimLen");
	if(argc == 1)
	{
		fprintf(stderr, "====>Info:Runing Jacobi with default parameter values:[Dimension = %ld, Iteration = %ld, blocks = %ld(%ld*%ld equations)], for details, see the README.txt\n", dim, MAX_ITER, blks, blks, C);
	}
	char *opt_str = "hd:b:i:";
	int opt = 0;
	int input_dim = dim;
	int input_iter = MAX_ITER;
	int input_blks = blks;
	while( (opt = getopt(argc, argv, opt_str)) != -1)
	{
		switch(opt)
		{
			case 'd':
				input_dim = atoi(optarg);	
				break;
			case 'b':
				input_blks = atoi(optarg);
				break;
			case 'i':
				input_iter = atoi(optarg);
				break;
			case 'h':
				usage();
				return 1;
			default:
				fprintf(stderr, "====>Error: Inputs contain invalid command line paramter(s)!\n");
				usage();
				return 1;
			}
		}
	max_file_free(max_file);
	if(input_dim <= 0 || input_dim > max_dim || input_dim % 2 != 0)
	{
		fprintf(stderr, "\n====>Error: Input dimension length is invalid, for details, see the usage below:\n");
		usage();
		return 1;
	}
	else
	{
		dim = (size_t)input_dim;
	}
	if(input_blks <= 0)
	{
		fprintf(stderr, "\n====>Error: Input block number is invalid, should bigger than zero.\n");
		usage();
		return 1;
	}
	else
	{
		blks = (size_t)input_blks;
	}
	if(input_iter <= 1)
	{
		fprintf(stderr, "\n====>Error: Input iteration number is invalid, should bigger than 1.\n");
		usage();
		return 1;
	}
	else
	{
		MAX_ITER = (size_t)input_iter;
	}
	total_equations = blks * C;

	double *A                  = malloc(dim*dim*sizeof(double));
	double *A_trans            = malloc(dim*dim*sizeof(double));
	double *b                  = malloc(total_equations*dim*sizeof(double));
	double *b_trans            = malloc(total_equations*dim*sizeof(double));
	double *diagA              = malloc(dim*sizeof(double));
	double *reverse_diagA      = malloc(dim*sizeof(double));
	double *x_init             = malloc(C*dim*sizeof(double));
	double *x_trans_init       = malloc(C*dim*sizeof(double));
	double *result             = malloc(total_equations * dim * sizeof(double));
	double *reorder_result     = malloc(total_equations * dim *sizeof(double));
	double *solutions          = malloc(total_equations * dim *sizeof(double));
	double *error              = malloc(total_equations*sizeof(double));
	double *error_bak          = malloc(total_equations*sizeof(double));
	int    *is_solution_valid  = malloc(total_equations*sizeof(int));
	int    *recacu_error_index = malloc(total_equations*sizeof(int));
	double *expected_error     = malloc(total_equations*sizeof(double));
	double *x_base             = malloc(total_equations * dim * sizeof(double));
	double *x_all_init         = malloc(total_equations * dim *sizeof(double));
	double *x_all_trans_init   = malloc(total_equations * dim *sizeof(double));
	memset(A,                0 , sizeof(double)*dim*dim);
	memset(A_trans,          0 , sizeof(double)*dim*dim);
	memset(b,                0 , sizeof(double)*dim*total_equations);
	memset(b_trans,          0 , sizeof(double)*dim*total_equations);
	memset(diagA,            0 , sizeof(double)*dim);
	memset(reverse_diagA,    0 , sizeof(double)*dim);
	memset(x_init,           0 , sizeof(double) *C*dim);
	memset(result,           0 , sizeof(double)*dim*total_equations);
	memset(reorder_result,   0 , sizeof(double)*dim*total_equations);
	memset(error,            0 , sizeof(double)*total_equations);
	memset(expected_error,   0 , sizeof(double)*total_equations);
	memset(x_base,           0 , sizeof(double)*dim*total_equations);
	memset(x_all_init,       0 , sizeof(double)*dim*total_equations);
	memset(x_all_trans_init, 0 , sizeof(double)*dim*total_equations);
	memset(is_solution_valid,0 , sizeof(int)*total_equations);
	
	for(int i = 0; i < total_equations; i ++)
	{
		recacu_error_index[i] = -1;
		expected_error[i]     = 1000;
		error_bak[i]          = 1000;
		for(int j = 0; j < dim; j ++)
		{
			solutions[i*dim + j] = 1000;
		}
	}

	/**
	 *  Generating random value for b and A
	 */
	srand(time(NULL));
	for(int i = 0; i < dim; ++i) {
		double sum = 0;
		for(int j = 0; j < dim; ++j) {
			if(i != j) {
				A[i*dim+j]     = 2.0*rand()/(double)RAND_MAX - 1 ; // random number between -1 and 1
				sum           += fabs(A[i*dim+j])                ;
			}
		}
		A[i * dim + i] = 1 + sum;
		diagA[i]       = 1.0/A[i * dim + i];
		reverse_diagA[i]  = A[i * dim + i];
	}
	
	double A_original[dim * dim];
	for(int i = 0; i < C*blks; i ++)
	{
			for(int j = 0; j < dim; j ++)
			{
				b[i * dim + j] = 2.0*rand()/(double)RAND_MAX - 1;
			}
	}

	for(int i = 0; i < dim; i ++)
	{
		for(int j = 0; j < dim; j ++)
		{
			A_original[i * dim + j] = A[i * dim + j];
			if(i != j)
			{
				A[i * dim + j] = A[i*dim + j] * diagA[i];
			}
		}
	}

	/**
	 * Reorder the input A and b 
	 */
	engine_start = clock();
	for(int i = 0; i < dim; i ++)
	{
		for(int j = 0; j < dim; j ++)
		{
			A_trans[i * dim + j] = A[j * dim + i];
		}
	}
	int count = 0;
	for(int yy = 0; yy < total_equations; yy += C)
	{
			for(int i = 0; i < dim; i ++)
			{
				for(int j = yy; j <yy + C; j ++)
				{
					b_trans[count] = b[j * dim + i]*diagA[i]; 
					count ++;
				}
			}
	}

	for(int k = 0; k < blks; k ++)
	{
			for ( int i = 0; i < C ; i ++ ) 
			{
					for ( int j = 0; j < dim; j ++ ) 
					{
							x_init[i * dim + j] = 0;
							x_trans_init[j*C + i] = x_init[i * dim + j];
					}
			}
		memcpy(x_all_trans_init + k * C * dim , x_trans_init , sizeof(double)*C*dim);
		memcpy(x_all_init       + k * C * dim , x_init       , sizeof(double)*C*dim);
	}
	
    jacobi(
		dim, 
		total_equations,
		MAX_ITER,
		A_trans                                ,
		dim * dim * sizeof(double)             ,
		b_trans                                ,
		total_equations * dim * sizeof(double) ,
		reverse_diagA                                  ,
		dim * sizeof(double)                   ,
		x_all_trans_init                       ,
		total_equations * dim * sizeof(double) ,
		error                                  ,
		total_equations * sizeof(double)       ,
		result                                 ,
	    total_equations * dim * sizeof(double) 
	  );

	for(int yy = 0; yy<total_equations; yy += C)
	{
			for(int i = 0; i <  C; i ++)
			{
				for(int j = 0; j < dim; j ++)
				{
					reorder_result[yy *dim + i*dim + j] = result[yy * dim + i + j * C];
				}
			}
	}

	/*Check Error to decide whether we need to restream into kernel again*/
	int recacu_cnt            = 0;
	int new_recacu_cnt        = 0;
	int actual_recacu_cnt     = 0;
	int new_actual_recacu_cnt = 0;

	double *x_latest_init       = malloc(total_equations * dim * sizeof(double)) ;
	double *x_latest_trans_init = malloc(total_equations * dim * sizeof(double)) ;
	double *recacu_b            = malloc(total_equations * dim * sizeof(double)) ;
	double *recacu_trans_b      = malloc(total_equations * dim * sizeof(double)) ;
	memset(x_latest_init       , 0 , total_equations * dim * sizeof(double))     ;
	memset(x_latest_trans_init , 0 , total_equations * dim * sizeof(double))     ;
	memset(recacu_b            , 0 , total_equations * dim * sizeof(double))     ;
	memset(recacu_trans_b      , 0 , total_equations * dim * sizeof(double))     ;

	int idx = 0;
	for(int i = 0; i < total_equations; i ++)
	{
		if(error[i] > CUR_EPS)
		{
			memcpy(x_latest_init + idx*dim, reorder_result + i*dim, dim*sizeof(double)); 
			memcpy(recacu_b      + idx*dim, b              + i*dim, dim*sizeof(double)); 
			recacu_error_index[idx] = i;			
			recacu_cnt ++        ;
			actual_recacu_cnt ++ ;
			idx ++;
		}
		else
		{
			error_bak[i] = error[i];
			memcpy(solutions +  i*dim, reorder_result + i*dim, dim*sizeof(double));
		}

	}
	while( recacu_cnt % C )
	{
		recacu_cnt ++;
	}

	/**
	 *  if recaculate count not zero, we start to restream data into kernel again
	 */
	int times = 1;
	while( recacu_cnt != 0 )
	{	
		/*Reorder Latest solutions init value */
		times ++;
		memset(x_latest_trans_init, 0, recacu_cnt*dim*sizeof(double));
		count = 0;
		for(int yy = 0; yy < recacu_cnt; yy += C)
		{
			for(int i = 0; i < dim; i ++)
			{
				for(int j = yy; j < yy + C; j ++)
				{
					x_latest_trans_init[count] = x_latest_init[j * dim + i]; 
					count ++;
				}
			}
		}


		/*Reorder latest b*/
		memset(recacu_trans_b, 0, total_equations*dim*sizeof(double));
		count = 0;
		for(int yy = 0; yy < recacu_cnt; yy += C)
		{
			for(int i = 0; i < dim; i ++)
			{
				for(int j = yy; j < yy + C; j ++)
				{
					recacu_trans_b[count] = recacu_b[j * dim + i]*diagA[i]; 
					count ++;
				}
			}
		}

		memset(error  , 0 , recacu_cnt * sizeof(double       )  ) ;
		memset(result , 0 , recacu_cnt * dim * sizeof(double )  ) ;
		jacobi(
			dim, 
			recacu_cnt,
			MAX_ITER,
			A_trans                           ,
			dim * dim * sizeof(double)        ,
			recacu_trans_b                    ,
			recacu_cnt * dim * sizeof(double) ,
			reverse_diagA                             ,
			dim * sizeof(double)              ,
			x_latest_trans_init               ,
			recacu_cnt * dim * sizeof(double) ,
			error                             ,
			recacu_cnt * sizeof(double)       ,
			result                            ,
			recacu_cnt * dim * sizeof(double)
		  );


		for(int yy = 0; yy < recacu_cnt; yy += C)
		{
			for(int i = 0; i <  C; i ++)
			{
				for(int j = 0; j < dim; j ++)
				{
					reorder_result[yy *dim + i*dim + j] = result[yy * dim + i + j * C];
				}
			}
		}
		
		new_recacu_cnt = 0;
		new_actual_recacu_cnt = 0;
		int idx2 = 0;
		for(int i = 0; i < actual_recacu_cnt; i ++)
		{
			if(error[i] > CUR_EPS)
			{
				memcpy(x_latest_init + new_recacu_cnt*dim, reorder_result + i*dim, dim*sizeof(double)); 
				memcpy(recacu_b      + new_recacu_cnt*dim, recacu_b       + i*dim, dim*sizeof(double)); 
				recacu_error_index[idx2] = recacu_error_index[i]; 
				new_recacu_cnt ++;
				new_actual_recacu_cnt ++;
				idx2 ++;
			}
			else
			{
				error_bak[ recacu_error_index[i]] = error[i];
				memcpy(solutions + recacu_error_index[i] *dim, reorder_result + i*dim, dim*sizeof(double));
			}
		}

		/* padding to multipy of C */
		while( new_recacu_cnt % C )
		{
			new_recacu_cnt ++;
		}
		/* update the current recaculating solution numbers */
		recacu_cnt = new_recacu_cnt;
		actual_recacu_cnt = new_actual_recacu_cnt;
	}//loop while

	engine_end        = clock();
	engine_total_time = (double)(engine_end - engine_start) / CLOCKS_PER_SEC;
	fprintf(stderr, "=========>Kernel Complete, Stream Times: %d\n", times);
	clock_t cpu_start = clock();
	jacobi_opt(A_original, x_base, b, dim, C, total_equations, x_all_init , expected_error);
	clock_t cpu_end = clock();
	double cpu_total_time = (double)(cpu_end - cpu_start) / CLOCKS_PER_SEC;

	/* Compare the result with the standard result */
	int cnt = 0;
	int index = 0;
	for(int i = 0; i < total_equations; i ++)
	{
		for(int j = 0; j < dim; j ++)
		{
			double diff = solutions[i * dim + j] - x_base[i*dim + j];
			if(fabs(diff) > EPS)
			{
					fprintf(stderr, "error: atual=%.10f, expect=%.10f, err=%.10e\n",
							solutions[i * dim + j], x_base[i*dim + j], diff);
					cnt ++;
					index ++;
			}
		}
	}
	if(cnt == 0)
	{
		max_print_result(dim, total_equations, MAX_ITER, engine_total_time, cpu_total_time);
		fprintf(stderr, "==========>All Test Passed\n\n");
	}
	else
	{
		fprintf(stderr, "!!!Test Failed:%d\n\n", cnt);
	}

	free ( A                   ) ;
	free ( A_trans             ) ;
	free ( b                   ) ;
	free ( b_trans             ) ;
	free ( diagA               ) ;
	free ( reverse_diagA       ) ;
	free ( x_init              ) ;
	free ( error               ) ;
	free ( error_bak           ) ;
	free ( recacu_error_index  ) ;
	free ( expected_error      ) ;
	free ( result              ) ;
	free ( reorder_result      ) ;
	free ( solutions           ) ;
	free ( x_base              ) ;
	free ( x_all_init          ) ;
	free ( x_all_trans_init    ) ;
	free ( x_latest_init       ) ;
	free ( x_latest_trans_init ) ;
	free ( recacu_b            ) ;

	int status = (cnt == 0) ? 0:1;
	return status;
}