Example #1
0
//----------------------------------------------------------
//--breadth first search on GPUs
//----------------------------------------------------------
void run_bfs_gpu(int no_of_nodes, Node *h_graph_nodes, int edge_list_size, \
		int *h_graph_edges, bool *h_graph_mask, bool *h_updating_graph_mask, \
		bool *h_graph_visited, int *h_cost) 
					throw(std::string){

	//int number_elements = height*width;
	bool h_over;
	cl_mem d_graph_nodes, d_graph_edges, d_graph_mask, d_updating_graph_mask, \
			d_graph_visited, d_cost, d_over;
//	try{
		//--1 transfer data from host to device
		//printf("initializing\n");
		_clInit();			
		//printf("allocating\n");
		d_graph_nodes = _clMalloc(no_of_nodes*sizeof(Node), h_graph_nodes);
		d_graph_edges = _clMalloc(edge_list_size*sizeof(int), h_graph_edges);
		d_graph_mask = _clMalloc(no_of_nodes*sizeof(bool), h_graph_mask);
		d_updating_graph_mask = _clMalloc(no_of_nodes*sizeof(bool), h_updating_graph_mask);
		d_graph_visited = _clMalloc(no_of_nodes*sizeof(bool), h_graph_visited);
		d_cost = _clMallocRW(no_of_nodes*sizeof(int), h_cost);
		d_over = _clMallocRW(sizeof(bool), &h_over);
		
		//printf("copyin\n");
		_clMemcpyH2D(d_graph_nodes, no_of_nodes*sizeof(Node), h_graph_nodes);
		_clMemcpyH2D(d_graph_edges, edge_list_size*sizeof(int), h_graph_edges);	
		_clMemcpyH2D(d_graph_mask, no_of_nodes*sizeof(bool), h_graph_mask);	
		_clMemcpyH2D(d_updating_graph_mask, no_of_nodes*sizeof(bool), h_updating_graph_mask);	
		_clMemcpyH2D(d_graph_visited, no_of_nodes*sizeof(bool), h_graph_visited);	
		_clMemcpyH2D(d_cost, no_of_nodes*sizeof(int), h_cost);	
			
		//--2 invoke kernel
#ifdef	PROFILING
		timer kernel_timer;
		double kernel_time = 0.0;		
		kernel_timer.reset();
		kernel_timer.start();
#endif
		int kerId=0;
	//	printf("launching kernel\n");
		do{
	//		printf("copy in\n");
			h_over = false;
			_clMemcpyH2D(d_over, sizeof(bool), &h_over);
			//--kernel 0
			int kernel_id = 0;
			int kernel_idx = 0;
	//		printf("set arg 1\n");
			_clSetArgs(kernel_id, kernel_idx++, d_graph_nodes);
			_clSetArgs(kernel_id, kernel_idx++, d_graph_edges);
			_clSetArgs(kernel_id, kernel_idx++, d_graph_mask);
			_clSetArgs(kernel_id, kernel_idx++, d_updating_graph_mask);
			_clSetArgs(kernel_id, kernel_idx++, d_graph_visited);
			_clSetArgs(kernel_id, kernel_idx++, d_cost);
			_clSetArgs(kernel_id, kernel_idx++, &no_of_nodes, sizeof(int));
			
			//int work_items = no_of_nodes;
	//		printf("invoke 1\n");
			_clInvokeKernel(kernel_id, no_of_nodes, work_group_size);
			
			//--kernel 1
			kernel_id = 1;
			kernel_idx = 0;			
	//		printf("set arg 2\n");
			_clSetArgs(kernel_id, kernel_idx++, d_graph_mask);
			_clSetArgs(kernel_id, kernel_idx++, d_updating_graph_mask);
			_clSetArgs(kernel_id, kernel_idx++, d_graph_visited);
			_clSetArgs(kernel_id, kernel_idx++, d_over);
			_clSetArgs(kernel_id, kernel_idx++, &no_of_nodes, sizeof(int));
			
			//work_items = no_of_nodes;
	//		printf("invoke 2\n");
			_clInvokeKernel(kernel_id, no_of_nodes, work_group_size);			
			
	//		printf("copy back\n");
			_clMemcpyD2H(d_over,sizeof(bool), &h_over);
	//		printf("done\n");
	//		printf("K%d\n",kerId++);
			}while(h_over);
	//	printf("done!");
		_clFinish();
#ifdef	PROFILING
		kernel_timer.stop();
		kernel_time = kernel_timer.getTimeInSeconds();
#endif
		//--3 transfer data from device to host
		_clMemcpyD2H(d_cost,no_of_nodes*sizeof(int), h_cost);
		//--statistics
#ifdef	PROFILING
		std::cout<<"kernel time(s):"<<kernel_time<<std::endl;		
#endif
		//--4 release cl resources.
		_clFree(d_graph_nodes);
		_clFree(d_graph_edges);
		_clFree(d_graph_mask);
		_clFree(d_updating_graph_mask);
		_clFree(d_graph_visited);
		_clFree(d_cost);
		_clFree(d_over);
		_clRelease();
//	}
//	catch(std::string msg){		
//		_clFree(d_graph_nodes);
//		_clFree(d_graph_edges);
//		_clFree(d_graph_mask);
//		_clFree(d_updating_graph_mask);
//		_clFree(d_graph_visited);
//		_clFree(d_cost);
//		_clFree(d_over);
//		_clRelease();
//		std::string e_str = "in run_transpose_gpu -> ";
//		e_str += msg;
//		throw(e_str);
//	}
	return ;
}
Example #2
0
int main(int argc, char ** argv)
{
	uint * in = NULL, * out_cpu = NULL, * out_gpu = NULL;
	cl_mem d_in = NULL, d_out = NULL;
try{
	if(argc!=2){
		printf("need 1 parameter here!!!");
		exit(-1);
	}

	_clInit(1, "gpu", 0);
	uint iter = 100;
	
#if defined TIME
	double start_time = 0.0;
	double end_time = 0.0;
	double deltaT = 0.0;
	string dat_name="data.dat";

	FILE * fp = fopen(dat_name.c_str(), "a+");
	if(fp==NULL)
	{
		printf("failed to open file!!!\n");
		exit(-1);
	}
#endif
	
	// parameters
	uint side = atoi(argv[1]);
	uint wData = side;
	uint hData = side;
	uint size = wData * hData;

	printf("wData=%d, hData=%d\n", wData, hData);
	
	// allocate memory space on the host and device side
	in = (uint * )malloc(size * sizeof(uint));
	out_cpu = (uint * )malloc(size * sizeof(uint));
	out_gpu = (uint * )malloc(size * sizeof(uint));
	
	d_in = _clMalloc(size * sizeof(uint));	
	d_out = _clMalloc(size * sizeof(uint));

	// initialization
	fill<uint>(in, size, 16);

	// copy data from host to device
	_clMemcpyH2D(d_in, in, size * sizeof(uint));
	
	// warm-up
	mt_1(d_in, d_out, wData, hData);
	mt_2(d_in, d_out, wData, hData);
	mt_3(d_in, d_out, wData, hData);
	
#ifdef VARIFY	
	CPURun(in, out_cpu, wData, hData);
#endif //VARIFY
	
	/**************************1****************************/
#ifdef TIME
	deltaT = 0.0;
#endif
	for(int i=0; i<iter; i++)
	{
	
#ifdef TIME
	start_time = gettime();
#endif

		mt_1(d_in, d_out, wData, hData);
	
#ifdef TIME	
	end_time = gettime();
	deltaT += end_time - start_time;	
#endif
	}	
#ifdef TIME
	fprintf(fp, "%lf\t", deltaT/(double)iter);
#endif

#ifdef VARIFY
	_clMemcpyD2H(out_gpu, d_out, size * sizeof(uint));
	verify_array_int<uint>(out_cpu, out_gpu, size);
#endif //VARIFY

	/**************************2****************************/
#ifdef TIME
	deltaT = 0.0;
#endif
	for(int i=0; i<iter; i++)
	{
	
#ifdef TIME
	start_time = gettime();
#endif

		mt_2(d_in, d_out, wData, hData);
	
#ifdef TIME	
	end_time = gettime();
	deltaT += end_time - start_time;	
#endif
	}	
#ifdef TIME
	fprintf(fp, "%lf\t", deltaT/(double)iter);
#endif

#ifdef VARIFY
	_clMemcpyD2H(out_gpu, d_out, size * sizeof(uint));
	verify_array_int<uint>(out_cpu, out_gpu, size);
#endif //VARIFY

	/**************************3****************************/
#ifdef TIME
	deltaT = 0.0;
#endif
	for(int i=0; i<iter; i++)
	{
	
#ifdef TIME
	start_time = gettime();
#endif

		mt_3(d_in, d_out, wData, hData);
	
#ifdef TIME	
	end_time = gettime();
	deltaT += end_time - start_time;	
#endif
	}	
#ifdef TIME
	fprintf(fp, "%lf\t", deltaT/(double)iter);
#endif

#ifdef VARIFY
	_clMemcpyD2H(out_gpu, d_out, size * sizeof(uint));
	verify_array_int<uint>(out_cpu, out_gpu, size);
#endif //VARIFY

#ifdef TIME	
	fprintf(fp, "\n");	
	fclose(fp);
#endif	
}
catch(string msg){
	printf("ERR:%s\n", msg.c_str());
	printf("Error catched\n");
	}

	_clFree(d_in);
	_clFree(d_out);
	_clRelease();
	if(in!=NULL) free(in);
	if(out_cpu!=NULL) free(out_cpu);
	if(out_gpu!=NULL) free(out_gpu);

	return 1;
}
Example #3
0
//----------------------------------------------------------
//--breadth first search on GPUs
//----------------------------------------------------------
void run_bfs_gpu(int no_of_nodes, Node *h_graph_nodes, int edge_list_size, \
		int *h_graph_edges, char *h_graph_mask, char *h_updating_graph_mask, \
		char *h_graph_visited, int *h_cost) 
					throw(std::string){

	//int number_elements = height*width;
	char h_over;
	cl_mem d_graph_nodes, d_graph_edges, d_graph_mask, d_updating_graph_mask, \
			d_graph_visited, d_cost, d_over;
	try{
		//--1 transfer data from host to device
		_clInit();			
		d_graph_nodes = _clMalloc(no_of_nodes*sizeof(Node), h_graph_nodes);
		d_graph_edges = _clMalloc(edge_list_size*sizeof(int), h_graph_edges);
		d_graph_mask = _clMallocRW(no_of_nodes*sizeof(char), h_graph_mask);
		d_updating_graph_mask = _clMallocRW(no_of_nodes*sizeof(char), h_updating_graph_mask);
		d_graph_visited = _clMallocRW(no_of_nodes*sizeof(char), h_graph_visited);


		d_cost = _clMallocRW(no_of_nodes*sizeof(int), h_cost);
		d_over = _clMallocRW(sizeof(char), &h_over);
		
		_clMemcpyH2D(d_graph_nodes, no_of_nodes*sizeof(Node), h_graph_nodes);
		_clMemcpyH2D(d_graph_edges, edge_list_size*sizeof(int), h_graph_edges);	
		_clMemcpyH2D(d_graph_mask, no_of_nodes*sizeof(char), h_graph_mask);	
		_clMemcpyH2D(d_updating_graph_mask, no_of_nodes*sizeof(char), h_updating_graph_mask);	
		_clMemcpyH2D(d_graph_visited, no_of_nodes*sizeof(char), h_graph_visited);	
		_clMemcpyH2D(d_cost, no_of_nodes*sizeof(int), h_cost);	
			
		//--2 invoke kernel
#ifdef	PROFILING
		timer kernel_timer;
		double kernel_time = 0.0;		
		kernel_timer.reset();
		kernel_timer.start();
#endif
		do{
			h_over = false;
			_clMemcpyH2D(d_over, sizeof(char), &h_over);
			//--kernel 0
			int kernel_id = 0;
			int kernel_idx = 0;
			_clSetArgs(kernel_id, kernel_idx++, d_graph_nodes);
			_clSetArgs(kernel_id, kernel_idx++, d_graph_edges);
			_clSetArgs(kernel_id, kernel_idx++, d_graph_mask);
			_clSetArgs(kernel_id, kernel_idx++, d_updating_graph_mask);
			_clSetArgs(kernel_id, kernel_idx++, d_graph_visited);
			_clSetArgs(kernel_id, kernel_idx++, d_cost);
			_clSetArgs(kernel_id, kernel_idx++, &no_of_nodes, sizeof(int));
			
			//int work_items = no_of_nodes;
			_clInvokeKernel(kernel_id, no_of_nodes, work_group_size);
			
			//--kernel 1
			kernel_id = 1;
			kernel_idx = 0;			
			_clSetArgs(kernel_id, kernel_idx++, d_graph_mask);
			_clSetArgs(kernel_id, kernel_idx++, d_updating_graph_mask);
			_clSetArgs(kernel_id, kernel_idx++, d_graph_visited);
			_clSetArgs(kernel_id, kernel_idx++, d_over);
			_clSetArgs(kernel_id, kernel_idx++, &no_of_nodes, sizeof(int));
			
			//work_items = no_of_nodes;
			_clInvokeKernel(kernel_id, no_of_nodes, work_group_size);			
			
			_clMemcpyD2H(d_over,sizeof(char), &h_over);
			}while(h_over);
			
		_clFinish();
#ifdef	PROFILING
		kernel_timer.stop();
		kernel_time = kernel_timer.getTimeInSeconds();
#endif
		//--3 transfer data from device to host
		_clMemcpyD2H(d_cost,no_of_nodes*sizeof(int), h_cost);
		//--statistics
#ifdef	PROFILING
		std::cout<<"kernel time(s):"<<kernel_time<<std::endl;		
#endif
		//--4 release cl resources.
		_clFree(d_graph_nodes);
		_clFree(d_graph_edges);
		_clFree(d_graph_mask);
		_clFree(d_updating_graph_mask);
		_clFree(d_graph_visited);
		_clFree(d_cost);
		_clFree(d_over);
		_clRelease();
	}
	catch(std::string msg){		
		_clFree(d_graph_nodes);
		_clFree(d_graph_edges);
		_clFree(d_graph_mask);
		_clFree(d_updating_graph_mask);
		_clFree(d_graph_visited);
		_clFree(d_cost);
		_clFree(d_over);
		_clRelease();
		std::string e_str = "in run_transpose_gpu -> ";
		e_str += msg;
		throw(e_str);
	}
	return ;
}
Example #4
0
double run_gpu(datatype *h_imatrix_a, datatype *h_imatrix_b, datatype *h_omatrix_c, datatype *h_omatrix_ref, int size, int kernel_id, bool verify) 
					throw(std::string){	

	int number_elements = size * size;
	cl_mem d_imatrix_a, d_imatrix_b, d_omatrix_c;
	
	try{
		//--1 transfer data from host to device				
		d_imatrix_a = _clMalloc(number_elements*sizeof(datatype));
		d_imatrix_b = _clMalloc(number_elements*sizeof(datatype));
		d_omatrix_c = _clMalloc(number_elements*sizeof(datatype));
		_clMemcpyH2D(d_imatrix_a, h_imatrix_a, number_elements*sizeof(datatype));
		_clMemcpyH2D(d_imatrix_b, h_imatrix_b, number_elements*sizeof(datatype));		
	
		//--2 invoke kernel
		int iterations = 1;
		int kernel_idx = 0;
		_clSetArgs(kernel_id, kernel_idx++, d_imatrix_a);
		_clSetArgs(kernel_id, kernel_idx++, d_imatrix_b);
		_clSetArgs(kernel_id, kernel_idx++, d_omatrix_c);
		_clSetArgs(kernel_id, kernel_idx++, &size, sizeof(int));
		_clSetArgs(kernel_id, kernel_idx++, &size, sizeof(int));
		int work_group_unit = 64;
		int range_x, range_y;
		int vf = 1;
		switch(kernel_id){
			case 0:
				range_x = size, range_y = size;
				vf = 1;
				break;
			case 1:
				range_x = size/2, range_y = size;
				vf = 2;
				break;
			case 2:
				range_x = size/4, range_y = size;
				vf = 4;
				break;
			case 3:
				range_x = size/8, range_y = size;
				vf = 8;
				break;
			case 4:
				range_x = size/16, range_y = size;
				vf = 16;
				break;
			default:
				throw("->unknown kernle id->");
				break;
		}
		int group_x = work_group_unit, group_y = 1;
		unsigned long deltaT = 0;
		unsigned long totalT = 0;
		for(int i=-1; i<iterations; i++){
			_clInvokeKernel2D(kernel_id, range_x, range_y, group_x, group_y, &deltaT);
			if(i==0)
			  totalT += deltaT;
		}
		//totalT = totalT/iterations;
		//--3 transfer data from device to host
		_clMemcpyD2H(h_omatrix_c,d_omatrix_c, number_elements*sizeof(datatype));
		//--statistics
		//--4 release cl resources.
		if(verify){
			verify_array<datatype>(h_omatrix_ref, h_omatrix_c, number_elements);			
		}
		_clFree(d_imatrix_a);
		_clFree(d_imatrix_b);
		_clFree(d_omatrix_c);

		return totalT*(1e-9);
	}
	catch(std::string msg){		
		std::string e_str = "in run_gpu -> ";
		e_str += msg;
		throw(e_str);
	}
	return 0.0;
}
Example #5
0
int main(int argc, char ** argv)
{
	cl_mem out = NULL;
try{
	if(argc!=3){
		printf("need 2 parameter here!!!\n");
		exit(-1);
	}
	
	
#if defined TIME
	double start_time = 0;
	double end_time = 0;
	string dat_name="data.dat";

	FILE * fp = fopen(dat_name.c_str(), "a+");
	if(fp==NULL)
	{
		printf("failed to open file!!!\n");
		exit(-1);
	}
#endif

	uint bins = atoi(argv[1]);
	uint size = atoi(argv[2]);
	uint iter = 100;

	
	printf("bins=%d, size=%d\n", bins, size);
	
	_clInit(1, "gpu", 0);
	
	out = _clMalloc((size/BS)*bins);
	
	layout_cyclic(out, bins, size);
	
	/**************************1****************************/

#ifdef TIME
	start_time = gettime();
#endif

	for(int i=1; i<iter; i++)
	{
		layout_blocked(out, bins, size);
	}
	
#ifdef TIME	
	end_time = gettime();
	fprintf(fp, "%lf\t", (end_time-start_time)/(double)iter);	
#endif		

	/**************************2****************************/

#ifdef TIME
	start_time = gettime();
#endif

	for(int i=1; i<iter; i++)
	{
		layout_cyclic(out, bins, size);
	}
	
#ifdef TIME	
	end_time = gettime();
	fprintf(fp, "%lf\t", (end_time-start_time)/(double)iter);	
#endif

	/**************************3****************************/

#ifdef TIME
	start_time = gettime();
#endif

	for(int i=1; i<iter; i++)
	{
		layout_cyclic_2(out, bins, size);
	}
	
#ifdef TIME	
	end_time = gettime();
	fprintf(fp, "%lf\t", (end_time-start_time)/(double)iter);	
#endif


#ifdef TIME	
	fprintf(fp, "\n");	
	fclose(fp);
#endif	
}
catch(string msg){
	printf("ERR:%s\n", msg.c_str());
	printf("Error catched\n");
	}

	_clFree(out);
	_clRelease();
	return 1;
}
Example #6
0
//----------------------------------------------------------
//--breadth first search on GPUs
//----------------------------------------------------------
void run_bfs_gpu(int no_of_nodes, Node *h_graph_nodes, int edge_list_size, \
		int *h_graph_edges, int *h_graph_mask, int *h_updating_graph_mask, \
		int *h_graph_visited, int *h_cost)
					throw(std::string){

	//int number_elements = height*width;
	int h_over;
	cl_mem d_graph_nodes, d_graph_edges, d_graph_mask, d_updating_graph_mask, \
			d_graph_visited, d_cost, d_over;
	try{
		//--1 transfer data from host to device
		_clInit();
		d_graph_nodes = _clMalloc(no_of_nodes*sizeof(Node), h_graph_nodes);
		d_graph_edges = _clMalloc(edge_list_size*sizeof(int), h_graph_edges);
		d_graph_mask = _clMallocRW(no_of_nodes*sizeof(int), h_graph_mask);
		d_updating_graph_mask = _clMallocRW(no_of_nodes*sizeof(int), h_updating_graph_mask);
		d_graph_visited = _clMallocRW(no_of_nodes*sizeof(int), h_graph_visited);


		d_cost = _clMallocRW(no_of_nodes*sizeof(int), h_cost);
		d_over = _clMallocRW(sizeof(int), &h_over);

		_clMemcpyH2D(d_graph_nodes, no_of_nodes*sizeof(Node), h_graph_nodes);
		_clMemcpyH2D(d_graph_edges, edge_list_size*sizeof(int), h_graph_edges);
		_clMemcpyH2D(d_graph_mask, no_of_nodes*sizeof(int), h_graph_mask);
		_clMemcpyH2D(d_updating_graph_mask, no_of_nodes*sizeof(int), h_updating_graph_mask);
		_clMemcpyH2D(d_graph_visited, no_of_nodes*sizeof(int), h_graph_visited);
		_clMemcpyH2D(d_cost, no_of_nodes*sizeof(int), h_cost);

		//--2 invoke kernel
#ifdef	PROFILING
		timer kernel_timer;
		double kernel_time = 0.0;
		kernel_timer.reset();
		kernel_timer.start();


#endif
		struct timespec startT, endT;
		clock_gettime(CLOCK_MONOTONIC, &startT);
		do{
			h_over = false;
			_clMemcpyH2D(d_over, sizeof(int), &h_over);
			//--kernel 0
			int kernel_id = 0;
			int kernel_idx = 0;
			_clSetArgs(kernel_id, kernel_idx++, d_graph_nodes);
			_clSetArgs(kernel_id, kernel_idx++, d_graph_edges);
			_clSetArgs(kernel_id, kernel_idx++, d_graph_mask);
			_clSetArgs(kernel_id, kernel_idx++, d_updating_graph_mask);
			_clSetArgs(kernel_id, kernel_idx++, d_graph_visited);
			_clSetArgs(kernel_id, kernel_idx++, d_cost);
			_clSetArgs(kernel_id, kernel_idx++, &no_of_nodes, sizeof(int));

			//int work_items = no_of_nodes;
			_clInvokeKernel(kernel_id, no_of_nodes, work_group_size);

			//--kernel 1
			kernel_id = 1;
			kernel_idx = 0;
			_clSetArgs(kernel_id, kernel_idx++, d_graph_mask);
			_clSetArgs(kernel_id, kernel_idx++, d_updating_graph_mask);
			_clSetArgs(kernel_id, kernel_idx++, d_graph_visited);
			_clSetArgs(kernel_id, kernel_idx++, d_over);
			_clSetArgs(kernel_id, kernel_idx++, &no_of_nodes, sizeof(int));

			//work_items = no_of_nodes;
			_clInvokeKernel(kernel_id, no_of_nodes, work_group_size);

			_clMemcpyD2H(d_over,sizeof(int), &h_over);
			}while(h_over);

		_clFinish();
		clock_gettime(CLOCK_MONOTONIC, &endT);
		uint64_t diff = 1000000000 * (endT.tv_sec - startT.tv_sec);
		uint64_t nanodiff = endT.tv_nsec - startT.tv_nsec;
		//printf("elapsed accelerator time = %llu nanoseconds\n", (long long unsigned int) diff);
		//printf("start time seconds%u \n", startT.tv_sec);
		//printf("end time seconds %u \n", endT.tv_sec);
        //printf("difference %u \n", diff);
		//printf("start time nanoseconds %u \n", startT.tv_nsec);
		//printf("end time nanoseconds %u \n", endT.tv_nsec);
        printf(" accelerator time %u \n", nanodiff + diff);
#ifdef	PROFILING
		kernel_timer.stop();
		kernel_time = kernel_timer.getTimeInSeconds();
#endif
		//--3 transfer data from device to host
		_clMemcpyD2H(d_cost,no_of_nodes*sizeof(int), h_cost);
		//--statistics
#ifdef	PROFILING
		std::cout<<"kernel time(s):"<<kernel_time<<std::endl;
#endif
		//--4 release cl resources.
		_clFree(d_graph_nodes);
		_clFree(d_graph_edges);
		_clFree(d_graph_mask);
		_clFree(d_updating_graph_mask);
		_clFree(d_graph_visited);
		_clFree(d_cost);
		_clFree(d_over);
		_clRelease();
	}
	catch(std::string msg){
		_clFree(d_graph_nodes);
		_clFree(d_graph_edges);
		_clFree(d_graph_mask);
		_clFree(d_updating_graph_mask);
		_clFree(d_graph_visited);
		_clFree(d_cost);
		_clFree(d_over);
		_clRelease();
		std::string e_str = "in run_transpose_gpu -> ";
		e_str += msg;
		throw(e_str);
	}
	return ;
}
Example #7
0
int main(int argc, char ** argv)
{
    float * h_raw, * h_out, * outCPU;
    cl_mem d_raw, d_out;
    try {
        if(argc!=2) {
            printf("need one parameter here!!!");
            exit(-1);
        }

        _clInit(1, "gpu", 0);


#if defined TIME
        double start_time = 0;
        double end_time = 0;
        string dat_name="data.dat";

        FILE * fp = fopen(dat_name.c_str(), "a+");
        if(fp==NULL)
        {
            printf("failed to open file!!!\n");
            exit(-1);
        }
#endif

        int cdim = atoi(argv[1]); //{384};
        int rdim = atoi(argv[1]); //{288};
        printf("cdim=%d, rdim=%d\n", cdim, rdim);
        h_raw = (float *)malloc(cdim * rdim * sizeof(float));
        h_out = (float *)malloc(cdim * rdim * sizeof(float));
        outCPU = (float *)malloc(cdim * rdim * sizeof(float));
        fill<float>(h_raw, cdim * rdim, 5);
        d_raw = _clMalloc(cdim * rdim * sizeof(float));
        d_out = _clMalloc(cdim * rdim * sizeof(float));
        _clMemcpyH2D(d_raw, h_raw, cdim * rdim * sizeof(float));
        printf("-0\n");
#ifdef VARIFY
        CPURun(h_raw, outCPU, cdim, rdim);
#endif //VARIFY

        /**************************1****************************/
#ifdef TIME
        start_time = gettime();
#endif
        printf("-1.1\n");
        broadcast(d_raw, d_out, cdim, rdim);
        printf("-1.2\n");
#ifdef TIME
        end_time = gettime();
        fprintf(fp, "%lf\t", (end_time - start_time));
#endif

#ifdef VARIFY
        _clMemcpyD2H(h_out, d_out, cdim * rdim * sizeof(float));
        verify_array<float>(outCPU, h_out, cdim * rdim);
#endif //VARIFY

        /**************************2****************************/
#ifdef TIME
        start_time = gettime();
#endif
        broadcast_lm(d_raw, d_out, cdim, rdim);
        printf("-2\n");
#ifdef TIME
        end_time = gettime();
        fprintf(fp, "%lf\t", (end_time - start_time));
#endif

#ifdef VARIFY
        _clMemcpyD2H(h_out, d_out, cdim * rdim * sizeof(float));
        verify_array<float>(outCPU, h_out, cdim * rdim);
#endif //VARIFY

#ifdef TIME
        fprintf(fp, "\n");
        fclose(fp);
#endif
    }
    catch(string msg) {
        printf("ERR:%s\n", msg.c_str());
        printf("Error catched\n");
    }
    _clFree(d_raw);
    _clFree(d_out);
    _clRelease();
    if(h_raw!=NULL) free(h_raw);
    if(h_out!=NULL) free(h_out);
    if(outCPU!=NULL) free(outCPU);

    return 1;
}
Example #8
0
double run_gpu(datatype *h_i_vector, datatype *h_o_vector, datatype *h_o_vector_ref,\
			int w, int h, int kernel_id, bool verify)throw(std::string){	
	cl_mem d_i_vector, d_o_vector;
	int number_elements_out = w * h;
	int number_elements_in = w * h;
	try{
		//--1 transfer data from host to device		
		d_i_vector = _clMalloc(number_elements_in * sizeof(datatype));
		d_o_vector = _clMalloc(number_elements_out * sizeof(datatype));
		_clMemcpyH2D(d_i_vector, h_i_vector, number_elements_in*sizeof(datatype));
		_clFinish();
		//--2 invoke kernel
		int args_idx = 0;
		_clSetArgs(kernel_id, args_idx++, d_i_vector);
		_clSetArgs(kernel_id, args_idx++, d_o_vector);
		_clSetArgs(kernel_id, args_idx++, &w, sizeof(int));
		_clSetArgs(kernel_id, args_idx++, &h, sizeof(int));

		int work_group_unit = 16;
		int range_x = -1; 
		int range_y = -1;
		switch(kernel_id){
			case 0:
				range_x = w, range_y = h;
				break;
			case 1:
				range_x = w/2, range_y = h;
				break;
			case 2:
				range_x = w/4, range_y = h;
				break;				
			case 3:
				range_x = w/8, range_y = h;
				break;								
			case 4:
				range_x = w/16, range_y = h;
				break;								
			default:
				throw(string("Unknown kernel id!!!"));
				break;	
		}
		int group_x = work_group_unit * 4;
		int group_y = 1;
		int number_iterations = 1;
		unsigned long deltaT = 0.0f;
		unsigned long kernel_exe_time = 0.0f;
		std::cout<<"--testing..."<<std::endl;		
		for(int i=-1; i<number_iterations; i++){
			_clInvokeKernel2D(kernel_id, range_x, range_y, group_x, group_y, &kernel_exe_time);			
			if(i==0)
			  deltaT += kernel_exe_time;
		}		
		deltaT = deltaT/number_iterations;
		std::cout<<"--done."<<std::endl;
		_clMemcpyD2H(h_o_vector, d_o_vector, number_elements_out*sizeof(datatype));
	
		if(verify){
			verify_array<datatype>(h_o_vector, h_o_vector_ref, number_elements_out);
		}
		//--4 release cl resources.
		_clFree(d_i_vector);
		_clFree(d_o_vector);

		return (double)(((double)w*(double)h*(double)(h+1))*sizeof(datatype))/(double)deltaT;		
	}
	catch(std::string msg){		
		std::string e_str = "in run_gpu -> ";
		e_str += msg;
		throw(e_str);
	}
	return 0.0;
}
Example #9
0
int main(int argc, char ** argv)
{
	//float *hIn1, *hIn2;
	//cl_mem dIn1, dIn2;
	@hIn;
	@dIn;
	float *hOut, *rOut;
	cl_mem dOut;
try{
	_clParseCommandLine(argc, argv);
	string strSubfix = string(argv[2]);
	_clInit(platform_id, device_type, device_id);
	int cdim = atoi(argv[1]); 
	int rdim = atoi(argv[1]); 
	int r = atoi(argv[3]);
	@cdimIn
	@rdimIn
	// different between iMAP1 and iMAP2
	printf("cdim=%d, rdim=%d, radius=%d\n", cdim, rdim, r);
	int iIter = 10;
	int elems = @elems;
	double dataAmount = (double)cdim * (double)rdim * (double)(elems) * (double)sizeof(float) * 1e-6;

	
#if defined TIME
	double start_time = 0;
	double end_time = 0;
	double delta_time = 0;
	int cnt = 0;
	string dat_name= string("data.") + strSubfix + string(".dat");

	FILE * fp = fopen(dat_name.c_str(), "a+");
	if(fp==NULL)
	{
		printf("failed to open file!!!\n");
		exit(-1);
	}
#endif
	
	//hIn1 = (float *)malloc(cdim * rdim * sizeof(float));
	//hIn2 = (float *)malloc(cdim * rdim * sizeof(float));
	@hAlc
	hOut = (float *)malloc(cdim * rdim * sizeof(float));
	rOut = (float *)malloc(cdim * rdim * sizeof(float));

	//fill<float>(hIn1, cdim * rdim, 5);
	//fill<float>(hIn2, cdim * rdim, 5);		
	@hFill

	//dIn1 = _clMalloc(cdim * rdim * sizeof(float));
	//dIn2 = _clMalloc(cdim * rdim * sizeof(float));
	@dAlc
	dOut = _clMalloc(cdim * rdim * sizeof(float));

	//_clMemcpyH2D(dIn1, hIn1, cdim * rdim * sizeof(float));
	//_clMemcpyH2D(dIn2, hIn2, cdim * rdim * sizeof(float));
	@h2dTrans

	_clFinish();
	
	// warmup
	//OCLRun(dIn1, dIn2, dOut, cdim, rdim);
	OCLRun(@oclArgs, dOut, cdim, rdim, cdimIn, rdimIn);

#ifdef VARIFY	
	//OMPRun(hIn1, hIn2, rOut, cdim, rdim);
	OMPRun(@ompArgs, rOut, cdim, rdim, cdimIn, rdimIn);
#endif //VARIFY
	

#ifdef TIME
	delta_time = 0;
	cnt = 0;
#endif	
	for(int i=0; i<iIter; i++)
	{
#ifdef TIME
	cnt++;
	start_time = gettime();
#endif

		OCLRun(@oclArgs, dOut, cdim, rdim, cdimIn, rdimIn);
#ifdef TIME	
	end_time = gettime();
	delta_time += end_time - start_time;
	if(fabs(delta_time-600000.0)>0.1) break;	// ????
#endif	
	}

#ifdef TIME
	fprintf(fp, "%lf\t", dataAmount * (double)cnt/delta_time);
#endif

#ifdef VARIFY
	_clMemcpyD2H(hOut, dOut, cdim * rdim * sizeof(float));	
	verify_array<float>(rOut, hOut, cdim * rdim);	
#endif //VARIFY


#ifdef TIME	
	fprintf(fp, "\n");	
	fclose(fp);
#endif	
}
catch(string msg){
	printf("ERR:%s\n", msg.c_str());
	printf("Error catched\n");
	exit(-1);
	}
	//_clFree(dIn1);
	//_clFree(dIn2);
	@clFree
	_clFree(dOut);
	_clRelease();
	//if(hIn1!=NULL) free(hIn1);
	//if(hIn2!=NULL) free(hIn2);
	@hFree
	if(hOut!=NULL) free(hOut);
	if(rOut!=NULL) free(rOut);

	return 1;
}
Example #10
0
int main(int argc, char ** argv)
{
	uint * h_in = NULL, * h_out_1 = NULL, * h_out_2 = NULL, * h_out = NULL;
	cl_mem d_in = NULL, d_out_1 = NULL, d_out_2 = NULL;
try{
	if(argc!=3){
		printf("need 2 parameter here!!!\n");
		exit(-1);
	}
	
	
#if defined TIME
	double start_time = 0;
	double end_time = 0;
	string dat_name="data.dat";

	FILE * fp = fopen(dat_name.c_str(), "a+");
	if(fp==NULL)
	{
		printf("failed to open file!!!\n");
		exit(-1);
	}
#endif

	uint w, h;
	uint side = atoi(argv[1]);
	w = side, h = side;
	uint size = w * h;
	uint radius = atoi(argv[2]);
	uint iter = 100;

	
	printf("w=%d, h=%d, radius=%d\n", w, h, radius);
	
	_clInit(1, "gpu", 0);

	h_in = (uint *)malloc(size * sizeof(uint));	
	h_out_1 = (uint *)malloc(size * sizeof(uint));
	h_out_2 = (uint *)malloc(size * sizeof(uint));
	h_out = (uint *)malloc(size * sizeof(uint));
	
	d_in = _clMalloc(size * sizeof(uint));
	d_out_1 = _clMalloc(size * sizeof(uint));
	d_out_2 = _clMalloc(size * sizeof(uint));
	
	fill<uint>(h_in, size, 10);
	_clMemcpyH2D(d_in, h_in, size * sizeof(uint));

	//g2l_CPU(h_in, h_out, w, h, radius);

	/**************************1****************************/

#ifdef TIME
	start_time = gettime();
#endif

	for(int i=1; i<iter; i++)
	{
		g2l_TBT(d_in, d_out_1, radius, w, h);
	}
	
#ifdef TIME	
	end_time = gettime();
	fprintf(fp, "%lf\t", (end_time-start_time)/(double)iter);	
#endif
	
#ifdef VARI
	_clMemcpyD2H(h_out_1, d_out_1, size * sizeof(uint));
#endif	
	
	
	/**************************2****************************/
#ifdef TIME
	start_time = gettime();
#endif
	for(int i=1; i<iter; i++)
	{
		g2l_FCTH(d_in, d_out_2, radius, w, h);
	}
	
#ifdef TIME	
	end_time = gettime();
	fprintf(fp, "%lf\t", (end_time-start_time)/(double)iter);	
#endif

#ifdef VARI	
	_clMemcpyD2H(h_out_2, d_out_2, size * sizeof(uint));
	verify_array_int<uint>(h_out_1, h_out_2, w, h);
#endif

#ifdef TIME	
	fprintf(fp, "\n");	
	fclose(fp);
#endif	
}
catch(string msg){
	printf("ERR:%s\n", msg.c_str());
	printf("Error catched\n");
	}

	_clFree(d_in);
	_clFree(d_out_1);
	_clFree(d_out_2);
	_clRelease();
	if(h_in!=NULL) free(h_in);
	if(h_out_1!=NULL) free(h_out_1);
	if(h_out_2!=NULL) free(h_out_2);
	if(h_out!=NULL) free(h_out);

	return 1;
}