Example #1
0
/*
	in the First-Central-Then-Halo mode (FCTH)
*/
void g2l_FCTH(cl_mem d_in, cl_mem d_out, uint radius, uint w, uint h){

	uint range_x = w;
	uint range_y = h;
	uint group_x = 16;
	uint group_y = 16;
	uint length_x = group_x + 2 * radius;
	uint length_y = group_y + 2 * radius;
	
	uint kernel_id = 1;
	uint arg_idx = 0;
	_clSetArgs(kernel_id, arg_idx++, d_in);
	_clSetArgs(kernel_id, arg_idx++, d_out);
	_clSetArgs(kernel_id, arg_idx++, NULL, length_x * length_y * sizeof(uint));
	_clSetArgs(kernel_id, arg_idx++, &w, sizeof(uint));
	_clSetArgs(kernel_id, arg_idx++, &h, sizeof(uint));
	_clSetArgs(kernel_id, arg_idx++, &radius, sizeof(uint));
	_clSetArgs(kernel_id, arg_idx++, &length_x, sizeof(uint));
	_clSetArgs(kernel_id, arg_idx++, &length_y, sizeof(uint));

	_clInvokeKernel2D(kernel_id, range_x, range_y, group_x, group_y);
	
	return ;
}
Example #2
0
//----------------------------------------------------------
//--breadth first search on GPUs
//----------------------------------------------------------
void run_bfs_gpu(int no_of_nodes, Node *h_graph_nodes, int edge_list_size, \
		int *h_graph_edges, bool *h_graph_mask, bool *h_updating_graph_mask, \
		bool *h_graph_visited, int *h_cost) 
					throw(std::string){

	//int number_elements = height*width;
	bool h_over;
	cl_mem d_graph_nodes, d_graph_edges, d_graph_mask, d_updating_graph_mask, \
			d_graph_visited, d_cost, d_over;
//	try{
		//--1 transfer data from host to device
		//printf("initializing\n");
		_clInit();			
		//printf("allocating\n");
		d_graph_nodes = _clMalloc(no_of_nodes*sizeof(Node), h_graph_nodes);
		d_graph_edges = _clMalloc(edge_list_size*sizeof(int), h_graph_edges);
		d_graph_mask = _clMalloc(no_of_nodes*sizeof(bool), h_graph_mask);
		d_updating_graph_mask = _clMalloc(no_of_nodes*sizeof(bool), h_updating_graph_mask);
		d_graph_visited = _clMalloc(no_of_nodes*sizeof(bool), h_graph_visited);
		d_cost = _clMallocRW(no_of_nodes*sizeof(int), h_cost);
		d_over = _clMallocRW(sizeof(bool), &h_over);
		
		//printf("copyin\n");
		_clMemcpyH2D(d_graph_nodes, no_of_nodes*sizeof(Node), h_graph_nodes);
		_clMemcpyH2D(d_graph_edges, edge_list_size*sizeof(int), h_graph_edges);	
		_clMemcpyH2D(d_graph_mask, no_of_nodes*sizeof(bool), h_graph_mask);	
		_clMemcpyH2D(d_updating_graph_mask, no_of_nodes*sizeof(bool), h_updating_graph_mask);	
		_clMemcpyH2D(d_graph_visited, no_of_nodes*sizeof(bool), h_graph_visited);	
		_clMemcpyH2D(d_cost, no_of_nodes*sizeof(int), h_cost);	
			
		//--2 invoke kernel
#ifdef	PROFILING
		timer kernel_timer;
		double kernel_time = 0.0;		
		kernel_timer.reset();
		kernel_timer.start();
#endif
		int kerId=0;
	//	printf("launching kernel\n");
		do{
	//		printf("copy in\n");
			h_over = false;
			_clMemcpyH2D(d_over, sizeof(bool), &h_over);
			//--kernel 0
			int kernel_id = 0;
			int kernel_idx = 0;
	//		printf("set arg 1\n");
			_clSetArgs(kernel_id, kernel_idx++, d_graph_nodes);
			_clSetArgs(kernel_id, kernel_idx++, d_graph_edges);
			_clSetArgs(kernel_id, kernel_idx++, d_graph_mask);
			_clSetArgs(kernel_id, kernel_idx++, d_updating_graph_mask);
			_clSetArgs(kernel_id, kernel_idx++, d_graph_visited);
			_clSetArgs(kernel_id, kernel_idx++, d_cost);
			_clSetArgs(kernel_id, kernel_idx++, &no_of_nodes, sizeof(int));
			
			//int work_items = no_of_nodes;
	//		printf("invoke 1\n");
			_clInvokeKernel(kernel_id, no_of_nodes, work_group_size);
			
			//--kernel 1
			kernel_id = 1;
			kernel_idx = 0;			
	//		printf("set arg 2\n");
			_clSetArgs(kernel_id, kernel_idx++, d_graph_mask);
			_clSetArgs(kernel_id, kernel_idx++, d_updating_graph_mask);
			_clSetArgs(kernel_id, kernel_idx++, d_graph_visited);
			_clSetArgs(kernel_id, kernel_idx++, d_over);
			_clSetArgs(kernel_id, kernel_idx++, &no_of_nodes, sizeof(int));
			
			//work_items = no_of_nodes;
	//		printf("invoke 2\n");
			_clInvokeKernel(kernel_id, no_of_nodes, work_group_size);			
			
	//		printf("copy back\n");
			_clMemcpyD2H(d_over,sizeof(bool), &h_over);
	//		printf("done\n");
	//		printf("K%d\n",kerId++);
			}while(h_over);
	//	printf("done!");
		_clFinish();
#ifdef	PROFILING
		kernel_timer.stop();
		kernel_time = kernel_timer.getTimeInSeconds();
#endif
		//--3 transfer data from device to host
		_clMemcpyD2H(d_cost,no_of_nodes*sizeof(int), h_cost);
		//--statistics
#ifdef	PROFILING
		std::cout<<"kernel time(s):"<<kernel_time<<std::endl;		
#endif
		//--4 release cl resources.
		_clFree(d_graph_nodes);
		_clFree(d_graph_edges);
		_clFree(d_graph_mask);
		_clFree(d_updating_graph_mask);
		_clFree(d_graph_visited);
		_clFree(d_cost);
		_clFree(d_over);
		_clRelease();
//	}
//	catch(std::string msg){		
//		_clFree(d_graph_nodes);
//		_clFree(d_graph_edges);
//		_clFree(d_graph_mask);
//		_clFree(d_updating_graph_mask);
//		_clFree(d_graph_visited);
//		_clFree(d_cost);
//		_clFree(d_over);
//		_clRelease();
//		std::string e_str = "in run_transpose_gpu -> ";
//		e_str += msg;
//		throw(e_str);
//	}
	return ;
}
Example #3
0
//----------------------------------------------------------
//--breadth first search on GPUs
//----------------------------------------------------------
void run_bfs_gpu(int no_of_nodes, Node *h_graph_nodes, int edge_list_size, \
		int *h_graph_edges, char *h_graph_mask, char *h_updating_graph_mask, \
		char *h_graph_visited, int *h_cost) 
					throw(std::string){

	//int number_elements = height*width;
	char h_over;
	cl_mem d_graph_nodes, d_graph_edges, d_graph_mask, d_updating_graph_mask, \
			d_graph_visited, d_cost, d_over;
	try{
		//--1 transfer data from host to device
		_clInit();			
		d_graph_nodes = _clMalloc(no_of_nodes*sizeof(Node), h_graph_nodes);
		d_graph_edges = _clMalloc(edge_list_size*sizeof(int), h_graph_edges);
		d_graph_mask = _clMallocRW(no_of_nodes*sizeof(char), h_graph_mask);
		d_updating_graph_mask = _clMallocRW(no_of_nodes*sizeof(char), h_updating_graph_mask);
		d_graph_visited = _clMallocRW(no_of_nodes*sizeof(char), h_graph_visited);


		d_cost = _clMallocRW(no_of_nodes*sizeof(int), h_cost);
		d_over = _clMallocRW(sizeof(char), &h_over);
		
		_clMemcpyH2D(d_graph_nodes, no_of_nodes*sizeof(Node), h_graph_nodes);
		_clMemcpyH2D(d_graph_edges, edge_list_size*sizeof(int), h_graph_edges);	
		_clMemcpyH2D(d_graph_mask, no_of_nodes*sizeof(char), h_graph_mask);	
		_clMemcpyH2D(d_updating_graph_mask, no_of_nodes*sizeof(char), h_updating_graph_mask);	
		_clMemcpyH2D(d_graph_visited, no_of_nodes*sizeof(char), h_graph_visited);	
		_clMemcpyH2D(d_cost, no_of_nodes*sizeof(int), h_cost);	
			
		//--2 invoke kernel
#ifdef	PROFILING
		timer kernel_timer;
		double kernel_time = 0.0;		
		kernel_timer.reset();
		kernel_timer.start();
#endif
		do{
			h_over = false;
			_clMemcpyH2D(d_over, sizeof(char), &h_over);
			//--kernel 0
			int kernel_id = 0;
			int kernel_idx = 0;
			_clSetArgs(kernel_id, kernel_idx++, d_graph_nodes);
			_clSetArgs(kernel_id, kernel_idx++, d_graph_edges);
			_clSetArgs(kernel_id, kernel_idx++, d_graph_mask);
			_clSetArgs(kernel_id, kernel_idx++, d_updating_graph_mask);
			_clSetArgs(kernel_id, kernel_idx++, d_graph_visited);
			_clSetArgs(kernel_id, kernel_idx++, d_cost);
			_clSetArgs(kernel_id, kernel_idx++, &no_of_nodes, sizeof(int));
			
			//int work_items = no_of_nodes;
			_clInvokeKernel(kernel_id, no_of_nodes, work_group_size);
			
			//--kernel 1
			kernel_id = 1;
			kernel_idx = 0;			
			_clSetArgs(kernel_id, kernel_idx++, d_graph_mask);
			_clSetArgs(kernel_id, kernel_idx++, d_updating_graph_mask);
			_clSetArgs(kernel_id, kernel_idx++, d_graph_visited);
			_clSetArgs(kernel_id, kernel_idx++, d_over);
			_clSetArgs(kernel_id, kernel_idx++, &no_of_nodes, sizeof(int));
			
			//work_items = no_of_nodes;
			_clInvokeKernel(kernel_id, no_of_nodes, work_group_size);			
			
			_clMemcpyD2H(d_over,sizeof(char), &h_over);
			}while(h_over);
			
		_clFinish();
#ifdef	PROFILING
		kernel_timer.stop();
		kernel_time = kernel_timer.getTimeInSeconds();
#endif
		//--3 transfer data from device to host
		_clMemcpyD2H(d_cost,no_of_nodes*sizeof(int), h_cost);
		//--statistics
#ifdef	PROFILING
		std::cout<<"kernel time(s):"<<kernel_time<<std::endl;		
#endif
		//--4 release cl resources.
		_clFree(d_graph_nodes);
		_clFree(d_graph_edges);
		_clFree(d_graph_mask);
		_clFree(d_updating_graph_mask);
		_clFree(d_graph_visited);
		_clFree(d_cost);
		_clFree(d_over);
		_clRelease();
	}
	catch(std::string msg){		
		_clFree(d_graph_nodes);
		_clFree(d_graph_edges);
		_clFree(d_graph_mask);
		_clFree(d_updating_graph_mask);
		_clFree(d_graph_visited);
		_clFree(d_cost);
		_clFree(d_over);
		_clRelease();
		std::string e_str = "in run_transpose_gpu -> ";
		e_str += msg;
		throw(e_str);
	}
	return ;
}
Example #4
0
double run_gpu(datatype *h_imatrix_a, datatype *h_imatrix_b, datatype *h_omatrix_c, datatype *h_omatrix_ref, int size, int kernel_id, bool verify) 
					throw(std::string){	

	int number_elements = size * size;
	cl_mem d_imatrix_a, d_imatrix_b, d_omatrix_c;
	
	try{
		//--1 transfer data from host to device				
		d_imatrix_a = _clMalloc(number_elements*sizeof(datatype));
		d_imatrix_b = _clMalloc(number_elements*sizeof(datatype));
		d_omatrix_c = _clMalloc(number_elements*sizeof(datatype));
		_clMemcpyH2D(d_imatrix_a, h_imatrix_a, number_elements*sizeof(datatype));
		_clMemcpyH2D(d_imatrix_b, h_imatrix_b, number_elements*sizeof(datatype));		
	
		//--2 invoke kernel
		int iterations = 1;
		int kernel_idx = 0;
		_clSetArgs(kernel_id, kernel_idx++, d_imatrix_a);
		_clSetArgs(kernel_id, kernel_idx++, d_imatrix_b);
		_clSetArgs(kernel_id, kernel_idx++, d_omatrix_c);
		_clSetArgs(kernel_id, kernel_idx++, &size, sizeof(int));
		_clSetArgs(kernel_id, kernel_idx++, &size, sizeof(int));
		int work_group_unit = 64;
		int range_x, range_y;
		int vf = 1;
		switch(kernel_id){
			case 0:
				range_x = size, range_y = size;
				vf = 1;
				break;
			case 1:
				range_x = size/2, range_y = size;
				vf = 2;
				break;
			case 2:
				range_x = size/4, range_y = size;
				vf = 4;
				break;
			case 3:
				range_x = size/8, range_y = size;
				vf = 8;
				break;
			case 4:
				range_x = size/16, range_y = size;
				vf = 16;
				break;
			default:
				throw("->unknown kernle id->");
				break;
		}
		int group_x = work_group_unit, group_y = 1;
		unsigned long deltaT = 0;
		unsigned long totalT = 0;
		for(int i=-1; i<iterations; i++){
			_clInvokeKernel2D(kernel_id, range_x, range_y, group_x, group_y, &deltaT);
			if(i==0)
			  totalT += deltaT;
		}
		//totalT = totalT/iterations;
		//--3 transfer data from device to host
		_clMemcpyD2H(h_omatrix_c,d_omatrix_c, number_elements*sizeof(datatype));
		//--statistics
		//--4 release cl resources.
		if(verify){
			verify_array<datatype>(h_omatrix_ref, h_omatrix_c, number_elements);			
		}
		_clFree(d_imatrix_a);
		_clFree(d_imatrix_b);
		_clFree(d_omatrix_c);

		return totalT*(1e-9);
	}
	catch(std::string msg){		
		std::string e_str = "in run_gpu -> ";
		e_str += msg;
		throw(e_str);
	}
	return 0.0;
}
Example #5
0
//----------------------------------------------------------
//--breadth first search on GPUs
//----------------------------------------------------------
void run_bfs_gpu(int no_of_nodes, Node *h_graph_nodes, int edge_list_size, \
		int *h_graph_edges, int *h_graph_mask, int *h_updating_graph_mask, \
		int *h_graph_visited, int *h_cost)
					throw(std::string){

	//int number_elements = height*width;
	int h_over;
	cl_mem d_graph_nodes, d_graph_edges, d_graph_mask, d_updating_graph_mask, \
			d_graph_visited, d_cost, d_over;
	try{
		//--1 transfer data from host to device
		_clInit();
		d_graph_nodes = _clMalloc(no_of_nodes*sizeof(Node), h_graph_nodes);
		d_graph_edges = _clMalloc(edge_list_size*sizeof(int), h_graph_edges);
		d_graph_mask = _clMallocRW(no_of_nodes*sizeof(int), h_graph_mask);
		d_updating_graph_mask = _clMallocRW(no_of_nodes*sizeof(int), h_updating_graph_mask);
		d_graph_visited = _clMallocRW(no_of_nodes*sizeof(int), h_graph_visited);


		d_cost = _clMallocRW(no_of_nodes*sizeof(int), h_cost);
		d_over = _clMallocRW(sizeof(int), &h_over);

		_clMemcpyH2D(d_graph_nodes, no_of_nodes*sizeof(Node), h_graph_nodes);
		_clMemcpyH2D(d_graph_edges, edge_list_size*sizeof(int), h_graph_edges);
		_clMemcpyH2D(d_graph_mask, no_of_nodes*sizeof(int), h_graph_mask);
		_clMemcpyH2D(d_updating_graph_mask, no_of_nodes*sizeof(int), h_updating_graph_mask);
		_clMemcpyH2D(d_graph_visited, no_of_nodes*sizeof(int), h_graph_visited);
		_clMemcpyH2D(d_cost, no_of_nodes*sizeof(int), h_cost);

		//--2 invoke kernel
#ifdef	PROFILING
		timer kernel_timer;
		double kernel_time = 0.0;
		kernel_timer.reset();
		kernel_timer.start();


#endif
		struct timespec startT, endT;
		clock_gettime(CLOCK_MONOTONIC, &startT);
		do{
			h_over = false;
			_clMemcpyH2D(d_over, sizeof(int), &h_over);
			//--kernel 0
			int kernel_id = 0;
			int kernel_idx = 0;
			_clSetArgs(kernel_id, kernel_idx++, d_graph_nodes);
			_clSetArgs(kernel_id, kernel_idx++, d_graph_edges);
			_clSetArgs(kernel_id, kernel_idx++, d_graph_mask);
			_clSetArgs(kernel_id, kernel_idx++, d_updating_graph_mask);
			_clSetArgs(kernel_id, kernel_idx++, d_graph_visited);
			_clSetArgs(kernel_id, kernel_idx++, d_cost);
			_clSetArgs(kernel_id, kernel_idx++, &no_of_nodes, sizeof(int));

			//int work_items = no_of_nodes;
			_clInvokeKernel(kernel_id, no_of_nodes, work_group_size);

			//--kernel 1
			kernel_id = 1;
			kernel_idx = 0;
			_clSetArgs(kernel_id, kernel_idx++, d_graph_mask);
			_clSetArgs(kernel_id, kernel_idx++, d_updating_graph_mask);
			_clSetArgs(kernel_id, kernel_idx++, d_graph_visited);
			_clSetArgs(kernel_id, kernel_idx++, d_over);
			_clSetArgs(kernel_id, kernel_idx++, &no_of_nodes, sizeof(int));

			//work_items = no_of_nodes;
			_clInvokeKernel(kernel_id, no_of_nodes, work_group_size);

			_clMemcpyD2H(d_over,sizeof(int), &h_over);
			}while(h_over);

		_clFinish();
		clock_gettime(CLOCK_MONOTONIC, &endT);
		uint64_t diff = 1000000000 * (endT.tv_sec - startT.tv_sec);
		uint64_t nanodiff = endT.tv_nsec - startT.tv_nsec;
		//printf("elapsed accelerator time = %llu nanoseconds\n", (long long unsigned int) diff);
		//printf("start time seconds%u \n", startT.tv_sec);
		//printf("end time seconds %u \n", endT.tv_sec);
        //printf("difference %u \n", diff);
		//printf("start time nanoseconds %u \n", startT.tv_nsec);
		//printf("end time nanoseconds %u \n", endT.tv_nsec);
        printf(" accelerator time %u \n", nanodiff + diff);
#ifdef	PROFILING
		kernel_timer.stop();
		kernel_time = kernel_timer.getTimeInSeconds();
#endif
		//--3 transfer data from device to host
		_clMemcpyD2H(d_cost,no_of_nodes*sizeof(int), h_cost);
		//--statistics
#ifdef	PROFILING
		std::cout<<"kernel time(s):"<<kernel_time<<std::endl;
#endif
		//--4 release cl resources.
		_clFree(d_graph_nodes);
		_clFree(d_graph_edges);
		_clFree(d_graph_mask);
		_clFree(d_updating_graph_mask);
		_clFree(d_graph_visited);
		_clFree(d_cost);
		_clFree(d_over);
		_clRelease();
	}
	catch(std::string msg){
		_clFree(d_graph_nodes);
		_clFree(d_graph_edges);
		_clFree(d_graph_mask);
		_clFree(d_updating_graph_mask);
		_clFree(d_graph_visited);
		_clFree(d_cost);
		_clFree(d_over);
		_clRelease();
		std::string e_str = "in run_transpose_gpu -> ";
		e_str += msg;
		throw(e_str);
	}
	return ;
}
Example #6
0
double run_gpu(datatype *h_i_vector, datatype *h_o_vector, datatype *h_o_vector_ref,\
			int w, int h, int kernel_id, bool verify)throw(std::string){	
	cl_mem d_i_vector, d_o_vector;
	int number_elements_out = w * h;
	int number_elements_in = w * h;
	try{
		//--1 transfer data from host to device		
		d_i_vector = _clMalloc(number_elements_in * sizeof(datatype));
		d_o_vector = _clMalloc(number_elements_out * sizeof(datatype));
		_clMemcpyH2D(d_i_vector, h_i_vector, number_elements_in*sizeof(datatype));
		_clFinish();
		//--2 invoke kernel
		int args_idx = 0;
		_clSetArgs(kernel_id, args_idx++, d_i_vector);
		_clSetArgs(kernel_id, args_idx++, d_o_vector);
		_clSetArgs(kernel_id, args_idx++, &w, sizeof(int));
		_clSetArgs(kernel_id, args_idx++, &h, sizeof(int));

		int work_group_unit = 16;
		int range_x = -1; 
		int range_y = -1;
		switch(kernel_id){
			case 0:
				range_x = w, range_y = h;
				break;
			case 1:
				range_x = w/2, range_y = h;
				break;
			case 2:
				range_x = w/4, range_y = h;
				break;				
			case 3:
				range_x = w/8, range_y = h;
				break;								
			case 4:
				range_x = w/16, range_y = h;
				break;								
			default:
				throw(string("Unknown kernel id!!!"));
				break;	
		}
		int group_x = work_group_unit * 4;
		int group_y = 1;
		int number_iterations = 1;
		unsigned long deltaT = 0.0f;
		unsigned long kernel_exe_time = 0.0f;
		std::cout<<"--testing..."<<std::endl;		
		for(int i=-1; i<number_iterations; i++){
			_clInvokeKernel2D(kernel_id, range_x, range_y, group_x, group_y, &kernel_exe_time);			
			if(i==0)
			  deltaT += kernel_exe_time;
		}		
		deltaT = deltaT/number_iterations;
		std::cout<<"--done."<<std::endl;
		_clMemcpyD2H(h_o_vector, d_o_vector, number_elements_out*sizeof(datatype));
	
		if(verify){
			verify_array<datatype>(h_o_vector, h_o_vector_ref, number_elements_out);
		}
		//--4 release cl resources.
		_clFree(d_i_vector);
		_clFree(d_o_vector);

		return (double)(((double)w*(double)h*(double)(h+1))*sizeof(datatype))/(double)deltaT;		
	}
	catch(std::string msg){		
		std::string e_str = "in run_gpu -> ";
		e_str += msg;
		throw(e_str);
	}
	return 0.0;
}