Ejemplo n.º 1
0
void testCusolver(int rows, int cols, int nnz, int *row_ptr, int *col_index, double *values,
		double *valuesB){
    // --- Initialize cuSPARSE
 	cusolverSpHandle_t cusolver_handle = NULL; checkCudaErrors(cusolverSpCreate(&cusolver_handle));
 	cusparseHandle_t cusparse_handle = NULL; checkCudaErrors(cusparseCreate(&cusparse_handle));
 	cudaStream_t cudaStream = NULL; checkCudaErrors(cudaStreamCreate(&cudaStream));
 	checkCudaErrors(cusolverSpSetStream(cusolver_handle, cudaStream));
 	checkCudaErrors(cusparseSetStream(cusparse_handle, cudaStream));


    cusparseMatDescr_t descrA;      checkCudaErrors(cusparseCreateMatDescr(&descrA));
    checkCudaErrors(cusparseSetMatType     (descrA, CUSPARSE_MATRIX_TYPE_GENERAL));
    checkCudaErrors(cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ONE));

    double start, stop, time_to_solve;
    start = second();

    // --- Device side dense matrix
    printf("\nAlloc GPU memory...\n");
    double *d_A;            checkCudaErrors(cudaMalloc(&d_A, nnz * sizeof(double)));
    int *d_A_RowIndices;    checkCudaErrors(cudaMalloc(&d_A_RowIndices, (rows + 1) * sizeof(int)));
    int *d_A_ColIndices;    checkCudaErrors(cudaMalloc(&d_A_ColIndices, nnz * sizeof(int)));
    double *d_x;        checkCudaErrors(cudaMalloc(&d_x, rows * sizeof(double)));

    checkCudaErrors(cudaMemcpy(d_A, values, nnz * sizeof(double), cudaMemcpyHostToDevice));
    checkCudaErrors(cudaMemcpy(d_A_RowIndices, row_ptr, (rows + 1) * sizeof(int), cudaMemcpyHostToDevice));
    checkCudaErrors(cudaMemcpy(d_A_ColIndices, col_index, nnz * sizeof(int), cudaMemcpyHostToDevice));

    double *d_b; checkCudaErrors(cudaMalloc(&d_b, rows * sizeof(double)));
    checkCudaErrors(cudaMemcpy(d_b, valuesB, rows * sizeof(double), cudaMemcpyHostToDevice));

    double *h_x = (double *)malloc(rows * sizeof(double));

    double tol = 1.e-12;
    int reorder = 0;
    int singularity = 0;
	printf("\nProcessing in GPU using cusolver QR...\n");


    //checkCudaErrors(cusolverSpDcsrlsvluHost(cusolver_handle, Nrows, nnz, descrA, sparse.Values(),
    	//	sparse.RowPtr(), sparse.ColIdx(), mB.values, tol, reorder, h_x, &singularity));
    checkCudaErrors(cusolverSpDcsrlsvqr(cusolver_handle, rows, nnz, descrA, d_A, d_A_RowIndices,
           		d_A_ColIndices, d_b, tol, reorder, d_x, &singularity));
    checkCudaErrors(cudaDeviceSynchronize());
    stop = second();
    time_to_solve = stop - start;
    checkCudaErrors(cudaMemcpy(h_x, d_x, rows * sizeof(double), cudaMemcpyDeviceToHost));

    double minusOne = -1.0;
    double one = 1.0;
    double *d_r; checkCudaErrors(cudaMalloc((void **)&d_r, sizeof(double)*rows));
    checkCudaErrors(cudaMemcpy(d_r, d_b, sizeof(double)*rows, cudaMemcpyDeviceToDevice));
    checkCudaErrors(cusparseDcsrmv(cusparse_handle,
            CUSPARSE_OPERATION_NON_TRANSPOSE,
            rows,
            cols,
            nnz,
            &minusOne,
            descrA,
            d_A,
            d_A_RowIndices,
            d_A_ColIndices,
            d_x,
            &one,
            d_r));
    double *h_r; h_r = (double*) malloc(rows * sizeof(double));
    checkCudaErrors(cudaMemcpy(h_r, d_r, sizeof(double)*rows, cudaMemcpyDeviceToHost));
    checkCudaErrors(cudaMemcpy(h_r, d_r, rows * sizeof(double), cudaMemcpyDeviceToHost));

    double r_inf = vec_norminf(rows, h_r);

    printf("(GPU - cuSolver) Time (sec): %f\n", time_to_solve);
    printf("(Eigen) |b - A*x| = %E \n", r_inf);

    checkCudaErrors(cusparseDestroy(cusparse_handle));
    checkCudaErrors(cusolverSpDestroy(cusolver_handle));
    checkCudaErrors(cudaStreamDestroy(cudaStream));
    checkCudaErrors(cudaFree(d_b));
    checkCudaErrors(cudaFree(d_x));
    checkCudaErrors(cudaFree(d_r));

    checkCudaErrors(cudaFree(d_A));
    checkCudaErrors(cudaFree(d_A_RowIndices));
    checkCudaErrors(cudaFree(d_A_ColIndices));

    free(h_x);
    free(h_r);
}
Ejemplo n.º 2
0
/*
 * Function to be called
 */
void* device_thread(void* passing_ptr) {
    DataArray* data_arr_ptr = (DataArray*) passing_ptr; // casting passed pointer


    cuDoubleComplex* data_r_dev;
    cuDoubleComplex* data_k_dev;


    // init device, allocate suitable variables in gpu memory ...
    //alloc_data_device(data_arr_ptr);
    cudaMalloc((void**) &data_r_dev, sizeof(double complex)*N); // pinnable memory <- check here for cudaMallocHost (could be faster)
    cudaMalloc((void**) &data_k_dev, sizeof(double complex)*N); // pinnable memory
    data_arr_ptr->data_r_dev = &data_r_dev; // in this way it would be easier to handle pointer to arrays
    data_arr_ptr->data_k_dev = &data_k_dev;
    printf("data allocated by host thread\n");

    // Each thread creates new stream ustomatically???
    // http://devblogs.nvidia.com/parallelforall/gpu-pro-tip-cuda-7-streams-simplify-concurrency/
    cudaStreamCreateWithFlags(streams_arr, cudaStreamNonBlocking);
    cudaStreamCreateWithFlags(streams_arr+1, cudaStreamNonBlocking);
    printf("streams created\n");

    // synchronize after allocating memory - data on host should be allocated and ready for copying
    cudaDeviceSynchronize(); // CHECK IF THIS DO NOT CAUSE ERRORS! - should syncronize host and device irrespective on pthreads
    // cudaStreamSynchronize( <enum stream> ); // to synchronize only with stream !!!
    pthread_barrier_wait (&barrier);
    printf("1st barier device thread - allocating mem on gpu\n");




    //copying data
    cudaMemcpyAsync( *(data_arr_ptr->data_r_dev), *(data_arr_ptr->data_r), N*sizeof(cuDoubleComplex), cudaMemcpyHostToDevice, streams_arr[MEMORY_STREAM] );

    // synchronize after copying data
    cudaDeviceSynchronize(); // should be used on
    pthread_barrier_wait (&barrier);
    printf("2nd barier device thread - copying data on gpu\n");






    printf("data visible in device thread:\n");

    /*for (uint64_t ii = 0; ii < (data_arr_ptr->size <= 32) ? data_arr_ptr->size : 32 ; ii++) {
      printf("%lu.\t",ii);
      printf("%lf + %lfj\t", creal( (*(data_arr_ptr->data_r))[ii] ), cimag( (*(data_arr_ptr->data_r))[ii] ));
      printf("%lf + %lfj\n", creal( (*(data_arr_ptr->data_k))[ii] ), cimag( (*(data_arr_ptr->data_k))[ii] ));
    }*/

    // synchronize after copying
    pthread_barrier_wait (&barrier);
    printf("3rd barier device thread - \n");



    //copying data
    //cudaMemcpyAsync( *(data_arr_ptr->data_r), *(data_arr_ptr->data_r_dev), N*sizeof(cuDoubleComplex), cudaMemcpyDeviceToHost, streams_arr[MEMORY_STREAM] );
    cudaMemcpyAsync( *(data_arr_ptr->data_r), data_r_dev, N*sizeof(cuDoubleComplex), cudaMemcpyDeviceToHost, streams_arr[MEMORY_STREAM] );


    // synchronize after copying back data
    cudaDeviceSynchronize(); // should be used on
    pthread_barrier_wait (&barrier);
    printf("4th barier device thread - \n");


    cudaStreamDestroy(streams_arr[KERNEL_STREAM]);
    cudaStreamDestroy(streams_arr[MEMORY_STREAM]);

    cudaFree(data_r_dev);
    printf("device r space freed\n");
    cudaFree(data_k_dev);
    cudaDeviceSynchronize();
    printf("device k space freed\n");

    printf("closing device thread\n");
    pthread_exit(NULL);
}
Ejemplo n.º 3
0
////////////////////////////////////////////////////////////////////////////////
// Test driver
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv)
{
    cudaError_t error;
    printf("%s Starting...\n\n", argv[0]);

    printf("Starting up CUDA context...\n");
    int dev = findCudaDevice(argc, (const char **)argv);

    uint *h_InputKey, *h_InputVal, *h_OutputKeyGPU, *h_OutputValGPU;
    uint *d_InputKey, *d_InputVal,    *d_OutputKey,    *d_OutputVal;
    StopWatchInterface *hTimer = NULL;

    const uint             N = 1048576;
    const uint           DIR = 0;
    const uint     numValues = 65536;
    const uint numIterations = 1;

    printf("Allocating and initializing host arrays...\n\n");
    sdkCreateTimer(&hTimer);
    h_InputKey     = (uint *)malloc(N * sizeof(uint));
    h_InputVal     = (uint *)malloc(N * sizeof(uint));
    h_OutputKeyGPU = (uint *)malloc(N * sizeof(uint));
    h_OutputValGPU = (uint *)malloc(N * sizeof(uint));
    srand(2001);

    for (uint i = 0; i < N; i++)
    {
        h_InputKey[i] = rand() % numValues;
        h_InputVal[i] = i;
    }

    printf("Allocating and initializing CUDA arrays...\n\n");
    error = cudaMalloc((void **)&d_InputKey,  N * sizeof(uint));
    checkCudaErrors(error);
    error = cudaMalloc((void **)&d_InputVal,  N * sizeof(uint));
    checkCudaErrors(error);
    error = cudaMalloc((void **)&d_OutputKey, N * sizeof(uint));
    checkCudaErrors(error);
    error = cudaMalloc((void **)&d_OutputVal, N * sizeof(uint));
    checkCudaErrors(error);
    error = cudaMemcpy(d_InputKey, h_InputKey, N * sizeof(uint), cudaMemcpyHostToDevice);
    checkCudaErrors(error);
    error = cudaMemcpy(d_InputVal, h_InputVal, N * sizeof(uint), cudaMemcpyHostToDevice);
    checkCudaErrors(error);

    int flag = 1;
    printf("Running GPU bitonic sort (%u identical iterations)...\n\n", numIterations);

    for (uint arrayLength = 64; arrayLength <= N; arrayLength *= 2)
    {
        printf("Testing array length %u (%u arrays per batch)...\n", arrayLength, N / arrayLength);
        error = cudaDeviceSynchronize();
        checkCudaErrors(error);

        sdkResetTimer(&hTimer);
        sdkStartTimer(&hTimer);
        uint threadCount = 0;

        for (uint i = 0; i < numIterations; i++)
            threadCount = bitonicSort(
                              d_OutputKey,
                              d_OutputVal,
                              d_InputKey,
                              d_InputVal,
                              N / arrayLength,
                              arrayLength,
                              DIR
                          );

        error = cudaDeviceSynchronize();
        checkCudaErrors(error);

        sdkStopTimer(&hTimer);
        printf("Average time: %f ms\n\n", sdkGetTimerValue(&hTimer) / numIterations);

        if (arrayLength == N)
        {
            double dTimeSecs = 1.0e-3 * sdkGetTimerValue(&hTimer) / numIterations;
            printf("sortingNetworks-bitonic, Throughput = %.4f MElements/s, Time = %.5f s, Size = %u elements, NumDevsUsed = %u, Workgroup = %u\n",
                   (1.0e-6 * (double)arrayLength/dTimeSecs), dTimeSecs, arrayLength, 1, threadCount);
        }

        printf("\nValidating the results...\n");
        printf("...reading back GPU results\n");
        error = cudaMemcpy(h_OutputKeyGPU, d_OutputKey, N * sizeof(uint), cudaMemcpyDeviceToHost);
        checkCudaErrors(error);
        error = cudaMemcpy(h_OutputValGPU, d_OutputVal, N * sizeof(uint), cudaMemcpyDeviceToHost);
        checkCudaErrors(error);

        int keysFlag = validateSortedKeys(h_OutputKeyGPU, h_InputKey, N / arrayLength, arrayLength, numValues, DIR);
        int valuesFlag = validateValues(h_OutputKeyGPU, h_OutputValGPU, h_InputKey, N / arrayLength, arrayLength);
        flag = flag && keysFlag && valuesFlag;

        printf("\n");
    }

    printf("Shutting down...\n");
    sdkDeleteTimer(&hTimer);
    cudaFree(d_OutputVal);
    cudaFree(d_OutputKey);
    cudaFree(d_InputVal);
    cudaFree(d_InputKey);
    free(h_OutputValGPU);
    free(h_OutputKeyGPU);
    free(h_InputVal);
    free(h_InputKey);

    cudaDeviceReset();
    exit(flag ? EXIT_SUCCESS : EXIT_FAILURE);
}
Ejemplo n.º 4
0
int
main()
{
	int i;
	struct timeval start, stop;
	FILE *fd;
	char *key;

	cudaSetDevice(0);

	/* Allocate memory */
	if ((key = (char *)malloc(40 * sizeof(char))) == NULL) {
		printf("Malloc failed!\n");
		exit(EXIT_FAILURE);
	}

	cudaMallocHost((void **) &batchKeys,
	    ((BATCH_SIZE + 1) * MAX_LEN_ALIGNED) * sizeof(char));
	cudaMallocHost((void **) &nKeys, BATCH_SIZE * sizeof(size_t));
	cudaMallocHost((void **) &batchIndex, (BATCH_SIZE + 1) * sizeof(int));
	cudaMallocHost((void **) &hashedKeys, BATCH_SIZE * sizeof(uint32_t));

	cudaMalloc((void **) &d_keys,
	    ((BATCH_SIZE + 1) * MAX_LEN_ALIGNED) * sizeof(char));
        cudaMalloc((void **) &d_len, BATCH_SIZE * sizeof(size_t));
        cudaMalloc((void **) &d_index, (BATCH_SIZE + 1) * sizeof(int));
        cudaMalloc((void **) &d_res, BATCH_SIZE * sizeof(uint32_t));

	/* Create 'BATCH_SIZE' number of random keys 
	 * and add them to batch table
	 */
	batchNo = 0;
        batchIndex[0] = 0;
	for(i = 0; i < BATCH_SIZE; i++) { 
		gen_random(key, 30);
		add_to_batch(key, 30);
	}

	/* Start Time (execution + memory) */
#ifdef EXEC_MEM
	gettimeofday(&start, NULL);
#endif // EXEC_MEM
	
	/* MemCpy Host -> Device */
	cudaMemcpy(d_keys, batchKeys, (batchIndex[BATCH_SIZE-1] +
	    strlen(&batchKeys[batchIndex[BATCH_SIZE - 1]])) * sizeof(char),
	    cudaMemcpyHostToDevice);
        cudaMemcpy(d_len, nKeys, BATCH_SIZE * sizeof(size_t),
	    cudaMemcpyHostToDevice);
        cudaMemcpy(d_index, batchIndex, BATCH_SIZE * sizeof(int),
	    cudaMemcpyHostToDevice);

	/* Start Time (execution only)*/
#ifndef EXEC_MEM
	gettimeofday(&start, NULL);
#endif // EXEC_MEM

	/* Call the kernel */
	CUDAhash(d_keys, d_index, d_len, d_res);

	/* Start Time (execution only)*/
#ifndef EXEC_MEM
	cudaDeviceSynchronize();
	gettimeofday(&stop, NULL);
#endif // EXEC_MEM

	/* MemCpy Device -> Host */
	cudaMemcpy(hashedKeys, d_res, BATCH_SIZE * sizeof(uint32_t),
	    cudaMemcpyDeviceToHost);	
	
	/* Start Time (execution + memory) */
#ifdef EXEC_MEM
	gettimeofday(&stop, NULL);
#endif // EXEC_MEM

	
#ifdef DEBUG
	for(i = 0; i < BATCH_SIZE; i++) {
		printf("%s\n", &batchKeys[batchIndex[i]]);
		printf("%u\n", hashedKeys[i]);
	}
#endif // DEBUG

	/* Print Time */
	fd = fopen("log.txt", "a+");
	fprintf(fd, "%lu", ((stop.tv_sec * USECS) + stop.tv_usec ) -
	    ((start.tv_sec * USECS) + start.tv_usec));
	fprintf(fd, "\t%1.f\n", ((double)BATCH_SIZE / 
	    ((double)(((stop.tv_sec * USECS) + stop.tv_usec ) -
	    ((start.tv_sec * USECS) + start.tv_usec)) / 1000000 )) / 1000);
	fclose(fd);

#ifdef DEBUG
	printf("Time: %lu \n", ((stop.tv_sec * USECS) + stop.tv_usec ) -
	    ((start.tv_sec * USECS) + start.tv_usec));
#endif // DEBUG
	
        /* Free memory */
        cudaFree(batchKeys);
	cudaFree(nKeys);
	cudaFree(hashedKeys);
	cudaFree(batchIndex);
        cudaFree(d_keys);
        cudaFree(d_len);
        cudaFree(d_res);
        cudaFree(d_index);

	return 0;
}
Ejemplo n.º 5
0
// Main ------------------------------------------------------------------------------------------
int main(int argc, char **argv) {

	const Params p(argc, argv);
	CUDASetup setcuda(p.device);
	Timer timer;
	cudaError_t cudaStatus;
	int it_cpu = 0;
	int it_gpu = 0;
	int err = 0;
#ifdef LOGS
	set_iter_interval_print(10);
	char test_info[500];
	snprintf(test_info, 500, "-i %d -g %d -t %d -f %s -l %d\n",p.n_gpu_threads, p.n_gpu_blocks,p.n_threads, p.file_name,p.switching_limit);
	start_log_file("cudaSingleSourceShortestPath", test_info);
	//printf("Com LOG\n");
#endif

	// Allocate
	int n_nodes, n_edges;
//	int n_nodes_o;
	read_input_size(n_nodes, n_edges, p);
	timer.start("Allocation");
	Node * h_nodes = (Node *) malloc(sizeof(Node) * n_nodes);

	//*************************** Alocando Memoria para o Gold *************************************
	Gold * gold = (Gold *) malloc(sizeof(Gold) * n_nodes);
	if (p.mode == 1) {
		// **********************  Lendo O gold *********************************
		read_gold(gold, p);
		// **********************************************************************
	}

	//***********************************************************************************************
	Node * d_nodes;
	cudaStatus = cudaMalloc((void**) &d_nodes, sizeof(Node) * n_nodes);
	Edge * h_edges = (Edge *) malloc(sizeof(Edge) * n_edges);
	Edge * d_edges;
	cudaStatus = cudaMalloc((void**) &d_edges, sizeof(Edge) * n_edges);
	std::atomic_int *h_color = (std::atomic_int *) malloc(
			sizeof(std::atomic_int) * n_nodes);
	int * d_color;
	cudaStatus = cudaMalloc((void**) &d_color, sizeof(int) * n_nodes);
	std::atomic_int *h_cost = (std::atomic_int *) malloc(
			sizeof(std::atomic_int) * n_nodes);
	int * d_cost;
	cudaStatus = cudaMalloc((void**) &d_cost, sizeof(int) * n_nodes);
	int * h_q1 = (int *) malloc(n_nodes * sizeof(int));
	int * d_q1;
	cudaStatus = cudaMalloc((void**) &d_q1, sizeof(int) * n_nodes);
	int * h_q2 = (int *) malloc(n_nodes * sizeof(int));
	int * d_q2;
	cudaStatus = cudaMalloc((void**) &d_q2, sizeof(int) * n_nodes);
	std::atomic_int h_head[1];
	int * d_head;
	cudaStatus = cudaMalloc((void**) &d_head, sizeof(int));
	std::atomic_int h_tail[1];
	int * d_tail;
	cudaStatus = cudaMalloc((void**) &d_tail, sizeof(int));
	std::atomic_int h_threads_end[1];
	int * d_threads_end;
	cudaStatus = cudaMalloc((void**) &d_threads_end, sizeof(int));
	std::atomic_int h_threads_run[1];
	int * d_threads_run;
	cudaStatus = cudaMalloc((void**) &d_threads_run, sizeof(int));
	int h_num_t[1];
	int * d_num_t;
	cudaStatus = cudaMalloc((void**) &d_num_t, sizeof(int));
	int h_overflow[1];
	int * d_overflow;
	cudaStatus = cudaMalloc((void**) &d_overflow, sizeof(int));
	std::atomic_int h_gray_shade[1];
	int * d_gray_shade;
	cudaStatus = cudaMalloc((void**) &d_gray_shade, sizeof(int));
	std::atomic_int h_iter[1];
	int * d_iter;
	cudaStatus = cudaMalloc((void**) &d_iter, sizeof(int));
	cudaDeviceSynchronize();
	CUDA_ERR();
	ALLOC_ERR(h_nodes, h_edges, h_color, h_cost, h_q1, h_q2);
	timer.stop("Allocation");

	// Initialize
	timer.start("Initialization");
	const int max_gpu_threads = setcuda.max_gpu_threads();
	int source;
	read_input(source, h_nodes, h_edges, p);

	for (int i = 0; i < n_nodes; i++) {
		h_cost[i].store(INF);
	}
	h_cost[source].store(0);
	for (int i = 0; i < n_nodes; i++) {
		h_color[i].store(WHITE);
	}
	h_tail[0].store(0);
	h_head[0].store(0);
	h_threads_end[0].store(0);
	h_threads_run[0].store(0);
	h_q1[0] = source;
	h_iter[0].store(0);
	h_overflow[0] = 0;
	h_gray_shade[0].store(GRAY0);
	timer.stop("Initialization");
	//timer.print("Initialization", 1);

	// Copy to device
	timer.start("Copy To Device");
	cudaStatus = cudaMemcpy(d_nodes, h_nodes, sizeof(Node) * n_nodes,
			cudaMemcpyHostToDevice);
	cudaStatus = cudaMemcpy(d_edges, h_edges, sizeof(Edge) * n_edges,
			cudaMemcpyHostToDevice);
	cudaDeviceSynchronize();
	CUDA_ERR();
	timer.stop("Copy To Device");

	for (int rep = 0; rep < p.n_reps; rep++) {

		// Reset
		for (int i = 0; i < n_nodes; i++) {
			h_cost[i].store(INF);
		}
		h_cost[source].store(0);
		for (int i = 0; i < n_nodes; i++) {
			h_color[i].store(WHITE);
		}

		it_cpu = 0;
		it_gpu = 0;

		h_tail[0].store(0);
		h_head[0].store(0);
		h_threads_end[0].store(0);
		h_threads_run[0].store(0);
		h_q1[0] = source;
		h_iter[0].store(0);
		h_overflow[0] = 0;
		h_gray_shade[0].store(GRAY0);

		// if(rep >= p.n_warmup)
		timer.start("Kernel");

#ifdef LOGS
		start_iteration();
#endif

		// Run first iteration in master CPU thread
		h_num_t[0] = 1;
		int pid;
		int index_i, index_o;
		for (index_i = 0; index_i < h_num_t[0]; index_i++) {
			pid = h_q1[index_i];
			h_color[pid].store(BLACK);
			int cur_cost = h_cost[pid].load();
			for (int i = h_nodes[pid].x; i < (h_nodes[pid].y + h_nodes[pid].x);
					i++) {
				int id = h_edges[i].x;
				int cost = h_edges[i].y;
				cost += cur_cost;
				h_cost[id].store(cost);
				h_color[id].store(GRAY0);
				index_o = h_tail[0].fetch_add(1);
				h_q2[index_o] = id;
			}
		}
		h_num_t[0] = h_tail[0].load();
		h_tail[0].store(0);
		h_threads_run[0].fetch_add(1);
		h_gray_shade[0].store(GRAY1);
		h_iter[0].fetch_add(1);
		// if(rep >= p.n_warmup)
		timer.stop("Kernel");

		// Pointers to input and output queues
		int * h_qin = h_q2;
		int * h_qout = h_q1;
		int * d_qin = d_q2;
		int * d_qout = d_q1;

		const int CPU_EXEC = (p.n_threads > 0) ? 1 : 0;
		const int GPU_EXEC =
				(p.n_gpu_blocks > 0 && p.n_gpu_threads > 0) ? 1 : 0;

		// Run subsequent iterations on CPU or GPU until number of input queue elements is 0
		while (*h_num_t != 0) {

			if ((*h_num_t < p.switching_limit || GPU_EXEC == 0)
					&& CPU_EXEC == 1) { // If the number of input queue elements is lower than switching_limit
				it_cpu = it_cpu + 1;
				//   if(rep >= p.n_warmup)
				timer.start("Kernel");

				// Continue until switching_limit condition is not satisfied
				while ((*h_num_t != 0)
						&& (*h_num_t < p.switching_limit || GPU_EXEC == 0)
						&& CPU_EXEC == 1) {

					// Swap queues
					if (h_iter[0] % 2 == 0) {
						h_qin = h_q1;
						h_qout = h_q2;
					} else {
						h_qin = h_q2;
						h_qout = h_q1;
					}

					std::thread main_thread(run_cpu_threads, h_nodes, h_edges,
							h_cost, h_color, h_qin, h_qout, h_num_t, h_head,
							h_tail, h_threads_end, h_threads_run, h_gray_shade,
							h_iter, p.n_threads, p.switching_limit, GPU_EXEC);
					main_thread.join();

					h_num_t[0] = h_tail[0].load(); // Number of elements in output queue
					h_tail[0].store(0);
					h_head[0].store(0);
					if (h_iter[0].load() % 2 == 0)
						h_gray_shade[0].store(GRAY0);
					else
						h_gray_shade[0].store(GRAY1);
				}

				//  if(rep >= p.n_warmup)
				timer.stop("Kernel");

			} else if ((*h_num_t >= p.switching_limit || CPU_EXEC == 0)
					&& GPU_EXEC == 1) { // If the number of input queue elements is higher than or equal to switching_limit
				it_gpu = it_gpu + 1;
				//  if(rep >= p.n_warmup)
				timer.start("Copy To Device");
				cudaStatus = cudaMemcpy(d_cost, h_cost, sizeof(int) * n_nodes,
						cudaMemcpyHostToDevice);
				cudaStatus = cudaMemcpy(d_color, h_color, sizeof(int) * n_nodes,
						cudaMemcpyHostToDevice);
				cudaStatus = cudaMemcpy(d_threads_run, h_threads_run,
						sizeof(int), cudaMemcpyHostToDevice);
				cudaStatus = cudaMemcpy(d_threads_end, h_threads_end,
						sizeof(int), cudaMemcpyHostToDevice);
				cudaStatus = cudaMemcpy(d_overflow, h_overflow, sizeof(int),
						cudaMemcpyHostToDevice);
				cudaStatus = cudaMemcpy(d_q1, h_q1, sizeof(int) * n_nodes,
						cudaMemcpyHostToDevice);
				cudaStatus = cudaMemcpy(d_q2, h_q2, sizeof(int) * n_nodes,
						cudaMemcpyHostToDevice);
				cudaStatus = cudaMemcpy(d_iter, h_iter, sizeof(int),
						cudaMemcpyHostToDevice);
				cudaDeviceSynchronize();
				CUDA_ERR();
				//  if(rep >= p.n_warmup)
				timer.stop("Copy To Device");

				// Continue until switching_limit condition is not satisfied
				while ((*h_num_t != 0)
						&& (*h_num_t >= p.switching_limit || CPU_EXEC == 0)
						&& GPU_EXEC == 1) {

					// Swap queues
					if (h_iter[0] % 2 == 0) {
						d_qin = d_q1;
						d_qout = d_q2;
					} else {
						d_qin = d_q2;
						d_qout = d_q1;
					}

					//  if(rep >= p.n_warmup)
					timer.start("Copy To Device");
					cudaStatus = cudaMemcpy(d_num_t, h_num_t, sizeof(int),
							cudaMemcpyHostToDevice);
					cudaStatus = cudaMemcpy(d_tail, h_tail, sizeof(int),
							cudaMemcpyHostToDevice);
					cudaStatus = cudaMemcpy(d_head, h_head, sizeof(int),
							cudaMemcpyHostToDevice);
					cudaStatus = cudaMemcpy(d_gray_shade, h_gray_shade,
							sizeof(int), cudaMemcpyHostToDevice);
					cudaDeviceSynchronize();
					CUDA_ERR();
					//  if(rep >= p.n_warmup)
					timer.stop("Copy To Device");

					//  if(rep >= p.n_warmup)
					timer.start("Kernel");
					assert(
							p.n_gpu_threads <= max_gpu_threads
									&& "The thread block size is greater than the maximum thread block size that can be used on this device");
					cudaStatus = call_SSSP_gpu(p.n_gpu_blocks, p.n_gpu_threads,
							d_nodes, d_edges, d_cost, d_color, d_qin, d_qout,
							d_num_t, d_head, d_tail, d_threads_end,
							d_threads_run, d_overflow, d_gray_shade, d_iter,
							p.switching_limit, CPU_EXEC,
							sizeof(int) * (W_QUEUE_SIZE + 3));
					cudaDeviceSynchronize();
					CUDA_ERR();
					//  if(rep >= p.n_warmup)
					timer.stop("Kernel");

					// if(rep >= p.n_warmup)
					timer.start("Copy Back and Merge");
					cudaStatus = cudaMemcpy(h_tail, d_tail, sizeof(int),
							cudaMemcpyDeviceToHost);
					cudaStatus = cudaMemcpy(h_iter, d_iter, sizeof(int),
							cudaMemcpyDeviceToHost);
					cudaDeviceSynchronize();
					CUDA_ERR();
					//  if(rep >= p.n_warmup)
					timer.stop("Copy Back and Merge");

					h_num_t[0] = h_tail[0].load(); // Number of elements in output queue
					h_tail[0].store(0);
					h_head[0].store(0);
					if (h_iter[0].load() % 2 == 0)
						h_gray_shade[0].store(GRAY0);
					else
						h_gray_shade[0].store(GRAY1);
				}

				// if(rep >= p.n_warmup)
				timer.start("Copy Back and Merge");
				cudaStatus = cudaMemcpy(h_cost, d_cost, sizeof(int) * n_nodes,
						cudaMemcpyDeviceToHost);
				cudaStatus = cudaMemcpy(h_color, d_color, sizeof(int) * n_nodes,
						cudaMemcpyDeviceToHost);
				cudaStatus = cudaMemcpy(h_threads_run, d_threads_run,
						sizeof(int), cudaMemcpyDeviceToHost);
				cudaStatus = cudaMemcpy(h_threads_end, d_threads_end,
						sizeof(int), cudaMemcpyDeviceToHost);
				cudaStatus = cudaMemcpy(h_overflow, d_overflow, sizeof(int),
						cudaMemcpyDeviceToHost);
				cudaStatus = cudaMemcpy(h_q1, d_q1, sizeof(int) * n_nodes,
						cudaMemcpyDeviceToHost);
				cudaStatus = cudaMemcpy(h_q2, d_q2, sizeof(int) * n_nodes,
						cudaMemcpyDeviceToHost);
				cudaDeviceSynchronize();
				CUDA_ERR();
				// if(rep >= p.n_warmup)
				timer.stop("Copy Back and Merge");
			}
		}

#ifdef LOGS
		end_iteration();
#endif

		// printf("IT CPU:%d\t",it_cpu);
		//printf("IT GPU:%d\n",it_gpu);
		if (p.mode == 1) {
			err = newest_verify(h_cost, n_nodes, n_nodes, gold, it_cpu, it_gpu);
		}		//err=new_verify(h_cost, n_nodes,,it_cpu,it_gpu);

		if (err > 0) {
			printf("Errors: %d\n", err);
			read_input(source, h_nodes, h_edges, p);
			read_gold(gold, p);
		} else {
			printf(".ITERATION %d\n", rep);
		}
#ifdef LOGS
		log_error_count(err);
#endif
		// Ler a entrada novamente
		//read_input(source, h_nodes, h_edges, p);
		//read_gold(gold,p);

	} // end of iteration
#ifdef LOGS
	end_log_file();
#endif
	// timer.print("Allocation", 1);
	//timer.print("Copy To Device", p.n_reps);
	// timer.print("Kernel", p.n_reps);
	// timer.print("Copy Back and Merge", p.n_reps);
	if (p.mode == 0) {
		create_output(h_cost, n_nodes, n_edges, std::string(p.comparison_file));
	}
	// Verify answer
	verify(h_cost, n_nodes, p.comparison_file);

	// Free memory
	timer.start("Deallocation");
	free(h_nodes);
	free(h_edges);
	free(h_color);
	free(h_cost);
	free(h_q1);
	free(h_q2);
	cudaStatus = cudaFree(d_nodes);
	cudaStatus = cudaFree(d_edges);
	cudaStatus = cudaFree(d_cost);
	cudaStatus = cudaFree(d_color);
	cudaStatus = cudaFree(d_q1);
	cudaStatus = cudaFree(d_q2);
	cudaStatus = cudaFree(d_num_t);
	cudaStatus = cudaFree(d_head);
	cudaStatus = cudaFree(d_tail);
	cudaStatus = cudaFree(d_threads_end);
	cudaStatus = cudaFree(d_threads_run);
	cudaStatus = cudaFree(d_overflow);
	cudaStatus = cudaFree(d_iter);
	cudaStatus = cudaFree(d_gray_shade);
	CUDA_ERR();
	cudaDeviceSynchronize();
	timer.stop("Deallocation");
	//timer.print("Deallocation", 1);

	// Release timers
	timer.release("Allocation");
	timer.release("Initialization");
	timer.release("Copy To Device");
	timer.release("Kernel");
	timer.release("Copy Back and Merge");
	timer.release("Deallocation");

	printf("Test Passed\n");
	return 0;
}
Ejemplo n.º 6
0
int main(int argc, char *argv[])
{
    // needed to work correctly with piped benchmarkrunner
    setlinebuf(stdout);
    setlinebuf(stdin);

    int n_indices = 1;
    int n_dimensions = 1;
    char inBuf[200]; // ridiculously large input buffer.
    
    bool isFirst = true;

  do {

    // Allocate memory for the arrays
    int *h_indices = 0;
    double        *h_outputGPU  = 0;

    try
    {
        h_indices = new int [n_indices * n_dimensions];
        h_outputGPU  = new double [n_indices * n_dimensions];
    }
    catch (std::exception e)
    {
        std::cerr << "Caught exception: " << e.what() << std::endl;
        std::cerr << "Unable to allocate CPU memory (try running with fewer vectors/dimensions)" << std::endl;
        return -1;
    }

    int *d_indices;
    double        *d_output;

    try
    {
        cudaError_t cudaResult;
        cudaResult = cudaMalloc((void **)&d_indices, n_dimensions * n_indices * sizeof(int));

        if (cudaResult != cudaSuccess)
        {
            throw std::runtime_error(cudaGetErrorString(cudaResult));
        }
    }
    catch (std::runtime_error e)
    {
        std::cerr << "Caught exception: " << e.what() << std::endl;
        std::cerr << "Unable to allocate GPU memory (try running with fewer vectors/dimensions)" << std::endl;
        return -1;
    }

    // Initialize the indices (done on the host)
    for(int i = 0; i < n_indices; i++) {
      h_indices[i] = i;
    }

    // Copy the indices to the device
    cudaMemcpy(d_indices, h_indices, n_dimensions * n_indices * sizeof(int), cudaMemcpyHostToDevice);
    cudaDeviceSynchronize();

    // Execute the QRNG on the device
    int n_vec;
    sobol_nikola_unsimplified(n_indices, d_indices, n_indices, &d_output, &n_vec);

    cudaDeviceSynchronize();

    cudaMemcpy(h_outputGPU, d_output, n_indices * n_dimensions * sizeof(double), cudaMemcpyDeviceToHost);

    // Cleanup and terminate
    delete h_indices;
    cudaFree(d_indices);
    cudaFree(d_output);

    if(!isFirst) {
      printf("RESULT ");

      for(int i = 0; i < std::min(n_indices,10); i++)
        printf("%f ", h_outputGPU[i]);

      printf("\n");
    }
    else {
      printf("OK\n");
      isFirst = false;
    }

    delete h_outputGPU;

      fgets(inBuf, 200, stdin);

      if (sscanf(inBuf, "%u", &n_indices) == 0)
      {
        // if input is not a number, it has to be "EXIT"
        if (strncmp("EXIT",inBuf,4)==0)
        {
          printf("OK\n");
          break;
        }
        else
        {
          printf("ERROR. Bad input: %s\n", inBuf);
          break;
        }
      }

    } while (true);

    cudaDeviceReset();
    return -1;
}
Ejemplo n.º 7
0
int scanhash_skein2(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
{
	int dev_id = device_map[thr_id];
	uint32_t *pdata = work->data;
	uint32_t *ptarget = work->target;
	const uint32_t first_nonce = pdata[19];
	const int swap = 1; // to toggle nonce endian

	uint32_t throughput = cuda_default_throughput(thr_id, 1U << 19); // 256*256*8
	if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);

	if (opt_benchmark)
		((uint32_t*)ptarget)[7] = 0;

	if (!init[thr_id])
	{
		cudaSetDevice(dev_id);
		if (opt_cudaschedule == -1 && gpu_threads == 1) {
			cudaDeviceReset();
			// reduce cpu usage
			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
			CUDA_LOG_ERROR();
		}

		cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput);

		quark_skein512_cpu_init(thr_id, throughput);
		cuda_check_cpu_init(thr_id, throughput);

		CUDA_SAFE_CALL(cudaDeviceSynchronize());

		init[thr_id] = true;
	}

	uint32_t endiandata[20];
	for (int k=0; k < 19; k++)
		be32enc(&endiandata[k], pdata[k]);

	skein512_cpu_setBlock_80((void*)endiandata);
	cuda_check_cpu_setTarget(ptarget);

	do {
		int order = 0;

		// Hash with CUDA
		skein512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], swap);
		quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);

		*hashes_done = pdata[19] - first_nonce + throughput;

		uint32_t foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
		if (foundNonce != UINT32_MAX)
		{
			uint32_t _ALIGN(64) vhash64[8];

			endiandata[19] = swab32_if(foundNonce, swap);
			skein2hash(vhash64, endiandata);

			if (vhash64[7] <= ptarget[7] && fulltest(vhash64, ptarget)) {
				int res = 1;
				uint32_t secNonce = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1);
				work_set_target_ratio(work, vhash64);
				if (secNonce != 0) {
					if (!opt_quiet)
						applog(LOG_BLUE, "GPU #%d: found second nonce %08x !", dev_id, swab32(secNonce));

					endiandata[19] = swab32_if(secNonce, swap);
					skein2hash(vhash64, endiandata);
					if (bn_hash_target_ratio(vhash64, ptarget) > work->shareratio)
						work_set_target_ratio(work, vhash64);
					pdata[21] = swab32_if(secNonce, !swap);
					res++;
				}
				pdata[19] = swab32_if(foundNonce, !swap);
				return res;
			} else {
				gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", foundNonce);
			}
		}

		if ((uint64_t) throughput + pdata[19] >= max_nonce) {
			pdata[19] = max_nonce;
			break;
		}

		pdata[19] += throughput;

	} while (!work_restart[thr_id].restart);

	*hashes_done = pdata[19] - first_nonce;

	return 0;
}
Ejemplo n.º 8
0
int main(int argc, char **argv) {
  uchar4        *h_rgbaImage, *d_rgbaImage;
  unsigned char *h_greyImage, *d_greyImage;

  std::string input_file;
  std::string output_file;
  std::string reference_file;
  double perPixelError = 0.0;
  double globalError   = 0.0;
  bool useEpsCheck = false;
  switch (argc)
  {
	case 2:
	  input_file = std::string(argv[1]);
	  output_file = "HW1_output.png";
	  reference_file = "HW1_reference.png";
	  break;
	case 3:
	  input_file  = std::string(argv[1]);
      output_file = std::string(argv[2]);
	  reference_file = "HW1_reference.png";
	  break;
	case 4:
	  input_file  = std::string(argv[1]);
      output_file = std::string(argv[2]);
	  reference_file = std::string(argv[3]);
	  break;
	case 6:
	  useEpsCheck=true;
	  input_file  = std::string(argv[1]);
	  output_file = std::string(argv[2]);
	  reference_file = std::string(argv[3]);
	  perPixelError = atof(argv[4]);
      globalError   = atof(argv[5]);
	  break;
	default:
      std::cerr << "Usage: ./HW1 input_file [output_filename] [reference_filename] [perPixelError] [globalError]" << std::endl;
      exit(1);
  }
  //load the image and give us our input and output pointers
  preProcess(&h_rgbaImage, &h_greyImage, &d_rgbaImage, &d_greyImage, input_file);

  GpuTimer timer;
  timer.Start();
  //call the students' code
  lineDetect(h_rgbaImage, d_rgbaImage, d_greyImage, numRows(), numCols());
  timer.Stop();
  cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());

  int err = printf("Your code ran in: %f msecs.\n", timer.Elapsed());

  if (err < 0) {
    //Couldn't print! Probably the student closed stdout - bad news
    std::cerr << "Couldn't print timing information! STDOUT Closed!" << std::endl;
    exit(1);
  }

  size_t numPixels = numRows()*numCols();
  checkCudaErrors(cudaMemcpy(h_greyImage, d_greyImage, sizeof(unsigned char) * numPixels, cudaMemcpyDeviceToHost));

  //check results and output the grey image
  postProcess(output_file, h_greyImage);

  referenceCalculation(h_rgbaImage, h_greyImage, numRows(), numCols());

  postProcess(reference_file, h_greyImage);

  //generateReferenceImage(input_file, reference_file);
  compareImages(reference_file, output_file, useEpsCheck, perPixelError, 
                globalError);

  cleanup();

  return 0;
}
Ejemplo n.º 9
0
int main(int argc, char **argv) {
  uchar4 *h_inputImageRGBA,  *d_inputImageRGBA;
  uchar4 *h_outputImageRGBA, *d_outputImageRGBA;
  unsigned char *d_redBlurred, *d_greenBlurred, *d_blueBlurred;

  float *h_filter;
  int    filterWidth;

  std::string input_file;
  std::string output_file;
  std::string reference_file;
  double perPixelError = 0.0;
  double globalError   = 0.0;
  bool useEpsCheck = false;
  switch (argc)
  {
	case 2:
	  input_file = std::string(argv[1]);
	  output_file = "HW2_output.png";
	  reference_file = "HW2_reference.png";
	  break;
	case 3:
	  input_file  = std::string(argv[1]);
      output_file = std::string(argv[2]);
	  reference_file = "HW2_reference.png";
	  break;
	case 4:
	  input_file  = std::string(argv[1]);
      output_file = std::string(argv[2]);
	  reference_file = std::string(argv[3]);
	  break;
	case 6:
	  useEpsCheck=true;
	  input_file  = std::string(argv[1]);
	  output_file = std::string(argv[2]);
	  reference_file = std::string(argv[3]);
	  perPixelError = atof(argv[4]);
      globalError   = atof(argv[5]);
	  break;
	default:
      std::cerr << "Usage: ./HW2 input_file [output_filename] [reference_filename] [perPixelError] [globalError]" << std::endl;
      exit(1);
  }
  //load the image and give us our input and output pointers
  preProcess(&h_inputImageRGBA, &h_outputImageRGBA, &d_inputImageRGBA, &d_outputImageRGBA,
             &d_redBlurred, &d_greenBlurred, &d_blueBlurred,
             &h_filter, &filterWidth, input_file);

  allocateMemoryAndCopyToGPU(numRows(), numCols(), h_filter, filterWidth);
  GpuTimer timer;
  timer.Start();
  //call the students' code
  your_gaussian_blur(h_inputImageRGBA, d_inputImageRGBA, d_outputImageRGBA, numRows(), numCols(),
                     d_redBlurred, d_greenBlurred, d_blueBlurred, filterWidth);
  timer.Stop();
  cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
  int err = printf("Your GPU code ran in: %f msecs.\n", timer.Elapsed());

  if (err < 0) {
    //Couldn't print! Probably the student closed stdout - bad news
    std::cerr << "Couldn't print timing information! STDOUT Closed!" << std::endl;
    exit(1);
  }

  //check results and output the blurred image

  size_t numPixels = numRows()*numCols();
  //copy the output back to the host
  checkCudaErrors(cudaMemcpy(h_outputImageRGBA, d_outputImageRGBA__, sizeof(uchar4) * numPixels, cudaMemcpyDeviceToHost));

  postProcess(output_file, h_outputImageRGBA);

  timer.Start();
  referenceCalculation(h_inputImageRGBA, h_outputImageRGBA,
                       numRows(), numCols(),
                       h_filter, filterWidth);
  timer.Stop();
  printf("Your CPU code ran in: %f msecs.\n", timer.Elapsed());

  postProcess(reference_file, h_outputImageRGBA);

    //  Cheater easy way with OpenCV
    //generateReferenceImage(input_file, reference_file, filterWidth);

  compareImages(reference_file, output_file, useEpsCheck, perPixelError, globalError);

  checkCudaErrors(cudaFree(d_redBlurred));
  checkCudaErrors(cudaFree(d_greenBlurred));
  checkCudaErrors(cudaFree(d_blueBlurred));

  cleanUp();

  return 0;
}
int main(int argc, char *argv[]) {

   int i,j,k,n;

   int nx = NX;
   int ny = NY;
   int nz = NZ;
   int nsteps = NSTEPS;

   if( argc >= 4 ) {
      nx = atoi( argv[1] );
      ny = atoi( argv[2] );
      nz = atoi( argv[3] );
   }
   if( argc >=5 )
      nsteps = atoi( argv[4] );

   StartTimer();

   size_t nbytes = nx * ny * nz * sizeof(float);

   float *restrict x  = (float*)malloc( nbytes );
   float *restrict y  = (float*)malloc( nbytes );
   float *restrict z  = (float*)malloc( nbytes );
   float *restrict f  = (float*)malloc( nbytes );
   float *restrict g  = (float*)malloc( nbytes );
   float *restrict fp = (float*)malloc( nbytes );
   float *restrict gp = (float*)malloc( nbytes );
   if( 0==x || 0==y || 0==z || 0==f || 0==g || 0==fp || 0==gp ) {
      printf( "couldn't allocate fields on the host\n" );
      return (-1);
   }

   float dx = 2.0f/(nx-1);
   float dy = 2.0f/(ny-1);
   float dz = 2.0f/(nz-1);
   float dt = 0.00000005f; // in order for the system to be numerically dt < dx!!!

   // initialize the grid to run from -1 to 1 in each direction
   for (i=0; i<nx; i++) {
      for (j=0; j<ny; j++) {
         for (k=0; k<nz; k++) {
            int offset = OFFSET(i, j, k, ny, nz);
            x[offset] = -1.0f + (i)*dx;
            y[offset] = -1.0f + (j)*dy;
            z[offset] = -1.0f + (k)*dz;
         }
      }
   } 

   // initialize the field to be a gaussian
   for (i=0; i<nx; i++) {
      for (j=0; j<ny; j++) {
         for (k=0; k<nz; k++) {
            int offset = OFFSET(i, j, k, ny, nz);
            f[offset] = 0.2f*exp( - ( x[offset]*x[offset] + 
                                      y[offset]*y[offset] + 
                                      z[offset]*z[offset] ) / 0.05f);
            g[offset] = 0.0f;
         }
      }
   } 

   // output the initial data when there are an even number of points, 
   // pick a line closest to a coordinate axis
   FILE *fPtr = fopen("wave3d.xline", "w");
   for (i=0; i<nx; i++) {
      int offset = OFFSET(i, ny/2, nz/2, ny, nz);
      fprintf(fPtr,"%5.3f %10.6e\n",x[offset],f[offset]);
   }
   fprintf(fPtr,"\n");

   float step = 0.0f;
   int printevery = 20;
   printf("step = %9.6f \n",step);

   cudaProfilerStart();
   #pragma acc enter data copyin(x[0:nx*ny*nz], y[0:nx*ny*nz], z[0:nx*ny*nz], f[0:nx*ny*nz], g[0:nx*ny*nz]) 
   #pragma acc enter data create(fp[0:nx*ny*nz], gp[0:nx*ny*nz])
   {

      for (n=0; n<nsteps; n++) {

         step = step + dt;
    
         if (((n+1)%printevery)==0)
            printf("step = %9.6f \n",step);
    
         #pragma acc kernels
         {

            // predictor
            #pragma acc loop independent collapse(2) gang
            for (i=0; i<nx; i++) {
               for (j=0; j<ny; j++) {
                  #pragma acc loop independent vector
                  for (k=0; k<nz; k++) {
                     int offset = OFFSET(i, j, k, ny, nz);
                     fp[offset] = f[offset] + dt * g[offset];
                  }
               }
            } 
      
            // static boundaries
            #pragma acc loop independent collapse(2)
            for (j=0; j<ny; j++) {
               for (k=0; k<nz; k++) {
                  int xbeg = OFFSET(0,    j, k, ny, nz);
                  int xend = OFFSET(nx-1, j, k, ny, nz);
                  gp[xbeg] = g[xbeg];
                  gp[xend] = g[xend];
               }
            } 
      
            #pragma acc loop independent collapse(2)
            for (i=0; i<nx; i++) {
               for (k=0; k<nz; k++) {
                  int ybeg = OFFSET(i,    0, k, ny, nz);
                  int yend = OFFSET(i, ny-1, k, ny, nz);
                  gp[ybeg] = g[ybeg];
                  gp[yend] = g[yend];
               }
            } 
      
            #pragma acc loop independent collapse(2)
            for (i=0; i<nx; i++) {
               for (j=0; j<ny; j++) {
                  int zbeg = OFFSET(i, j,    0, ny, nz); 
                  int zend = OFFSET(i, j, nz-1, ny, nz); 
                  gp[zbeg] = g[zbeg];
                  gp[zend] = g[zend];
               }
            } 
      
            // use the predictor to update gp
            #pragma acc loop independent collapse(2) gang
            for (i=1; i<nx-1; i++) {
               for (j=1; j<ny-1; j++) {
                  #pragma acc loop independent vector
                  for (k=1; k<nz-1; k++) {
                     int current = OFFSET(i, j, k, ny, nz);

                     int next_x = OFFSET(i+1,   j,   k,   ny, nz);
                     int next_y = OFFSET(i,   j+1,   k,   ny, nz);
                     int next_z = OFFSET(i,     j, k+1,   ny, nz);

                     int prev_x = OFFSET(i-1,   j,   k,   ny, nz);
                     int prev_y = OFFSET(i,   j-1,   k,   ny, nz);
                     int prev_z = OFFSET(i,     j, k-1,   ny, nz);

                     gp[current] = g[current] + dt * (
                                   (fp[next_x] - 2.0f * fp[current] + fp[prev_x]) / dx / dx +
                                   (fp[next_y] - 2.0f * fp[current] + fp[prev_y]) / dy / dy +
                                   (fp[next_z] - 2.0f * fp[current] + fp[prev_z]) / dz / dz );
                  }
               }
            } 
      
            // use the average g's to update f
            #pragma acc loop independent collapse(2) gang
            for (i=0; i<nx; i++) {
               for (j=0; j<ny; j++) {
                  #pragma acc loop independent vector
                  for (k=0; k<nz; k++) {
                     int offset = OFFSET(i, j, k, ny, nz);
                     fp[offset] = f[offset] + dt * (0.5f * (g[offset] + gp[offset]));
                  }
               }
            } 
      
            // now update all the variables
            #pragma acc loop independent collapse(2) gang
            for (i=0; i<nx; i++) {
               for (j=0; j<ny; j++) {
                  #pragma acc loop independent vector
                  for (int k=0; k<nz; k++) {
                     int offset = OFFSET(i, j, k, ny, nz);
                     f[offset] = fp[offset];
                     g[offset] = gp[offset];
                  }
               }
            } 
      
         } // pragma acc kernels
    
         if (((n+1)%printevery)==0) {
            #pragma acc update host(x[0:nx*(ny*nz)], f[0:nx*(ny*nz)])
            for (i=0; i<nx; i++) {
               int offset = OFFSET(i, ny/2, nz/2, ny, nz);
               fprintf(fPtr,"%5.3f %10.6e\n",x[offset],f[offset]);
            }
            fprintf(fPtr,"\n");
         }
    
    
      } // for nsteps

   } // pragma acc data

   cudaProfilerStop();
   cudaDeviceSynchronize();

   free(x);
   free(y);
   free(z);
   free(f);
   free(g);
   free(fp);
   free(gp);

   float totalTime = GetTimer();
   printf("Total time: %f seconds\n", totalTime / 1000.0f);

   exit(0);
}
Ejemplo n.º 11
0
int main(int argc, char *argv[])
{
  typedef int                 IndexType;
  typedef double              ValueType;
  typedef cusp::device_memory MemorySpace;
  //typedef cusp::row_major     Orientation;

  bool success = true;
  bool verbose = false;
  try {

    // Setup command line options
    Teuchos::CommandLineProcessor CLP;
    CLP.setDocString("This test performance of block multiply routines.\n");
    IndexType n = 32;
    CLP.setOption("n", &n, "Number of mesh points in the each direction");
    IndexType nrhs_begin = 32;
    CLP.setOption("begin", &nrhs_begin,
                  "Staring number of right-hand-sides");
    IndexType nrhs_end = 512;
    CLP.setOption("end", &nrhs_end,
                  "Ending number of right-hand-sides");
    IndexType nrhs_step = 32;
    CLP.setOption("step", &nrhs_step,
                  "Increment in number of right-hand-sides");
    IndexType nits = 10;
    CLP.setOption("nits", &nits,
                  "Number of multiply iterations");
    int device_id = 0;
    CLP.setOption("device", &device_id, "CUDA device ID");
    CLP.parse( argc, argv );

    // Set CUDA device
    cudaSetDevice(device_id);
    cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte);

    // create 3D Poisson problem
    cusp::csr_matrix<IndexType, ValueType, MemorySpace> A;
    cusp::gallery::poisson27pt(A, n, n, n);

    std::cout << "nrhs , num_rows , num_entries , row_time , row_gflops , "
              << "col_time , col_gflops" << std::endl;

    for (IndexType nrhs = nrhs_begin; nrhs <= nrhs_end; nrhs += nrhs_step) {

      double flops =
        2.0 * static_cast<double>(A.num_entries) * static_cast<double>(nrhs);

      // test row-major storage
      cusp::array2d<ValueType, MemorySpace, cusp::row_major> x_row(
        A.num_rows, nrhs, 1);
      cusp::array2d<ValueType, MemorySpace, cusp::row_major> y_row(
        A.num_rows, nrhs, 0);

      cusp::detail::timer row_timer;
      row_timer.start();
      for (IndexType iter=0; iter<nits; ++iter) {
        cusp::MVmultiply(A, x_row, y_row);
      }
      cudaDeviceSynchronize();
      double row_time = row_timer.seconds_elapsed() / nits;
      double row_gflops = 1.0e-9 * flops / row_time;

      // test column-major storage
      cusp::array2d<ValueType, MemorySpace, cusp::column_major> x_col(
        A.num_rows, nrhs, 1);
      cusp::array2d<ValueType, MemorySpace, cusp::column_major> y_col(
        A.num_rows, nrhs, 0);

      cusp::detail::timer col_timer;
      col_timer.start();
      for (IndexType iter=0; iter<nits; ++iter) {
        cusp::MVmultiply(A, x_col, y_col);
      }
      cudaDeviceSynchronize();
      double col_time = col_timer.seconds_elapsed() / nits;
      double col_gflops = 1.0e-9 * flops / col_time;

      std::cout << nrhs << " , "
                << A.num_rows << " , "  << A.num_entries << " , "
                << row_time << " , " << row_gflops << " , "
                << col_time << " , " << col_gflops
                << std::endl;

    }

  }
  TEUCHOS_STANDARD_CATCH_STATEMENTS(verbose, std::cerr, success);

  if (success)
    return 0;
  return -1;
}
Ejemplo n.º 12
0
///////////////////////
// Main program entry
///////////////////////
int main(int argc, char** argv)
{
	unsigned int max_iters, Nx, Ny, Nz, blockX, blockY, blockZ;
	int rank, numberOfProcesses;

	if (argc == 8)
	{
		Nx = atoi(argv[1]);
		Ny = atoi(argv[2]);
		Nz = atoi(argv[3]);
		max_iters = atoi(argv[4]);
		blockX = atoi(argv[5]);
		blockY = atoi(argv[6]);
		blockZ = atoi(argv[7]);
	}
	else
	{
		printf("Usage: %s nx ny nz i block_x block_y block_z\n", argv[0]);
		exit(1);
	}

  InitializeMPI(&argc, &argv, &rank, &numberOfProcesses);
  AssignDevices(rank);
  ECCCheck(rank);

	// Define constants
	const _DOUBLE_ L = 1.0;
	const _DOUBLE_ h = L/(Nx+1);
	const _DOUBLE_ dt = h*h/6.0;
	const _DOUBLE_ beta = dt/(h*h);
	const _DOUBLE_ c0 = beta;
	const _DOUBLE_ c1 = (1-6*beta);

	// Copy constants to Constant Memory on the GPUs
	CopyToConstantMemory(c0, c1);

	// Decompose along the z-axis
	const int _Nz = Nz/numberOfProcesses;
  const int dt_size = sizeof(_DOUBLE_);

    // Host memory allocations
    _DOUBLE_ *u_new, *u_old;
    _DOUBLE_ *h_Uold;

    u_new = (_DOUBLE_*)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(Nz+2));
    u_old = (_DOUBLE_*)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(Nz+2));

    if (rank == 0)
    {
    	h_Uold = (_DOUBLE_*)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(Nz+2)); 
    }

    init(u_old, u_new, h, Nx, Ny, Nz);

    // Allocate and generate host subdomains
    _DOUBLE_ *h_s_Uolds, *h_s_Unews, *h_s_rbuf[numberOfProcesses];
    _DOUBLE_ *left_send_buffer, *left_receive_buffer;
    _DOUBLE_ *right_send_buffer, *right_receive_buffer;

    h_s_Unews = (_DOUBLE_*)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(_Nz+2));
    h_s_Uolds = (_DOUBLE_*)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(_Nz+2));

#if defined(DEBUG) || defined(_DEBUG)
  if (rank == 0)
  {
    for (int i = 0; i < numberOfProcesses; i++)
    {
        h_s_rbuf[i] = (_DOUBLE_*)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(_Nz+2));
        checkCuda(cudaHostAlloc((void**)&h_s_rbuf[i], dt_size*(Nx+2)*(Ny+2)*(_Nz+2), cudaHostAllocPortable));
    }
  }
#endif

    right_send_buffer = (_DOUBLE_*)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(_GC_DEPTH));
    left_send_buffer = (_DOUBLE_*)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(_GC_DEPTH));
    right_receive_buffer = (_DOUBLE_*)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(_GC_DEPTH));
    left_receive_buffer = (_DOUBLE_*)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(_GC_DEPTH));

    checkCuda(cudaHostAlloc((void**)&h_s_Unews, dt_size*(Nx+2)*(Ny+2)*(_Nz+2), cudaHostAllocPortable));
    checkCuda(cudaHostAlloc((void**)&h_s_Uolds, dt_size*(Nx+2)*(Ny+2)*(_Nz+2), cudaHostAllocPortable));

    checkCuda(cudaHostAlloc((void**)&right_send_buffer, dt_size*(Nx+2)*(Ny+2)*(_GC_DEPTH), cudaHostAllocPortable));
    checkCuda(cudaHostAlloc((void**)&left_send_buffer, dt_size*(Nx+2)*(Ny+2)*(_GC_DEPTH), cudaHostAllocPortable));
    checkCuda(cudaHostAlloc((void**)&right_receive_buffer, dt_size*(Nx+2)*(Ny+2)*(_GC_DEPTH), cudaHostAllocPortable));
    checkCuda(cudaHostAlloc((void**)&left_receive_buffer, dt_size*(Nx+2)*(Ny+2)*(_GC_DEPTH), cudaHostAllocPortable));

    init_subdomain(h_s_Uolds, u_old, Nx, Ny, _Nz, rank);

	// GPU stream operations
	cudaStream_t compute_stream;
	cudaStream_t data_stream;

	checkCuda(cudaStreamCreate(&compute_stream));
	checkCuda(cudaStreamCreate(&data_stream));

	// GPU Memory Operations
	size_t pitch_bytes, pitch_gc_bytes;

  _DOUBLE_ *d_s_Unews, *d_s_Uolds;
  _DOUBLE_ *d_right_send_buffer, *d_left_send_buffer;
  _DOUBLE_ *d_right_receive_buffer, *d_left_receive_buffer;

  checkCuda(cudaMallocPitch((void**)&d_s_Uolds, &pitch_bytes, dt_size*(Nx+2), (Ny+2)*(_Nz+2)));
  checkCuda(cudaMallocPitch((void**)&d_s_Unews, &pitch_bytes, dt_size*(Nx+2), (Ny+2)*(_Nz+2)));

  checkCuda(cudaMallocPitch((void**)&d_left_send_buffer, &pitch_gc_bytes, dt_size*(Nx+2), (Ny+2)*(_GC_DEPTH)));
  checkCuda(cudaMallocPitch((void**)&d_left_receive_buffer, &pitch_gc_bytes, dt_size*(Nx+2), (Ny+2)*(_GC_DEPTH)));
  checkCuda(cudaMallocPitch((void**)&d_right_send_buffer, &pitch_gc_bytes, dt_size*(Nx+2), (Ny+2)*(_GC_DEPTH)));
  checkCuda(cudaMallocPitch((void**)&d_right_receive_buffer, &pitch_gc_bytes, dt_size*(Nx+2), (Ny+2)*(_GC_DEPTH)));

	// Copy subdomains from host to device and get walltime
	double HtD_timer = 0.;

	MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD));
	HtD_timer -= MPI_Wtime();
  MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD));

  checkCuda(cudaMemcpy2D(d_s_Uolds, pitch_bytes, h_s_Uolds, dt_size*(Nx+2), dt_size*(Nx+2), ((Ny+2)*(_Nz+2)), cudaMemcpyDefault));
  checkCuda(cudaMemcpy2D(d_s_Unews, pitch_bytes, h_s_Unews, dt_size*(Nx+2), dt_size*(Nx+2), ((Ny+2)*(_Nz+2)), cudaMemcpyDefault));

  MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD));
	HtD_timer += MPI_Wtime();
	MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD));

	unsigned int ghost_width = 1;

	int pitch = pitch_bytes/dt_size;
	int gc_pitch = pitch_gc_bytes/dt_size;

  // GPU kernel launch parameters
	dim3 threads_per_block(blockX, blockY, blockZ);
	unsigned int blocksInX = getBlock(Nx, blockX);
	unsigned int blocksInY = getBlock(Ny, blockY);
	unsigned int blocksInZ = getBlock(_Nz-2, k_loop);

	dim3 thread_blocks(blocksInX, blocksInY, blocksInZ);
	dim3 thread_blocks_halo(blocksInX, blocksInY);

	//MPI_Status status;
	MPI_Status status[numberOfProcesses];
	MPI_Request gather_send_request[numberOfProcesses];
	MPI_Request right_send_request[numberOfProcesses], left_send_request[numberOfProcesses];
	MPI_Request right_receive_request[numberOfProcesses], left_receive_request[numberOfProcesses];

	double compute_timer = 0.;

	MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD));
  compute_timer -= MPI_Wtime();
  MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD));

	for(unsigned int iterations = 0; iterations < max_iters; iterations++)
	{
		// Compute right boundary data on device 0
		if (rank == 0) {
      int kstart = (_Nz+1)-ghost_width;
	    int kstop = _Nz+1;

			ComputeInnerPointsAsync(thread_blocks_halo, threads_per_block, data_stream, d_s_Unews, d_s_Uolds, pitch, Nx, Ny, _Nz, kstart, kstop);
			CopyBoundaryRegionToGhostCellAsync(thread_blocks_halo, threads_per_block, data_stream, d_s_Unews, d_right_send_buffer, Nx, Ny, _Nz, pitch, gc_pitch, 0);
			
			checkCuda(cudaMemcpy2DAsync(right_send_buffer, dt_size*(Nx+2), d_right_send_buffer, pitch_gc_bytes, dt_size*(Nx+2), (Ny+2)*(_GC_DEPTH), cudaMemcpyDefault, data_stream));
			checkCuda(cudaStreamSynchronize(data_stream));

			MPI_CHECK(MPI_Isend(right_send_buffer, (Nx+2)*(Ny+2)*(_GC_DEPTH), MPI_DOUBLE, 1, 0, MPI_COMM_WORLD, &right_send_request[rank]));
		}
		else
		{
			int kstart = 1;
			int kstop = 1+ghost_width;

			ComputeInnerPointsAsync(thread_blocks_halo, threads_per_block, data_stream, d_s_Unews, d_s_Uolds, pitch, Nx, Ny, _Nz, kstart, kstop);
			CopyBoundaryRegionToGhostCellAsync(thread_blocks_halo, threads_per_block, data_stream, d_s_Unews, d_left_send_buffer, Nx, Ny, _Nz, pitch, gc_pitch, 1);
			
			checkCuda(cudaMemcpy2DAsync(left_send_buffer, dt_size*(Nx+2), d_left_send_buffer, pitch_gc_bytes, dt_size*(Nx+2), (Ny+2)*(_GC_DEPTH), cudaMemcpyDefault, data_stream));
			checkCuda(cudaStreamSynchronize(data_stream));

			MPI_CHECK(MPI_Isend(left_send_buffer, (Nx+2)*(Ny+2)*(_GC_DEPTH), MPI_DOUBLE, 0, 1, MPI_COMM_WORLD, &left_send_request[rank]));
		}

		// Compute inner nodes for device 0
		if (rank == 0) {
			int kstart = 1;
			int kstop = (_Nz+1)-ghost_width;

			ComputeInnerPointsAsync(thread_blocks, threads_per_block, compute_stream, d_s_Unews, d_s_Uolds, pitch, Nx, Ny, _Nz, kstart, kstop);
		}
		// Compute inner nodes for device 1
		else
		{
			int kstart = 1+ghost_width;
			int kstop = _Nz+1;

			ComputeInnerPointsAsync(thread_blocks, threads_per_block, compute_stream, d_s_Unews, d_s_Uolds, pitch, Nx, Ny, _Nz, kstart, kstop);
		}

		// Receive data from device 1
		if (rank == 0) {
			MPI_CHECK(MPI_Irecv(left_send_buffer, (Nx+2)*(Ny+2)*(_GC_DEPTH), MPI_DOUBLE, 1, 1, MPI_COMM_WORLD, &right_receive_request[rank]));
		}
		else
		{
			MPI_CHECK(MPI_Irecv(right_send_buffer, (Nx+2)*(Ny+2)*(_GC_DEPTH), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD, &left_receive_request[rank]));
		}

		if (rank == 0) {
      MPI_CHECK(MPI_Wait(&right_receive_request[rank], &status[rank]));

			checkCuda(cudaMemcpy2DAsync(d_right_receive_buffer, pitch_gc_bytes, left_send_buffer, dt_size*(Nx+2), dt_size*(Nx+2), ((Ny+2)*(_GC_DEPTH)), cudaMemcpyDefault, data_stream));
			CopyGhostCellToBoundaryRegionAsync(thread_blocks_halo, threads_per_block, data_stream, d_s_Unews, d_right_receive_buffer, Nx, Ny, _Nz, pitch, gc_pitch, 0);
		}
		else
		{
			MPI_CHECK(MPI_Wait(&left_receive_request[rank], &status[rank]));

			checkCuda(cudaMemcpy2DAsync(d_left_receive_buffer, pitch_gc_bytes, right_send_buffer, dt_size*(Nx+2), dt_size*(Nx+2), ((Ny+2)*(_GC_DEPTH)), cudaMemcpyDefault, data_stream));
			CopyGhostCellToBoundaryRegionAsync(thread_blocks_halo, threads_per_block, data_stream, d_s_Unews, d_left_receive_buffer, Nx, Ny, _Nz, pitch, gc_pitch, 1);
		}

		if (rank == 0)
		{
			MPI_CHECK(MPI_Wait(&right_send_request[rank], MPI_STATUS_IGNORE));
		}
		else
		{
			MPI_CHECK(MPI_Wait(&left_send_request[rank], MPI_STATUS_IGNORE));
		}

		// Swap pointers on the host
		checkCuda(cudaDeviceSynchronize());
		swap(_DOUBLE_*, d_s_Unews, d_s_Uolds);
	}

	MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD));
	compute_timer += MPI_Wtime();
	MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD));

	// Copy data from device to host
	double DtH_timer = 0;

	MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD));
  DtH_timer -= MPI_Wtime();
  MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD));

	checkCuda(cudaMemcpy2D(h_s_Uolds, dt_size*(Nx+2), d_s_Uolds, pitch_bytes, dt_size*(Nx+2), (Ny+2)*(_Nz+2), cudaMemcpyDefault));

	MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD));
	DtH_timer += MPI_Wtime();
	MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD));

  // Gather results from subdomains
  MPI_CHECK(MPI_Isend(h_s_Uolds, (Nx+2)*(Ny+2)*(_Nz+2), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD, &gather_send_request[rank]));

	if (rank == 0)
	{
		for (int i = 0; i < numberOfProcesses; i++)
		{
			MPI_CHECK(MPI_Recv(h_s_rbuf[i], (Nx+2)*(Ny+2)*(_Nz+2), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, &status[rank]));
			merge_domains(h_s_rbuf[i], h_Uold, Nx, Ny, _Nz, i);
		}
	}

	// Calculate on host
#if defined(DEBUG) || defined(_DEBUG)
	if (rank == 0)
	{
		cpu_heat3D(u_new, u_old, c0, c1, max_iters, Nx, Ny, Nz);
	}
#endif

	if (rank == 0)
	{
		float gflops = CalcGflops(compute_timer, max_iters, Nx, Ny, Nz);
		PrintSummary("3D Heat (7-pt)", "Plane sweeping", compute_timer, HtD_timer, DtH_timer, gflops, max_iters, Nx);

		_DOUBLE_ t = max_iters * dt;
		CalcError(h_Uold, u_old, t, h, Nx, Ny, Nz);
	}

	Finalize();

  // Free device memory
  checkCuda(cudaFree(d_s_Unews));
  checkCuda(cudaFree(d_s_Uolds));
  checkCuda(cudaFree(d_right_send_buffer));
  checkCuda(cudaFree(d_left_send_buffer));
  checkCuda(cudaFree(d_right_receive_buffer));
  checkCuda(cudaFree(d_left_receive_buffer));

  // Free host memory
  checkCuda(cudaFreeHost(h_s_Unews));
  checkCuda(cudaFreeHost(h_s_Uolds));

#if defined(DEBUG) || defined(_DEBUG)
  if (rank == 0)
  {
  	for (int i = 0; i < numberOfProcesses; i++)
  	{
  		checkCuda(cudaFreeHost(h_s_rbuf[i]));
  	}

    free(h_Uold);
  }
#endif

  checkCuda(cudaFreeHost(left_send_buffer));
  checkCuda(cudaFreeHost(left_receive_buffer));
  checkCuda(cudaFreeHost(right_send_buffer));
  checkCuda(cudaFreeHost(right_receive_buffer));

  checkCuda(cudaDeviceReset());

  free(u_old);
  free(u_new);

  return 0;
}
Ejemplo n.º 13
0
int main(int argc, char **argv) {
  unsigned int *inputVals;
  unsigned int *inputPos;
  unsigned int *outputVals;
  unsigned int *outputPos;

  size_t numElems;

  std::string input_file;
  std::string template_file;
  std::string output_file;
  std::string reference_file = "red_eye_effect.gold";
  double perPixelError = 0.0;
  double globalError   = 0.0;
  bool useEpsCheck = false;

  switch (argc)
  {
	case 3:
	  input_file  = std::string(argv[1]);
      template_file = std::string(argv[2]);
	  output_file = "HW4_output.png";
	  break;
	case 4:
	  input_file  = std::string(argv[1]);
      template_file = std::string(argv[2]);
	  output_file = std::string(argv[3]);
	  break;
	default:
          std::cerr << "Usage: ./HW4 input_file template_file [output_filename]" << std::endl;
          exit(1);
  }
  //load the image and give us our input and output pointers
  preProcess(&inputVals, &inputPos, &outputVals, &outputPos, numElems, input_file, template_file);
  
  GpuTimer timer;
  timer.Start();

  //call the students' code
  your_sort(inputVals, inputPos, outputVals, outputPos, numElems);

  timer.Stop();
  cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
  printf("\n");
  int err = printf("Your code ran in: %f msecs.\n", timer.Elapsed());

  if (err < 0) {
    //Couldn't print! Probably the student closed stdout - bad news
    std::cerr << "Couldn't print timing information! STDOUT Closed!" << std::endl;
    exit(1);
  }

  //check results and output the red-eye corrected image
  postProcess(outputVals, outputPos, numElems, output_file);


  // check code moved from HW4.cu
  /****************************************************************************
  * You can use the code below to help with debugging, but make sure to       *
  * comment it out again before submitting your assignment for grading,       *
  * otherwise this code will take too much time and make it seem like your    *
  * GPU implementation isn't fast enough.                                     *
  *                                                                           *
  * This code MUST RUN BEFORE YOUR CODE in case you accidentally change       *
  * the input values when implementing your radix sort.                       *
  *                                                                           *
  * This code performs the reference radix sort on the host and compares your *
  * sorted values to the reference.                                           *
  *                                                                           *
  * Thrust containers are used for copying memory from the GPU                *
  * ************************************************************************* */
  thrust::device_ptr<unsigned int> d_inputVals(inputVals);
  thrust::device_ptr<unsigned int> d_inputPos(inputPos);

  thrust::host_vector<unsigned int> h_inputVals(d_inputVals,
                                                d_inputVals+numElems);
  thrust::host_vector<unsigned int> h_inputPos(d_inputPos,
                                               d_inputPos + numElems);

  thrust::host_vector<unsigned int> h_outputVals(numElems);
  thrust::host_vector<unsigned int> h_outputPos(numElems);
  
  reference_calculation(&h_inputVals[0], &h_inputPos[0],
						&h_outputVals[0], &h_outputPos[0],
						numElems);

  //postProcess(valsPtr, posPtr, numElems, reference_file);

  compareImages(reference_file, output_file, useEpsCheck, perPixelError, globalError);

  thrust::device_ptr<unsigned int> d_outputVals(outputVals);
  thrust::device_ptr<unsigned int> d_outputPos(outputPos);

  thrust::host_vector<unsigned int> h_yourOutputVals(d_outputVals,
                                                     d_outputVals + numElems);
  thrust::host_vector<unsigned int> h_yourOutputPos(d_outputPos,
                                                    d_outputPos + numElems);

  checkResultsExact(&h_outputVals[0], &h_yourOutputVals[0], numElems);
  //checkResultsExact(&h_outputPos[0], &h_yourOutputPos[0], numElems);

  checkCudaErrors(cudaFree(inputVals));
  checkCudaErrors(cudaFree(inputPos));
  checkCudaErrors(cudaFree(outputVals));
  checkCudaErrors(cudaFree(outputPos));

  return 0;
}
Ejemplo n.º 14
0
void testCuda(int m, int n, int nnz, std::vector<int>& rows, std::vector<int>& cols,
		std::vector<double>& values, double* matB){

    double tol=1e-9;
    double start, stop, time_to_build, time_to_solve;

    int cudaDevice = 0;

    checkCudaErrors(cudaSetDevice(cudaDevice));

    cudaDeviceProp prop;
    cudaGetDeviceProperties(&prop, cudaDevice);
    printf("Device Number: %d\n", cudaDevice);
    printf("  Device name: %s\n", prop.name);
    checkCudaErrors(cudaDeviceReset());

	 size_t mem_tot = 0;
	 size_t mem_free = 0;

	 cudaMemGetInfo(&mem_free, & mem_tot);
	 printf("\nFree memory: %d", mem_free);

	MatSparse matA;
    matA.setSize(m, n);

    std::vector<int> I, J;
    std::vector<double> V;

    for (int k = 0; k < nnz; k++){
    	double _val = values[k];
    	int i = rows[k];
    	int j = cols[k];

    	if (fabs(_val) > tol){
        	I.push_back(i-1);
        	J.push_back(j-1);
        	V.push_back(_val);
        }
    }

    start = second();
    matA.fromTruples(I, J, V);
    stop = second();
    time_to_build = stop - start;
    std::cerr << "Time to Build in GPU (second): " << time_to_build << std::endl;


    // ******************************** GPU SOLVER ******************************** //

    // --- Initialize cuSPARSE
     	cusolverSpHandle_t cusolver_handle = NULL; checkCudaErrors(cusolverSpCreate(&cusolver_handle));
     	cusparseHandle_t cusparse_handle = NULL; checkCudaErrors(cusparseCreate(&cusparse_handle));
     	cudaStream_t cudaStream = NULL; checkCudaErrors(cudaStreamCreate(&cudaStream));
     	checkCudaErrors(cusolverSpSetStream(cusolver_handle, cudaStream));
     	checkCudaErrors(cusparseSetStream(cusparse_handle, cudaStream));


        cusparseMatDescr_t descrA;      checkCudaErrors(cusparseCreateMatDescr(&descrA));
        checkCudaErrors(cusparseSetMatType     (descrA, CUSPARSE_MATRIX_TYPE_GENERAL));
        checkCudaErrors(cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ZERO));


        printf("\nAlloc GPU memory...\n");
        double *d_A;            checkCudaErrors(cudaMalloc(&d_A, nnz * sizeof(double)));
        int *d_A_RowIndices;    checkCudaErrors(cudaMalloc(&d_A_RowIndices, (m + 1) * sizeof(int)));
        int *d_A_ColIndices;    checkCudaErrors(cudaMalloc(&d_A_ColIndices, nnz * sizeof(int)));
        double *d_x;        checkCudaErrors(cudaMalloc(&d_x, m * sizeof(double)));
        double *d_b; checkCudaErrors(cudaMalloc(&d_b, m * sizeof(double)));
        printf("\nError: %s", cudaGetErrorString(cudaGetLastError()));

        printf("\nCopying data...\n");
        checkCudaErrors(cudaMemcpy(d_A, matA.valuesPtr(), nnz * sizeof(double), cudaMemcpyHostToDevice));
        checkCudaErrors(cudaMemcpy(d_A_RowIndices, matA.RowPtr(), (m + 1) * sizeof(int), cudaMemcpyHostToDevice));
        checkCudaErrors(cudaMemcpy(d_A_ColIndices, matA.ColIdxPtr(), nnz * sizeof(int), cudaMemcpyHostToDevice));
        checkCudaErrors(cudaMemcpy(d_b, matB, m * sizeof(double), cudaMemcpyHostToDevice));

        double *h_x = (double *)malloc(m * sizeof(double));

        printf("\nError: %s", cudaGetErrorString(cudaGetLastError()));
        cudaMemGetInfo(&mem_free, &mem_tot);
        printf("\nFree memory: %d", mem_free);

        int reorder = 0;
        int singularity = 0;
        start = second();
        //checkCudaErrors(cusolverSpDcsrlsvluHost(cusolver_handle, Nrows, nnz, descrA, sparse.Values(),
        	//	sparse.RowPtr(), sparse.ColIdx(), mB.values, tol, reorder, h_x, &singularity));
        checkCudaErrors(cusolverSpDcsrlsvqr(cusolver_handle, m, nnz, descrA, d_A, d_A_RowIndices,
               		d_A_ColIndices, d_b, tol, reorder, d_x, &singularity));
        checkCudaErrors(cudaDeviceSynchronize());
        stop = second();
        time_to_solve = stop - start;


        checkCudaErrors(cudaMemcpy(h_x, d_x, m * sizeof(double), cudaMemcpyDeviceToHost));

//        for (int k=0; k<mA.getNumRows(); k++) solution[k] = h_x[k];


        checkCudaErrors(cusparseDestroy(cusparse_handle));
        checkCudaErrors(cusolverSpDestroy(cusolver_handle));
        checkCudaErrors(cudaStreamDestroy(cudaStream));
        checkCudaErrors(cudaFree(d_b));
        checkCudaErrors(cudaFree(d_x));

        checkCudaErrors(cudaFree(d_A));
        checkCudaErrors(cudaFree(d_A_RowIndices));
        checkCudaErrors(cudaFree(d_A_ColIndices));

        free(h_x);

        std::cerr << "Time to Build in GPU (second): " << time_to_build << std::endl;
        std::cerr << "Time to Solve in GPU (second): " << time_to_solve << std::endl;
        std::cerr << "done!";

    // ****************************************************************************** //
}
void CinderCUDASampleApp::update()
{
	generateCUDAImage();
	cudaDeviceSynchronize();
}
Ejemplo n.º 16
0
int main(int argc, char **argv)
{
    int N = 0, nz = 0, *I = NULL, *J = NULL;
    float *val = NULL;
    const float tol = 1e-5f;
    const int max_iter = 10000;
    float *x;
    float *rhs;
    float a, b, na, r0, r1;
    float dot;
    float *r, *p, *Ax;
    int k;
    float alpha, beta, alpham1;

    printf("Starting [%s]...\n", sSDKname);

    // This will pick the best possible CUDA capable device
    cudaDeviceProp deviceProp;
    int devID = findCudaDevice(argc, (const char **)argv);
    checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID));

    if (!deviceProp.managedMemory) { 
        // This samples requires being run on a device that supports Unified Memory
        fprintf(stderr, "Unified Memory not supported on this device\n");

        // cudaDeviceReset causes the driver to clean up all state. While
        // not mandatory in normal operation, it is good practice.  It is also
        // needed to ensure correct operation when the application is being
        // profiled. Calling cudaDeviceReset causes all profile data to be
        // flushed before the application exits
        cudaDeviceReset();
        exit(EXIT_WAIVED);
    }

    // Statistics about the GPU device
    printf("> GPU device has %d Multi-Processors, SM %d.%d compute capabilities\n\n",
           deviceProp.multiProcessorCount, deviceProp.major, deviceProp.minor);

    /* Generate a random tridiagonal symmetric matrix in CSR format */
    N = 1048576;
    nz = (N-2)*3 + 4;

    cudaMallocManaged((void **)&I, sizeof(int)*(N+1));
    cudaMallocManaged((void **)&J, sizeof(int)*nz);
    cudaMallocManaged((void **)&val, sizeof(float)*nz);

    genTridiag(I, J, val, N, nz);

    cudaMallocManaged((void **)&x, sizeof(float)*N);
    cudaMallocManaged((void **)&rhs, sizeof(float)*N);

    for (int i = 0; i < N; i++)
    {
        rhs[i] = 1.0;
        x[i] = 0.0;
    }

    /* Get handle to the CUBLAS context */
    cublasHandle_t cublasHandle = 0;
    cublasStatus_t cublasStatus;
    cublasStatus = cublasCreate(&cublasHandle);

    checkCudaErrors(cublasStatus);

    /* Get handle to the CUSPARSE context */
    cusparseHandle_t cusparseHandle = 0;
    cusparseStatus_t cusparseStatus;
    cusparseStatus = cusparseCreate(&cusparseHandle);

    checkCudaErrors(cusparseStatus);

    cusparseMatDescr_t descr = 0;
    cusparseStatus = cusparseCreateMatDescr(&descr);

    checkCudaErrors(cusparseStatus);

    cusparseSetMatType(descr,CUSPARSE_MATRIX_TYPE_GENERAL);
    cusparseSetMatIndexBase(descr,CUSPARSE_INDEX_BASE_ZERO);

    // temp memory for CG
    checkCudaErrors(cudaMallocManaged((void **)&r, N*sizeof(float)));
    checkCudaErrors(cudaMallocManaged((void **)&p, N*sizeof(float)));
    checkCudaErrors(cudaMallocManaged((void **)&Ax, N*sizeof(float)));

    cudaDeviceSynchronize();

    for (int i=0; i < N; i++)
    {
        r[i] = rhs[i];
    }

    alpha = 1.0;
    alpham1 = -1.0;
    beta = 0.0;
    r0 = 0.;

    cusparseScsrmv(cusparseHandle,CUSPARSE_OPERATION_NON_TRANSPOSE, N, N, nz, &alpha, descr, val, I, J, x, &beta, Ax);

    cublasSaxpy(cublasHandle, N, &alpham1, Ax, 1, r, 1);
    cublasStatus = cublasSdot(cublasHandle, N, r, 1, r, 1, &r1);

    k = 1;

    while (r1 > tol*tol && k <= max_iter)
    {
        if (k > 1)
        {
            b = r1 / r0;
            cublasStatus = cublasSscal(cublasHandle, N, &b, p, 1);
            cublasStatus = cublasSaxpy(cublasHandle, N, &alpha, r, 1, p, 1);
        }
        else
        {
            cublasStatus = cublasScopy(cublasHandle, N, r, 1, p, 1);
        }

        cusparseScsrmv(cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, N, N, nz, &alpha, descr, val, I, J, p, &beta, Ax);
        cublasStatus = cublasSdot(cublasHandle, N, p, 1, Ax, 1, &dot);
        a = r1 / dot;

        cublasStatus = cublasSaxpy(cublasHandle, N, &a, p, 1, x, 1);
        na = -a;
        cublasStatus = cublasSaxpy(cublasHandle, N, &na, Ax, 1, r, 1);

        r0 = r1;
        cublasStatus = cublasSdot(cublasHandle, N, r, 1, r, 1, &r1);
        cudaThreadSynchronize();
        printf("iteration = %3d, residual = %e\n", k, sqrt(r1));
        k++;
    }

    printf("Final residual: %e\n",sqrt(r1));

    fprintf(stdout,"&&&& uvm_cg test %s\n", (sqrt(r1) < tol) ? "PASSED" : "FAILED");

    float rsum, diff, err = 0.0;

    for (int i = 0; i < N; i++)
    {
        rsum = 0.0;

        for (int j = I[i]; j < I[i+1]; j++)
        {
            rsum += val[j]*x[J[j]];
        }

        diff = fabs(rsum - rhs[i]);

        if (diff > err)
        {
            err = diff;
        }
    }

    cusparseDestroy(cusparseHandle);
    cublasDestroy(cublasHandle);

    cudaFree(I);
    cudaFree(J);
    cudaFree(val);
    cudaFree(x);
    cudaFree(rhs);
    cudaFree(r);
    cudaFree(p);
    cudaFree(Ax);

    cudaDeviceReset();

    printf("Test Summary:  Error amount = %f, result = %s\n", err, (k <= max_iter) ? "SUCCESS" : "FAILURE");
    exit((k <= max_iter) ? EXIT_SUCCESS : EXIT_FAILURE);
}
Ejemplo n.º 17
0
/**
 * Synchronizes the CUDA device in the case of a GPU build
 */
inline void SynchronizeCUDA()
{
#ifdef __CUDA_BACKEND__
    cudaDeviceSynchronize();
#endif
}
Ejemplo n.º 18
0
void runAutoTest(int argc, char *argv[])
{
    printf("[%s] (automated testing w/ readback)\n", sSDKsample);
    int devID = findCudaDevice(argc, (const char **)argv);

    // Ensure that SM 2.0 or higher device is available before running
    checkDeviceMeetComputeSpec(argc, argv);

    loadDefaultImage(argv[0]);

    Pixel *d_result;
    checkCudaErrors(cudaMalloc((void **)&d_result, imWidth*imHeight*sizeof(Pixel)));

    char *ref_file = NULL;
    char  dump_file[256];

    int mode = 0;
    mode = getCmdLineArgumentInt(argc, (const char **)argv, "mode");
    getCmdLineArgumentString(argc, (const char **)argv, "file", &ref_file);

    switch (mode)
    {
        case 0:
            g_SobelDisplayMode = SOBELDISPLAY_IMAGE;
            sprintf(dump_file, "lena_orig.pgm");
            break;

        case 1:
            g_SobelDisplayMode = SOBELDISPLAY_SOBELTEX;
            sprintf(dump_file, "lena_tex.pgm");
            break;

        case 2:
            g_SobelDisplayMode = SOBELDISPLAY_SOBELSHARED;
            sprintf(dump_file, "lena_shared.pgm");
            break;

        default:
            printf("Invalid Filter Mode File\n");
            exit(EXIT_FAILURE);
            break;
    }

    printf("AutoTest: %s <%s>\n", sSDKsample, filterMode[g_SobelDisplayMode]);
    sobelFilter(d_result, imWidth, imHeight, g_SobelDisplayMode, imageScale, blockOp, pointOp);
    checkCudaErrors(cudaDeviceSynchronize());

    unsigned char *h_result = (unsigned char *)malloc(imWidth*imHeight*sizeof(Pixel));
    checkCudaErrors(cudaMemcpy(h_result, d_result, imWidth*imHeight*sizeof(Pixel), cudaMemcpyDeviceToHost));
    sdkSavePGM(dump_file, h_result, imWidth, imHeight);

    if (!sdkComparePGM(dump_file, sdkFindFilePath(ref_file, argv[0]), MAX_EPSILON_ERROR, 0.15f, false))
    {
        g_TotalErrors++;
    }

    checkCudaErrors(cudaFree(d_result));
    free(h_result);

    if (g_TotalErrors != 0)
    {
        printf("Test failed!\n");
        exit(EXIT_FAILURE);
    }

    printf("Test passed!\n");
    exit(EXIT_SUCCESS);
}
Ejemplo n.º 19
0
void cv::gpu::graphcut(GpuMat& terminals, GpuMat& leftTransp, GpuMat& rightTransp, GpuMat& top, GpuMat& topLeft, GpuMat& topRight,
              GpuMat& bottom, GpuMat& bottomLeft, GpuMat& bottomRight, GpuMat& labels, GpuMat& buf, Stream& s)
{
#if (CUDA_VERSION < 5000)
    CV_Assert(terminals.type() == CV_32S);
#else
    CV_Assert(terminals.type() == CV_32S || terminals.type() == CV_32F);
#endif

    Size src_size = terminals.size();

    CV_Assert(leftTransp.size() == Size(src_size.height, src_size.width));
    CV_Assert(leftTransp.type() == terminals.type());

    CV_Assert(rightTransp.size() == Size(src_size.height, src_size.width));
    CV_Assert(rightTransp.type() == terminals.type());

    CV_Assert(top.size() == src_size);
    CV_Assert(top.type() == terminals.type());

    CV_Assert(topLeft.size() == src_size);
    CV_Assert(topLeft.type() == terminals.type());

    CV_Assert(topRight.size() == src_size);
    CV_Assert(topRight.type() == terminals.type());

    CV_Assert(bottom.size() == src_size);
    CV_Assert(bottom.type() == terminals.type());

    CV_Assert(bottomLeft.size() == src_size);
    CV_Assert(bottomLeft.type() == terminals.type());

    CV_Assert(bottomRight.size() == src_size);
    CV_Assert(bottomRight.type() == terminals.type());

    labels.create(src_size, CV_8U);

    NppiSize sznpp;
    sznpp.width = src_size.width;
    sznpp.height = src_size.height;

    int bufsz;
    nppSafeCall( nppiGraphcut8GetSize(sznpp, &bufsz) );

    ensureSizeIsEnough(1, bufsz, CV_8U, buf);

    cudaStream_t stream = StreamAccessor::getStream(s);

    NppStreamHandler h(stream);

    NppiGraphcutStateHandler state(sznpp, buf.ptr<Npp8u>(), nppiGraphcut8InitAlloc);

#if (CUDA_VERSION < 5000)
    nppSafeCall( nppiGraphcut8_32s8u(terminals.ptr<Npp32s>(), leftTransp.ptr<Npp32s>(), rightTransp.ptr<Npp32s>(),
        top.ptr<Npp32s>(), topLeft.ptr<Npp32s>(), topRight.ptr<Npp32s>(),
        bottom.ptr<Npp32s>(), bottomLeft.ptr<Npp32s>(), bottomRight.ptr<Npp32s>(),
        static_cast<int>(terminals.step), static_cast<int>(leftTransp.step), sznpp, labels.ptr<Npp8u>(), static_cast<int>(labels.step), state) );
#else
    if (terminals.type() == CV_32S)
    {
        nppSafeCall( nppiGraphcut8_32s8u(terminals.ptr<Npp32s>(), leftTransp.ptr<Npp32s>(), rightTransp.ptr<Npp32s>(),
            top.ptr<Npp32s>(), topLeft.ptr<Npp32s>(), topRight.ptr<Npp32s>(),
            bottom.ptr<Npp32s>(), bottomLeft.ptr<Npp32s>(), bottomRight.ptr<Npp32s>(),
            static_cast<int>(terminals.step), static_cast<int>(leftTransp.step), sznpp, labels.ptr<Npp8u>(), static_cast<int>(labels.step), state) );
    }
    else
    {
        nppSafeCall( nppiGraphcut8_32f8u(terminals.ptr<Npp32f>(), leftTransp.ptr<Npp32f>(), rightTransp.ptr<Npp32f>(),
            top.ptr<Npp32f>(), topLeft.ptr<Npp32f>(), topRight.ptr<Npp32f>(),
            bottom.ptr<Npp32f>(), bottomLeft.ptr<Npp32f>(), bottomRight.ptr<Npp32f>(),
            static_cast<int>(terminals.step), static_cast<int>(leftTransp.step), sznpp, labels.ptr<Npp8u>(), static_cast<int>(labels.step), state) );
    }
#endif

    if (stream == 0)
        cudaSafeCall( cudaDeviceSynchronize() );
}
Ejemplo n.º 20
0
void DeviceMemory::upload(const void *host_ptr_arg, size_t sizeBytes_arg)
{
    create(sizeBytes_arg);
    cudaSafeCall( cudaMemcpy(data_, host_ptr_arg, sizeBytes_, cudaMemcpyHostToDevice) );
    cudaSafeCall( cudaDeviceSynchronize() );
}
Ejemplo n.º 21
0
int main(int argc, char **argv)
{
    // Start logs
    printf("%s Starting...\n\n", argv[0]);

    unsigned int useDoublePrecision;

    char *precisionChoice;
    getCmdLineArgumentString(argc, (const char **)argv, "type", &precisionChoice);

    if (precisionChoice == NULL)
    {
        useDoublePrecision = 0;
    }
    else
    {
        if (!STRCASECMP(precisionChoice, "double"))
        {
            useDoublePrecision = 1;
        }
        else
        {
            useDoublePrecision = 0;
        }
    }

    unsigned int tableCPU[QRNG_DIMENSIONS][QRNG_RESOLUTION];

    float *h_OutputGPU, *d_Output;

    int dim, pos;
    double delta, ref, sumDelta, sumRef, L1norm, gpuTime;

    StopWatchInterface *hTimer = NULL;

    if (sizeof(INT64) != 8)
    {
        printf("sizeof(INT64) != 8\n");
        return 0;
    }

    // use command-line specified CUDA device, otherwise use device with highest Gflops/s
    int dev = findCudaDevice(argc, (const char **)argv);

    sdkCreateTimer(&hTimer);

    int deviceIndex;
    checkCudaErrors(cudaGetDevice(&deviceIndex));
    cudaDeviceProp deviceProp;
    checkCudaErrors(cudaGetDeviceProperties(&deviceProp, deviceIndex));
    int version = deviceProp.major * 10 + deviceProp.minor;

    if (useDoublePrecision && version < 13)
    {
        printf("Double precision not supported.\n");

        // cudaDeviceReset causes the driver to clean up all state. While
        // not mandatory in normal operation, it is good practice.  It is also
        // needed to ensure correct operation when the application is being
        // profiled. Calling cudaDeviceReset causes all profile data to be
        // flushed before the application exits
        cudaDeviceReset();
        return 0;
    }

    printf("Allocating GPU memory...\n");
    checkCudaErrors(cudaMalloc((void **)&d_Output, QRNG_DIMENSIONS * N * sizeof(float)));

    printf("Allocating CPU memory...\n");
    h_OutputGPU = (float *)malloc(QRNG_DIMENSIONS * N * sizeof(float));

    printf("Initializing QRNG tables...\n\n");
    initQuasirandomGenerator(tableCPU);

    if (useDoublePrecision)
    {
        initTable_SM13(tableCPU);
    }
    else
    {
        initTable_SM10(tableCPU);
    }

    printf("Testing QRNG...\n\n");
    checkCudaErrors(cudaMemset(d_Output, 0, QRNG_DIMENSIONS * N * sizeof(float)));
    int numIterations = 20;

    for (int i = -1; i < numIterations; i++)
    {
        if (i == 0)
        {
            checkCudaErrors(cudaDeviceSynchronize());
            sdkResetTimer(&hTimer);
            sdkStartTimer(&hTimer);
        }

        if (useDoublePrecision)
        {
            quasirandomGenerator_SM13(d_Output, 0, N);
        }
        else
        {
            quasirandomGenerator_SM10(d_Output, 0, N);
        }
    }

    checkCudaErrors(cudaDeviceSynchronize());
    sdkStopTimer(&hTimer);
    gpuTime = sdkGetTimerValue(&hTimer)/(double)numIterations*1e-3;
    printf("quasirandomGenerator, Throughput = %.4f GNumbers/s, Time = %.5f s, Size = %u Numbers, NumDevsUsed = %u, Workgroup = %u\n",
           (double)QRNG_DIMENSIONS * (double)N * 1.0E-9 / gpuTime, gpuTime, QRNG_DIMENSIONS*N, 1, 128*QRNG_DIMENSIONS);

    printf("\nReading GPU results...\n");
    checkCudaErrors(cudaMemcpy(h_OutputGPU, d_Output, QRNG_DIMENSIONS * N * sizeof(float), cudaMemcpyDeviceToHost));

    printf("Comparing to the CPU results...\n\n");
    sumDelta = 0;
    sumRef = 0;

    for (dim = 0; dim < QRNG_DIMENSIONS; dim++)
        for (pos = 0; pos < N; pos++)
        {
            ref       = getQuasirandomValue63(pos, dim);
            delta     = (double)h_OutputGPU[dim * N + pos] - ref;
            sumDelta += fabs(delta);
            sumRef   += fabs(ref);
        }

    printf("L1 norm: %E\n", sumDelta / sumRef);

    printf("\nTesting inverseCNDgpu()...\n\n");
    checkCudaErrors(cudaMemset(d_Output, 0, QRNG_DIMENSIONS * N * sizeof(float)));

    for (int i = -1; i < numIterations; i++)
    {
        if (i == 0)
        {
            checkCudaErrors(cudaDeviceSynchronize());
            sdkResetTimer(&hTimer);
            sdkStartTimer(&hTimer);
        }

        if (useDoublePrecision)
        {
            inverseCND_SM13(d_Output, NULL, QRNG_DIMENSIONS * N);
        }
        else
        {
            inverseCND_SM10(d_Output, NULL, QRNG_DIMENSIONS * N);
        }
    }

    checkCudaErrors(cudaDeviceSynchronize());
    sdkStopTimer(&hTimer);
    gpuTime = sdkGetTimerValue(&hTimer)/(double)numIterations*1e-3;
    printf("quasirandomGenerator-inverse, Throughput = %.4f GNumbers/s, Time = %.5f s, Size = %u Numbers, NumDevsUsed = %u, Workgroup = %u\n",
           (double)QRNG_DIMENSIONS * (double)N * 1E-9 / gpuTime, gpuTime, QRNG_DIMENSIONS*N, 1, 128);

    printf("Reading GPU results...\n");
    checkCudaErrors(cudaMemcpy(h_OutputGPU, d_Output, QRNG_DIMENSIONS * N * sizeof(float), cudaMemcpyDeviceToHost));

    printf("\nComparing to the CPU results...\n");
    sumDelta = 0;
    sumRef = 0;
    unsigned int distance = ((unsigned int)-1) / (QRNG_DIMENSIONS * N + 1);

    for (pos = 0; pos < QRNG_DIMENSIONS * N; pos++)
    {
        unsigned int d = (pos + 1) * distance;
        ref       = MoroInvCNDcpu(d);
        delta     = (double)h_OutputGPU[pos] - ref;
        sumDelta += fabs(delta);
        sumRef   += fabs(ref);
    }

    printf("L1 norm: %E\n\n", L1norm = sumDelta / sumRef);

    printf("Shutting down...\n");
    sdkDeleteTimer(&hTimer);
    free(h_OutputGPU);
    checkCudaErrors(cudaFree(d_Output));

    // cudaDeviceReset causes the driver to clean up all state. While
    // not mandatory in normal operation, it is good practice.  It is also
    // needed to ensure correct operation when the application is being
    // profiled. Calling cudaDeviceReset causes all profile data to be
    // flushed before the application exits
    cudaDeviceReset();
    exit(L1norm < 1e-6 ? EXIT_SUCCESS : EXIT_FAILURE);
}
Ejemplo n.º 22
0
void DeviceMemory::download(void *host_ptr_arg) const
{    
    cudaSafeCall( cudaMemcpy(host_ptr_arg, data_, sizeBytes_, cudaMemcpyDeviceToHost) );
    cudaSafeCall( cudaDeviceSynchronize() );
}          
Ejemplo n.º 23
0
void magmablas_ssymm_mgpu_spec(
    magma_side_t side, magma_uplo_t uplo, magma_int_t m, magma_int_t n,
    float alpha,
    float *dA[],    magma_int_t ldda,  magma_int_t offset,
    float *dB[],    magma_int_t lddb,
    float beta,
    float *dC[],    magma_int_t lddc,
    float *dwork[], magma_int_t dworksiz,
    float *C,       magma_int_t ldc,
    float *work[],  magma_int_t ldwork,
    magma_int_t ngpu, magma_int_t nb, 
    magma_queue_t streams[][20], magma_int_t nstream, 
    magma_event_t redevents[][MagmaMaxGPUs*MagmaMaxGPUs+10],magma_int_t nbevents, 
    magma_int_t gnode[MagmaMaxGPUs][MagmaMaxGPUs+2], magma_int_t nbcmplx )
{
    #define dA(dev, i, j) (dA[dev] + (i) + (j)*ldda)
    #define dB(dev, i, j) (dB[dev] + (i) + (j)*lddb)
    #define dC(dev, i, j) (dC[dev] + (i) + (j)*lddc)
    #define dwork(dev, i, j) (dwork[dev] + (i) + (j)*lddwork)
    #define C(i, j) (C + (i) + (j)*ldc)
    
    if ( side != MagmaLeft || uplo != MagmaLower ) {
        fprintf( stderr, "%s: only Left Lower implemented\n", __func__ );
    }
    
    assert( ldda >= m );
    assert( lddb >= m );
    assert( lddc >= m );
    assert( nstream >= ngpu );
    assert( nbevents >= ngpu*ngpu );
    
    float *dwork1[MagmaMaxGPUs];
    float *dwork2[MagmaMaxGPUs];


    magma_int_t lddwork = lddc;
    for( magma_int_t dev = 0; dev < ngpu; ++dev ) {
        dwork1[dev] = dwork[dev];
        dwork2[dev] = dwork[dev]+n*lddwork;
    }
    assert( dworksiz >= (2*n*lddwork) );




        
    magma_device_t cdev;
    magma_getdevice( &cdev );
    magma_queue_t cstream;
    magmablasGetKernelStream(&cstream);


    magma_int_t dev,devperm,myblk,mycolsize,myblkoffst;
    magma_int_t gdev,gcolsize,gmaster,gngpu;
    magma_int_t masterdev,lcdev,lccolsize,myngpu;

    magma_int_t stdev       = (offset/nb)%ngpu;  
    magma_int_t blockoffset = offset % nb;  
    magma_int_t fstblksiz   = 0;
    if(blockoffset>0){
        fstblksiz   = min(m, (nb - blockoffset));
    }
    //magma_int_t nbblk       = magma_ceildiv(m,nb);
    magma_int_t nbblk       = magma_ceildiv((m+blockoffset),nb);
    magma_int_t maxgsize    = n*nb*magma_ceildiv(nbblk,ngpu);
    magma_int_t remm        = m- fstblksiz;
    magma_int_t nbblkoffst  = offset/nb;


    magma_int_t nblstblks = -1;
    magma_int_t devlstblk = -1;
    magma_int_t lstblksiz = remm%nb;
    if(lstblksiz>0){
        nblstblks = nbblk%ngpu;
        devlstblk = (nblstblks-1+ngpu)%ngpu;
    }

    magma_int_t nbcmplxactive =  0;
    magma_int_t cmplxisactive[MagmaMaxGPUs];
    magma_int_t gpuisactive[MagmaMaxGPUs];
    memset(gpuisactive, 0, MagmaMaxGPUs*sizeof(magma_int_t));
    memset(cmplxisactive, 0, MagmaMaxGPUs*sizeof(magma_int_t));


    //*******************************
    //  each GPU make a GEMM with the
    //  transpose of its blocks to compute
    //  a final portion of X=A*VT
    //*******************************
    /* dB = V*T already ==> dB' = T'*V'
     * compute T'*V'*X is equal to compute locally (VT)'_i*X_i 
     * then  each GPU broadcast its X_i to assemble the full X which is used
     * to compute W  =  X  - 0.5 * V * T'*V'*X  = X - 0.5 * V *dwork3
     */
    if(ngpu ==1){
        magma_setdevice( 0 );
        magmablasSetKernelStream( streams[ 0 ][ 0 ] );
        // compute X[me] = A*VT = A[me]^tr *VT;
        magma_sgemm( MagmaTrans, MagmaNoTrans, m, n, m,
                     alpha, dA(0,offset,offset), ldda,
                            dB[0],         lddb,
                     beta,  dC[0], lddc );
        return;
    }
    //ngpu>1
    for( magma_int_t cmplxid = 0; cmplxid < nbcmplx; ++cmplxid ) {
        masterdev     = -1;
        gnode[cmplxid][MagmaMaxGPUs+1] = -1;
        myngpu = gnode[cmplxid][MagmaMaxGPUs];
        for( magma_int_t idev = 0; idev < myngpu; ++idev ) {
            dev         = gnode[cmplxid][idev];
            devperm     = (dev-stdev+ngpu)%ngpu;
            myblk       = (nbblk/ngpu) + (nbblk%ngpu > devperm ?  1:0 );
            mycolsize   = myblk*nb;
            myblkoffst  = nb*((nbblkoffst/ngpu)+(nbblkoffst%ngpu > dev?1:0));            
            if(dev==stdev){
                mycolsize  -=  blockoffset;
                myblkoffst +=  blockoffset;     // local index in parent matrix
            }
            if((devperm==devlstblk)&&(lstblksiz>0)){
                mycolsize -=  (nb-(remm%nb));
            }
            mycolsize = min(mycolsize,m);

        
            if(mycolsize>0){
                if(masterdev==-1) masterdev     = dev;
                //printf("dev %d devperm %d on cmplx %d  master %d nbblk %d myblk %d m %d n %d mycolsize %d stdev %d fstblksize %d lastdev %d lastsize %d dA(%d,%d,%d) ==> dwork(%d,%d)\n",dev,devperm,cmplxid,masterdev,nbblk,myblk,m,n,mycolsize,stdev,fstblksiz,devlstblk,remm%nb,dev,offset,myblkoffst,dev,maxgsize*dev);
                gpuisactive[dev] = mycolsize;
                magma_setdevice( dev );
                magmablasSetKernelStream( streams[ dev ][ dev ] );    

                magma_sgemm( MagmaTrans, MagmaNoTrans, mycolsize, n, m,
                             alpha, dA(dev,offset,myblkoffst), ldda,
                                    dB(dev,0,0),    lddb,
                             beta,  &dwork[dev][maxgsize*dev], mycolsize );
                magma_event_record(redevents[dev][dev*ngpu+dev], streams[dev][dev]);
            }
            if(dev == masterdev){
                nbcmplxactive = nbcmplxactive +1;
                cmplxisactive[cmplxid] = 1;
                gnode[cmplxid][MagmaMaxGPUs+1] = masterdev;
            }
        }
    }



/*
    for( magma_int_t dev = 0; dev < ngpu; ++dev ) {
        magma_setdevice( dev );
        magma_queue_sync( streams[ dev ][ dev ] );
    }
*/


    //*******************************
    //  each Master GPU has the final
    //  result either by receiving 
    //  from CPU of by making the add
    //  by himself, so now it is time 
    //  to broadcast over the GPUs of 
    //  its board.
    //*******************************
    //printf("=======================================================================\n");
    //printf("                           sending                                     \n");
    //printf("=======================================================================\n");
    
    for( magma_int_t cmplxid = 0; cmplxid < nbcmplx; ++cmplxid ) {
        myngpu    = gnode[cmplxid][MagmaMaxGPUs];
        masterdev = gnode[cmplxid][MagmaMaxGPUs+1];
        for( magma_int_t idev = 0; idev < myngpu; ++idev ) {
            dev         = gnode[cmplxid][idev];
            mycolsize   = gpuisactive[dev];
            if(mycolsize>0){
                // I am an active GPU send my portion local 
                // to all active gpu of my cmplex and global to the 
                // active master of the other real and they should 
                // send it out to their actives slaves.
                magma_setdevice( dev );        
                //==============================================
                // sending to the master of the active real
                //==============================================
                //printf     ("\n\n**************GPU %d\n ",dev);
                //printf     ("             GPU %d sending to cmplx masters\n",dev);
                for( magma_int_t k = 0; k < nbcmplx; ++k ) {
                    if(k!=cmplxid){
                        gmaster = gnode[k][MagmaMaxGPUs+1];
                        if(gmaster!=-1){ //real is active
                            //printf     ("                    device %d from cmplx %d is sending to master %d on cmplx %d block of size %d event %d\n",dev,cmplxid,gmaster,k,mycolsize,redevents[dev][gmaster*ngpu+dev]);
                            magma_queue_wait_event(streams[ dev ][ gmaster ], redevents[dev][dev*ngpu+dev]);
                            cudaMemcpy2DAsync(&dwork[gmaster][maxgsize*dev], mycolsize*sizeof(float),
                                         &dwork[dev][maxgsize*dev], mycolsize*sizeof(float),
                                         mycolsize*sizeof(float), n,
                                         cudaMemcpyDeviceToDevice, streams[dev][gmaster]);
                            magma_event_record(redevents[dev][gmaster*ngpu+dev], streams[dev][gmaster]);
                        }
                    }
                }
                //==============================================
                //
                //==============================================
                // sending to the active GPUs of my real
                //==============================================
                //printf     ("              GPU %d sending internal\n",dev);                
                for( magma_int_t l = 0; l < myngpu; ++l ) {
                    lcdev         = gnode[cmplxid][l];
                    lccolsize     = gpuisactive[lcdev];
                    if((lcdev!=dev)&&(lccolsize>0)){
                        //printf     ("                    device %d from cmplx %d is sending internal to dev %d block of size %d event %d\n",dev,cmplxid,lcdev,mycolsize,redevents[dev][lcdev*ngpu+dev]);
                        magma_queue_wait_event(streams[ dev ][ lcdev ], redevents[dev][dev*ngpu+dev]);
                        cudaMemcpy2DAsync(&dwork[lcdev][maxgsize*dev], mycolsize*sizeof(float),
                                         &dwork[dev][maxgsize*dev], mycolsize*sizeof(float),
                                         mycolsize*sizeof(float), n,
                                         cudaMemcpyDeviceToDevice, streams[dev][lcdev]);
                        magma_event_record(redevents[dev][lcdev*ngpu+dev], streams[dev][lcdev]);
                    }
                }
                //==============================================
            }// end if mycolsize>0
        }// for idev
    }// for cmplxid


    //printf("=======================================================================\n");
    //printf("                master wait and resend internally                      \n");
    //printf("=======================================================================\n");
    
    for( magma_int_t cmplxid = 0; cmplxid < nbcmplx; ++cmplxid ) {
        myngpu    = gnode[cmplxid][MagmaMaxGPUs];
        masterdev = gnode[cmplxid][MagmaMaxGPUs+1];
        //==============================================
        // if I am active master so wait receiving contribution
        // of the GPUs of other real and send it locally
        //==============================================
        if(masterdev != -1){
            mycolsize   = gpuisactive[masterdev];
            magma_setdevice( masterdev );
            //printf("              GPU %d distributing internal\n",masterdev);
            for( magma_int_t k = 0; k < nbcmplx; ++k ) {
                if(k!=cmplxid){
                    gngpu   = gnode[k][MagmaMaxGPUs];
                    for( magma_int_t g = 0; g < gngpu; ++g ) {
                        gdev         = gnode[k][g];
                        gcolsize     = gpuisactive[gdev];
                        // check if I received from this GPU,
                        // if yes send it to my group
                        if(gcolsize>0){
                           magma_queue_wait_event(streams[ masterdev ][ gdev ], redevents[gdev][masterdev*ngpu+gdev]);
                           for( magma_int_t l = 0; l < myngpu; ++l ) {
                                lcdev         = gnode[cmplxid][l];
                                lccolsize     = gpuisactive[lcdev];
                                if((lcdev!=masterdev)&&(lccolsize>0)){
                                    //printf("                    Master %d on cmplx %d waiting on event %d is distributing internal results of %d to lcdev %d block of size %d event %d\n", masterdev,cmplxid,redevents[gdev][masterdev*ngpu+gdev],gdev,lcdev,gcolsize,redevents[masterdev][lcdev*ngpu+gdev]);
                                    cudaMemcpy2DAsync(&dwork[lcdev][maxgsize*gdev], gcolsize*sizeof(float),
                                                    &dwork[masterdev][maxgsize*gdev], gcolsize*sizeof(float),
                                                    gcolsize*sizeof(float), n,
                                                    cudaMemcpyDeviceToDevice, streams[masterdev][gdev]);
                                    magma_event_record(redevents[masterdev][lcdev*ngpu+gdev], streams[masterdev][gdev]);
                                }
                            }
                        }
                    }
                }
            }
        }// if active master 
        //==============================================
    }// for cmplxid





/*

    for( magma_int_t dev = 0; dev < ngpu; ++dev ) {
        magma_setdevice( dev );
                magma_queue_sync( streams[ dev ][ 0 ] );
        for( magma_int_t s = 0; s < ngpu; ++s ) {
                magma_queue_sync( streams[ dev ][ s ] );
        }
    }
*/
    //printf("=======================================================================\n");
    //printf("                           distributing                                \n");
    //printf("=======================================================================\n");

    magma_int_t lcblki,gbblki,gblk,ib;
    
    for( magma_int_t cmplxid = 0; cmplxid < nbcmplx; ++cmplxid ) {
        myngpu    = gnode[cmplxid][MagmaMaxGPUs];
        masterdev = gnode[cmplxid][MagmaMaxGPUs+1];
        for( magma_int_t idev = 0; idev < myngpu; ++idev ) {
            dev         = gnode[cmplxid][idev];
            mycolsize   = gpuisactive[dev];
            if(mycolsize>0){ // I am an active GPU
                //printf("\n\n==============GPU %d collecting\n",dev);
                magma_setdevice( dev );        
                // collect my results first as tyhere is no need to wait to   
                // receive nothing, just wait that my gemm are done.
                // in theory this should be inside the loop but cuda was not 
                // able to run it first for all gpu and on gpu>0 it was waiting
                // however it was on different stream so it should run. but maybe
                // this is because there are too many function call and this make 
                // cuda not handleit so nice. anyway it coul dbe removed when cuda
                // is able to lunch it first without wait.
                gdev = dev;
                gcolsize     = gpuisactive[gdev];
                if(gcolsize>0){
                    devperm     = (gdev-stdev+ngpu)%ngpu;
                    gblk        = (nbblk/ngpu) + (nbblk%ngpu > devperm ?  1:0 );
                    magmablasSetKernelStream( streams[ dev ][ gdev ] );
                    magma_queue_wait_event(streams[ dev ][ gdev ], redevents[gdev][dev*ngpu+gdev]);
                    //printf     ("              GPU %d stream %d doing slacpy\n",dev,streams[ dev ][ gdev ]);
                    for( magma_int_t blki = 0; blki < gblk; ++blki){
                        gbblki = (blki*ngpu + devperm)*nb - blockoffset;
                        lcblki = blki*nb;
                        ib     = nb;//min(nb,m-gbblki);
                        if(gdev==stdev){
                            lcblki = blki*nb-blockoffset;
                            if(blki==0){
                                gbblki = 0;
                                lcblki = 0;
                                ib     = nb-blockoffset;
                            }
                        }
                        ib     = min(ib,m-gbblki);
                        //printf("                    blockoffset %d nbblk %d stdev %d  receiving from gdev %d gblk %d  gcolsize %d copying blki %d of size ibxn %dx%d from work[%d] to C[%d]\n", blockoffset,nbblk,stdev,gdev,gblk,gcolsize,blki,ib,n,lcblki,gbblki);
                        magmablas_slacpy( MagmaFull, ib, n, &dwork[dev][maxgsize*gdev+lcblki], gcolsize, &dC[dev][gbblki], lddc);
                    }// end blki
                }


                
                for( magma_int_t k = 0; k < nbcmplx; ++k ) {
                    gngpu   = gnode[k][MagmaMaxGPUs];
                    for( magma_int_t g = 0; g < gngpu; ++g ) {
                        gdev         = gnode[k][g];
                        gcolsize     = gpuisactive[gdev];
                        // if gcolsize>0, ==> gpu gdev was active and so 
                        // I received from him/computed a portion of dwork, 
                        // so go over its gblk and distribute it on dC.
                        if(gdev!=dev){
                            if(gcolsize>0){
                                devperm     = (gdev-stdev+ngpu)%ngpu;
                                gblk        = (nbblk/ngpu) + (nbblk%ngpu > devperm ?  1:0 );
                                magmablasSetKernelStream( streams[ dev ][ gdev ] );
                                if(k==cmplxid){
                                    //we are on the same group so wait on event issued by gdev for me citing his id
                                    magma_queue_wait_event(streams[ dev ][ gdev ], redevents[gdev][dev*ngpu+gdev]);
                                    //printf     ("              GPU %d stream %d waiting on event %d to collecte from %d the size of gcolsize %d\n",dev,streams[ dev ][ gdev ],redevents[gdev][dev*ngpu+gdev],gdev,gcolsize);
                                }else{
                                    //we are on different group so:
                                    //if I am the master wait on the event issued by gdev for me citing his id
                                    //else  wait event issued by my master for me on the behalf of gdev
                                    //printf     ("              GPU %d stream %d waiting on event %d to collecte from %d the size of gcolsize %d\n",dev,streams[ dev ][ gdev ],redevents[masterdev][dev*ngpu+gdev],gdev,gcolsize);
                                    if(dev==masterdev)
                                        magma_queue_wait_event(streams[ dev ][ gdev ], redevents[gdev][dev*ngpu+gdev]);
                                    else
                                        magma_queue_wait_event(streams[ dev ][ gdev ], redevents[masterdev][dev*ngpu+gdev]);
                                }
                                //printf     ("              GPU %d stream %d doing slacpy\n",dev,streams[ dev ][ gdev ]);
                                for( magma_int_t blki = 0; blki < gblk; ++blki){
                                    gbblki = (blki*ngpu + devperm)*nb - blockoffset;
                                    lcblki = blki*nb;
                                    ib     = nb;//min(nb,m-gbblki);
                                    if(gdev==stdev){
                                        lcblki = blki*nb-blockoffset;
                                        if(blki==0){
                                            gbblki = 0;
                                            lcblki = 0;
                                            ib     = nb-blockoffset;
                                        }
                                    }
                                    ib     = min(ib,m-gbblki);
                                    //printf("                    blockoffset %d nbblk %d stdev %d  receiving from gdev %d gblk %d  gcolsize %d copying blki %d of size ibxn %dx%d from work[%d] to C[%d]\n", blockoffset,nbblk,stdev,gdev,gblk,gcolsize,blki,ib,n,lcblki,gbblki);
                                    magmablas_slacpy( MagmaFull, ib, n, &dwork[dev][maxgsize*gdev+lcblki], gcolsize, &dC[dev][gbblki], lddc);
                                }// end blki
                            }// en gcolsize>0 meaning gdev is active
                        } // end if gdev != dev
                    }// end loop over the g gpus of the cmplx k
                }//end loop over the real k
            }// end mycolsize>0 meaning that I am active
        }// end loop over idev of cmplxid
    }// end loop of the cmplx







    for( magma_int_t dev = 0; dev < ngpu; ++dev ) {
        magma_setdevice( dev );
        cudaDeviceSynchronize();
    }

    // put back the input gpu and its input stream 
    magma_setdevice( cdev );
    magmablasSetKernelStream( cstream );

}
Ejemplo n.º 24
0
int main (int argc, char **argv){

	unsigned int size = 4000;
	unsigned int interval = 4000;
	int *data = (int*)malloc(size * sizeof(int));
	int threshold = 127;
	srand(0);

	unsigned int count = 0;
	int val;
	for (unsigned int s = 0; s < size; ++s) {
		val = rand() % (2*threshold + 1);
		if (val > threshold) {
			data[s] = 1;
			++count;
		} else {
			data[s] = 0;
		}
	}

	int * output = (int*)malloc(size * sizeof(int));
	unsigned int total = 0;

	cudaSetDevice(0);

	cudaStream_t stream;
	cudaError_t error;
	error = cudaStreamCreate(&stream);


	// now do the tests
	// CPU
	long t1, t2;
	t1 = ClockGetTime3();
	total = nscale::gpu::SelectCPUTesting(data, size, output );
	t2 = ClockGetTime3();
	printf("cpu: %d total, %lu ms\n", total, t2-t1);

	// thrust
	for (unsigned int s = 0; s < size; s++) {
		if ((s % (size / interval)) == 0) {
			printf("%d, ", data[s]);
		}
	}
	printf("\n");
	t1 = ClockGetTime3();
	total = nscale::gpu::SelectThrustScanTesting(data, size, output, stream);
	error = cudaStreamSynchronize(stream);
	t2 = ClockGetTime3();
	printf("thrust scan: %d total, %lu ms\n", total, t2-t1);
	for (unsigned int s = 0; s < size; s++) {
		if ((s % (size / interval)) == 0) {
			printf("%d, ", output[s]);
		}
	}
	printf("\n");
	cudaDeviceSynchronize();

	// warp scan unordered
//	for (unsigned int s = 0; s < size; s++) {
//		if ((s % 10000) == 0) {
//			printf("%d, ", data[s]);
//		}
//	}
	t1 = ClockGetTime3();
	total = nscale::gpu::SelectWarpScanUnorderedTesting(data, size, output, stream);
//	error = cudaStreamSynchronize(stream);
//	cudaDeviceSynchronize();
	t2 = ClockGetTime3();
	printf("warp scan unordered: %d total, %lu ms\n", total, t2-t1);
	for (unsigned int s = 0; s < size; s++) {
		if ((s % (size / interval)) == 0) {
			printf("%d, ", output[s]);
		}
	}
	printf("\n");
//	cudaDeviceSynchronize();
	int count2;
	// warp scan ordered
	t1 = ClockGetTime3();
	total = nscale::gpu::SelectWarpScanOrderedTesting(data, size, output, stream);
//	cudaDeviceSynchronize();
	t2 = ClockGetTime3();
	printf("warp scan ordered: %d total, %lu ms\n", total, t2-t1);
	for (unsigned int s = 0; s < size; s++) {
		if ((s % (size / interval)) == 0) {
			printf("%d, ", output[s]);
		}
	}
	printf("\n");

	error = cudaStreamDestroy(stream);

	free(data);
	free(output);

	return 0;
}
Ejemplo n.º 25
0
int main( int argc, char **argv ) {
  printf("Starting\n");
  int size;
  cudaError_t cudaStat;
  magma_err_t magmaStat;
  cublasStatus_t stat;
  cublasHandle_t handle;
  int it,i;

  cublasOperation_t N = 'N';
  cublasOperation_t T = 'T';
  char N2 = 'N';
  char T2 = 'T';
  double one = 1., zero=0.;
  char uplo = 'L';
  int info;
  
  int err; double* A; double* B;
  magmaStat = magma_init();

  int use_pinned;
  if(argc > 1) {
    use_pinned = atoi(argv[1]);
  } else use_pinned = 0;
  printf("Setting use_pinned to %d\n", use_pinned);

  for( size = 256; size <= 8192; size*=2 ) {
 
     if(use_pinned) {
       // allocate pinned memory on CPU
       err = magma_dmalloc_pinned( &A,  size*size );  assert( err == 0 );
       err = magma_dmalloc_pinned( &B,  size*size );  assert( err == 0 );
     } else {
       // allocate standard memory on CPU
       A = (double*) malloc( sizeof(double)*size*size );
       B = (double*) malloc( sizeof(double)*size*size );
     }

    cudaDeviceSynchronize();
    double tInit = read_timer();     
    double *dA,*dB;
    // allocate memory on GPU
    magma_malloc( (void**) &dA, sizeof(double)*size*size );
    magma_malloc( (void**) &dB, sizeof(double)*size*size );
    
    cudaDeviceSynchronize();
    double tAlloc = read_timer();     
 
    fillMatrix(B, size*size);
 

    cudaDeviceSynchronize();
    double tInit2 = read_timer();

    // transfer data to GPU
    magma_dsetmatrix( size, size, B, size, dB, size );

    cudaDeviceSynchronize();
    double tTransferToGPU = read_timer();

    // matrix multiply
    magmablas_dgemm('N', 'T', size, size, size, one, dB, size, dB, size, zero, dA, size );
    // magma_dgemm is apparently synonymous with magmablas_dgemm

    cudaDeviceSynchronize();
    double tMatMult = read_timer();
 
    // Cholesky decomposition on GPU with GPU interface (called with object on GPU)
    magma_dpotrf_gpu( 'L', size, dA, size, &info );

    cudaDeviceSynchronize();
    double tChol = read_timer();

    // transfer data back to CPU
    magma_dgetmatrix( size, size, dA, size, A, size );
    cudaDeviceSynchronize();
    double tTransferFromGPU = read_timer();
 
    // standard BLAS matrix multiply on CPU
    dgemm_( &N2, &T2, &size, &size, &size, &one, B, &size, B, &size, &zero, A, &size );

    cudaDeviceSynchronize();
    double tMatMultBlas = read_timer();

    // Cholesky decomposition on GPU with CPU interface (called with object on CPU)
    magma_dpotrf( 'L', size, A, size, &info );

    cudaDeviceSynchronize();
    double tCholCpuInterface = read_timer();

    // recreate A = B * B (could just do a save and copy instead....)
    dgemm_( &N2, &T2, &size, &size, &size, &one, B, &size, B, &size, &zero, A, &size );

    cudaDeviceSynchronize();
    double tInit3 = read_timer();

    // standard Lapack Cholesky decomposition on CPU
    dpotrf_(&uplo, &size, A, &size, &info);
  
    cudaDeviceSynchronize();
    double tCholCpu= read_timer();
 

    printf("====================================================\n");
    printf("Timing results for n = %d\n", size);
    printf("GPU memory allocation time: %f\n", tAlloc - tInit);
    printf("Transfer to GPU time: %f\n", tTransferToGPU - tInit2);
    printf("Matrix multiply time (GPU): %f\n", tMatMult - tTransferToGPU);
    printf("Matrix multiply time (BLAS): %f\n", tMatMultBlas - tTransferToGPU);
    printf("Cholesky factorization time (GPU w/ GPU interface): %f\n", tChol - tMatMult);
    printf("Cholesky factorization time (GPU w/ CPU interface): %f\n", tCholCpuInterface - tMatMultBlas);
    printf("Cholesky factorization time (LAPACK): %f\n", tCholCpu - tInit3);
    printf("Transfer from GPU time: %f\n", tTransferFromGPU - tChol);

    if(use_pinned) {
      magma_free_pinned(A);
      magma_free_pinned(B);
    } else {
      free(A);
      free(B);
    }
    magma_free(dA);
    magma_free(dB);
 
  }
  return EXIT_SUCCESS;
}
Ejemplo n.º 26
0
double
do_compute_and_probe(double seconds, MPI_Request* request)
{
    double t1 = 0.0, t2 = 0.0;
    double test_time = 0.0;
    int num_tests = 0;
    double target_seconds_for_compute = 0.0;
    int flag = 0;
    MPI_Status status;

    if (options.num_probes) {
        target_seconds_for_compute = (double) seconds/options.num_probes;
        if (DEBUG) fprintf(stderr, "setting target seconds to %f\n", (target_seconds_for_compute * 1e6 ));
    } 
    else {
        target_seconds_for_compute = seconds;
        if (DEBUG) fprintf(stderr, "setting target seconds to %f\n", (target_seconds_for_compute * 1e6 ));
    }

#ifdef _ENABLE_CUDA_KERNEL_
    if (options.target == gpu) {
        if (options.num_probes) {
            /* Do the dummy compute on GPU only */
            do_compute_gpu(target_seconds_for_compute);
            num_tests = 0;
            while (num_tests < options.num_probes) {
                t1 = MPI_Wtime();
                MPI_Test(request, &flag, &status);
                t2 = MPI_Wtime();
                test_time += (t2-t1);
                num_tests++;
            }
        }
        else {
            do_compute_gpu(target_seconds_for_compute);
        }
    }
    else if (options.target == both) {
        if (options.num_probes) {
            /* Do the dummy compute on GPU and CPU*/
            do_compute_gpu(target_seconds_for_compute);
            num_tests = 0;
            while (num_tests < options.num_probes) {
                t1 = MPI_Wtime();
                MPI_Test(request, &flag, &status);
                t2 = MPI_Wtime();
                test_time += (t2-t1);
                num_tests++;
                do_compute_cpu(target_seconds_for_compute);
            }
        } 
        else {
            do_compute_gpu(target_seconds_for_compute);
            do_compute_cpu(target_seconds_for_compute);
        }        
    }
    else
#endif
    if (options.target == cpu) {
        if (options.num_probes) {
            num_tests = 0;
            while (num_tests < options.num_probes) {
                do_compute_cpu(target_seconds_for_compute);
                t1 = MPI_Wtime();
                MPI_Test(request, &flag, &status);
                t2 = MPI_Wtime();
                test_time += (t2-t1);
                num_tests++;
            }
        }
        else {
            do_compute_cpu(target_seconds_for_compute);
        }
    }

#ifdef _ENABLE_CUDA_KERNEL_
    if (options.target == gpu || options.target == both) {
        cudaDeviceSynchronize();    
        cudaStreamDestroy(stream);
    }
#endif
    
    return test_time;
}
Ejemplo n.º 27
0
/*
 * main should only control threads
 *
 * the threads should be invoked on different cores:
 * http://stackoverflow.com/questions/1407786/how-to-set-cpu-affinity-of-a-particular-pthread
 * https://www.google.pl/search?client=ubuntu&channel=fs&q=how+to+schedule+pthreads+through+cores&ie=utf-8&oe=utf-8&gfe_rd=cr&ei=PSudVePFOqeA4AShra2AAQ
 */
int main() {

    cudaDeviceReset();
    cudaDeviceSynchronize();

    // print device properties
    print_device();

    // create pointers to data
    const uint64_t size = N;
    double complex* data_r_host = NULL; // initializing with NULL for debuging purposes
    double complex* data_k_host = NULL; // initializing with NULL for debuging purposes
    DataArray* data_arr_ptr = (DataArray*) malloc((size_t) sizeof(DataArray)); // change to global variable <- easier to code
    create_data_arr(data_arr_ptr, &data_r_host, &data_k_host, size);

    // allocate memory for array of streams
    const uint8_t num_streams = 2; // rewrite on defines?
    streams_arr = (cudaStream_t*) malloc( (size_t) sizeof(cudaStream_t)*num_streams);

    // create threads
    const uint8_t num_threads = 2;
    printf("host thread id\t %u\ndevice thread id %u\n",HOST_THRD, DEVICE_THRD);

    pthread_t* thread_ptr_arr = (pthread_t*) malloc( (size_t) sizeof(pthread_t)*num_threads ); // alternatively pthread_t* thread_ptr_arr[num_threads];

    // init barier for threads
    pthread_barrier_init (&barrier, NULL, num_threads); // last number tells how many threads should be synchronized by this barier

    pthread_create(&thread_ptr_arr[HOST_THRD], NULL, host_thread, (void*) data_arr_ptr);
    pthread_create(&thread_ptr_arr[DEVICE_THRD], NULL, device_thread, (void*) data_arr_ptr);

//   for (uint8_t ii = 0; ii < num_threads; ii++) {
//     pthread_create(thread_ptr_arr[ii], NULL, host_thread, (void*) data_arr_ptr);
//   }

    //cudaStream_t stream1;
    //cudaStream_t stream2;
    //cudaStream_t* streams_arr[2] = {&stream1, &stream2};

    void* status;
    pthread_join(thread_ptr_arr[HOST_THRD], &status);
    pthread_join(thread_ptr_arr[DEVICE_THRD], &status);

    printf("data visible in main thread:\n");
    /*for (uint64_t ii=0; ii < (data_arr_ptr->size <= 32) ? data_arr_ptr->size : 32 ; ii++) {
      printf( "%lu.\t",ii );
      printf( "%lf + %lf\t", creal(data_r_host[ii]), cimag(data_r_host[ii]) );
      printf( "%lf + %lf\n", creal(data_k_host[ii]), cimag(data_k_host[ii]) );
    }*/



    free(thread_ptr_arr);
    free(streams_arr);
    free_data_arr(data_arr_ptr);
    cudaDeviceSynchronize();
    free(data_arr_ptr);

    cudaThreadExit();
    cudaDeviceSynchronize();

    printf("Main: program completed. Exiting...\n");
    return EXIT_SUCCESS;
}
Ejemplo n.º 28
0
void TEMPLATE2 (CHOLMOD (gpu_final_assembly))
(
    cholmod_common *Common,
    double *Lx,
    Int psx,
    Int nscol,
    Int nsrow,
    int supernodeUsedGPU,
    int *iHostBuff,
    int *iDevBuff,
    cholmod_gpu_pointers *gpu_p
)
{
    Int iidx, i, j;
    Int iHostBuff2 ;
    Int iDevBuff2 ;

    if ( supernodeUsedGPU ) {

        /* ------------------------------------------------------------------ */
        /* Apply all of the Shur-complement updates, computed on the gpu, to */
        /* the supernode. */
        /* ------------------------------------------------------------------ */

        *iHostBuff = (Common->ibuffer)%CHOLMOD_HOST_SUPERNODE_BUFFERS;
        *iDevBuff = (Common->ibuffer)%CHOLMOD_DEVICE_STREAMS;

        if ( nscol * L_ENTRY >= CHOLMOD_POTRF_LIMIT ) {

            /* If this supernode is going to be factored using the GPU (potrf)
             * then it will need the portion of the update assembled ont the
             * CPU.  So copy that to a pinned buffer an H2D copy to device. */

            /* wait until a buffer is free */
            cudaEventSynchronize ( Common->updateCBuffersFree[*iHostBuff] );

            /* copy update assembled on CPU to a pinned buffer */

#pragma omp parallel for num_threads(CHOLMOD_OMP_NUM_THREADS)   \
    private(iidx) if (nscol>32)

            for ( j=0; j<nscol; j++ ) {
                for ( i=j; i<nsrow*L_ENTRY; i++ ) {
                    iidx = j*nsrow*L_ENTRY+i;
                    gpu_p->h_Lx[*iHostBuff][iidx] = Lx[psx*L_ENTRY+iidx];
                }
            }

            /* H2D transfer of update assembled on CPU */
            cudaMemcpyAsync ( gpu_p->d_A[1], gpu_p->h_Lx[*iHostBuff],
                              nscol*nsrow*L_ENTRY*sizeof(double),
                              cudaMemcpyHostToDevice,
                              Common->gpuStream[*iDevBuff] );
        }

        Common->ibuffer++;

        iHostBuff2 = (Common->ibuffer)%CHOLMOD_HOST_SUPERNODE_BUFFERS;
        iDevBuff2 = (Common->ibuffer)%CHOLMOD_DEVICE_STREAMS;

        /* wait for all kernels to complete */
        cudaEventSynchronize( Common->updateCKernelsComplete );

        /* copy assembled Schur-complement updates computed on GPU */
        cudaMemcpyAsync ( gpu_p->h_Lx[iHostBuff2], gpu_p->d_A[0],
                          nscol*nsrow*L_ENTRY*sizeof(double),
                          cudaMemcpyDeviceToHost,
                          Common->gpuStream[iDevBuff2] );

        if ( nscol * L_ENTRY >= CHOLMOD_POTRF_LIMIT ) {

            /* with the current implementation, potrf still uses data from the
             * CPU - so put the fully assembled supernode in a pinned buffer for
             * fastest access */

            /* need both H2D and D2H copies to be complete */
            cudaDeviceSynchronize();

            /* sum updates from cpu and device on device */
#ifdef REAL
            sumAOnDevice ( gpu_p->d_A[1], gpu_p->d_A[0], -1.0, nsrow, nscol );
#else
            sumComplexAOnDevice ( gpu_p->d_A[1], gpu_p->d_A[0],
                                  -1.0, nsrow, nscol );
#endif

            /* place final assembled supernode in pinned buffer */

#pragma omp parallel for num_threads(CHOLMOD_OMP_NUM_THREADS)   \
    private(iidx) if (nscol>32)

            for ( j=0; j<nscol; j++ ) {
                for ( i=j*L_ENTRY; i<nscol*L_ENTRY; i++ ) {
                    iidx = j*nsrow*L_ENTRY+i;
                    gpu_p->h_Lx[*iHostBuff][iidx] -=
                        gpu_p->h_Lx[iHostBuff2][iidx];
                }
            }

        }
        else
        {

            /* assemble with CPU updates */
            cudaDeviceSynchronize();

#pragma omp parallel for num_threads(CHOLMOD_OMP_NUM_THREADS)   \
    private(iidx) if (nscol>32)

            for ( j=0; j<nscol; j++ ) {
                for ( i=j*L_ENTRY; i<nsrow*L_ENTRY; i++ ) {
                    iidx = j*nsrow*L_ENTRY+i;
                    Lx[psx*L_ENTRY+iidx] -= gpu_p->h_Lx[iHostBuff2][iidx];
                }
            }
        }
    }
    return;
}
Ejemplo n.º 29
0
static int cutorch_synchronize(lua_State *L)
{
  cudaDeviceSynchronize();  
  return 0;
}
int _tmain(int argc, _TCHAR* argv[]) 
{
	uchar4 *h_inputImageRGBA,  *d_inputImageRGBA;
	uchar4 *h_outputImageRGBA, *d_outputImageRGBA;
	unsigned char *d_redBlurred, *d_greenBlurred, *d_blueBlurred;

	float *h_filter;
	int    filterWidth;

	//PreProcess
	const std::string *filename = new std::string("./cinque_terre_small.jpg");
	cv::Mat imageInputRGBA;
	cv::Mat imageOutputRGBA;

	//make sure the context initializes ok
	checkCudaErrors(cudaFree(0));

	cv::Mat image = cv::imread(filename->c_str(), CV_LOAD_IMAGE_COLOR);
  
	if (image.empty()) 
	{
	std::cerr << "Couldn't open file: " << filename << std::endl;
	cv::waitKey(0);
	exit(1);
	}

	cv::cvtColor(image, imageInputRGBA, CV_BGR2RGBA);

	//allocate memory for the output
	imageOutputRGBA.create(image.rows, image.cols, CV_8UC4);

	//This shouldn't ever happen given the way the images are created
	//at least based upon my limited understanding of OpenCV, but better to check
	if (!imageInputRGBA.isContinuous() || !imageOutputRGBA.isContinuous()) {
	std::cerr << "Images aren't continuous!! Exiting." << std::endl;
	exit(1);
	}

	h_inputImageRGBA  = (uchar4 *)imageInputRGBA.ptr<unsigned char>(0);
	h_outputImageRGBA = (uchar4 *)imageOutputRGBA.ptr<unsigned char>(0);

	const size_t numPixels = image.rows * image.cols;
	//allocate memory on the device for both input and output
	checkCudaErrors(cudaMalloc(&d_inputImageRGBA, sizeof(uchar4) * numPixels));
	checkCudaErrors(cudaMalloc(&d_outputImageRGBA, sizeof(uchar4) * numPixels));
	checkCudaErrors(cudaMemset(d_outputImageRGBA, 0, numPixels * sizeof(uchar4))); //make sure no memory is left laying around

	//copy input array to the GPU
	checkCudaErrors(cudaMemcpy(d_inputImageRGBA, h_inputImageRGBA, sizeof(uchar4) * numPixels, cudaMemcpyHostToDevice));

	//now create the filter that they will use
	const int blurKernelWidth = 9;
	const float blurKernelSigma = 2.;

	filterWidth = blurKernelWidth;

	//create and fill the filter we will convolve with
	h_filter = new float[blurKernelWidth * blurKernelWidth];

	float filterSum = 0.f; //for normalization

	for (int r = -blurKernelWidth/2; r <= blurKernelWidth/2; ++r) 
	{
		for (int c = -blurKernelWidth/2; c <= blurKernelWidth/2; ++c)
		{
			float filterValue = expf( -(float)(c * c + r * r) / (2.f * blurKernelSigma * blurKernelSigma));
			h_filter[(r + blurKernelWidth/2) * blurKernelWidth + c + blurKernelWidth/2] = filterValue;
			filterSum += filterValue;
		}
	}

	float normalizationFactor = 1.f / filterSum;

	for (int r = -blurKernelWidth/2; r <= blurKernelWidth/2; ++r)
	{
		for (int c = -blurKernelWidth/2; c <= blurKernelWidth/2; ++c)
		{
			h_filter[(r + blurKernelWidth/2) * blurKernelWidth + c + blurKernelWidth/2] *= normalizationFactor;
		}
	}

	//blurred
	checkCudaErrors(cudaMalloc(&d_redBlurred,    sizeof(unsigned char) * numPixels));
	checkCudaErrors(cudaMalloc(&d_greenBlurred,  sizeof(unsigned char) * numPixels));
	checkCudaErrors(cudaMalloc(&d_blueBlurred,   sizeof(unsigned char) * numPixels));
	checkCudaErrors(cudaMemset(d_redBlurred,   0, sizeof(unsigned char) * numPixels));
	checkCudaErrors(cudaMemset(d_greenBlurred, 0, sizeof(unsigned char) * numPixels));
	checkCudaErrors(cudaMemset(d_blueBlurred,  0, sizeof(unsigned char) * numPixels));


	allocateMemoryAndCopyToGPU(image.rows, image.cols, h_filter, filterWidth);
	GpuTimer timer;
	timer.Start();
	//call the students' code
	your_gaussian_blur(h_inputImageRGBA, d_inputImageRGBA, d_outputImageRGBA, image.rows, image.cols,
						d_redBlurred, d_greenBlurred, d_blueBlurred, filterWidth);
	timer.Stop();
	cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
	int err = printf("%f msecs.\n", timer.Elapsed());

	if (err < 0) {
	//Couldn't print! Probably the student closed stdout - bad news
	std::cerr << "Couldn't print timing information! STDOUT Closed!" << std::endl;
	exit(1);
	}

	cleanup();

	//check results and output the blurred image
	//PostProcess

	//copy the output back to the host
	checkCudaErrors(cudaMemcpy(imageOutputRGBA.ptr<unsigned char>(0), d_outputImageRGBA, sizeof(uchar4) * numPixels, cudaMemcpyDeviceToHost));

	cv::Mat imageOutputBGR;
	cv::cvtColor(imageOutputRGBA, imageOutputBGR, CV_RGBA2BGR);
	//output the image
	cv::imwrite("./blurredResult.jpg", imageOutputBGR);

	cv::namedWindow( "Display window", CV_WINDOW_NORMAL);
	cv::imshow("Display window", imageOutputBGR);
	
	cv::waitKey(0);


	checkCudaErrors(cudaFree(d_redBlurred));
	checkCudaErrors(cudaFree(d_greenBlurred));
	checkCudaErrors(cudaFree(d_blueBlurred));

	return 0;
}