Example #1
0
    typename Container::value_type* cuda_get_mapped_device_pointer(Container & c )
    {
      typename Container::value_type * out = nullptr;
      using locality = typename meta::option<Container,tag::locality_>::type;

      if ( (cuda_alloc_type == cudaHostAllocMapped) && std::is_same<locality,pinned_>::value ) cudaHostGetDevicePointer( (void **) &out, (void*) c.data() ,0 );
      return out;
    }
Example #2
0
extern "C" double gsum(UDF_INIT *initid, UDF_ARGS *args, char *is_null, char *error) 
{
	DBUG_ENTER("udf_sum::gsum");

	double* h_in;
	double* h_out;
	double* d_in;
	double* d_out;
	unsigned int index = 0;

	cudaHostAlloc((void**)&h_in, MAXIMUM_ELEMENTS_IN_CACHE*sizeof(double), cudaHostAllocWriteCombined | cudaHostAllocMapped );
	CUDA_CHECK_ERRORS("cudaHostAlloc -> h_in");
	cudaHostAlloc((void**)&h_out, CUDA_BLOCK_SIZE*sizeof(double), cudaHostAllocWriteCombined | cudaHostAllocMapped );
	CUDA_CHECK_ERRORS("cudaHostAlloc -> h_out");

	cudaHostGetDevicePointer((void**)&d_in, h_in, 0);
	cudaHostGetDevicePointer((void**)&d_out, h_out, 0);


	char* column_name = (char*) args->args[0];
	char* table_name = (char*) args->args[1];
	char* schema_name = (char*) args->args[2];

	DBUG_PRINT("info", ("column_name [%s], table_name [%s], schema_name [%s]", column_name, table_name, schema_name));
	fprintf(stderr, "column_name [%s], table_name [%s], schema_name [%s]\n", column_name, table_name, schema_name);
	fflush(stderr);

	THD *thd = current_thd;

	TABLE_LIST* table_list = new TABLE_LIST;	
	memset((char*) table_list, 0, sizeof(TABLE_LIST));

	DBUG_PRINT("info", ("table_list->init_one_table"));
	table_list->init_one_table(schema_name, strlen(schema_name), table_name, strlen(table_name), table_name, TL_READ);
	DBUG_PRINT("info", ("open_and_lock_tables"));
	open_and_lock_tables(thd, table_list, FALSE, MYSQL_OPEN_IGNORE_FLUSH | MYSQL_LOCK_IGNORE_TIMEOUT);

	TABLE* table = table_list->table;

	clock_t cpu_clock;
	cpu_clock = clock();
	table->file->ha_rnd_init(true);

	while (table->file->ha_rnd_next(table->record[0]) == 0){
		h_in[index++] = table->field[1]->val_real();
	}
	table->file->ha_rnd_end();
	cpu_clock = clock() - cpu_clock;
	fprintf(stderr, "gsum -> index [%d]\n", index);
	fprintf(stderr, "gsum -> fill cache within [%f seconds]\n", ((float)cpu_clock)/CLOCKS_PER_SEC);
	fflush(stderr);
	

	cudaEvent_t start, stop;
	float elapsedTime;

	cudaEventCreate(&start);
	cudaEventCreate(&stop);
	cudaEventRecord(start, 0);
	ReductionTask reduction_task(MAXIMUM_ELEMENTS_IN_CACHE, sizeof(double), CUDA_BLOCK_SIZE, CUDA_THREAD_PER_BLOCK_SIZE, R_SUM, R_DOUBLE);
	reductionWorkerUsingMappedMemory<double>(d_in, d_out, &reduction_task);
	cudaEventRecord(stop, 0);
	cudaEventSynchronize(stop);
	cudaEventElapsedTime(&elapsedTime, start, stop);

	double gpu_sum = 0;
	for (unsigned int i = 0; i < CUDA_BLOCK_SIZE; i++)
	{
		gpu_sum += ((double*)h_out)[i];
	}

	float bandwidthInMBs = (1e3f * MAXIMUM_ELEMENTS_IN_CACHE*sizeof(double)) / (elapsedTime * (float)(1 << 20));
	fprintf(stderr, "gpu result [%f], gpu time [%f seconds] bandwidth (mb) [%f]\n", gpu_sum, elapsedTime/1000.0, bandwidthInMBs);
	fflush(stderr);

	cudaFreeHost(h_in);
	CUDA_CHECK_ERRORS("cudaFreeHost -> h_in");
	cudaFreeHost(h_out);
	CUDA_CHECK_ERRORS("cudaFreeHost -> h_out");

	thd->cleanup_after_query();
	DBUG_PRINT("info", ("about to delete table_list"));
	delete table_list;

	DBUG_RETURN(gpu_sum);
}
void MFNHashTypePlainCUDA::allocateThreadAndDeviceMemory() {
    trace_printf("MFNHashTypePlainCUDA::allocateThreadAndDeviceMemory()\n");
    /**
     * Error variable - stores the result of the various mallocs & such.
     */
    cudaError_t err, err2;

    /**
     * Flags for calling cudaHostMalloc - will be set to cudaHostAllocMapped
     * if we are mapping memory to the host with zero copy.
     */
    unsigned int cudaHostMallocFlags = 0;

    if (this->useZeroCopy) {
        cudaHostMallocFlags |= cudaHostAllocMapped;
    }


    /*
     * Malloc the device hashlist space.  This is the number of available hashes
     * times the hash length in bytes.  The data will be copied later.
     */
    err = cudaMalloc((void **)&this->DeviceHashlistAddress,
        this->activeHashesProcessed.size() * this->hashLengthBytes);
    if (err != cudaSuccess) {
        printf("Unable to allocate %d bytes for device hashlist!  Exiting!\n",
                this->activeHashesProcessed.size() * this->hashLengthBytes);
        printf("return code: %d\n", err);
        exit(1);
    }

    /*
     * Allocate the host/device space for the success list (flags for found passwords).
     * This is a byte per password.  To avoid atomic write issues, each password
     * gets a full addressible byte, and the GPU handles the dependencies between
     * multiple threads trying to set a flag in the same segment of memory.
     *
     * On the host, it will be allocated as mapped memory if we are using zerocopy.
     *
     * As this region of memory is frequently copied back to the host, mapping it
     * improves performance.  In theory.
     */
    err = cudaHostAlloc((void **)&this->HostSuccessAddress,
        this->activeHashesProcessed.size(), cudaHostMallocFlags);
    if (err != cudaSuccess) {
        printf("Unable to allocate %d bytes for success flags!  Exiting!\n",
                this->activeHashesProcessed.size());
        printf("return code: %d\n", err);
        exit(1);
    }

    // Clear host success flags region - if we are mapping the memory, the GPU
    // will directly write this.
    memset(this->HostSuccessAddress, 0, this->activeHashesProcessed.size());

    // Allocate memory for the reported flags.
    this->HostSuccessReportedAddress = new uint8_t [this->activeHashesProcessed.size()];
    memset(this->HostSuccessReportedAddress, 0, this->activeHashesProcessed.size());

    // If zero copy is in use, get the device pointer for the success data, else
    // malloc a region of memory on the device.
    if (this->useZeroCopy) {
        err = cudaHostGetDevicePointer((void **)&this->DeviceSuccessAddress,
            this->HostSuccessAddress, 0);
        err2 = cudaSuccess;
    } else {
        err = cudaMalloc((void **)&this->DeviceSuccessAddress,
            this->activeHashesProcessed.size());
        err2 = cudaMemset(this->DeviceSuccessAddress, 0,
            this->activeHashesProcessed.size());
    }
    if ((err != cudaSuccess) || (err2 != cudaSuccess)) {
        printf("Unable to allocate %d bytes for device success list!  Exiting!\n",
                this->activeHashesProcessed.size());
        printf("return code: %d\n", err);
        printf("return code: %d\n", err2);
        exit(1);
    }

    /*
     * Allocate memory for the found passwords.  As this is commonly copied
     * back and forth, it will be made zero copy if requested.
     *
     * This requires (number hashes * passwordLength) bytes of data.
     */

    err = cudaHostAlloc((void **)&this->HostFoundPasswordsAddress,
        this->passwordLength * this->activeHashesProcessed.size() , cudaHostMallocFlags);
    if (err != cudaSuccess) {
        printf("Unable to allocate %d bytes for host password list!  Exiting!\n",
                this->passwordLength * this->activeHashesProcessed.size());
        printf("return code: %d\n", err);
        exit(1);
    }
    // Clear the host found password space.
    memset(this->HostFoundPasswordsAddress, 0,
            this->passwordLength * this->activeHashesProcessed.size());

    if (this->useZeroCopy) {
        err = cudaHostGetDevicePointer((void **)&this->DeviceFoundPasswordsAddress,
            this->HostFoundPasswordsAddress, 0);
        err2 = cudaSuccess;
    } else {
        err = cudaMalloc((void **)&this->DeviceFoundPasswordsAddress,
            this->passwordLength * this->activeHashesProcessed.size());
        err2 = cudaMemset(this->DeviceFoundPasswordsAddress, 0,
            this->passwordLength * this->activeHashesProcessed.size());
    }
    if ((err != cudaSuccess) || (err2 != cudaSuccess)) {
        printf("Unable to allocate %d bytes for device password list!  Exiting!\n",
                this->passwordLength * this->activeHashesProcessed.size());
        printf("return code: %d\n", err);
        printf("return code: %d\n", err2);
        exit(1);
    }

    /**
     * Allocate space for host and device start positions.  To improve performance,
     * this space is now aligned for improved coalescing performance.  All the
     * position 0 elements are together, followed by all the position 1 elements,
     * etc.
     *
     * This memory can be allocated as write combined, as it is not read by
     * the host ever - only written.  Since it is regularly transferred to the
     * GPU, this should help improve performance.
     */
    err = cudaHostAlloc((void**)&this->HostStartPointAddress,
        this->TotalKernelWidth * this->passwordLength,
        cudaHostAllocWriteCombined | cudaHostMallocFlags);

    err2 = cudaMalloc((void **)&this->DeviceStartPointAddress,
        this->TotalKernelWidth * this->passwordLength);

    if ((err != cudaSuccess) || (err2 != cudaSuccess)) {
        printf("Unable to allocate %d bytes for host/device startpos list!  Exiting!\n",
                this->TotalKernelWidth * this->passwordLength);
        printf("return code: %d\n", err);
        printf("return code: %d\n", err2);
        exit(1);
    }
    
    /**
     * Allocate space for the device start password values.  This is a copy of
     * the MFNHashTypePlain::HostStartPasswords32 vector for the GPU.
     */
    err = cudaMalloc((void **)&this->DeviceStartPasswords32Address,
        this->TotalKernelWidth * this->passwordLengthWords);
    
    if ((err != cudaSuccess)) {
        printf("Unable to allocate %d bytes for host/device startpos list!  Exiting!\n",
                this->TotalKernelWidth * this->passwordLengthWords);
        printf("return code: %d\n", err);
        exit(1);
    }


    /**
     * Finally, attempt to allocate space for the giant device bitmaps.  There
     * are 4x128MB bitmaps, and any number can be allocated.  If they are not
     * fully allocated, their address is set to null as a indicator to the device
     * that there is no data present.  Attempt to allocate as many as possible.
     *
     * This will be accessed regularly, so should probably not be zero copy.
     * Also, I'm not sure how mapping host memory into multiple threads would
     * work.  Typically, if the GPU doesn't have enough RAM for the full
     * set of bitmaps, it's a laptop, and therefore may be short on host RAM
     * for the pinned access.
     *
     * If there is an error in allocation, call cudaGetLastError() to clear it -
     * we know there has been an error, and do not want it to persist.
     */
    err = cudaMalloc((void **)&this->DeviceBitmap128mb_a_Address,
        128 * 1024 * 1024);
    if (err == cudaSuccess) {
        memalloc_printf("Successfully allocated Bitmap A\n");
    } else {
        memalloc_printf("Unable to allocate 128MB bitmap A\n");
        this->DeviceBitmap128mb_a_Address = 0;
        cudaGetLastError();
    }

    err = cudaMalloc((void **)&this->DeviceBitmap128mb_b_Address,
        128 * 1024 * 1024);
    if (err == cudaSuccess) {
        memalloc_printf("Successfully allocated Bitmap B\n");
    } else {
        memalloc_printf("Unable to allocate 128MB bitmap B\n");
        this->DeviceBitmap128mb_b_Address = 0;
        cudaGetLastError();
    }

    err = cudaMalloc((void **)&this->DeviceBitmap128mb_c_Address,
        128 * 1024 * 1024);
    if (err == cudaSuccess) {
        memalloc_printf("Successfully allocated Bitmap C\n");
    } else {
        memalloc_printf("Unable to allocate 128MB bitmap C\n");
        this->DeviceBitmap128mb_c_Address = 0;
        cudaGetLastError();
    }

    err = cudaMalloc((void **)&this->DeviceBitmap128mb_d_Address,
        128 * 1024 * 1024);
    if (err == cudaSuccess) {
        memalloc_printf("Successfully allocated Bitmap D\n");
    } else {
        memalloc_printf("Unable to allocate 128MB bitmap D\n");
        this->DeviceBitmap128mb_d_Address = 0;
        cudaGetLastError();
    }
    //printf("Thread %d memory allocated successfully\n", this->threadId);
}
Example #4
0
cudaError_t WINAPI wine_cudaHostGetDevicePointer(void **pDevice, void *pHost, unsigned int flags) {
    WINE_TRACE("\n");
    return cudaHostGetDevicePointer( pDevice, pHost, flags );
}
Example #5
0
void lanczos(complex double * A, 	// chunk of A
		complex double * evecs, //the eigenvectors
		double * evals,		//evals, real
		int n, 			// full size of A
		int m,			// rows of A for this process
		int myOffset,			// where to begin			
		int subSize,			// the subspace size
		int commSize,			// MPI size
		int commRank){			// MPI rank


	MPI_Errhandler_set(MPI_COMM_WORLD, MPI_ERRORS_RETURN);
	// args for gemv
	char type = 'N';
	int info,inc=1,dim=n;



#ifdef _USE_GPU
	// check the device
	char hostname[256];
	gethostname(hostname,255);

	struct cudaDeviceProp p;
	cudaGetDeviceProperties(&p,0);
	int support = p.canMapHostMemory;

	if(support == 0){
		fprintf(stderr,"%s does not support mapping host memory\n",hostname);
		MPI_Finalize();
		exit(1);
	}

#endif

	// malloc vectors for use in lanczos
	complex double * alpha	= (complex double*) malloc(sizeof(complex double) * subSize);
	complex double * beta	= (complex double*) malloc(sizeof(complex double) * (subSize-1));
	complex double * r ;

	r = (complex double*) malloc(sizeof(complex double) * n);

	complex double * scratch= (complex double*) malloc(sizeof(complex double) * n);
	complex double * Q	= (complex double*) malloc(sizeof(complex double) * n * subSize);

	for (int i=0; i<m*n; i++)
		Q[i] = 0.0+0.0*_Complex_I;


	// an initial q-vector in first column of Q
	for (int i=0; i<n; i++)
		Q[i] = (1.0+1.0*_Complex_I) / sqrt(2.0f* (double) n);


	//dump_mat("Q",Q);

#ifdef _USE_GPU

	cudaError_t cerror;
	cublasStatus_t status = cublasInit();
	check_cu_error("CUBLAS initialization error on host");

	cuDoubleComplex * d_ortho;
	cuDoubleComplex * d_r;
	cuDoubleComplex * d_A;
	cuDoubleComplex * d_Q;
	cuDoubleComplex * d_beta;
	cuDoubleComplex * d_alpha;
	cuDoubleComplex * d_output;

	// zero copy memory for vector r, for use with MPI
	cerror = cudaHostAlloc((void**) &r,sizeof(cuDoubleComplex)*n,cudaHostAllocMapped);
	check_cu_error("cudaHostAlloc failed for r on host");
	cerror = cudaHostGetDevicePointer(&d_r,r,0);
	check_cu_error("cudaHostGetDevicePointer failed for d_r on host");
	// regular mallocs for everyone else
	cerror = cudaMalloc((void**) &d_ortho, sizeof(cuDoubleComplex));
	check_cu_error("cudaMalloc failed for d_ortho on host");
	cerror = cudaMalloc((void**) &d_alpha, sizeof(cuDoubleComplex) * subSize);
	check_cu_error("cudaMalloc failed for d_alpha on host");
	cerror = cudaMalloc((void**) &d_beta, sizeof(cuDoubleComplex) * (subSize-1));
	check_cu_error("cudaMalloc failed for d_beta on host");

	cerror = cudaMalloc((void**) &d_Q, sizeof(cuDoubleComplex) * subSize*n);
	check_cu_error("cudaMalloc failed for d_Q on host");
	cerror = cudaMalloc((void**) &d_A, sizeof(cuDoubleComplex) * m * n);
	check_cu_error("cudaMalloc failed for d_A on host");
	cerror = cudaMalloc((void**) &d_output, sizeof(cuDoubleComplex) * n);
	check_cu_error("cudaMalloc failed for d_output on host");
	// gpu running configuration
	cublasHandle_t handle;
	cublasCreate(&handle);

	dim3 threads,blocks;
	threads.x 	= _LAN_THREADS;
	blocks.x 	= n / threads.x +1;

	threads.y=1,threads.z=1,blocks.y=1,blocks.z	= 1;

#endif

	// multiplicative factors in gemv
	complex double mula 	= 1.0+0.0*_Complex_I;
	complex double mulb 	= 0.0+0.0*_Complex_I;
	complex double mulc 	= -1.0+0.0*_Complex_I;

	// args for gemv
	//char type = 'N';
	//int m=m,n=n,info;
	//int inc=1,dim=n;


	// init vectors
	zgemv_(&type,&m,&n,&mula,A,&m,Q,&inc,&mulb,&r[myOffset],&inc);


	// need to gather into r
	int success = MPI_Allgather((void*) &r[myOffset], m, MPI_LONG_DOUBLE, \
			(void*) r, m, MPI_LONG_DOUBLE,MPI_COMM_WORLD);

	//dump_vec(commRank,"r",r);


#ifdef _DEBUG_LANCZOS
	if (success != MPI_SUCCESS) {

		char error_string[256];
		int length_of_error_string;

		MPI_Error_string(success, error_string, &length_of_error_string);
		fprintf(stderr,"MPI_Allgather failed in file %s around line %d with code : %s\n",__FILE__,__LINE__,error_string);
		MPI_Finalize();
		exit(1);
	}

#endif
	for (int i=0; i<subSize; i++) alpha[i] 	= 0.0f;
	for (int i=0; i<subSize-1; i++) beta[i] = 0.0f;

	for (int i=0; i<n; i++) alpha[0] 	+= r[i] * conj(Q[i]);
	for (int i=0; i<n; i++) r[i] 		-= alpha[0] * Q[i];
	for (int i=0; i<n; i++) beta[0]		+= conj(r[i]) * r[i];	
	beta[0] = sqrt(beta[0]);

	//test subsequent lanczos vectors
	double ortho;

#ifdef _USE_GPU

	// send to device
	status =cublasSetVector(subSize,sizeof(cuDoubleComplex),alpha,1.0,d_alpha,1.0);
	check_last_cublas_error(status,"cublasSetVector failed for d_alpha on host",hostname,__LINE__);
	status =cublasSetVector(subSize-1,sizeof(cuDoubleComplex),beta,1.0,d_beta,1.0);
	check_cb_error("cublasSetVector failed for d_beta on host");
	status = cublasSetMatrix(m,n,sizeof(cuDoubleComplex),A,m,d_A,m);
	check_cb_error("cublasSetMatrix failed for d_A on host");
	status = cublasSetMatrix(n,subSize,sizeof(cuDoubleComplex),Q,n,d_Q,n);
	check_cb_error("cublasSetMatrix failed for d_Q on host");
#endif


#ifdef _GATHER_SCALAR
	//reduction not currently supported for cuda
	complex double * alpha_temp = (complex double * ) malloc (sizeof(complex double) * commSize);
	complex double * beta_temp = (complex double * ) malloc (sizeof(complex double) * commSize);

#endif
	// main lanczos loops
	for (int i=1; i<subSize; i++){

		MPI_Barrier(MPI_COMM_WORLD);
		ortho = 0.0;

#ifndef _USE_GPU


		// new column to Q, updated q
		
		for (int j=0; j<n; j++) Q[i*n+j] = r[j] / beta[i-1];

		// update r 
		zgemv_(&type,&m,&n,&mula,A,&m,&Q[i*n],&inc,&mulb,&r[myOffset],&inc);

		lanczos_diagnostic_c(r,Q,beta,alpha,n,i);

#ifndef _GATHER_SCALAR
		// need to gather into r
		int success = MPI_Allgather((void*) &r[myOffset], m, MPI_LONG_DOUBLE, \
				(void*) r, m, MPI_LONG_DOUBLE,MPI_COMM_WORLD);


#ifdef _DEBUG_LANCZOS
		if (success != MPI_SUCCESS) {

			char error_string[256];
			int length_of_error_string;

			MPI_Error_string(success, error_string, &length_of_error_string);
			fprintf(stderr,"MPI_Allgather failed in file %s around line %d with code : %s\n",__FILE__,__LINE__,error_string);
			MPI_Finalize();
			exit(1);
		}

#endif

#endif
		//
		int ind = (commSize==1) ? i-1 : i;

		// another r update
		for (int j=0; j<n; j++) r[j] 	-= beta[ind] * Q[(i-1)*n+j];


#ifndef _GATHER_SCALAR
		// update alpha
		for (int j=0; j<n; j++) alpha[i]+= r[j] * conj(Q[i*n+j]);

#else
		alpha_temp[commRank]=0.0+0.0*I;
		for (int j=0; j<m; j++) alpha_temp[commRank] +=r[j+myOffset] * conj(Q[i*n+j+myOffset]);
		// need to gather into r
		int success = MPI_Allgather((void*) &alpha_temp[commRank], 1, MPI_LONG_DOUBLE, \
				(void*) alpha_temp, commSize-1, MPI_LONG_DOUBLE,MPI_COMM_WORLD);

		for (int j=0; j<commSize; j++) alpha[i]+=alpha_temp[j];


#endif

		// r update
		for (int j=0; j<n; j++) r[j] 	-= alpha[i] * Q[i*n+j];

		// weak orthogonality test
		for (int j=0; j<n; j++)	ortho 	+= fabs(conj(Q[j]) * Q[i*n+j]);



		//exit(0);
		// re-orthogonalize
		// r -= Q(Q^T * r)
		if ( ortho > _EVECS_NORM){

#ifdef _GATHER_SCALAR
			// need to gather into r
			int success = MPI_Allgather((void*) &r[myOffset], m, MPI_LONG_DOUBLE, \
					(void*) r, m, MPI_LONG_DOUBLE,MPI_COMM_WORLD);


#ifdef _DEBUG_LANCZOS
			if (success != MPI_SUCCESS) {

				char error_string[256];
				int length_of_error_string;

				MPI_Error_string(success, error_string, &length_of_error_string);
				fprintf(stderr,"MPI_Allgather failed in file %s around line %d with code : %s\n",__FILE__,__LINE__,error_string);
				MPI_Finalize();
				exit(1);
			}

#endif

#endif

			//if (1){

			char typet = 'C';
			zgemv_(&typet,&n,&subSize,&mula,Q,&dim,r,&inc,&mulb,scratch,&inc);
			zgemv_(&type,&n,&subSize,&mulc,Q,&dim,scratch,&inc,&mula,r,&inc);


		}

		// update beta
		if (i<subSize-1){

#ifndef _GATHER_SCALAR

			for (int j=0; j<n; j++) beta[i]	+= conj(r[j]) * r[j];	

#else

			beta_temp[commRank]=0.0+0.0*I;
			for (int j=0; j<m; j++) beta_temp[commRank] +=conj(r[j+myOffset]) * r[j+myOffset];
			int success = MPI_Allgather((void*) &beta_temp[commRank], 1, MPI_LONG_DOUBLE, \
					(void*) beta_temp, commSize-1, MPI_LONG_DOUBLE,MPI_COMM_WORLD);

			for (int j=0; j<commSize; j++) beta[i]+=beta_temp[j];


#endif
			beta[i] = sqrt(beta[i]);
		}

#else

		//lanczos_diagnostic(blocks,threads,d_r,d_Q,d_beta,d_alpha,n,i);
		cerror = lanczos_first_update(blocks, threads, d_r, d_Q, d_beta, n, i);
		check_cu_error("lanczos_first_update failed on host");

		//exit(0);
		cublasGetError();


		cublasZgemv(handle,CUBLAS_OP_N,m,n,&mula,d_A,m,&d_Q[i*n],1,&mulb,&d_r[myOffset],1);

		status = cublasGetError();
		check_cb_error("cublasZgemv failed on host");

		// need to gather into r
		int success = MPI_Allgather((void*) &d_r[myOffset], m, MPI_LONG_DOUBLE, (void*) d_r, m, MPI_LONG_DOUBLE,MPI_COMM_WORLD);


#ifdef _DEBUG_LANCZOS
		if (success != MPI_SUCCESS) {

			char error_string[256];
			int length_of_error_string;

			MPI_Error_string(success, error_string, &length_of_error_string);
			fprintf(stderr,"gpu MPI_Allgather failed in file %s around line %d with code %s\n",__FILE__,__LINE__,error_string);
			MPI_Finalize();
			exit(1);
		}

#endif


		int ind = i; //(commSize==1) ? i-1 : i;
		cerror = lanczos_second_update(blocks, threads, d_r, d_Q, d_beta, n, i, ind);
		check_cu_error("lanczos_second_update failed on host");

		cerror = vector_dot(d_Q,d_r,d_output,&d_alpha[i],1,n,i*n,0,0);
		check_cu_error("vector_dot failed on host");

		cerror = lanczos_third_update(blocks, threads, d_r, d_Q, d_alpha, n, i);
		check_cu_error("lanczos_third_update failed on host");

		if (i<subSize-1){
			cerror = vector_dot(d_r,d_r,d_output,&d_beta[i],1,n,0,0,1);
		}

		check_cu_error("vector_dot failed on host");



		// crude orthogonality test
		//
		cerror = vector_dot(d_Q,d_Q,d_output,d_ortho,1,n,0,i*n,1);
		check_cu_error("vector_dot failed on host");

		//lanczos_diagnostic(blocks,threads,d_r,d_Q,d_beta,d_alpha,n,i);

		cudaMemcpy(&ortho,&d_ortho,sizeof(double), cudaMemcpyDeviceToHost);


		if (fabs(ortho) > _EVECS_NORM){
			//if (0){


			cublasGetError();

			cublasZgemv(handle,CUBLAS_OP_T,n,subSize,&mula,d_Q,dim,d_r,1,&mulb,d_output,1);
			cublasZgemv(handle,CUBLAS_OP_N,n,subSize,&mula,d_Q,dim,d_output,1,&mulb,d_output,1);

			status = cublasGetError();
			check_cb_error("cublasZgemv failed on host");

			cerror = lanczos_fourth_update(blocks, threads, d_r, d_output, n);
			check_cu_error("lanczos_fourth_update failed on host");
		}



#endif
		}

#ifdef _USE_GPU

		if (commRank==0){

			cerror = cudaMemcpy(alpha,d_alpha,sizeof(cuDoubleComplex) * subSize, cudaMemcpyDeviceToHost);
			check_cu_error("cudaMemcpy of d_alpha to host");
			cerror = cudaMemcpy(beta,d_beta,sizeof(cuDoubleComplex) * (subSize-1), cudaMemcpyDeviceToHost);
			check_cu_error("cudaMemcpy of d_beta to host");
			cerror = cudaMemcpy(Q,d_Q,sizeof(cuDoubleComplex) * subSize*n, cudaMemcpyDeviceToHost);
			check_cu_error("cudaMemcpy of d_Q to host");

		}
		cudaFree(d_alpha);
		cudaFree(d_output);
		cudaFree(d_beta);
		cudaFree(d_Q);
		cudaFreeHost(d_r);
		cudaFree(d_A);

#endif


#ifdef _DEBUG_LANCZOS
		if (commRank==0){

			printf("alpha & beta :\n");
			for (int i=0; i<subSize; i++)
				printf("%f+%fi ",creal(alpha[i]),cimag(alpha[i]));
			printf("\n");
			for (int i=0; i<subSize-1; i++)
				printf("%f+%fi ",creal(beta[i]),cimag(beta[i]));
			printf("\n");
		}
#endif
		// calculate spectrum of (now) tridiagonal matrix

		double * alp = (double*) malloc(sizeof(double) * subSize);
		double * bet = (double*) malloc(sizeof(double) * (subSize-1));

		for (int i=0; i<subSize; i++) alp[i] = creal(alpha[i]);
		for (int i=0; i<(subSize-1); i++) bet[i] = creal(beta[i]);

#ifdef _CALC_EVECS

		complex double * evecs_lan = (complex double*) malloc(sizeof(complex double) * subSize * subSize);


		type = 'I';

		zsteqr_(&type,&subSize,alp,bet,evecs_lan,&subSize,(double*) evecs,&info);

		type = 'N';

		for (int i=0; i<subSize; i++)
			zgemv_(&type,&n,&subSize,&mula,Q,&n,&evecs_lan[i*subSize],&inc,&mulb,&evecs[i*n],&inc);

		free(evecs_lan);
#else

		dsterf_(&subSize,alp,bet,&info);
		free(bet);

#endif

		for (int i=0; i<subSize; i++) evals[i] = alp[i];

#ifdef _DEBUG_LANCZOS

		if (commRank==0){
			printf("evals :\n");

			for (int i=0; i<subSize; i++)
				printf("%f ",evals[i]);
			printf("\n");

		}
#endif


		free(alp); 
		free(alpha); 	
		free(beta);
#ifndef _USE_GPU
		free(r);
#endif
		free(Q);
		}
Example #6
0
void BodySystemGPU<T>::_initialize(unsigned numBodies)
{
    assert(!m_bInitialized);

    m_numBodies = numBodies;

    unsigned int memSize = sizeof(T) * 4 * numBodies;

    m_deviceData = new DeviceData<T>[m_numDevices];

    // divide up the workload amongst Devices
    float *weights = new float[m_numDevices];
    int *numSms = new int[m_numDevices];
    float total = 0;

    for (unsigned int i = 0; i < m_numDevices; i++)
    {
        cudaDeviceProp props;
        checkCudaErrors(cudaGetDeviceProperties(&props, i));

        // Choose the weight based on the Compute Capability
        // We estimate that a CC2.0 SM is about 4.0x faster than a CC 1.x SM for
        // this application (since a 15-SM GF100 is about 2X faster than a 30-SM GT200).
        numSms[i] = props.multiProcessorCount;
        weights[i] = numSms[i] * (props.major >= 2 ? 4.f : 1.f);
        total += weights[i];

    }

    unsigned int offset = 0;
    unsigned int remaining = m_numBodies;

    for (unsigned int i = 0; i < m_numDevices; i++)
    {
        unsigned int count = (int)((weights[i] / total) * m_numBodies);
        unsigned int round = numSms[i] * 256;
        count = round * ((count + round - 1) / round);

        if (count > remaining)
        {
            count = remaining;
        }

        remaining -= count;
        m_deviceData[i].offset = offset;
        m_deviceData[i].numBodies = count;
        offset += count;

        if ((i == m_numDevices - 1) && (offset < m_numBodies-1))
        {
            m_deviceData[i].numBodies += m_numBodies - offset;
        }
    }

    delete [] weights;
    delete [] numSms;

    if (m_bUseSysMem)
    {
        checkCudaErrors(cudaHostAlloc((void **)&m_hPos[0], memSize, cudaHostAllocMapped | cudaHostAllocPortable));
        checkCudaErrors(cudaHostAlloc((void **)&m_hPos[1], memSize, cudaHostAllocMapped | cudaHostAllocPortable));
        checkCudaErrors(cudaHostAlloc((void **)&m_hVel,    memSize, cudaHostAllocMapped | cudaHostAllocPortable));

        memset(m_hPos[0], 0, memSize);
        memset(m_hPos[1], 0, memSize);
        memset(m_hVel, 0, memSize);

        for (unsigned int i = 0; i < m_numDevices; i++)
        {
            if (m_numDevices > 1)
            {
                checkCudaErrors(cudaSetDevice(i));
            }

            checkCudaErrors(cudaEventCreate(&m_deviceData[i].event));
            checkCudaErrors(cudaHostGetDevicePointer((void **)&m_deviceData[i].dPos[0], (void *)m_hPos[0], 0));
            checkCudaErrors(cudaHostGetDevicePointer((void **)&m_deviceData[i].dPos[1], (void *)m_hPos[1], 0));
            checkCudaErrors(cudaHostGetDevicePointer((void **)&m_deviceData[i].dVel, (void *)m_hVel, 0));
        }
    }
    else
    {
        m_hPos[0] = new T[m_numBodies*4];
        m_hVel = new T[m_numBodies*4];

        memset(m_hPos[0], 0, memSize);
        memset(m_hVel, 0, memSize);

        checkCudaErrors(cudaEventCreate(&m_deviceData[0].event));

        if (m_bUsePBO)
        {
            // create the position pixel buffer objects for rendering
            // we will actually compute directly from this memory in CUDA too
            glGenBuffers(2, (GLuint *)m_pbo);

            for (int i = 0; i < 2; ++i)
            {
                glBindBuffer(GL_ARRAY_BUFFER, m_pbo[i]);
                glBufferData(GL_ARRAY_BUFFER, memSize, m_hPos[0], GL_DYNAMIC_DRAW);

                int size = 0;
                glGetBufferParameteriv(GL_ARRAY_BUFFER, GL_BUFFER_SIZE, (GLint *)&size);

                if ((unsigned)size != memSize)
                {
                    fprintf(stderr, "WARNING: Pixel Buffer Object allocation failed!n");
                }

                glBindBuffer(GL_ARRAY_BUFFER, 0);
                checkCudaErrors(cudaGraphicsGLRegisterBuffer(&m_pGRes[i],
                                                             m_pbo[i],
                                                             cudaGraphicsMapFlagsNone));
            }
        }
        else
        {
            checkCudaErrors(cudaMalloc((void **)&m_deviceData[0].dPos[0], memSize));
            checkCudaErrors(cudaMalloc((void **)&m_deviceData[0].dPos[1], memSize));
        }

        checkCudaErrors(cudaMalloc((void **)&m_deviceData[0].dVel, memSize));
    }

    m_bInitialized = true;
}
HostReflectionHost::BootUp::BootUp(const std::string& module)
    : _module(module)
{
    report("Booting up host reflection...");

    // add message handlers
    _addMessageHandlers();

    // allocate memory for the queue
    size_t queueDataSize = maxMessageSize() * 2;
    size_t size = 2 * (queueDataSize + sizeof(QueueMetaData));

    _deviceHostSharedMemory = new char[size];

    // setup the queue meta data
    QueueMetaData* hostToDeviceMetaData =
        (QueueMetaData*)_deviceHostSharedMemory;
    QueueMetaData* deviceToHostMetaData =
        (QueueMetaData*)_deviceHostSharedMemory + 1;

    char* hostToDeviceData = _deviceHostSharedMemory +
                             2 * sizeof(QueueMetaData);
    char* deviceToHostData = _deviceHostSharedMemory +
                             2 * sizeof(QueueMetaData) + queueDataSize;

    hostToDeviceMetaData->hostBegin = hostToDeviceData;
    hostToDeviceMetaData->size      = queueDataSize;
    hostToDeviceMetaData->head      = 0;
    hostToDeviceMetaData->tail      = 0;
    hostToDeviceMetaData->mutex     = (size_t)-1;

    deviceToHostMetaData->hostBegin = deviceToHostData;
    deviceToHostMetaData->size      = queueDataSize;
    deviceToHostMetaData->head      = 0;
    deviceToHostMetaData->tail      = 0;
    deviceToHostMetaData->mutex     = (size_t)-1;

    // Allocate the queues
    _hostToDeviceQueue = new HostQueue(hostToDeviceMetaData);
    _deviceToHostQueue = new HostQueue(deviceToHostMetaData);

    // Map the memory onto the device
    cudaHostRegister(_deviceHostSharedMemory, size, 0);

    char* devicePointer = 0;

    cudaHostGetDevicePointer((void**)&devicePointer,
                             _deviceHostSharedMemory, 0);

    // Send the metadata to the device
    QueueMetaData* hostToDeviceMetaDataPointer =
        (QueueMetaData*)devicePointer;
    QueueMetaData* deviceToHostMetaDataPointer =
        (QueueMetaData*)devicePointer + 1;

    hostToDeviceMetaData->deviceBegin = devicePointer +
                                        2 * sizeof(QueueMetaData);
    deviceToHostMetaData->deviceBegin = devicePointer +
                                        2 * sizeof(QueueMetaData) + queueDataSize;

    cudaConfigureCall(dim3(1, 1, 1), dim3(1, 1, 1), 0, 0);

    cudaSetupArgument(&hostToDeviceMetaDataPointer, 8, 0 );
    cudaSetupArgument(&deviceToHostMetaDataPointer, 8, 8 );
    ocelot::launch(_module, "_bootupHostReflection");

    // start up the host worker thread
    _kill   = false;
    _thread = new boost::thread(_runThread, this);
}
Example #8
0
// Initialize a test instance on the given grid.
struct test_config_t* test_init(
	const char* name, const char* mode,
	int n, int nt, int sx, int sy, int ss, int rank, int szcomm,
	real xmin, real ymin, real zmin,
	real xmax, real ymax, real zmax,
	int bx, int by, int bs, int ex, int ey, int es
#ifdef CUDA
	, struct cudaDeviceProp* props
#endif
)
{
	// TODO: replace n with nx, ny, ns.

	// TODO: parameterize.
	int szelem = sizeof(real);
	int narrays = 3;

	//
	// 1) Calculate the dimensions of entire grid domain.
	//
#ifdef MPI
	// For each MPI node create a view of decomposed grid topology.
	struct grid_domain_t* domains = grid_init_simple(
		n, n, n, sx, sy, ss, bx, by, bs, ex, ey, es);

	// The rank-th subdomain is assigned to entire MPI process.
	struct grid_domain_t* domain = domains + rank;

	// Set domain data copying callbacks and user-defined pointer
	// - the test config, in this case.
	int ndomains = domain->parent->nsubdomains;
	for (int i = 0; i < ndomains; i++)
	{
		struct grid_domain_t* domain = domains + i;
		domain->scatter_memcpy = &grid_subcpy;
		domain->gather_memcpy = &grid_subcpy;
		domain->narrays = narrays;
		domain->szelem = szelem;
	}

	// The problem X, Y, Z dimensions are set relative to the
	// subdomain of entire MPI process.
	int nx = domain->grid[0].nx, ny = domain->grid[0].ny, ns = domain->grid[0].ns;
	size_t nxys = domain->grid[0].extsize;
	size_t nxysb = nxys * szelem;
#else
	int nx = n, ny = n, ns = n;
	size_t nxys = nx * ny * ns;
	size_t nxysb = nxys * szelem;
#endif

	//
	// 2) Allocate the test config structure together with
	// the array of pointers to keep CPU and GPU data arrays.
	// Assign dimensions and data pointers.
	//
#ifdef CUDA
	int gpu = !strcmp(mode, "GPU");
#else
	int gpu = 0;
#endif
	struct test_config_t* t = (struct test_config_t*)malloc(
		sizeof(struct test_config_t) + (1 + gpu) * narrays * sizeof(char*));
#ifdef MPI
	t->cpu = *domain;
#ifdef CUDA
	t->gpu = *domain;
#endif
	// Track MPI node rank, and decomposition grid domains
	// in test config structure.
	t->rank = rank;
	t->domains = domains;
#else
	t->cpu.grid->nx = nx; t->cpu.grid->ny = ny; t->cpu.grid->ns = ns; t->cpu.grid->extsize = nxys;
	t->cpu.parent = &t->cpu;
	t->cpu.narrays = narrays;
#ifdef CUDA
	t->gpu.grid->nx = nx; t->gpu.grid->ny = ny; t->gpu.grid->ns = ns; t->gpu.grid->extsize = nxys;
	t->gpu.parent = &t->gpu;
	t->cpu.narrays = narrays;
#endif
#endif
	t->cpu.arrays = (char**)(t + 1);
#ifdef CUDA
	t->gpu.arrays = t->cpu.arrays + narrays;
#endif

	//
	// 3) Set the simple properties of test config.
	//
	t->name = name; t->mode = mode;
	t->nx = nx; t->ny = ny; t->ns = ns; t->nt = nt;

	// Grid steps.
	t->dx = (xmax - xmin) / (n - 1);
	t->dy = (ymax - ymin) / (n - 1);
	t->ds = (zmax - zmin) / (n - 1);
	t->dt = t->dx / 2.0;
		
	// Set scheme coefficients.
	double dt2dx2 = (t->dt * t->dt) / (t->dx * t->dx);
	t->c0 = 2.0 - dt2dx2 * 7.5;
	t->c1 = dt2dx2 * (4.0 / 3.0);
	t->c2 = dt2dx2 * (-1.0 / 12.0);

	//
	// 4) Allocate the CPU data arrays.
	//
#if defined(CUDA)
	if (!strcmp(mode, "GPU"))
	{
		for (int iarray = 0; iarray < narrays; iarray++)
		{
#if defined(CUDA_MAPPED)
			// Allocate memory as host-mapped memory accessible both from
			// CPU and GPU.
			CUDA_SAFE_CALL(cudaHostAlloc((void**)&t->cpu.arrays[iarray],
				nxysb, cudaHostAllocMapped));
#elif defined(CUDA_PINNED)
			// Allocate host memory as pinned to get faster CPU-GPU data
			// transfers.
			CUDA_SAFE_CALL(cudaMallocHost((void**)&t->cpu.arrays[iarray],
				nxysb));
#endif // CUDA_MAPPED
		}
	}
	else
#endif // CUDA
	{
		// Allocate regular CPU memory.
		for (int iarray = 0; iarray < narrays; iarray++)
			t->cpu.arrays[iarray] = (char*)malloc(nxysb);
	}

	// Initially flush CPU array data to zero.
	for (int iarray = 0; iarray < narrays; iarray++)
		memset(t->cpu.arrays[iarray], 0, nxysb);
#if defined(MPI)
	struct grid_domain_t* subdomains = domain->subdomains;
	int nsubdomains = domain->nsubdomains;

#if defined(CUDA) && !defined(CUDA_MAPPED)
	if (!strcmp(mode, "GPU"))
	{
		// Assign domain main arrays.
		domain->arrays = t->gpu.arrays;
	}
	else
#endif // CUDA && !CUDA_MAPPED
	{
		// Assign domain main arrays.
		domain->arrays = t->cpu.arrays;
	}

	// Allocate memory required to keep the rest of domain data.
	// In addition to main data arrays, each domain also allocates data
	// for its subdomains (nested domains). In this case the nested domains
	// represent boundaries for data buffering.
#if defined(CUDA) && defined(CUDA_MAPPED)
	if (!strcmp(mode, "GPU"))
	{
		for (int i = 0; i < nsubdomains; i++)
		{
			struct grid_domain_t* subdomain = subdomains + i;
	
			subdomain->arrays = (char**)malloc(sizeof(char*) * narrays);
			subdomain->narrays = narrays;
			for (int iarray = 0; iarray < narrays; iarray++)
			{
				size_t size = subdomain->grid[0].extsize * szelem;

				// Allocate a host-mapped array for subdomain in order
				// to make in possible to perform GPU-initiated boundaries
				// update.
				CUDA_SAFE_CALL(cudaHostAlloc((void**)&subdomain->arrays[iarray],
					size, cudaHostAllocMapped));

				// TODO: mapping
				
				// TODO: flushing to zero.
			}
		}
	}
	else
#endif // CUDA && CUDA_MAPPED
	{
		for (int i = 0; i < nsubdomains; i++)
		{
			struct grid_domain_t* subdomain = subdomains + i;
	
			subdomain->arrays = (char**)malloc(sizeof(char*) * narrays);
			subdomain->narrays = narrays;
			for (int iarray = 0; iarray < narrays; iarray++)
			{
				size_t size = subdomain->grid[0].extsize * szelem;

				// Allocate regular CPU memory.
				subdomain->arrays[iarray] = (char*)malloc(size);
				
				// Flush to zero.
				memset(subdomain->arrays[iarray], 0, size);
			}
		}
	}
#endif // MPI
	
	//
	// 5) Allocate the GPU data arrays.
	//
#if defined(CUDA)
	if (!strcmp(mode, "GPU"))
	{
#if defined(CUDA_MAPPED)
		// In case of host-mapped memory the GPU arrays pointers are
		// either same as for CPU arrays or contain specially mapped
		// pointers, depending on device capability.
		int use_mapping = props->major < 2;
		if (use_mapping)
		{
#ifdef VERBOSE
			printf("requires mapping\n");
#endif
			for (int i = 0; i < narrays; i++)
				CUDA_SAFE_CALL(cudaHostGetDevicePointer(
					(void**)&t->gpu.arrays[i], t->cpu.arrays[i], 0));
		}
		else
		{
#ifdef VERBOSE
			printf("does not require mapping\n");
#endif
			for (int iarray = 0; iarray < narrays; iarray++)
				t->gpu.arrays[iarray] = t->cpu.arrays[iarray];
		}
#else
		for (int iarray = 0; iarray < narrays; iarray++)
		{
			// Allocate regular GPU memory.
			CUDA_SAFE_CALL(cudaMalloc((void**)&t->gpu.arrays[iarray], nxysb));

			// Initially flush GPU array data to zero.
			CUDA_SAFE_CALL(cudaMemset(t->gpu.arrays[iarray], 0, nxysb));
			
			// TODO: reassign arrays of MPI domain.
		}
#endif // CUDA_MAPPED
	}
#endif // CUDA

	return t;
}