int main(int argc, char* argv[]) { // Initialize MPI. From this point the specified // number of processes will be executed in parallel. int mpi_status = MPI_Init(&argc, &argv); int mpi_error_msg_length; char mpi_error_msg[MPI_MAX_ERROR_STRING]; if (mpi_status != MPI_SUCCESS) { MPI_Error_string(mpi_status, mpi_error_msg, &mpi_error_msg_length); fprintf(stderr, "Cannot initialize MPI, status = %s\n", mpi_error_msg); return 1; } // Get the size of the MPI global communicator, // that is get the total number of MPI processes. int nprocesses; mpi_status = MPI_Comm_size(MPI_COMM_WORLD, &nprocesses); if (mpi_status != MPI_SUCCESS) { MPI_Error_string(mpi_status, mpi_error_msg, &mpi_error_msg_length); fprintf(stderr, "Cannot retrieve the number of MPI processes, status = %s\n", mpi_error_msg); return 1; } // Get the rank (index) of the current MPI process // in the global communicator. int iprocess; mpi_status = MPI_Comm_rank(MPI_COMM_WORLD, &iprocess); if (mpi_status != MPI_SUCCESS) { MPI_Error_string(mpi_status, mpi_error_msg, &mpi_error_msg_length); fprintf(stderr, "Cannot retrieve the rank of current MPI process, status = %s\n", mpi_error_msg); return 1; } int ndevices = 0; cudaError_t cuda_status = cudaGetDeviceCount(&ndevices); if (cuda_status != cudaSuccess) { fprintf(stderr, "Cannot get the cuda device count by process %d, status = %s\n", iprocess, cudaGetErrorString(cuda_status)); return 1; } // Return if no cuda devices present. if (iprocess == 0) printf("%d CUDA device(s) found\n", ndevices); if (!ndevices) return 0; // Get problem size from the command line. if (argc != 3) { printf("Usage: %s <n> <npasses>\n", argv[0]); return 0; } int n = atoi(argv[1]); int npasses = atoi(argv[2]); size_t size = n * n * sizeof(float); if ((n <= 0) || (npasses <= 0)) return 0; // Assign unique device to each MPI process. cuda_status = cudaSetDevice(iprocess); if (cuda_status != cudaSuccess) { fprintf(stderr, "Cannot set CUDA device by process %d, status= %s\n", iprocess, cudaGetErrorString(cuda_status)); return 1; } // Create two device input buffers. float *din1, *din2; cuda_status = cudaMalloc((void**)&din1, size); if (cuda_status != cudaSuccess) { fprintf(stderr, "Cannot allocate input device buffer by process %d, status = %s\n", iprocess, cudaGetErrorString(cuda_status)); return 1; } cuda_status = cudaMalloc((void**)&din2, size); if (cuda_status != cudaSuccess) { fprintf(stderr, "Cannot allocate input device buffer by process %d, status = %s\n", iprocess, cudaGetErrorString(cuda_status)); return 1; } // Create device output buffer. float* dout; cuda_status = cudaMalloc((void**)&dout, size); if (cuda_status != cudaSuccess) { fprintf(stderr, "Cannot allocate output device buffer by process %d, status = %s\n", iprocess, cudaGetErrorString(cuda_status)); return 1; } float* hin = (float*)malloc(size); float* hout = (float*)malloc(size); // Generate random input data. double dinvrmax = 1.0 / RAND_MAX; for (int i = 0; i < n * n; i++) { for (int j = 0; j < iprocess + 1; j++) hin[i] += rand() * dinvrmax; hin[i] /= iprocess + 1; } // Copy input data generated on host to device buffer. cuda_status = cudaMemcpy(din1, hin, size, cudaMemcpyHostToDevice); if (cuda_status != cudaSuccess) { fprintf(stderr, "Cannot copy input data from host to device by process %d, status = %s\n", iprocess, cudaGetErrorString(cuda_status)); return 1; } // Perform the specified number of processing passes. for (int ipass = 0; ipass < npasses; ipass++) { // Fill output device buffer will zeros. cuda_status = cudaMemset(dout, 0, size); if (cuda_status != cudaSuccess) { fprintf(stderr, "Cannot fill output device buffer with zeros by process %d, status = %s\n", iprocess, cudaGetErrorString(cuda_status)); return 1; } // Process data on GPU. pattern2d_gpu(1, n, 1, 1, n, 1, din1, dout); // Wait for GPU kernels to finish processing. cuda_status = cudaThreadSynchronize(); if (cuda_status != cudaSuccess) { fprintf(stderr, "Cannot synchronize GPU kernel by process %d, status = %s\n", iprocess, cudaGetErrorString(cuda_status)); return 1; } // Copy output data back from device to host. cuda_status = cudaMemcpy(hout, dout, size, cudaMemcpyDeviceToHost); if (cuda_status != cudaSuccess) { fprintf(stderr, "Cannot copy output data from device to host by process %d, status = %s\n", iprocess, cudaGetErrorString(cuda_status)); return 1; } // Output average value of the resulting field. float avg = 0.0; for (int i = 0; i < n * n; i++) avg += hout[i]; avg /= n * n; printf("Sending process %d resulting field with average = %f to process %d\n", iprocess, avg, (iprocess + 1) % nprocesses); MPI_Request request; int inext = (iprocess + 1) % nprocesses; int iprev = (iprocess - 1) % nprocesses; iprev += (iprev < 0) ? nprocesses : 0; // Pass entire process input device buffer directly to input device buffer // of next process. mpi_status = MPI_Isend(din1, n * n, MPI_FLOAT, inext, 0, MPI_COMM_WORLD, &request); mpi_status = MPI_Recv(din2, n * n, MPI_FLOAT, iprev, 0, MPI_COMM_WORLD, NULL); mpi_status = MPI_Wait(&request, MPI_STATUS_IGNORE); // Swap buffers. float* swap = din1; din1 = din2; din2 = swap; } cuda_status = cudaFree(din1); if (cuda_status != cudaSuccess) { fprintf(stderr, "Cannot free input device buffer by process %d, status = %s\n", iprocess, cudaGetErrorString(cuda_status)); return 1; } cuda_status = cudaFree(dout); if (cuda_status != cudaSuccess) { fprintf(stderr, "Cannot free output device buffer by process %d, status = %s\n", iprocess, cudaGetErrorString(cuda_status)); return 1; } free(hin); free(hout); mpi_status = MPI_Finalize(); if (mpi_status != MPI_SUCCESS) { MPI_Error_string(mpi_status, mpi_error_msg, &mpi_error_msg_length); fprintf(stderr, "Cannot finalize MPI, status = %s\n", mpi_error_msg); return 1; } return 0; }
//return types are void since any internal error will be handled by quitting //no point in returning error codes... //returns a pointer to an RGBA version of the input image //and a pointer to the single channel grey-scale output //on both the host and device void preProcess(uchar4 **h_inputImageRGBA, uchar4 **h_outputImageRGBA, uchar4 **d_inputImageRGBA, uchar4 **d_outputImageRGBA, unsigned char **d_redBlurred, unsigned char **d_greenBlurred, unsigned char **d_blueBlurred, float **h_filter, int *filterWidth, const std::string &filename) { //make sure the context initializes ok checkCudaErrors(cudaFree(0)); cv::Mat image = cv::imread(filename.c_str(), CV_LOAD_IMAGE_COLOR); if (image.empty()) { std::cerr << "Couldn't open file: " << filename << std::endl; exit(1); } cv::cvtColor(image, imageInputRGBA, CV_BGR2RGBA); //allocate memory for the output imageOutputRGBA.create(image.rows, image.cols, CV_8UC4); //This shouldn't ever happen given the way the images are created //at least based upon my limited understanding of OpenCV, but better to check if (!imageInputRGBA.isContinuous() || !imageOutputRGBA.isContinuous()) { std::cerr << "Images aren't continuous!! Exiting." << std::endl; exit(1); } *h_inputImageRGBA = (uchar4 *)imageInputRGBA.ptr<unsigned char>(0); *h_outputImageRGBA = (uchar4 *)imageOutputRGBA.ptr<unsigned char>(0); const size_t numPixels = numRows() * numCols(); //allocate memory on the device for both input and output checkCudaErrors(cudaMalloc(d_inputImageRGBA, sizeof(uchar4) * numPixels)); checkCudaErrors(cudaMalloc(d_outputImageRGBA, sizeof(uchar4) * numPixels)); checkCudaErrors(cudaMemset(*d_outputImageRGBA, 0, numPixels * sizeof(uchar4))); //make sure no memory is left laying around //copy input array to the GPU checkCudaErrors(cudaMemcpy(*d_inputImageRGBA, *h_inputImageRGBA, sizeof(uchar4) * numPixels, cudaMemcpyHostToDevice)); d_inputImageRGBA__ = *d_inputImageRGBA; d_outputImageRGBA__ = *d_outputImageRGBA; //now create the filter that they will use const int blurKernelWidth = 9; const float blurKernelSigma = 2.; *filterWidth = blurKernelWidth; //create and fill the filter we will convolve with *h_filter = new float[blurKernelWidth * blurKernelWidth]; h_filter__ = *h_filter; float filterSum = 0.f; //for normalization for (int r = -blurKernelWidth/2; r <= blurKernelWidth/2; ++r) { for (int c = -blurKernelWidth/2; c <= blurKernelWidth/2; ++c) { float filterValue = expf( -(float)(c * c + r * r) / (2.f * blurKernelSigma * blurKernelSigma)); (*h_filter)[(r + blurKernelWidth/2) * blurKernelWidth + c + blurKernelWidth/2] = filterValue; filterSum += filterValue; } } float normalizationFactor = 1.f / filterSum; for (int r = -blurKernelWidth/2; r <= blurKernelWidth/2; ++r) { for (int c = -blurKernelWidth/2; c <= blurKernelWidth/2; ++c) { (*h_filter)[(r + blurKernelWidth/2) * blurKernelWidth + c + blurKernelWidth/2] *= normalizationFactor; } } //blurred checkCudaErrors(cudaMalloc(d_redBlurred, sizeof(unsigned char) * numPixels)); checkCudaErrors(cudaMalloc(d_greenBlurred, sizeof(unsigned char) * numPixels)); checkCudaErrors(cudaMalloc(d_blueBlurred, sizeof(unsigned char) * numPixels)); checkCudaErrors(cudaMemset(*d_redBlurred, 0, sizeof(unsigned char) * numPixels)); checkCudaErrors(cudaMemset(*d_greenBlurred, 0, sizeof(unsigned char) * numPixels)); checkCudaErrors(cudaMemset(*d_blueBlurred, 0, sizeof(unsigned char) * numPixels)); }
float gpuDecode(EntropyCodingTaskInfo *infos, type_image *img, mem_mg_t *mem_mg, int count) { int codeBlocks = count; int maxOutLength = /*MAX_CODESTREAM_SIZE*/(1 << img->cblk_exp_w) * (1 << img->cblk_exp_h) * 14; int n = 0; for(int i = 0; i < codeBlocks; i++) n += infos[i].width * infos[i].height; byte *d_inbuf; GPU_JPEG2K::CoefficientState *d_stBuffors; CodeBlockAdditionalInfo *h_infos = (CodeBlockAdditionalInfo *)mem_mg->alloc->host(sizeof(CodeBlockAdditionalInfo) * codeBlocks, mem_mg->ctx); CodeBlockAdditionalInfo *d_infos; d_inbuf = (byte *)mem_mg->alloc->dev(sizeof(byte) * codeBlocks * maxOutLength, mem_mg->ctx); d_infos = (CodeBlockAdditionalInfo *)mem_mg->alloc->dev(sizeof(CodeBlockAdditionalInfo) * codeBlocks, mem_mg->ctx); int magconOffset = 0; for(int i = 0; i < codeBlocks; i++) { h_infos[i].width = infos[i].width; h_infos[i].height = infos[i].height; h_infos[i].nominalWidth = infos[i].nominalWidth; h_infos[i].stripeNo = (int) ceil(infos[i].height / 4.0f); h_infos[i].subband = infos[i].subband; h_infos[i].magconOffset = magconOffset + infos[i].width; h_infos[i].magbits = infos[i].magbits; h_infos[i].length = infos[i].length; h_infos[i].significantBits = infos[i].significantBits; h_infos[i].coefficients = (int *)mem_mg->alloc->dev(sizeof(int) * infos[i].nominalWidth * infos[i].nominalHeight, mem_mg->ctx); infos[i].coefficients = h_infos[i].coefficients; cuda_memcpy_htd(infos[i].codeStream, (void *) (d_inbuf + i * maxOutLength), sizeof(byte) * infos[i].length); magconOffset += h_infos[i].width * (h_infos[i].stripeNo + 2); } d_stBuffors = (GPU_JPEG2K::CoefficientState *)mem_mg->alloc->dev(sizeof(GPU_JPEG2K::CoefficientState) * magconOffset, mem_mg->ctx); cudaMemset((void *) d_stBuffors, 0, sizeof(GPU_JPEG2K::CoefficientState) * magconOffset); cuda_memcpy_htd(h_infos, d_infos, sizeof(CodeBlockAdditionalInfo) * codeBlocks); // cudaEvent_t start, end; // cudaEventCreate(&start); // cudaEventCreate(&end); // // cudaEventRecord(start, 0); GPU_JPEG2K::launch_decode((int) ceil((float) codeBlocks / THREADS), THREADS, d_stBuffors, d_inbuf, maxOutLength, d_infos, codeBlocks); // cudaEventRecord(end, 0); mem_mg->dealloc->dev(d_inbuf, mem_mg->ctx); mem_mg->dealloc->dev(d_stBuffors, mem_mg->ctx); mem_mg->dealloc->dev(d_infos, mem_mg->ctx); mem_mg->dealloc->host(h_infos, mem_mg->ctx); float elapsed = 0.0f; // cudaEventElapsedTime(&elapsed, start, end); return elapsed; }
int main(int argc, char **argv) { char *output; int x; int y; struct cuda_device device; int available_words = 1; int current_words = 0; struct wordlist_file file; char input_hash[4][9]; print_info(); if (argc != ARG_COUNT) { printf("Usage: %s WORDLIST_FILE MD5_HASH\n", argv[0]); return -1; } if (process_wordlist(argv[ARG_WORDLIST], &file) == -1) { printf("Error Opening Wordlist File: %s\n", argv[ARG_WORDLIST]); return -1; } if (read_wordlist(&file) == 0) { printf("No valid passwords in the wordlist file: %s\n", argv[ARG_WORDLIST]); return -1; } // first things first, we need to select our CUDA device if (get_cuda_device(&device) == -1) { printf("No Cuda Device Installed\n"); return -1; } // we now need to calculate the optimal amount of threads to use for this card calculate_cuda_params(&device); // now we input our target hash if (strlen(argv[ARG_MD5]) != 32) { printf("Not a valid MD5 Hash (should be 32 bytes and only Hex Chars\n"); return -1; } // we split the input hash into 4 blocks memset(input_hash, 0, sizeof(input_hash)); for(x=0; x < 4; x++) { strncpy(input_hash[x], argv[ARG_MD5] + (x * 8), 8); device.target_hash[x] = htonl(_httoi(input_hash[x])); } // allocate global memory for use on device if (cudaMalloc(&device.device_global_memory, device.device_global_memory_len) != CUDA_SUCCESS) { printf("Error allocating memory on device (global memory)\n"); return -1; } // allocate the 'stats' that will indicate if we are successful in cracking if (cudaMalloc(&device.device_stats_memory, sizeof(struct device_stats)) != CUDA_SUCCESS) { printf("Error allocating memory on device (stats memory)\n"); return -1; } // allocate debug memory if required if (cudaMalloc(&device.device_debug_memory, device.device_global_memory_len) != CUDA_SUCCESS) { printf("Error allocating memory on device (debug memory)\n"); return -1; } // make sure the stats are clear on the device if (cudaMemset(device.device_stats_memory, 0, sizeof(struct device_stats)) != CUDA_SUCCESS) { printf("Error Clearing Stats on device\n"); return -1; } // this is our host memory that we will copy to the graphics card if ((device.host_memory = malloc(device.device_global_memory_len)) == NULL) { printf("Error allocating memory on host\n"); return -1; } // put our target hash into the GPU constant memory as this will not change (and we can't spare shared memory for speed) if (cudaMemcpyToSymbol("target_hash", device.target_hash, 16, 0, cudaMemcpyHostToDevice) != CUDA_SUCCESS) { printf("Error initalizing constants\n"); return -1; } #ifdef BENCHMARK // these will be used to benchmark int counter = 0; struct timeval start, end; gettimeofday(&start, NULL); #endif int z; while(available_words) { memset(device.host_memory, 0, device.device_global_memory_len); for(x=0; x < (device.device_global_memory_len / 64) && file.words[current_words] != (char *)0; x++, current_words++) { #ifdef BENCHMARK counter++; // increment counter for this word #endif output = md5_pad(file.words[current_words]); memcpy(device.host_memory + (x * 64), output, 64); } if (file.words[current_words] == (char *)0) { // read some more words ! current_words = 0; if (!read_wordlist(&file)) { // no more words available available_words = 0; // we continue as we want to flush the cache ! } } // now we need to transfer the MD5 hashes to the graphics card for preperation if (cudaMemcpy(device.device_global_memory, device.host_memory, device.device_global_memory_len, cudaMemcpyHostToDevice) != CUDA_SUCCESS) { printf("Error Copying Words to GPU\n"); return -1; } md5_calculate(&device); // launch the kernel of the CUDA device if (cudaMemcpy(&device.stats, device.device_stats_memory, sizeof(struct device_stats), cudaMemcpyDeviceToHost) != CUDA_SUCCESS) { printf("Error Copying STATS from the GPU\n"); return -1; } #ifdef DEBUG // For debug, we will receive the hashes for verification memset(device.host_memory, 0, device.device_global_memory_len); if (cudaMemcpy(device.host_memory, device.device_debug_memory, device.device_global_memory_len, cudaMemcpyDeviceToHost) != CUDA_SUCCESS) { printf("Error Copying words to GPU\n"); return; } cudaThreadSynchronize(); // prints out the debug hash'es printf("MD5 registers:\n\n"); unsigned int *m = (unsigned int *)device.host_memory; for(y=0; y <= (device.max_blocks * device.max_threads); y++) { printf("------ [%d] -------\n", y); printf("A: %08x\n", m[(y * 4) + 0]); printf("B: %08x\n", m[(y * 4) + 1]); printf("C: %08x\n", m[(y * 4) + 2]); printf("D: %08x\n", m[(y * 4) + 3]); printf("-------------------\n\n"); } #endif if (device.stats.hash_found == 1) { printf("WORD FOUND: [%s]\n", md5_unpad(device.stats.word)); break; } } if (device.stats.hash_found != 1) { printf("No word could be found for the provided MD5 hash\n"); } #ifdef BENCHMARK gettimeofday(&end, NULL); long long time = (end.tv_sec * (unsigned int)1e6 + end.tv_usec) - (start.tv_sec * (unsigned int)1e6 + start.tv_usec); printf("Time taken to check %d hashes: %f seconds\n", counter, (float)((float)time / 1000.0) / 1000.0); printf("Words per second: %d\n", counter / (time / 1000) * 1000); #endif }
cuda_pattern_data( cuda_pattern_config const& cpc ) { device_id = cpc.device_id; int current_id; cuda_assert( cudaGetDevice(¤t_id) ); if ( current_id != device_id ) cuda_assert( cudaSetDevice( device_id ) ); size_type const kt_factor_size = sizeof(value_type) * cpc.tilt_size * 3; cuda_assert( cudaMalloc( reinterpret_cast<void**>(&kt_factor), kt_factor_size ) ); cuda_assert( cudaMemset( reinterpret_cast<void*>(kt_factor), 0, kt_factor_size ) ); size_type const beams_size = sizeof(value_type) * cpc.tilt_size * 10; cuda_assert( cudaMalloc( reinterpret_cast<void**>(&beams), beams_size ) ); cuda_assert( cudaMemset( reinterpret_cast<void*>(beams), 0, beams_size ) ); size_type const ug_size = sizeof(value_type) * cpc.ug_size * 2; cuda_assert( cudaMalloc( reinterpret_cast<void**>(&ug), ug_size ) ); cuda_assert( cudaMemset( reinterpret_cast<void*>(ug), 0, ug_size ) ); size_type const ar_size = sizeof(size_type) * cpc.tilt_size * cpc.max_dim * cpc.max_dim; cuda_assert( cudaMalloc( reinterpret_cast<void**>(&ar), ar_size ) ); cuda_assert( cudaMemset( reinterpret_cast<void*>(ar), 0, ar_size ) ); size_type const diag_size = sizeof(value_type) * cpc.tilt_size * cpc.max_dim; cuda_assert( cudaMalloc( reinterpret_cast<void**>(&diag), diag_size ) ); cuda_assert( cudaMemset( reinterpret_cast<void*>(diag), 0, diag_size ) ); size_type const dim_size = sizeof(size_type) * cpc.tilt_size; cuda_assert( cudaMalloc( reinterpret_cast<void**>(&dim), dim_size ) ); cuda_assert( cudaMemset( reinterpret_cast<void*>(dim), 0, dim_size ) ); size_type const I_exp_size = sizeof(value_type) * cpc.tilt_size * cpc.max_dim; cuda_assert( cudaMalloc( reinterpret_cast<void**>(&I_exp), I_exp_size ) ); cuda_assert( cudaMemset( reinterpret_cast<void*>(I_exp), 0, I_exp_size ) ); size_type const I_diff_size = sizeof(value_type) * cpc.tilt_size * cpc.max_dim; cuda_assert( cudaMalloc( reinterpret_cast<void**>(&I_diff), I_diff_size ) ); cuda_assert( cudaMemset( reinterpret_cast<void*>(I_diff), 0, I_diff_size ) ); size_type const I_zigmoid_size = sizeof(value_type) * cpc.tilt_size * cpc.max_dim; cuda_assert( cudaMalloc( reinterpret_cast<void**>(&I_zigmoid), I_zigmoid_size ) ); cuda_assert( cudaMemset( reinterpret_cast<void*>(I_zigmoid), 0, I_zigmoid_size ) ); size_type const cache_size = sizeof(complex_type) * cpc.tilt_size * cpc.max_dim * cpc.max_dim * 6; cuda_assert( cudaMalloc( reinterpret_cast<void**>(&cache), cache_size ) ); cuda_assert( cudaMemset( reinterpret_cast<void*>(cache), 0, cache_size ) ); size_type const gvec_size = sizeof(value_type) * cpc.ug_size * 2; cuda_assert( cudaMalloc( reinterpret_cast<void**>(&gvec), gvec_size ) ); cuda_assert( cudaMemset( reinterpret_cast<void*>(gvec), 0, gvec_size ) ); size_type const tilt_size = sizeof(value_type) * cpc.tilt_size * 2; cuda_assert( cudaMalloc( reinterpret_cast<void**>(&tilt), tilt_size ) ); cuda_assert( cudaMemset( reinterpret_cast<void*>(tilt), 0, tilt_size ) ); }
/** Purpose ------- CPOTRF computes the Cholesky factorization of a complex Hermitian positive definite matrix dA. The factorization has the form dA = U**H * U, if UPLO = MagmaUpper, or dA = L * L**H, if UPLO = MagmaLower, where U is an upper triangular matrix and L is lower triangular. This is the block version of the algorithm, calling Level 3 BLAS. Arguments --------- @param[in] uplo magma_uplo_t - = MagmaUpper: Upper triangle of dA is stored; - = MagmaLower: Lower triangle of dA is stored. @param[in] n INTEGER The order of the matrix dA. N >= 0. @param[in,out] dA COMPLEX array on the GPU, dimension (LDDA,N) On entry, the Hermitian matrix dA. If UPLO = MagmaUpper, the leading N-by-N upper triangular part of dA contains the upper triangular part of the matrix dA, and the strictly lower triangular part of dA is not referenced. If UPLO = MagmaLower, the leading N-by-N lower triangular part of dA contains the lower triangular part of the matrix dA, and the strictly upper triangular part of dA is not referenced. \n On exit, if INFO = 0, the factor U or L from the Cholesky factorization dA = U**H * U or dA = L * L**H. @param[in] ldda INTEGER The leading dimension of the array dA. LDDA >= max(1,N). To benefit from coalescent memory accesses LDDA must be divisible by 16. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value - > 0: if INFO = i, the leading minor of order i is not positive definite, and the factorization could not be completed. @ingroup magma_cposv_comp ********************************************************************/ extern "C" magma_int_t magma_cpotrf2_mgpu(int num_gpus, magma_uplo_t uplo, magma_int_t m, magma_int_t n, magma_int_t off_i, magma_int_t off_j, magma_int_t nb, magmaFloatComplex **d_lA, magma_int_t ldda, magmaFloatComplex **d_lP, magma_int_t lddp, magmaFloatComplex *A, magma_int_t lda, magma_int_t h, magma_queue_t stream[][3], magma_event_t event[][5], magma_int_t *info ) { #define Alo(i, j) (A + ((j)+off_j)*lda + (nb*(((i)/nb)%h)+off_i)) #define Aup(i, j) (A + (nb*(((j)/nb)%h)+off_j)*lda + (i+off_i)) #define dlA(id, i, j) (d_lA[(id)] + (j)*ldda + (i)) #define dlP(id, i, j, k) (d_lP[(id)] + (k)*nb*lddp + (j)*lddp + (i)) #define dlPT(id, i, j, k) (d_lP[(id)] + (k)*nb*lddp + (j)*nb + (i)) magma_int_t j, jb, nb0, nb2, dd, d, id, j_local, j_local2, buf; magmaFloatComplex c_one = MAGMA_C_ONE; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; float d_one = 1.0; float d_neg_one = -1.0; int upper = (uplo == MagmaUpper); magmaFloatComplex *dlpanel; //magma_event_t event0[MagmaMaxGPUs], // syrk // event1[MagmaMaxGPUs], // send off-diagonal // event2[MagmaMaxGPUs], // send diagonal // event3[MagmaMaxGPUs]; // trsm magma_int_t n_local[MagmaMaxGPUs], ldpanel; int stream0 = 0, stream1 = 1; #ifdef CTRSM_WORK magmaFloatComplex *d_dinvA[MagmaMaxGPUs][2], *d_x[MagmaMaxGPUs][2]; /* used by ctrsm_work */ #endif *info = 0; if (! upper && uplo != MagmaLower) { *info = -1; } else if (n < 0) { *info = -2; } else if (!upper && num_gpus*ldda < max(1,n)) { *info = -4; } else if (upper && ldda < max(1,m)) { *info = -4; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } for( d=0; d < num_gpus; d++ ) { /* local-n and local-ld */ if (upper) { n_local[d] = ((n/nb)/num_gpus)*nb; if (d < (n/nb)%num_gpus) n_local[d] += nb; else if (d == (n/nb)%num_gpus) n_local[d] += n%nb; } else { n_local[d] = ((m/nb)/num_gpus)*nb; if (d < (m/nb)%num_gpus) n_local[d] += nb; else if (d == (m/nb)%num_gpus) n_local[d] += m%nb; } //magma_setdevice(d); //magma_event_create( &event0[d] ); //magma_event_create( &event1[d] ); //magma_event_create( &event2[d] ); //magma_event_create( &event3[d] ); } magma_setdevice(0); /* == initialize the trace */ trace_init( 1, num_gpus, 3, (magma_queue_t*)stream ); /* Use blocked code. */ if (upper) { /* ---------------------------------------------- */ /* Upper-triangular case */ /* > Compute the Cholesky factorization A = U'*U. */ /* ---------------------------------------------- */ #if defined(PRECISION_d) && defined(CTRSM_WORK) /* invert the diagonals * Allocate device memory for the inversed diagonal blocks, size=m*NB */ for( d=0; d < num_gpus; d++ ) { magma_setdevice(d); for( j=0; j < 2; j++ ) { magma_cmalloc( &d_dinvA[d][j], nb*nb ); magma_cmalloc( &d_x[d][j], n*nb ); cudaMemset(d_dinvA[d][j], 0, nb*nb*sizeof(magmaFloatComplex)); cudaMemset(d_x[d][j], 0, n*nb*sizeof(magmaFloatComplex)); } } magma_setdevice(0); #endif for (j=0; j < m; j += nb) { /* Set the GPU number that holds the current panel */ id = (j/nb)%num_gpus; buf = (j/nb)%num_gpus; /* Set the local index where the current panel is */ j_local = j/(nb*num_gpus); jb = min(nb, (m-j)); if ( j > 0 ) { /* needed on pluto... */ magma_setdevice(id); magma_queue_sync( stream[id][stream0] ); // wait for the column on CPU /* broadcast off-diagonal column to all gpus */ d = (j/nb+1)%num_gpus; for( dd=0; dd < num_gpus; dd++ ) { if ( d != id ) { magma_setdevice(d); /* wait for it on CPU */ magma_queue_wait_event( stream[d][stream0], event[id][1] ); /* send it to GPU */ trace_gpu_start( d, stream0, "comm", "rows to GPUs" ); magma_csetmatrix_async( j, jb, Aup(0,j), lda, dlP(d,jb,0,buf), lddp, stream[d][stream0] ); trace_gpu_end( d, stream0 ); magma_event_record( event[d][1], stream[d][stream0] ); } d = (d+1)%num_gpus; } } /* Update the current diagonal block */ magma_setdevice(id); if ( j > 0 ) { magmablasSetKernelStream(stream[id][stream1]); trace_gpu_start( id, stream1, "syrk", "syrk" ); magma_cherk(MagmaUpper, MagmaConjTrans, jb, j, d_neg_one, dlA(id, 0, nb*j_local), ldda, d_one, dlA(id, j, nb*j_local), ldda); trace_gpu_end( id, stream1 ); magma_event_record( event[id][0], stream[id][stream1] ); } /* send the diagonal to cpu */ magma_queue_wait_event( stream[id][stream0], event[id][0] ); // wait for syrk trace_gpu_start( id, stream0, "comm", "D to CPU" ); magma_cgetmatrix_async( jb, jb, dlA(id, j, nb*j_local), ldda, Aup(j,j), lda, stream[id][stream0] ); trace_gpu_end( id, stream0 ); if ( j > 0 ) { /* Compute the local block column of the panel. */ d = (j/nb+1)%num_gpus; for( dd=0; dd < num_gpus; dd++ ) { j_local2 = j_local+1; if ( d > id ) j_local2 --; nb0 = nb*j_local2; if ( n_local[d] > nb0 ) { /* wait for the off-diagonal */ if ( d != id ) { //magma_queue_sync( stream[id][3] ); dlpanel = dlP(d, jb, 0, buf); ldpanel = lddp; /* wait for the offdiagonal column */ magma_queue_wait_event( stream[d][stream1], event[d][1] ); } else { dlpanel = dlA(d, 0, nb*j_local); ldpanel = ldda; } /* update the panel */ magma_setdevice(d); magmablasSetKernelStream(stream[d][stream1]); trace_gpu_start( d, stream1, "gemm", "gemm" ); magma_cgemm(MagmaConjTrans, MagmaNoTrans, jb, n_local[d]-nb0, j, c_neg_one, dlpanel, ldpanel, dlA(d, 0, nb0), ldda, c_one, dlA(d, j, nb0), ldda); trace_gpu_end( d, stream1 ); } d = (d+1)%num_gpus; } } /* factor the diagonal */ magma_setdevice(id); magma_queue_sync( stream[id][stream0] ); // wait for the diagonal trace_cpu_start( 0, "getrf", "getrf" ); lapackf77_cpotrf(MagmaUpperStr, &jb, Aup(j,j), &lda, info); trace_cpu_end( 0 ); if (*info != 0) { *info = *info + j; break; } /* send the diagonal to gpus */ if ( (j+jb) < n) { d = (j/nb+1)%num_gpus; for( dd=0; dd < num_gpus; dd++ ) { magma_setdevice(d); if ( d == id ) { dlpanel = dlA(d, j, nb*j_local); ldpanel = ldda; } else { dlpanel = dlP(d, 0, 0, buf); ldpanel = lddp; } trace_gpu_start( d, stream0, "comm", "D to GPUs" ); magma_csetmatrix_async( jb, jb, Aup(j,j), lda, dlpanel, ldpanel, stream[d][stream0] ); trace_gpu_end( d, stream0 ); magma_event_record( event[d][2], stream[d][stream0] ); d = (d+1)%num_gpus; } } else { magma_setdevice(id); trace_gpu_start( id, stream0, "comm", "D to GPUs" ); magma_csetmatrix_async( jb, jb, Aup(j,j), lda, dlA(id, j, nb*j_local), ldda, stream[id][stream0] ); trace_gpu_end( id, stream0 ); } /* panel-factorize the off-diagonal */ if ( (j+jb) < n) { d = (j/nb+1)%num_gpus; for( dd=0; dd < num_gpus; dd++ ) { /* next column */ j_local2 = j_local+1; if ( d > id ) j_local2--; if ( d == id ) { dlpanel = dlA(d, j, nb*j_local); ldpanel = ldda; } else { dlpanel = dlP(d, 0, 0, buf); ldpanel = lddp; } nb2 = n_local[d]-nb*j_local2; nb0 = min(nb, nb2 ); magma_setdevice(d); magmablasSetKernelStream(stream[d][stream1]); magma_queue_wait_event( stream[d][stream1], event[d][2] ); // wait for the diagonal if ( j+jb < m && d == (j/nb+1)%num_gpus ) { /* owns the next column, look-ahead the column */ trace_gpu_start( d, stream1, "trsm", "trsm" ); #if defined(PRECISION_d) && defined(CTRSM_WORK) magmablas_ctrsm_work( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit, jb, nb0, c_one, dlpanel, ldpanel, dlA(d, j, nb*j_local2), ldda, d_dinvA[d][0], d_x[d][0] ); /*nb2 = n_local[d] - j_local2*nb; magmablas_ctrsm_work( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit, jb, nb2, c_one, dlpanel, ldpanel, dlA(d, j, nb*j_local2), ldda, d_dinvA[d], d_x[d] ); */ #else /*nb2 = n_local[d] - j_local2*nb; magma_ctrsm( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit, jb, nb2, c_one, dlpanel, ldda, dlA(d, j, nb*j_local2), ldda); */ magma_ctrsm( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit, jb, nb0, c_one, dlpanel, ldpanel, dlA(d, j, nb*j_local2), ldda); #endif trace_gpu_end( d, stream1 ); magma_event_record( event[d][3], stream[d][stream1] ); /* send the column to cpu */ if ( j+jb < m ) { trace_gpu_start( d, stream0, "comm", "rows to CPU" ); magma_queue_wait_event( stream[d][stream0], event[d][3] ); // wait for lookahead magma_cgetmatrix_async( (j+jb), nb0, dlA(d, 0, nb*j_local2), ldda, Aup(0,j+jb), lda, stream[d][stream0] ); trace_gpu_end( d, stream0 ); magma_event_record( event[d][1], stream[d][stream0] ); } /* update the remaining blocks */ nb2 = nb2 - nb0; #if defined(PRECISION_d) && defined(CTRSM_WORK) magmablas_ctrsm_work( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit, jb, nb2, c_one, dlpanel, ldpanel, dlA(d, j, nb*j_local2+nb0), ldda, d_dinvA[d][1], d_x[d][1] ); #else magma_ctrsm( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit, jb, nb2, c_one, dlpanel, ldpanel, dlA(d, j, nb*j_local2+nb0), ldda); #endif } else if ( nb2 > 0 ) { /* update the entire trailing matrix */ trace_gpu_start( d, stream1, "trsm", "trsm" ); #if defined(PRECISION_d) && defined(CTRSM_WORK) magmablas_ctrsm_work( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit, jb, nb2, c_one, dlpanel, ldpanel, dlA(d, j, nb*j_local2), ldda, d_dinvA[d][1], d_x[d][1] ); #else magma_ctrsm( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit, jb, nb2, c_one, dlpanel, ldpanel, dlA(d, j, nb*j_local2), ldda); #endif trace_gpu_end( d, stream1 ); } d = (d+1)%num_gpus; } } /* end of ctrsm */ } /* end of for j=1, .., n */ } else { /* -------------------------------------------- */ /* Lower-triangular case */ /* Compute the Cholesky factorization A = L*L'. */ /* -------------------------------------------- */ #if defined(PRECISION_d) && defined(CTRSM_WORK) /* * Allocate device memory for the inversed diagonal blocks, size=N*BLOCK_SIZE */ for( d=0; d < num_gpus; d++ ) { magma_setdevice(d); for( j=0; j < 2; j++ ) { magma_cmalloc( &d_dinvA[d][j], nb*nb ); magma_cmalloc( &d_x[d][j], nb*m ); cudaMemset(d_dinvA[d][j], 0, nb*nb*sizeof(magmaFloatComplex)); cudaMemset(d_x[d][j], 0, nb* m*sizeof(magmaFloatComplex)); } } magma_setdevice(0); #endif for (j=0; j < n; j += nb) { /* Set the GPU number that holds the current panel */ id = (j/nb)%num_gpus; buf = (j/nb)%num_gpus; /* Set the local index where the current panel is */ j_local = j/(nb*num_gpus); jb = min(nb, (n-j)); if ( j > 0 ) { /* needed on pluto... */ magma_setdevice(id); magma_queue_sync( stream[id][stream0] ); // wait for the column on CPU /* broadcast offdiagonal row to all gpus */ d = (j/nb+1)%num_gpus; for( dd=0; dd < num_gpus; dd++ ) { if ( d != id ) { magma_setdevice(d); /* wait for it on CPU */ magma_queue_wait_event( stream[d][stream0], event[id][1] ); /* send it to GPU */ magma_csetmatrix_async( jb, j, Alo(j,0), lda, dlPT(d,0,jb,buf), nb, stream[d][stream0] ); magma_event_record( event[d][1], stream[d][stream0] ); } d = (d+1)%num_gpus; } } /* Update the current diagonal block */ magma_setdevice(id); if ( j > 0 ) { magmablasSetKernelStream(stream[id][stream1]); magma_cherk(MagmaLower, MagmaNoTrans, jb, j, d_neg_one, dlA(id, nb*j_local, 0), ldda, d_one, dlA(id, nb*j_local, j), ldda); magma_event_record( event[id][0], stream[id][stream1] ); } /* send the diagonal to cpu */ magma_queue_wait_event( stream[id][stream0], event[id][0] ); // wait for syrk magma_cgetmatrix_async( jb, jb, dlA(id, nb*j_local, j), ldda, Alo(j,j), lda, stream[id][stream0] ); /* update the offdiagonal blocks */ if ( j > 0 ) { /* compute the block-rows of the panel */ d = (j/nb+1)%num_gpus; for( dd=0; dd < num_gpus; dd++ ) { j_local2 = j_local+1; if ( d > id ) j_local2 --; nb0 = nb*j_local2; if ( nb0 < n_local[d] ) { if ( d != id ) { dlpanel = dlPT(d, 0, jb, buf); ldpanel = nb; /* wait for offdiagonal row */ magma_queue_wait_event( stream[d][stream1], event[d][1] ); } else { dlpanel = dlA(d, nb*j_local, 0); ldpanel = ldda; } magma_setdevice(d); magmablasSetKernelStream(stream[d][stream1]); magma_cgemm( MagmaNoTrans, MagmaConjTrans, n_local[d]-nb0, jb, j, c_neg_one, dlA(d, nb0, 0), ldda, dlpanel, ldpanel, c_one, dlA(d, nb0, j), ldda); } d = (d+1)%num_gpus; } } /* factor the diagonal */ magma_setdevice(id); magma_queue_sync( stream[id][stream0] ); lapackf77_cpotrf(MagmaLowerStr, &jb, Alo(j,j), &lda, info); if (*info != 0) { *info = *info + j; break; } /* send the diagonal to gpus */ if ( (j+jb) < m ) { d = (j/nb+1)%num_gpus; for( dd=0; dd < num_gpus; dd++ ) { magma_setdevice(d); if ( d == id ) { dlpanel = dlA(d, nb*j_local, j); ldpanel = ldda; } else { dlpanel = dlPT(d, 0, 0, buf); ldpanel = nb; } magma_csetmatrix_async( jb, jb, Alo(j,j), lda, dlpanel, ldpanel, stream[d][stream0] ); magma_event_record( event[d][2], stream[d][stream0] ); d = (d+1)%num_gpus; } } else { magma_setdevice(id); magma_csetmatrix_async( jb, jb, Alo(j,j), lda, dlA(id, nb*j_local, j), ldda, stream[id][stream0] ); } /* factorize off-diagonal blocks */ if ( (j+jb) < m ) { d = (j/nb+1)%num_gpus; for( dd=0; dd < num_gpus; dd++ ) { /* next column */ j_local2 = j_local+1; if ( d > id ) j_local2--; if ( d == id ) { dlpanel = dlA(d, nb*j_local, j); ldpanel = ldda; } else { dlpanel = dlPT(d, 0, 0, buf); ldpanel = nb; } nb2 = n_local[d] - j_local2*nb; nb0 = min(nb, nb2 ); magma_setdevice(d); magmablasSetKernelStream(stream[d][stream1]); magma_queue_wait_event( stream[d][stream1], event[d][2] ); // wait for the diagonal if ( j+jb < n && d == (j/nb+1)%num_gpus ) { /* owns the next column, look-ahead the column */ #if defined(PRECISION_d) && defined(CTRSM_WORK) magmablas_ctrsm_work( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit, nb0, jb, c_one, dlpanel, ldpanel, dlA(d, nb*j_local2, j), ldda, d_dinvA[d][0], d_x[d][0]); #else magma_ctrsm( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit, nb0, jb, c_one, dlpanel, ldpanel, dlA(d, nb*j_local2, j), ldda); #endif magma_event_record( event[d][3], stream[d][stream1] ); /* send the column to cpu */ if ( j+jb < n ) { magma_queue_wait_event( stream[d][stream0], event[d][3] ); // wait for lookahead magma_cgetmatrix_async( nb0, j+jb, dlA(d, nb*j_local2, 0), ldda, Alo(j+jb,0), lda, stream[d][stream0] ); magma_event_record( event[d][1], stream[d][stream0] ); } /* update the remaining blocks */ nb2 = nb2 - nb0; #if defined(PRECISION_d) && defined(CTRSM_WORK) magmablas_ctrsm_work( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit, nb2, jb, c_one, dlpanel, ldpanel, dlA(d, nb*j_local2+nb0, j), ldda, d_dinvA[d][1], d_x[d][1] ); #else magma_ctrsm( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit, nb2, jb, c_one, dlpanel, ldpanel, dlA(d, nb*j_local2+nb0, j), ldda); #endif } else if ( nb2 > 0 ) { /* update the entire trailing matrix */ #if defined(PRECISION_d) && defined(CTRSM_WORK) magmablas_ctrsm_work( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit, nb2, jb, c_one, dlpanel, ldpanel, dlA(d, nb*j_local2, j), ldda, d_dinvA[d][1], d_x[d][1] ); #else magma_ctrsm( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit, nb2, jb, c_one, dlpanel, ldpanel, dlA(d, nb*j_local2, j), ldda); #endif } d = (d+1)%num_gpus; } } } } /* end of else not upper */ /* == finalize the trace == */ trace_finalize( "cpotrf.svg", "trace.css" ); /* clean up */ for( d=0; d < num_gpus; d++ ) { magma_setdevice(d); magma_queue_sync( stream[d][0] ); magma_queue_sync( stream[d][1] ); magmablasSetKernelStream(NULL); //magma_event_destroy( event0[d] ); //magma_event_destroy( event1[d] ); //magma_event_destroy( event2[d] ); //magma_event_destroy( event3[d] ); } magma_setdevice(0); return *info; } /* magma_cpotrf_mgpu */
/***************************************************************************//** Purpose ------- SGEQRF computes a QR factorization of a real M-by-N matrix A: A = Q * R. Arguments --------- @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. N >= 0. @param[in,out] dA_array Array of pointers, dimension (batchCount). Each is a REAL array on the GPU, dimension (LDDA,N) On entry, the M-by-N matrix A. On exit, the elements on and above the diagonal of the array contain the min(M,N)-by-N upper trapezoidal matrix R (R is upper triangular if m >= n); the elements below the diagonal, with the array TAU, represent the orthogonal matrix Q as a product of min(m,n) elementary reflectors (see Further Details). @param[in] ldda INTEGER The leading dimension of the array dA. LDDA >= max(1,M). To benefit from coalescent memory accesses LDDA must be divisible by 16. @param[out] dtau_array Array of pointers, dimension (batchCount). Each is a REAL array, dimension (min(M,N)) The scalar factors of the elementary reflectors (see Further Details). @param[out] info_array Array of INTEGERs, dimension (batchCount), for corresponding matrices. - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. @param[in] batchCount INTEGER The number of matrices to operate on. @param[in] queue magma_queue_t Queue to execute in. Further Details --------------- The matrix Q is represented as a product of elementary reflectors Q = H(1) H(2) . . . H(k), where k = min(m,n). Each H(i) has the form H(i) = I - tau * v * v' where tau is a real scalar, and v is a real vector with v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i), and tau in TAU(i). @ingroup magma_geqrf_batched *******************************************************************************/ extern "C" magma_int_t magma_sgeqrf_batched( magma_int_t m, magma_int_t n, float **dA_array, magma_int_t ldda, float **dtau_array, magma_int_t *info_array, magma_int_t batchCount, magma_queue_t queue) { #define dA(i, j) (dA + (i) + (j)*ldda) /* Local Parameter */ magma_int_t nb = magma_get_sgeqrf_batched_nb(m); magma_int_t min_mn = min(m, n); /* Check arguments */ cudaMemset(info_array, 0, batchCount*sizeof(magma_int_t)); magma_int_t arginfo = 0; if (m < 0) arginfo = -1; else if (n < 0) arginfo = -2; else if (ldda < max(1,m)) arginfo = -4; if (arginfo != 0) { magma_xerbla( __func__, -(arginfo) ); return arginfo; } /* Quick return if possible */ if (m == 0 || n == 0) return arginfo; float *dT = NULL; float *dR = NULL; float **dR_array = NULL; float **dT_array = NULL; magma_malloc((void**)&dR_array, batchCount * sizeof(*dR_array)); magma_malloc((void**)&dT_array, batchCount * sizeof(*dT_array)); magma_int_t lddt = min(nb, min_mn); magma_int_t lddr = min(nb, min_mn); magma_smalloc(&dR, lddr * lddr * batchCount); magma_smalloc(&dT, lddt * lddt * batchCount); /* check allocation */ if ( dR_array == NULL || dT_array == NULL || dR == NULL || dT == NULL ) { magma_free(dR_array); magma_free(dT_array); magma_free(dR); magma_free(dT); magma_int_t info = MAGMA_ERR_DEVICE_ALLOC; magma_xerbla( __func__, -(info) ); return info; } magma_sset_pointer( dR_array, dR, lddr, 0, 0, lddr*min(nb, min_mn), batchCount, queue ); magma_sset_pointer( dT_array, dT, lddt, 0, 0, lddt*min(nb, min_mn), batchCount, queue ); arginfo = magma_sgeqrf_expert_batched(m, n, dA_array, ldda, dR_array, lddr, dT_array, lddt, dtau_array, 0, info_array, batchCount, queue); magma_free(dR_array); magma_free(dT_array); magma_free(dR); magma_free(dT); return arginfo; }
int main() { //Checks for memory leaks in debug mode _CrtSetDbgFlag(_CRTDBG_ALLOC_MEM_DF | _CRTDBG_LEAK_CHECK_DF); glfwInit(); glfwWindowHint(GLFW_CONTEXT_VERSION_MAJOR, 4); glfwWindowHint(GLFW_CONTEXT_VERSION_MINOR, 4); glfwWindowHint(GLFW_OPENGL_PROFILE, GLFW_OPENGL_CORE_PROFILE); glfwWindowHint(GLFW_RESIZABLE, GL_FALSE); GLFWwindow* window = glfwCreateWindow(width, height, "Hikari", nullptr, nullptr); glfwMakeContextCurrent(window); //Set callbacks for keyboard and mouse glfwSetInputMode(window, GLFW_CURSOR, GLFW_CURSOR_DISABLED); glewExperimental = GL_TRUE; glewInit(); glGetError(); //Define the viewport dimensions glViewport(0, 0, width, height); //Initialize cuda->opengl context cudaCheck(cudaGLSetGLDevice(0)); cudaGraphicsResource *resource; //Create a texture to store ray tracing result GLuint tex; glActiveTexture(GL_TEXTURE0); glGenTextures(1, &tex); glBindTexture(GL_TEXTURE_2D, tex); glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE); glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE); glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST); glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST); glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA32F, width, height, 0, GL_RGBA, GL_FLOAT, NULL); cudaCheck(cudaGraphicsGLRegisterImage(&resource, tex, GL_TEXTURE_2D, cudaGraphicsMapFlagsWriteDiscard)); glBindTexture(GL_TEXTURE_2D, 0); Shader final = Shader("fsQuad.vert", "fsQuad.frag"); FullscreenQuad fsQuad = FullscreenQuad(); float4* buffer; cudaCheck(cudaMalloc((void**)&buffer, width * height * sizeof(float4))); cudaCheck(cudaMemset(buffer, 0, width * height * sizeof(float4))); //Mesh float3 offset = make_float3(0); float3 scale = make_float3(15); Mesh cBox("objs/Avent", 0, scale, offset); offset = make_float3(0, 55, 0); scale = make_float3(100); Mesh light("objs/plane", (int)cBox.triangles.size(), scale, offset); cBox.triangles.insert(cBox.triangles.end(), light.triangles.begin(), light.triangles.end()); cBox.aabbs.insert(cBox.aabbs.end(), light.aabbs.begin(), light.aabbs.end()); std::cout << "Num triangles: " << cBox.triangles.size() << std::endl; cBox.root = AABB(fminf(cBox.root.minBounds, light.root.minBounds), fmaxf(cBox.root.maxBounds, light.root.maxBounds)); BVH bvh(cBox.aabbs, cBox.triangles, cBox.root); Camera cam(make_float3(14, 15, 80), make_int2(width, height), 45.0f, 0.04f, 80.0f); Camera* dCam; cudaCheck(cudaMalloc((void**)&dCam, sizeof(Camera))); cudaCheck(cudaMemcpy(dCam, &cam, sizeof(Camera), cudaMemcpyHostToDevice)); cudaCheck(cudaGraphicsMapResources(1, &resource, 0)); cudaArray* pixels; cudaCheck(cudaGraphicsSubResourceGetMappedArray(&pixels, resource, 0, 0)); cudaResourceDesc viewCudaArrayResourceDesc; viewCudaArrayResourceDesc.resType = cudaResourceTypeArray; viewCudaArrayResourceDesc.res.array.array = pixels; cudaSurfaceObject_t viewCudaSurfaceObject; cudaCheck(cudaCreateSurfaceObject(&viewCudaSurfaceObject, &viewCudaArrayResourceDesc)); cudaCheck(cudaGraphicsUnmapResources(1, &resource, 0)); while (!glfwWindowShouldClose(window)) { float currentFrame = float(glfwGetTime()); deltaTime = currentFrame - lastFrame; lastFrame = currentFrame; //Check and call events glfwPollEvents(); handleInput(window, cam); if (cam.moved) { frameNumber = 0; cudaCheck(cudaMemset(buffer, 0, width * height * sizeof(float4))); } cam.rebuildCamera(); cudaCheck(cudaMemcpy(dCam, &cam, sizeof(Camera), cudaMemcpyHostToDevice)); frameNumber++; if (frameNumber < 20000) { cudaCheck(cudaGraphicsMapResources(1, &resource, 0)); std::chrono::time_point<std::chrono::system_clock> start, end; start = std::chrono::system_clock::now(); render(cam, dCam, viewCudaSurfaceObject, buffer, bvh.dTriangles, bvh.dNodes, frameNumber, cam.moved); end = std::chrono::system_clock::now(); std::chrono::duration<double> elapsed = end - start; std::cout << "Frame: " << frameNumber << " --- Elapsed time: " << elapsed.count() << "s\n"; cudaCheck(cudaGraphicsUnmapResources(1, &resource, 0)); } cam.moved = false; glUseProgram(final.program); glActiveTexture(GL_TEXTURE0); glBindTexture(GL_TEXTURE_2D, tex); glClear(GL_COLOR_BUFFER_BIT); final.setUniformi("tRender", 0); fsQuad.render(); //std::cout << glGetError() << std::endl; //Swap the buffers glfwSwapBuffers(window); glfwSetCursorPos(window, lastX, lastY); }
void InitCudaLayers() { mmGridSizeX = sim_width/blockSizex; mmGridSizeY = sim_height/blockSizey; mmGridSize = mmGridSizeX*mmGridSizeY; memset(mmGrid, 0, sizeof(mmGrid)); memset(mmYGGrid, 0, sizeof(mmYGGrid)); tempHostData = (float*)malloc(sim_width*sim_height*TEMP_HOST_ELEM*sizeof(float)); tempHostDataNoCuda = (float*)malloc(sim_width*sim_height*TEMP_HOST_ELEM*sizeof(float)); grid8ValTick = (float*)malloc(sim_width*sim_height*8*sizeof(float)); initColors(); memset(gCudaLayer, 0, sizeof(gCudaLayer)); memset(gCudaFuncLayer, 0, sizeof(gCudaFuncLayer)); memset(gPhysLayer, 0, sizeof(gPhysLayer)); memset(gStateLayer, 0, sizeof(gStateLayer)); srand(0); int seed = rand(); const cudaChannelFormatDesc desc4 = cudaCreateChannelDesc<float4>(); cudaMallocArray(&gCudaVectArray, &desc4, sim_width, sim_height); #if NFLAYERS ==2 const cudaChannelFormatDesc desc2 = cudaCreateChannelDesc<float2>(); #else if NFLAYERS ==4 const cudaChannelFormatDesc descF = desc4; #endif cudaMallocArray(&gCudaFlArray, &descF, sim_width, sim_height); const cudaChannelFormatDesc desc = cudaCreateChannelDesc<float>(); cudaMallocArray(&gCudaFuncWavePack, &desc, sim_width); cudaMallocArray(&gCudaFuncSmooth, &desc, sim_width); cudaMallocArray(&(gCudaLayer[0]), &desc, sim_width, sim_height); cudaMallocArray(&(gCudaLayer[1]), &desc, sim_width, sim_height); cudaMallocArray(&(gCudaFuncLayer[0]), &desc, sim_width, sim_height); cudaMalloc(&cuTempData, TEMP_SIZE*sizeof(float)*sim_width*sim_height); cudaMalloc(&cuRandArr, sizeof(unsigned int)*sim_width*sim_height); cudaMalloc(&gStateLayer[0], sim_rect*sizeof(float)); cudaMemset(gStateLayer[0], 0, sim_rect*sizeof(float)); cudaMalloc(&gStateLayer[1], sim_rect*sizeof(float)); cudaMemset(gStateLayer[1], 0, sim_rect*sizeof(float)); cudaMalloc(&gPhysLayer[0], sim_rect*sizeof(float)); cudaMemset(gPhysLayer[0], 0, sim_rect*sizeof(float)); cudaMalloc(&gPhysLayer[1], sim_rect*sizeof(float)); cudaMemset(gPhysLayer[1], 0, sim_rect*sizeof(float)); cudaMalloc(&gRedBlueField, NFLAYERS*sim_rect*sizeof(float)); cudaMemset(gRedBlueField, 0, NFLAYERS*sim_rect*sizeof(float)); size_t pitch = 4*sim_width*sizeof(float); cudaMallocPitch((void**)&gVectorLayer, &pitch, 4*sim_width*sizeof(float), sim_height); cudaMemset2D(gVectorLayer, 4*sim_width*sizeof(float), 0, 4*sim_width*sizeof(float), sim_height); InitWavePack(32, 1.f, sim_width, sim_height, cuTempData, gCudaFuncWavePack); InitSmooth(1, sim_width, cuTempData, gCudaFuncSmooth); InitRnd2DInt(seed, cuRandArr, sim_width, sim_height); InitFuncLayer(gCudaFuncLayer[0], cuTempData, sim_width, sim_height); InitPhysLayer(gPhysLayer[0], gStateLayer[0], cuRandArr, sim_width, sim_height); float* gridIni = cuTempData+3*sim_rect/2; float* halfTemp = cuTempData + sim_rect; float* out = cuTempData + 2*sim_rect; cudaMemset(out, 0, sim_rect*sizeof(float)); seed = rand(); int gridx = INTERP_SIZEX; int gridy = INTERP_SIZEX; InitRnd2DF(seed, gridIni, gridx, gridy); float scaleadd = .7f; Spline2D(gridIni, gridx, gridy, halfTemp, scaleadd, out, sim_width, sim_height); seed = rand(); gridx = (int)(gridx*2); gridy = (int)(gridy*2); InitRnd2DF(seed, gridIni, gridx, gridy); scaleadd = .3f; Spline2D(gridIni, gridx, gridy, halfTemp, scaleadd, out, sim_width, sim_height); cudaMemcpyToArray(gCudaLayer[0], 0, 0, out, sizeof(float)*sim_rect, cudaMemcpyDeviceToDevice); cudaMemset(out, 0, sim_rect*sizeof(float)); gridx = INTERP_SIZEX; gridy = INTERP_SIZEX; seed = rand(); InitRnd2DF(seed, gridIni, gridx, gridy); scaleadd = .7f; Spline2D(gridIni, gridx, gridy, halfTemp, scaleadd, out, sim_width, sim_height); seed = rand(); gridx = (int)(gridx*1.5); gridy = (int)(gridy*1.5); InitRnd2DF(seed, gridIni, gridx, gridy); scaleadd = .3f; Spline2D(gridIni, gridx, gridy, halfTemp, scaleadd, out, sim_width, sim_height); cudaMemcpyToArray(gCudaLayer[1], 0, 0, out, sizeof(float)*sim_rect, cudaMemcpyDeviceToDevice); float2 pos0; pos0.x = gObj0X; pos0.y = gObj0Y; float2 pos1; pos1.x = gObj1X; pos1.y = gObj1Y; gObjInertia.Init(pos0, pos1); LayerProc(sim_width, sim_height, gCudaLayer[0], gCudaFuncLayer[0], cuTempData, pos0.x , pos0.y, pos1.x , pos1.y); ParticleStateInit(cuTempData, cuRandArr, gStateLayer[0], gPhysLayer[0], gRedBlueField); InitBhv(); }
void clearTSDF(size_t batchID){ //GPU_set_negones(xSize*ySize*zSize, cacheGPU+batchID*xSize*ySize*zSize*sizeofStorageT); GPU_set_zeros(xSize*ySize*zSize, cacheGPU+batchID*xSize*ySize*zSize); checkCUDA(__LINE__, cudaMemset(weightGPU, 0, sizeof(uint8_t) * xSize*ySize*zSize)); };
magma_int_t magmablas_zhemv_mgpu( magma_int_t num_gpus, magma_int_t k, char uplo, magma_int_t n, magma_int_t nb, magmaDoubleComplex alpha, magmaDoubleComplex **da, magma_int_t ldda, magma_int_t offset, magmaDoubleComplex **dx, magma_int_t incx, magmaDoubleComplex beta, magmaDoubleComplex **dy, magma_int_t incy, magmaDoubleComplex **dwork, magma_int_t ldwork, magmaDoubleComplex *work, magmaDoubleComplex *w, magma_queue_t stream[][10] ) { #define dX(id, i) (dx[(id)]+incx*(i)) #define dY(id, i, j) (dy[(id)]+incy*(i)+n*(j)) magma_int_t id; #ifdef MAGMABLAS_ZHEMV_MGPU for( id=0; id<num_gpus; id++ ) { magma_setdevice(id); magmablasSetKernelStream(stream[id][0]); trace_gpu_start( id, 0, "memset", "memset" ); cudaMemset( dwork[id], 0, ldwork*sizeof(magmaDoubleComplex) ); trace_gpu_end( id, 0 ); trace_gpu_start( id, 0, "symv", "symv" ); } if( nb == 32 ) { magmablas_zhemv_mgpu_32_offset( uplo, offset+n, alpha, da, ldda, dx, incx, beta, dy, incy, dwork, ldwork, num_gpus, nb, offset, stream ); } else { magmablas_zhemv_mgpu_offset( uplo, offset+n, alpha, da, ldda, dx, incx, beta, dy, incy, dwork, ldwork, num_gpus, nb, offset, stream ); } for( id=0; id<num_gpus; id++ ) { magma_setdevice(id); trace_gpu_end( id, 0 ); magmablasSetKernelStream(NULL); } //magma_setdevice(0); //magmablasSetKernelStream(stream[0][0]); //magma_zhemv('L', n, alpha, &da[0][offset+offset*ldda], ldda, &dx[0][offset], incx, beta, &dy[0][offset], incy ); //magmablasSetKernelStream(NULL); /* send to CPU */ magma_setdevice(0); trace_gpu_start( 0, 0, "comm", "comm" ); magma_zgetvector_async( n, dY(0, offset, 0), 1, w, 1, stream[0][0] ); trace_gpu_end( 0, 0 ); magmablasSetKernelStream(NULL); for( id=1; id<num_gpus; id++ ) { magma_setdevice(id); trace_gpu_start( id, 0, "comm", "comm" ); magma_zgetvector_async( n, dY(id, offset, 0), 1, &work[id*n], 1, stream[id][0] ); trace_gpu_end( id, 0 ); magmablasSetKernelStream(NULL); } #else magmaDoubleComplex c_one = MAGMA_Z_ONE; char uplo_[2] = {uplo, 0}; magma_int_t i, ii, j, kk, ib, ib0, i_1, i_local, idw; magma_int_t i_0=n; magma_int_t loffset0 = nb*(offset/(nb*num_gpus)); magma_int_t loffset1 = offset%nb; magma_int_t loffset; //magma_zhemv(uplo, n, alpha, da, ldda, dx, incx, beta, dy, incy ); idw = (offset/nb)%num_gpus; for( id=0; id<num_gpus; id++ ) { magma_setdevice(id); magmablasSetKernelStream(stream[id][0]); cudaMemset( dy[id], 0, n*k*sizeof(magmaDoubleComplex) ); } if( lapackf77_lsame( uplo_, "L" ) ) { /* the first block */ if( loffset1 > 0 ) { id = idw; kk = 0; magma_setdevice(id); magmablasSetKernelStream(stream[id][kk]); loffset = loffset0+loffset1; ib0 = min(nb-loffset1,n); // diagonal magma_zhemv(MagmaLower, ib0, c_one, dA(id, 0, 0 ), ldda, dX(id, 0), incx, c_one, dY(id, 0, kk), incy); // off-diagonl if( ib0 < n ) { for( j=ib0; j<n; j+= i_0 ) { i_1 = min(i_0, n-j); magma_zgemv(MagmaNoTrans, i_1, ib0, c_one, dA(id, j, 0), ldda, dX(id, 0), incx, c_one, dY(id, j, kk), incy); magma_zgemv(MagmaConjTrans, i_1, ib0, c_one, dA(id, j, 0), ldda, dX(id, j), incx, c_one, dY(id, 0, kk), incy); } } } else { ib0 = 0; } /* diagonal */ for( i=ib0; i<n; i+=nb ) { id = ((i+offset)/nb)%num_gpus; kk = ((i+loffset1)/(nb*num_gpus))%k; magma_setdevice(id); magmablasSetKernelStream(stream[id][kk]); i_local = (i+loffset1)/(nb*num_gpus); ib = min(nb,n-i); ii = nb*i_local; loffset = loffset0; if( id < idw ) loffset += nb; magma_zhemv(MagmaLower, ib, c_one, dA(id, i, ii), ldda, dX(id, i), incx, c_one, dY(id, i, kk), incy); } /* off-diagonal */ for( i=ib0; i<n-nb; i+=nb ) { id = ((i+offset)/nb)%num_gpus; kk = ((i+loffset1)/(nb*num_gpus))%k; magma_setdevice(id); magmablasSetKernelStream(stream[id][kk]); i_local = ((i+loffset1)/nb)/num_gpus; ii = nb*i_local; ib = min(nb,n-i); loffset = loffset0; if( id < idw ) loffset += nb; for( j=i+ib; j<n; j+= i_0 ) { i_1 = min(i_0, n-j); magma_zgemv(MagmaNoTrans, i_1, ib, c_one, dA(id, j, ii), ldda, dX(id, i), incx, c_one, dY(id, j, kk), incy); magma_zgemv(MagmaConjTrans, i_1, ib, c_one, dA(id, j, ii), ldda, dX(id, j), incx, c_one, dY(id, i, kk), incy); } } } else { /* upper-triangular storage */ loffset = 0; /* diagonal */ for( i=0; i<n; i+=nb ) { id = (i/nb)%num_gpus; kk = (i/(nb*num_gpus))%k; ib = min(nb,n-i); magma_setdevice(id); magmablasSetKernelStream(stream[id][kk]); i_local = i/(nb*num_gpus); ii = nb*i_local; magma_zhemv(MagmaUpper, ib, c_one, dA(id, i, ii), ldda, dX(id, i), incx, c_one, dY(id, i, kk), incy); } /* off-diagonal */ for( i=nb; i<n; i+=nb ) { id = (i/nb)%num_gpus; kk = (i/(nb*num_gpus))%k; magma_setdevice(id); magmablasSetKernelStream(stream[id][kk]); i_local = (i/nb)/num_gpus; ii = nb*i_local; ib = min(nb,n-i); magma_zgemv(MagmaNoTrans, i, ib, c_one, dA(id, 0, ii), ldda, dX(id, i), incx, c_one, dY(id, 0, kk), incy); magma_zgemv(MagmaConjTrans, i, ib, c_one, dA(id, 0, ii), ldda, dX(id, 0), incx, c_one, dY(id, i, kk), incy); } } /* send to CPU */ magma_setdevice(0); magma_zgetvector_async( n, dY(0, 0, 0), 1, w, 1, stream[0][0] ); for( kk=1; kk<k; kk++ ) { magma_zgetvector_async( n, dY(0, 0, kk), 1, &work[kk*n], 1, stream[0][kk] ); } magmablasSetKernelStream(NULL); for( id=1; id<num_gpus; id++ ) { magma_setdevice(id); for( kk=0; kk<k; kk++ ) { magma_zgetvector_async( n, dY(id, 0, kk), 1, &work[id*k*n + kk*n], 1, stream[id][kk] ); } magmablasSetKernelStream(NULL); } #endif return 0; }
// Initialize a test instance on the given grid. struct test_config_t* test_init( const char* name, const char* mode, int n, int nt, int sx, int sy, int ss, int rank, int szcomm, real xmin, real ymin, real zmin, real xmax, real ymax, real zmax, int bx, int by, int bs, int ex, int ey, int es #ifdef CUDA , struct cudaDeviceProp* props #endif ) { // TODO: replace n with nx, ny, ns. // TODO: parameterize. int szelem = sizeof(real); int narrays = 3; // // 1) Calculate the dimensions of entire grid domain. // #ifdef MPI // For each MPI node create a view of decomposed grid topology. struct grid_domain_t* domains = grid_init_simple( n, n, n, sx, sy, ss, bx, by, bs, ex, ey, es); // The rank-th subdomain is assigned to entire MPI process. struct grid_domain_t* domain = domains + rank; // Set domain data copying callbacks and user-defined pointer // - the test config, in this case. int ndomains = domain->parent->nsubdomains; for (int i = 0; i < ndomains; i++) { struct grid_domain_t* domain = domains + i; domain->scatter_memcpy = &grid_subcpy; domain->gather_memcpy = &grid_subcpy; domain->narrays = narrays; domain->szelem = szelem; } // The problem X, Y, Z dimensions are set relative to the // subdomain of entire MPI process. int nx = domain->grid[0].nx, ny = domain->grid[0].ny, ns = domain->grid[0].ns; size_t nxys = domain->grid[0].extsize; size_t nxysb = nxys * szelem; #else int nx = n, ny = n, ns = n; size_t nxys = nx * ny * ns; size_t nxysb = nxys * szelem; #endif // // 2) Allocate the test config structure together with // the array of pointers to keep CPU and GPU data arrays. // Assign dimensions and data pointers. // #ifdef CUDA int gpu = !strcmp(mode, "GPU"); #else int gpu = 0; #endif struct test_config_t* t = (struct test_config_t*)malloc( sizeof(struct test_config_t) + (1 + gpu) * narrays * sizeof(char*)); #ifdef MPI t->cpu = *domain; #ifdef CUDA t->gpu = *domain; #endif // Track MPI node rank, and decomposition grid domains // in test config structure. t->rank = rank; t->domains = domains; #else t->cpu.grid->nx = nx; t->cpu.grid->ny = ny; t->cpu.grid->ns = ns; t->cpu.grid->extsize = nxys; t->cpu.parent = &t->cpu; t->cpu.narrays = narrays; #ifdef CUDA t->gpu.grid->nx = nx; t->gpu.grid->ny = ny; t->gpu.grid->ns = ns; t->gpu.grid->extsize = nxys; t->gpu.parent = &t->gpu; t->cpu.narrays = narrays; #endif #endif t->cpu.arrays = (char**)(t + 1); #ifdef CUDA t->gpu.arrays = t->cpu.arrays + narrays; #endif // // 3) Set the simple properties of test config. // t->name = name; t->mode = mode; t->nx = nx; t->ny = ny; t->ns = ns; t->nt = nt; // Grid steps. t->dx = (xmax - xmin) / (n - 1); t->dy = (ymax - ymin) / (n - 1); t->ds = (zmax - zmin) / (n - 1); t->dt = t->dx / 2.0; // Set scheme coefficients. double dt2dx2 = (t->dt * t->dt) / (t->dx * t->dx); t->c0 = 2.0 - dt2dx2 * 7.5; t->c1 = dt2dx2 * (4.0 / 3.0); t->c2 = dt2dx2 * (-1.0 / 12.0); // // 4) Allocate the CPU data arrays. // #if defined(CUDA) if (!strcmp(mode, "GPU")) { for (int iarray = 0; iarray < narrays; iarray++) { #if defined(CUDA_MAPPED) // Allocate memory as host-mapped memory accessible both from // CPU and GPU. CUDA_SAFE_CALL(cudaHostAlloc((void**)&t->cpu.arrays[iarray], nxysb, cudaHostAllocMapped)); #elif defined(CUDA_PINNED) // Allocate host memory as pinned to get faster CPU-GPU data // transfers. CUDA_SAFE_CALL(cudaMallocHost((void**)&t->cpu.arrays[iarray], nxysb)); #endif // CUDA_MAPPED } } else #endif // CUDA { // Allocate regular CPU memory. for (int iarray = 0; iarray < narrays; iarray++) t->cpu.arrays[iarray] = (char*)malloc(nxysb); } // Initially flush CPU array data to zero. for (int iarray = 0; iarray < narrays; iarray++) memset(t->cpu.arrays[iarray], 0, nxysb); #if defined(MPI) struct grid_domain_t* subdomains = domain->subdomains; int nsubdomains = domain->nsubdomains; #if defined(CUDA) && !defined(CUDA_MAPPED) if (!strcmp(mode, "GPU")) { // Assign domain main arrays. domain->arrays = t->gpu.arrays; } else #endif // CUDA && !CUDA_MAPPED { // Assign domain main arrays. domain->arrays = t->cpu.arrays; } // Allocate memory required to keep the rest of domain data. // In addition to main data arrays, each domain also allocates data // for its subdomains (nested domains). In this case the nested domains // represent boundaries for data buffering. #if defined(CUDA) && defined(CUDA_MAPPED) if (!strcmp(mode, "GPU")) { for (int i = 0; i < nsubdomains; i++) { struct grid_domain_t* subdomain = subdomains + i; subdomain->arrays = (char**)malloc(sizeof(char*) * narrays); subdomain->narrays = narrays; for (int iarray = 0; iarray < narrays; iarray++) { size_t size = subdomain->grid[0].extsize * szelem; // Allocate a host-mapped array for subdomain in order // to make in possible to perform GPU-initiated boundaries // update. CUDA_SAFE_CALL(cudaHostAlloc((void**)&subdomain->arrays[iarray], size, cudaHostAllocMapped)); // TODO: mapping // TODO: flushing to zero. } } } else #endif // CUDA && CUDA_MAPPED { for (int i = 0; i < nsubdomains; i++) { struct grid_domain_t* subdomain = subdomains + i; subdomain->arrays = (char**)malloc(sizeof(char*) * narrays); subdomain->narrays = narrays; for (int iarray = 0; iarray < narrays; iarray++) { size_t size = subdomain->grid[0].extsize * szelem; // Allocate regular CPU memory. subdomain->arrays[iarray] = (char*)malloc(size); // Flush to zero. memset(subdomain->arrays[iarray], 0, size); } } } #endif // MPI // // 5) Allocate the GPU data arrays. // #if defined(CUDA) if (!strcmp(mode, "GPU")) { #if defined(CUDA_MAPPED) // In case of host-mapped memory the GPU arrays pointers are // either same as for CPU arrays or contain specially mapped // pointers, depending on device capability. int use_mapping = props->major < 2; if (use_mapping) { #ifdef VERBOSE printf("requires mapping\n"); #endif for (int i = 0; i < narrays; i++) CUDA_SAFE_CALL(cudaHostGetDevicePointer( (void**)&t->gpu.arrays[i], t->cpu.arrays[i], 0)); } else { #ifdef VERBOSE printf("does not require mapping\n"); #endif for (int iarray = 0; iarray < narrays; iarray++) t->gpu.arrays[iarray] = t->cpu.arrays[iarray]; } #else for (int iarray = 0; iarray < narrays; iarray++) { // Allocate regular GPU memory. CUDA_SAFE_CALL(cudaMalloc((void**)&t->gpu.arrays[iarray], nxysb)); // Initially flush GPU array data to zero. CUDA_SAFE_CALL(cudaMemset(t->gpu.arrays[iarray], 0, nxysb)); // TODO: reassign arrays of MPI domain. } #endif // CUDA_MAPPED } #endif // CUDA return t; }
void reset_gpu_write_buffer( ) { cudaError_t err = cudaMemset( device_A, 0x0, n_rows * n_cols * sizeof( T ) ); CUDA_V_THROW( err, "cudaMemset reset_gpu_write_buffer" ); }
/** Purpose ------- SPOTRF computes the Cholesky factorization of a real symmetric positive definite matrix dA. The factorization has the form dA = U**H * U, if UPLO = MagmaUpper, or dA = L * L**H, if UPLO = MagmaLower, where U is an upper triangular matrix and L is lower triangular. This is the block version of the algorithm, calling Level 3 BLAS. If the current stream is NULL, this version replaces it with a new stream to overlap computation with communication. Arguments --------- @param[in] uplo magma_uplo_t - = MagmaUpper: Upper triangle of dA is stored; - = MagmaLower: Lower triangle of dA is stored. @param[in] n INTEGER The order of the matrix dA. N >= 0. @param[in,out] dA REAL array on the GPU, dimension (LDDA,N) On entry, the symmetric matrix dA. If UPLO = MagmaUpper, the leading N-by-N upper triangular part of dA contains the upper triangular part of the matrix dA, and the strictly lower triangular part of dA is not referenced. If UPLO = MagmaLower, the leading N-by-N lower triangular part of dA contains the lower triangular part of the matrix dA, and the strictly upper triangular part of dA is not referenced. \n On exit, if INFO = 0, the factor U or L from the Cholesky factorization dA = U**H * U or dA = L * L**H. @param[in] ldda INTEGER The leading dimension of the array dA. LDDA >= max(1,N). To benefit from coalescent memory accesses LDDA must be divisible by 16. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value - > 0: if INFO = i, the leading minor of order i is not positive definite, and the factorization could not be completed. @ingroup magma_sposv_comp ********************************************************************/ extern "C" magma_int_t magma_spotrf_batched( magma_uplo_t uplo, magma_int_t n, float **dA_array, magma_int_t ldda, magma_int_t *info_array, magma_int_t batchCount) { #define A(i_, j_) (A + (i_) + (j_)*ldda) cudaMemset(info_array, 0, batchCount*sizeof(magma_int_t)); magma_int_t arginfo = 0; if ( uplo != MagmaUpper && uplo != MagmaLower) { arginfo = -1; } else if (n < 0) { arginfo = -2; } else if (ldda < max(1,n)) { arginfo = -4; } if (arginfo != 0) { magma_xerbla( __func__, -(arginfo) ); return arginfo; } // Quick return if possible if (n == 0) { return arginfo; } if( n > 2048 ){ printf("=========================================================================================\n"); printf(" WARNING batched routines are designed for small sizes it might be better to use the\n Native/Hybrid classical routines if you want performance\n"); printf("=========================================================================================\n"); } magma_int_t j, k, ib; magma_int_t nb = POTRF_NB; magma_int_t gemm_crossover = 127;//nb > 32 ? 127 : 160; #if defined(USE_CUOPT) cublasHandle_t myhandle; cublasCreate_v2(&myhandle); #else cublasHandle_t myhandle=NULL; #endif float **dA_displ = NULL; float **dW0_displ = NULL; float **dW1_displ = NULL; float **dW2_displ = NULL; float **dW3_displ = NULL; float **dW4_displ = NULL; float **dinvA_array = NULL; float **dx_array = NULL; magma_malloc((void**)&dA_displ, batchCount * sizeof(*dA_displ)); magma_malloc((void**)&dW0_displ, batchCount * sizeof(*dW0_displ)); magma_malloc((void**)&dW1_displ, batchCount * sizeof(*dW1_displ)); magma_malloc((void**)&dW2_displ, batchCount * sizeof(*dW2_displ)); magma_malloc((void**)&dW3_displ, batchCount * sizeof(*dW3_displ)); magma_malloc((void**)&dW4_displ, batchCount * sizeof(*dW4_displ)); magma_malloc((void**)&dinvA_array, batchCount * sizeof(*dinvA_array)); magma_malloc((void**)&dx_array, batchCount * sizeof(*dx_array)); float* dinvA; float* dx;// dinvA and x are workspace in strsm magma_int_t invA_msize = ((n+TRI_NB-1)/TRI_NB)*TRI_NB*TRI_NB; magma_int_t x_msize = n*nb; magma_smalloc( &dinvA, invA_msize * batchCount); magma_smalloc( &dx, x_msize * batchCount ); sset_pointer(dx_array, dx, 1, 0, 0, x_msize, batchCount); sset_pointer(dinvA_array, dinvA, TRI_NB, 0, 0, invA_msize, batchCount); cudaMemset( dinvA, 0, batchCount * ((n+TRI_NB-1)/TRI_NB)*TRI_NB*TRI_NB * sizeof(float) ); float **cpuAarray = NULL; magma_malloc_cpu((void**) &cpuAarray, batchCount*sizeof(float*)); magma_getvector( batchCount, sizeof(float*), dA_array, 1, cpuAarray, 1); float d_alpha = -1.0; float d_beta = 1.0; magma_queue_t cstream; magmablasGetKernelStream(&cstream); magma_int_t streamid; const magma_int_t nbstreams=32; magma_queue_t stream[nbstreams]; for(k=0; k<nbstreams; k++){ magma_queue_create( &stream[k] ); } magmablasSetKernelStream(NULL); if (uplo == MagmaUpper) { printf("Upper side is unavailable \n"); goto fin; } else { for(j = 0; j < n; j+=nb) { ib = min(nb, n-j); #if 1 //=============================================== // panel factorization //=============================================== magma_sdisplace_pointers(dA_displ, dA_array, ldda, j, j, batchCount); sset_pointer(dx_array, dx, 1, 0, 0, x_msize, batchCount); sset_pointer(dinvA_array, dinvA, TRI_NB, 0, 0, invA_msize, batchCount); #if 0 arginfo = magma_spotrf_panel_batched( uplo, n-j, ib, dA_displ, ldda, dx_array, x_msize, dinvA_array, invA_msize, dW0_displ, dW1_displ, dW2_displ, dW3_displ, dW4_displ, info_array, j, batchCount, myhandle); #else //arginfo = magma_spotrf_rectile_batched( arginfo = magma_spotrf_recpanel_batched( uplo, n-j, ib, 32, dA_displ, ldda, dx_array, x_msize, dinvA_array, invA_msize, dW0_displ, dW1_displ, dW2_displ, dW3_displ, dW4_displ, info_array, j, batchCount, myhandle); #endif if(arginfo != 0 ) goto fin; //=============================================== // end of panel //=============================================== #endif #if 1 //real_Double_t gpu_time; //gpu_time = magma_sync_wtime(NULL); if( (n-j-ib) > 0){ if( (n-j-ib) > gemm_crossover) { //------------------------------------------- // USE STREAM HERK //------------------------------------------- // since it use different stream I need to wait the panel. // But since the code use the NULL stream everywhere, // so I don't need it, because the NULL stream do the sync by itself //magma_queue_sync(NULL); /* you must know the matrix layout inorder to do it */ for(k=0; k<batchCount; k++) { streamid = k%nbstreams; magmablasSetKernelStream(stream[streamid]); // call herk, class ssyrk must call cpu pointer magma_ssyrk(MagmaLower, MagmaNoTrans, n-j-ib, ib, d_alpha, (const float*) cpuAarray[k] + j+ib+j*ldda, ldda, d_beta, cpuAarray[k] + j+ib+(j+ib)*ldda, ldda); } // need to synchronise to be sure that panel do not start before // finishing the update at least of the next panel // BUT no need for it as soon as the other portion of the code // use the NULL stream which do the sync by itself //magma_device_sync(); magmablasSetKernelStream(NULL); } else { //------------------------------------------- // USE BATCHED GEMM(which is a HERK in fact, since it only access the lower part) //------------------------------------------- magma_sdisplace_pointers(dA_displ, dA_array, ldda, j+ib, j, batchCount); magma_sdisplace_pointers(dW1_displ, dA_array, ldda, j+ib, j+ib, batchCount); magmablas_ssyrk_batched(uplo, MagmaNoTrans, n-j-ib, ib, d_alpha, dA_displ, ldda, d_beta, dW1_displ, ldda, batchCount); } } //gpu_time = magma_sync_wtime(NULL) - gpu_time; //real_Double_t flops = (n-j-ib) * (n-j-ib) * ib / 1e9 * batchCount; //real_Double_t gpu_perf = flops / gpu_time; //printf("Rows= %d, Colum=%d, herk time = %7.2fms, Gflops= %7.2f\n", n-j-ib, ib, gpu_time*1000, gpu_perf); #endif } } fin: magma_queue_sync(NULL); for(k=0; k<nbstreams; k++){ magma_queue_destroy( stream[k] ); } magmablasSetKernelStream(cstream); #if defined(USE_CUOPT) cublasDestroy_v2(myhandle); #endif magma_free(dA_displ); magma_free(dW0_displ); magma_free(dW1_displ); magma_free(dW2_displ); magma_free(dW3_displ); magma_free(dW4_displ); magma_free(dinvA_array); magma_free(dx_array); magma_free(dinvA); magma_free(dx); magma_free_cpu(cpuAarray); return arginfo; }
inline void caffe_gpu_memset(const uint_tp N, const int_tp alpha, void* X) { CUDA_CHECK(cudaMemset(X, alpha, N)); // NOLINT(caffe/alt_fn) }
extern "C" magma_int_t magma_dgeqrf_expert_batched( magma_int_t m, magma_int_t n, double **dA_array, magma_int_t ldda, double **dR_array, magma_int_t lddr, double **dT_array, magma_int_t lddt, double **dtau_array, magma_int_t provide_RT, magma_int_t *info_array, magma_int_t batchCount, magma_queue_t queue) { #define dA(i, j) (dA + (i) + (j)*ldda) // A(i, j) means at i row, j column /* Local Parameter */ magma_int_t nb = magma_get_dgeqrf_batched_nb(m); magma_int_t nnb = 8; magma_int_t min_mn = min(m, n); /* Check arguments */ cudaMemset(info_array, 0, batchCount*sizeof(magma_int_t)); magma_int_t arginfo = 0; if (m < 0) arginfo = -1; else if (n < 0) arginfo = -2; else if (ldda < max(1,m)) arginfo = -4; else if (lddr < min_mn && provide_RT == 1) arginfo = -6; else if (lddr < min(min_mn, nb)) arginfo = -6; else if (lddt < min(min_mn, nb)) arginfo = -8; if (arginfo != 0) { magma_xerbla( __func__, -(arginfo) ); return arginfo; } /* Quick return if possible */ if (m == 0 || n == 0) if (min_mn == 0 ) return arginfo; if ( m > 2048 || n > 2048 ) { printf("=========================================================================================\n"); printf(" WARNING batched routines are designed for small sizes it might be better to use the\n Native/Hybrid classical routines if you want performance\n"); printf("=========================================================================================\n"); } magma_int_t i, k, ib=nb, jb=nnb, offset_RT=0, use_stream; magma_int_t ldw, offset; double **dW0_displ = NULL; double **dW1_displ = NULL; double **dW2_displ = NULL; double **dW3_displ = NULL; double **dW4_displ = NULL; double **dW5_displ = NULL; double **dR_displ = NULL; double **dT_displ = NULL; double *dwork = NULL; double **cpuAarray = NULL; double **cpuTarray = NULL; magma_malloc((void**)&dW0_displ, batchCount * sizeof(*dW0_displ)); magma_malloc((void**)&dW1_displ, batchCount * sizeof(*dW1_displ)); magma_malloc((void**)&dW2_displ, batchCount * sizeof(*dW2_displ)); magma_malloc((void**)&dW3_displ, batchCount * sizeof(*dW3_displ)); magma_malloc((void**)&dW4_displ, batchCount * sizeof(*dW4_displ)); magma_malloc((void**)&dW5_displ, batchCount * sizeof(*dW5_displ)); magma_malloc((void**)&dR_displ, batchCount * sizeof(*dR_displ)); magma_malloc((void**)&dT_displ, batchCount * sizeof(*dT_displ)); magma_dmalloc(&dwork, (2 * nb * n) * batchCount); magma_malloc_cpu((void**) &cpuAarray, batchCount*sizeof(double*)); magma_malloc_cpu((void**) &cpuTarray, batchCount*sizeof(double*)); /* check allocation */ if ( dW0_displ == NULL || dW1_displ == NULL || dW2_displ == NULL || dW3_displ == NULL || dW4_displ == NULL || dW5_displ == NULL || dR_displ == NULL || dT_displ == NULL || dwork == NULL || cpuAarray == NULL || cpuTarray == NULL ) { magma_free(dW0_displ); magma_free(dW1_displ); magma_free(dW2_displ); magma_free(dW3_displ); magma_free(dW4_displ); magma_free(dW5_displ); magma_free(dR_displ); magma_free(dT_displ); magma_free(dwork); magma_free_cpu(cpuAarray); magma_free_cpu(cpuTarray); magma_int_t info = MAGMA_ERR_DEVICE_ALLOC; magma_xerbla( __func__, -(info) ); return info; } magma_ddisplace_pointers(dR_displ, dR_array, lddr, 0, 0, batchCount, queue); magma_ddisplace_pointers(dT_displ, dT_array, lddt, 0, 0, batchCount, queue); // set dR and dT to zero. if provide_RT == 0 only a tile of size nbxnb is used and overwritten at each step magmablas_dlaset_batched( MagmaFull, lddr, (provide_RT > 0 ? n:min(min_mn,nb)), MAGMA_D_ZERO, MAGMA_D_ZERO, dR_displ, lddr, batchCount, queue ); magmablas_dlaset_batched( MagmaFull, lddt, (provide_RT > 0 ? n:min(min_mn,nb)), MAGMA_D_ZERO, MAGMA_D_ZERO, dT_displ, lddt, batchCount, queue ); /* if ( provide_RT > 0 ) { magmablas_dlaset_q( MagmaFull, lddr, n*batchCount, MAGMA_D_ZERO, MAGMA_D_ZERO, dR, lddr, queue ); magmablas_dlaset_q( MagmaFull, lddt, n*batchCount, MAGMA_D_ZERO, MAGMA_D_ZERO, dT, lddt, queue ); } else { magmablas_dlaset_q( MagmaFull, lddr, nb*batchCount, MAGMA_D_ZERO, MAGMA_D_ZERO, dR, lddr, queue ); magmablas_dlaset_q( MagmaFull, lddt, nb*batchCount, MAGMA_D_ZERO, MAGMA_D_ZERO, dT, lddt, queue ); } */ magma_int_t streamid; const magma_int_t nbstreams=10; magma_queue_t queues[nbstreams]; for (i=0; i < nbstreams; i++) { magma_device_t cdev; magma_getdevice( &cdev ); magma_queue_create( cdev, &queues[i] ); } magma_getvector( batchCount, sizeof(double*), dA_array, 1, cpuAarray, 1, queue); magma_getvector( batchCount, sizeof(double*), dT_array, 1, cpuTarray, 1, queue); for (i=0; i < min_mn; i += nb) { ib = min(nb, min_mn-i); //=============================================== // panel factorization //=============================================== magma_ddisplace_pointers(dW0_displ, dA_array, ldda, i, i, batchCount, queue); magma_ddisplace_pointers(dW2_displ, dtau_array, 1, i, 0, batchCount, queue); if ( provide_RT > 0 ) { offset_RT = i; magma_ddisplace_pointers(dR_displ, dR_array, lddr, (provide_RT == 1 ? offset_RT:0), offset_RT, batchCount, queue); magma_ddisplace_pointers(dT_displ, dT_array, lddt, 0, offset_RT, batchCount, queue); } //dwork is used in panel factorization and trailing matrix update //dW4_displ, dW5_displ are used as workspace and configured inside magma_dgeqrf_panel_batched(m-i, ib, jb, dW0_displ, ldda, dW2_displ, dT_displ, lddt, dR_displ, lddr, dW1_displ, dW3_displ, dwork, dW4_displ, dW5_displ, info_array, batchCount, queue); //=============================================== // end of panel //=============================================== //=============================================== // update trailing matrix //=============================================== if ( (n-ib-i) > 0) { //dwork is used in panel factorization and trailing matrix update //reset dW4_displ ldw = nb; magma_dset_pointer( dW4_displ, dwork, 1, 0, 0, ldw*n, batchCount, queue ); offset = ldw*n*batchCount; magma_dset_pointer( dW5_displ, dwork + offset, 1, 0, 0, ldw*n, batchCount, queue ); // set the diagonal of v as one and the upper triangular part as zero already set inside geqrf_panel //magmablas_dlaset_batched( MagmaUpper, ib, ib, MAGMA_D_ZERO, MAGMA_D_ONE, dW0_displ, ldda, batchCount, queue ); //magma_ddisplace_pointers(dW2_displ, dtau_array, 1, i, 0, batchCount, queue); // it is faster since it is using BLAS-3 GEMM routines, different from lapack implementation magma_dlarft_batched(m-i, ib, 0, dW0_displ, ldda, dW2_displ, dT_displ, lddt, dW4_displ, nb*lddt, batchCount, queue); // perform C = (I-V T^H V^H) * C, C is the trailing matrix //------------------------------------------- // USE STREAM GEMM //------------------------------------------- use_stream = magma_drecommend_cublas_gemm_stream(MagmaNoTrans, MagmaNoTrans, m-i-ib, n-i-ib, ib); if ( use_stream ) { magma_queue_sync(queue); for (k=0; k < batchCount; k++) { streamid = k%nbstreams; // the queue gemm must take cpu pointer magma_dlarfb_gpu_gemm( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, m-i, n-i-ib, ib, cpuAarray[k] + i + i * ldda, ldda, cpuTarray[k] + offset_RT*lddt, lddt, cpuAarray[k] + i + (i+ib) * ldda, ldda, dwork + nb * n * k, -1, dwork + nb * n * batchCount + nb * n * k, -1, queues[streamid] ); } // need to synchronise to be sure that panel does not start before // finishing the update at least of the next panel // if queue is NULL, no need to sync if ( queue != NULL ) { for (magma_int_t s=0; s < nbstreams; s++) magma_queue_sync(queues[s]); } } //------------------------------------------- // USE BATCHED GEMM //------------------------------------------- else { //direct trailing matrix in dW1_displ magma_ddisplace_pointers(dW1_displ, dA_array, ldda, i, i+ib, batchCount, queue); magma_dlarfb_gemm_batched( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, m-i, n-i-ib, ib, (const double**)dW0_displ, ldda, (const double**)dT_displ, lddt, dW1_displ, ldda, dW4_displ, ldw, dW5_displ, ldw, batchCount, queue ); } }// update the trailing matrix //=============================================== // copy dR back to V after the trailing matrix update, // only when provide_RT=0 otherwise the nbxnb block of V is set to diag=1/0 // The upper portion of V could be set totaly to 0 here if ( provide_RT == 0 ) { magmablas_dlacpy_batched( MagmaUpper, ib, ib, dR_displ, lddr, dW0_displ, ldda, batchCount, queue ); } } magma_queue_sync(queue); for (k=0; k < nbstreams; k++) { magma_queue_destroy( queues[k] ); } magma_free(dW0_displ); magma_free(dW1_displ); magma_free(dW2_displ); magma_free(dW3_displ); magma_free(dW4_displ); magma_free(dW5_displ); magma_free(dR_displ); magma_free(dT_displ); magma_free(dwork); magma_free_cpu(cpuAarray); magma_free_cpu(cpuTarray); return arginfo; }
void PointerFreeHashGrid::updateLookupTable() { // __BENCH.LOOP_STAGE_START("Process Iterations > Iterations > Build Point Free lookup"); if (hashGridLists) delete[] hashGridLists; hashGridLists = new uint[hashGridEntryCount]; if (hashGridLenghts) memset(hashGridLenghts, 0, hashGridSize * sizeof(uint)); else hashGridLenghts = new uint[hashGridSize]; if (hashGridListsIndex) memset(hashGridListsIndex, 0, hashGridSize * sizeof(uint)); else hashGridListsIndex = new uint[hashGridSize]; uint listIndex = 0; for (unsigned int i = 0; i < hashGridSize; ++i) { std::list<uint> *hps = hashGrid[i]; hashGridListsIndex[i] = listIndex; if (hps) { hashGridLenghts[i] = hps->size(); std::list<uint>::iterator iter = hps->begin(); while (iter != hps->end()) { hashGridLists[listIndex++] = *iter++; } } else { hashGridLenghts[i] = 0; } } // __BENCH.LOOP_STAGE_STOP("Process Iterations > Iterations > Build Point Free lookup"); // __BENCH.LOOP_STAGE_START("Process Iterations > Iterations > Copy lookup to device"); //checkCUDAmemory("before updateLookupTable"); uint size1 = sizeof(uint) * hashGridEntryCount; if (hashGridListsBuff) cudaFree(hashGridListsBuff); cudaMalloc((void**) (&hashGridListsBuff), size1); cudaMemset(hashGridListsBuff, 0, size1); cudaMemcpy(hashGridListsBuff, hashGridLists, size1, cudaMemcpyHostToDevice); uint size2 = sizeof(uint) * hashGridSize; if (!hashGridListsIndexBuff) cudaMalloc((void**) (&hashGridListsIndexBuff), size2); cudaMemset(hashGridListsIndexBuff, 0, size2); cudaMemcpy(hashGridListsIndexBuff, hashGridListsIndex, size2, cudaMemcpyHostToDevice); if (!hashGridLenghtsBuff) cudaMalloc((void**) (&hashGridLenghtsBuff), size2); cudaMemset(hashGridLenghtsBuff, 0, size2); cudaMemcpy(hashGridLenghtsBuff, hashGridLenghts, size2, cudaMemcpyHostToDevice); checkCUDAError(); // __BENCH.LOOP_STAGE_STOP("Process Iterations > Iterations > Copy lookup to device"); //checkCUDAmemory("After updateLookupTable"); }
/** Purpose ------- DGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges. The factorization has the form A = P * L * U where P is a permutation matrix, L is lower triangular with unit diagonal elements (lower trapezoidal if m > n), and U is upper triangular (upper trapezoidal if m < n). This is the right-looking Level 3 BLAS version of the algorithm. If the current stream is NULL, this version replaces it with a new stream to overlap computation with communication. Arguments --------- @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. N >= 0. @param[in,out] dA DOUBLE_PRECISION array on the GPU, dimension (LDDA,N). On entry, the M-by-N matrix to be factored. On exit, the factors L and U from the factorization A = P*L*U; the unit diagonal elements of L are not stored. @param[in] ldda INTEGER The leading dimension of the array A. LDDA >= max(1,M). @param[out] ipiv INTEGER array, dimension (min(M,N)) The pivot indices; for 1 <= i <= min(M,N), row i of the matrix was interchanged with row IPIV(i). @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. - > 0: if INFO = i, U(i,i) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations. @ingroup magma_dgesv_comp ********************************************************************/ extern "C" magma_int_t magma_dgetrf_batched( magma_int_t m, magma_int_t n, double **dA_array, magma_int_t ldda, magma_int_t **ipiv_array, magma_int_t *info_array, magma_int_t batchCount, magma_queue_t queue) { #define A(i_, j_) (A + (i_) + (j_)*ldda) magma_int_t min_mn = min(m, n); cudaMemset(info_array, 0, batchCount*sizeof(magma_int_t)); /* Check arguments */ magma_int_t arginfo = 0; if (m < 0) arginfo = -1; else if (n < 0) arginfo = -2; else if (ldda < max(1,m)) arginfo = -4; if (arginfo != 0) { magma_xerbla( __func__, -(arginfo) ); return arginfo; } /* Quick return if possible */ if (m == 0 || n == 0) if(min_mn == 0 ) return arginfo; if( m > 2048 || n > 2048 ){ printf("=========================================================================================\n"); printf(" WARNING batched routines are designed for small sizes it might be better to use the\n Native/Hybrid classical routines if you want performance\n"); printf("=========================================================================================\n"); } //#define ENABLE_TIMER3 #if defined(ENABLE_TIMER3) real_Double_t tall=0.0, tloop=0., talloc=0., tdalloc=0.; tall = magma_sync_wtime(0); talloc = magma_sync_wtime(0); #endif double neg_one = MAGMA_D_NEG_ONE; double one = MAGMA_D_ONE; magma_int_t ib, i, k, pm; magma_int_t nb = BATRF_NB; magma_int_t gemm_crossover = nb > 32 ? 127 : 160; // magma_int_t gemm_crossover = n;// use only stream gemm #if defined(USE_CUOPT) cublasHandle_t myhandle; cublasCreate_v2(&myhandle); #else cublasHandle_t myhandle=NULL; #endif magma_int_t **dipiv_displ = NULL; double **dA_displ = NULL; double **dW0_displ = NULL; double **dW1_displ = NULL; double **dW2_displ = NULL; double **dW3_displ = NULL; double **dW4_displ = NULL; double **dinvA_array = NULL; double **dwork_array = NULL; magma_malloc((void**)&dipiv_displ, batchCount * sizeof(*dipiv_displ)); magma_malloc((void**)&dA_displ, batchCount * sizeof(*dA_displ)); magma_malloc((void**)&dW0_displ, batchCount * sizeof(*dW0_displ)); magma_malloc((void**)&dW1_displ, batchCount * sizeof(*dW1_displ)); magma_malloc((void**)&dW2_displ, batchCount * sizeof(*dW2_displ)); magma_malloc((void**)&dW3_displ, batchCount * sizeof(*dW3_displ)); magma_malloc((void**)&dW4_displ, batchCount * sizeof(*dW4_displ)); magma_malloc((void**)&dinvA_array, batchCount * sizeof(*dinvA_array)); magma_malloc((void**)&dwork_array, batchCount * sizeof(*dwork_array)); magma_int_t invA_msize = ((n+TRI_NB-1)/TRI_NB)*TRI_NB*TRI_NB; magma_int_t dwork_msize = n*nb; magma_int_t **pivinfo_array = NULL; magma_int_t *pivinfo = NULL; double* dinvA = NULL; double* dwork = NULL;// dinvA and dwork are workspace in dtrsm double **cpuAarray = NULL; magma_dmalloc( &dinvA, invA_msize * batchCount); magma_dmalloc( &dwork, dwork_msize * batchCount ); magma_malloc((void**)&pivinfo_array, batchCount * sizeof(*pivinfo_array)); magma_malloc((void**)&pivinfo, batchCount * m * sizeof(magma_int_t)); magma_malloc_cpu((void**) &cpuAarray, batchCount*sizeof(double*)); /* check allocation */ if ( dA_displ == NULL || dW0_displ == NULL || dW1_displ == NULL || dW2_displ == NULL || dW3_displ == NULL || dW4_displ == NULL || dinvA_array == NULL || dwork_array == NULL || dinvA == NULL || dwork == NULL || cpuAarray == NULL || dipiv_displ == NULL || pivinfo_array == NULL || pivinfo == NULL) { magma_free(dA_displ); magma_free(dW0_displ); magma_free(dW1_displ); magma_free(dW2_displ); magma_free(dW3_displ); magma_free(dW4_displ); magma_free(dinvA_array); magma_free(dwork_array); magma_free( dinvA ); magma_free( dwork ); free(cpuAarray); magma_free(dipiv_displ); magma_free(pivinfo_array); magma_free(pivinfo); magma_int_t info = MAGMA_ERR_DEVICE_ALLOC; magma_xerbla( __func__, -(info) ); return info; } magmablas_dlaset_q(MagmaFull, invA_msize, batchCount, MAGMA_D_ZERO, MAGMA_D_ZERO, dinvA, invA_msize, queue); magmablas_dlaset_q(MagmaFull, dwork_msize, batchCount, MAGMA_D_ZERO, MAGMA_D_ZERO, dwork, dwork_msize, queue); dset_pointer(dwork_array, dwork, 1, 0, 0, dwork_msize, batchCount, queue); dset_pointer(dinvA_array, dinvA, TRI_NB, 0, 0, invA_msize, batchCount, queue); set_ipointer(pivinfo_array, pivinfo, 1, 0, 0, m, batchCount, queue); // printf(" I am in dgetrfbatched\n"); magma_queue_t cstream; magmablasGetKernelStream(&cstream); magma_int_t streamid; const magma_int_t nbstreams=32; magma_queue_t stream[nbstreams]; for(i=0; i<nbstreams; i++){ magma_queue_create( &stream[i] ); } magma_getvector( batchCount, sizeof(double*), dA_array, 1, cpuAarray, 1); #if defined(ENABLE_TIMER3) printf(" I am after malloc\n"); talloc = magma_sync_wtime(0) - talloc; tloop = magma_sync_wtime(0); #endif for(i = 0; i < min_mn; i+=nb) { magmablasSetKernelStream(NULL); ib = min(nb, min_mn-i); pm = m-i; magma_idisplace_pointers(dipiv_displ, ipiv_array, ldda, i, 0, batchCount, queue); magma_ddisplace_pointers(dA_displ, dA_array, ldda, i, i, batchCount, queue); //=============================================== // panel factorization //=============================================== #if 0 arginfo = magma_dgetf2_batched( pm, ib, dA_displ, ldda, dW1_displ, dW2_displ, dW3_displ, dipiv_displ, info_array, i, batchCount, myhandle); #else arginfo = magma_dgetrf_recpanel_batched( pm, ib, 16, dA_displ, ldda, dipiv_displ, pivinfo_array, dwork_array, nb, dinvA_array, invA_msize, dW0_displ, dW1_displ, dW2_displ, dW3_displ, dW4_displ, info_array, i, batchCount, myhandle, queue); #endif if(arginfo != 0 ) goto fin; //=============================================== // end of panel //=============================================== #define RUN_ALL #ifdef RUN_ALL // setup pivinfo before adjusting ipiv setup_pivinfo_batched(pivinfo_array, dipiv_displ, pm, ib, batchCount, queue); adjust_ipiv_batched(dipiv_displ, ib, i, batchCount, queue); // stepinit_ipiv(pivinfo_array, pm, batchCount);// for debug and check swap, it create an ipiv #if 0 dlaswp_batched( i, dA_displ, ldda, i, i+ib, dipiv_displ, pivinfo_array, batchCount); #else magma_ddisplace_pointers(dA_displ, dA_array, ldda, i, 0, batchCount, queue); magma_ddisplace_pointers(dW0_displ, dA_array, ldda, i, 0, batchCount, queue); magma_dlaswp_rowparallel_batched( i, dA_displ, ldda, dW0_displ, ldda, i, i+ib, pivinfo_array, batchCount, queue); #endif if( (i + ib) < n) { // swap right side and trsm magma_ddisplace_pointers(dA_displ, dA_array, ldda, i, i+ib, batchCount, queue); dset_pointer(dwork_array, dwork, nb, 0, 0, dwork_msize, batchCount, queue); // I don't think it is needed Azzam magma_dlaswp_rowparallel_batched( n-(i+ib), dA_displ, ldda, dwork_array, nb, i, i+ib, pivinfo_array, batchCount, queue); magma_ddisplace_pointers(dA_displ, dA_array, ldda, i, i, batchCount, queue); magma_ddisplace_pointers(dW0_displ, dA_array, ldda, i, i+ib, batchCount, queue); magmablas_dtrsm_outofplace_batched(MagmaLeft, MagmaLower, MagmaNoTrans, MagmaUnit, 1, ib, n-i-ib, MAGMA_D_ONE, dA_displ, ldda, // dA dwork_array, nb, // dB dW0_displ, ldda, // dX dinvA_array, invA_msize, dW1_displ, dW2_displ, dW3_displ, dW4_displ, 0, batchCount, queue); if( (i + ib) < m) { // if gemm size is >160 use a streamed classical cublas gemm since it is faster // the batched is faster only when M=N<=160 for K40c //------------------------------------------- // USE STREAM GEMM //------------------------------------------- if( (m-i-ib) > gemm_crossover && (n-i-ib) > gemm_crossover) { //printf("caling streamed dgemm %d %d %d \n", m-i-ib, n-i-ib, ib); // since it use different stream I need to wait the TRSM and swap. // But since the code use the NULL stream everywhere, // so I don't need it, because the NULL stream do the sync by itself //magma_queue_sync(NULL); // for(k=0; k<batchCount; k++) { streamid = k%nbstreams; magmablasSetKernelStream(stream[streamid]); magma_dgemm(MagmaNoTrans, MagmaNoTrans, m-i-ib, n-i-ib, ib, neg_one, cpuAarray[k] + (i+ib)+i*ldda, ldda, cpuAarray[k] + i+(i+ib)*ldda, ldda, one, cpuAarray[k] + (i+ib)+(i+ib)*ldda, ldda); } // need to synchronise to be sure that dgetf2 do not start before // finishing the update at least of the next panel // BUT no need for it as soon as the other portion of the code // use the NULL stream which do the sync by itself //magma_device_sync(); } //------------------------------------------- // USE BATCHED GEMM //------------------------------------------- else { magma_ddisplace_pointers(dA_displ, dA_array, ldda, i+ib, i, batchCount, queue); magma_ddisplace_pointers(dW1_displ, dA_array, ldda, i, i+ib, batchCount, queue); magma_ddisplace_pointers(dW2_displ, dA_array, ldda, i+ib, i+ib, batchCount, queue); //printf("caling batched dgemm %d %d %d \n", m-i-ib, n-i-ib, ib); magmablas_dgemm_batched( MagmaNoTrans, MagmaNoTrans, m-i-ib, n-i-ib, ib, neg_one, dA_displ, ldda, dW1_displ, ldda, one, dW2_displ, ldda, batchCount, queue); } // end of batched/stream gemm } // end of if( (i + ib) < m) } // end of if( (i + ib) < n) #endif }// end of for fin: magma_queue_sync(NULL); #if defined(ENABLE_TIMER3) tloop = magma_sync_wtime(0) - tloop; tdalloc = magma_sync_wtime(0); #endif for(i=0; i<nbstreams; i++){ magma_queue_destroy( stream[i] ); } magmablasSetKernelStream(cstream); #if defined(USE_CUOPT) cublasDestroy_v2(myhandle); #endif magma_free(dA_displ); magma_free(dW0_displ); magma_free(dW1_displ); magma_free(dW2_displ); magma_free(dW3_displ); magma_free(dW4_displ); magma_free(dinvA_array); magma_free(dwork_array); magma_free( dinvA ); magma_free( dwork ); free(cpuAarray); magma_free(dipiv_displ); magma_free(pivinfo_array); magma_free(pivinfo); #if defined(ENABLE_TIMER3) tdalloc = magma_sync_wtime(0) - tdalloc; tall = magma_sync_wtime(0) - tall; printf("here is the timing from inside dgetrf_batched talloc: %10.5f tloop: %10.5f tdalloc: %10.5f tall: %10.5f sum: %10.5f\n", talloc, tloop, tdalloc, tall, talloc+tloop+tdalloc ); #endif return arginfo; }
void* safe_cudaMalloc(size_t size) { void* devicePtr; void* goldPtr; void* outputPtr; if(is_crash == 0) { char errorDescription[250]; sprintf(errorDescription, "Trying_to_alloc_memory_GPU_may_crash"); #ifdef LOGS log_info_detail((char *)errorDescription); #endif is_crash = 1; } // First, alloc DEVICE proposed memory and HOST memory for device memory checking check_framework_errors(cudaMalloc(&devicePtr, size)); outputPtr = malloc(size); goldPtr = malloc(size); if ((outputPtr == NULL) || (goldPtr == NULL)) { #ifdef LOGS log_error_detail((char *) "error host malloc"); end_log_file(); #endif printf("error host malloc\n"); exit(EXIT_FAILURE); } // ===> FIRST PHASE: CHECK SETTING BITS TO 10101010 check_framework_errors(cudaMemset(devicePtr, 0xAA, size)); memset(goldPtr, 0xAA, size); check_framework_errors( cudaMemcpy(outputPtr, devicePtr, size, cudaMemcpyDeviceToHost)); if (memcmp(outputPtr, goldPtr, size)) { // Failed free(outputPtr); free(goldPtr); void* newDevicePtr = safe_cudaMalloc(size); check_framework_errors(cudaFree(devicePtr)); return newDevicePtr; } // ===> END FIRST PHASE // ===> SECOND PHASE: CHECK SETTING BITS TO 01010101 check_framework_errors(cudaMemset(devicePtr, 0x55, size)); memset(goldPtr, 0x55, size); check_framework_errors(cudaMemcpy(outputPtr, devicePtr, size, cudaMemcpyDeviceToHost)); if (memcmp(outputPtr, goldPtr, size)) { // Failed free(outputPtr); free(goldPtr); void* newDevicePtr = safe_cudaMalloc(size); check_framework_errors(cudaFree(devicePtr)); return newDevicePtr; } // ===> END SECOND PHASE free(outputPtr); free(goldPtr); return devicePtr; }
int main(int argc, char* argv[]) { const int bufsize = 512; char buffer[bufsize]; int m,n,S; double time_st,time_end,time_avg; //omp_set_num_threads(2); // printf("\n-----------------\nnumber of threads fired = %d\n-----------------\n",(int)omp_get_num_threads()); if(argc!=2) { cout<<"Insufficient arguments"<<endl; return 1; } graph G; cerr<<"Start reading "; // time_st=dsecnd(); G.create_graph(argv[1]); // time_end=dsecnd(); // time_avg = (time_end-time_st); // cout<<"Success "<<endl; // cerr<<"Reading time "<<time_avg<<endl; cerr<<"Constructing Matrices "; // time_st=dsecnd(); G.construct_MNA(); G.construct_NA(); // time_end=dsecnd(); // time_avg = (time_end-time_st); // cerr<<"Done "<<time_avg<<endl; // G.construct_sparse_MNA(); m=G.node_array.size()-1; n=G.voltage_edge_id.size(); cout<<endl; cout<<"MATRIX STAT:"<<endl; cout<<"Nonzero elements: "<<G.nonzero<<endl; cout<<"Number of Rows: "<<m+n<<endl; cout<<"Nonzero in G: "<<G.Gnonzero<<endl; cout<<"Number of rows in G: "<<m<<endl; cout<<"Nonzero in P: "<<G.Pnonzero<<endl; cout<<"Number of rows in P: "<<m<<endl; // printf("\n Nonzero = %d", G.nonzero); // printf("\n Rows = %d", m+n); cout<<"MAT val: "<<endl; int i,j; G.Mat_val[0] += 100; G.Gmat[0] +=100; /* for(i=0;i<G.Gnonzero;i++) cout<<" "<<G.Gmat[i]; cout<<endl; for(i=0;i<G.Gnonzero;i++) cout<<" "<<G.Gcolumns[i]; cout<<endl; for(i=0;i<m+1;i++) cout<<" "<<G.GrowIndex[i]; cout<<endl; for(i=0;i<m;i++) printf(" %.8f", G.b1[i]); cout<<endl; for(i=0;i<m;i++) printf(" %.8f", G.x1[i]); cout<<endl; */ SuiteSparse_long *Gnz = (SuiteSparse_long*)calloc(m,sizeof(SuiteSparse_long)); for(i=0;i<m;i++) { // cout<<endl; SuiteSparse_long startindex=G.GrowIndex[i]; SuiteSparse_long endindex=G.GrowIndex[i+1]; Gnz[i] = endindex - startindex; // for(j=startindex;j<endindex;j++) // cout<<" "<<G.Gmat[j]; // cout<<endl; } /* for(i=0;i<G.Pnonzero;i++) cout<<" "<<G.Pmat[i]; cout<<endl; for(i=0;i<G.Pnonzero;i++) cout<<" "<<G.Pcolumns[i]; cout<<endl; for(i=0;i<m+1;i++) cout<<" "<<G.ProwIndex[i]; cout<<endl; /* for(i=0;i<m;i++) printf(" %.8f", G.b1[i]); cout<<endl; for(i=0;i<m;i++) printf(" %.8f", G.x1[i]); cout<<endl; for(i=0;i<m;i++) { cout<<endl; int startindex=G.ProwIndex[i]; int endindex=G.ProwIndex[i+1]; for(j=startindex;j<endindex;j++) cout<<" "<<G.Pmat[j]; cout<<endl; } /* for(i=0;i<G.nonzero;i++) cout<<" "<<G.Mat_val[i]; cout<<endl; for(i=0;i<G.nonzero;i++) cout<<" "<<G.columns[i]; cout<<endl; for(i=0;i<m+n+1;i++) cout<<" "<<G.rowIndex[i]; cout<<endl; for(i=0;i<m+n;i++) printf(" %.8f", G.b[i]); cout<<endl; for(i=0;i<m+n;i++) printf(" %.8f", G.x[i]); cout<<endl; for(i=0;i<m+n;i++) { cout<<endl; int startindex=G.rowIndex[i]; int endindex=G.rowIndex[i+1]; for(j=startindex;j<endindex;j++) cout<<" "<<G.Mat_val[j]; cout<<endl; } */ /* for (i=0;i<m+n+1;i++) { //cout<<endl; if(G.rowIndex[i]==G.rowIndex[i+1]) break; for(j=G.rowIndex[i];j<G.rowIndex[i+1];j++) { if(G.Mat_val[j]>10) cout<<G.Mat_val[j]<<"\t"; } //cout<<endl; /*for(j=G.rowIndex[i];j<G.rowIndex[i+1];j++) { cout<<G.columns[j]<<"\t"; } //cout<<endl; } cout<<endl; */ //printing the matrix printf("\n Fine till here"); printf("\n"); // int* rowmIndex=(int*)calloc(m+1,sizeof(int)); printf("\n Fine till here"); printf("\n"); //int rowmIndex[5]={1,2,3,4,5}; /* for(i=0;i<m+1;i++) { rowmIndex[i]=G.rowIndex[i]; printf(" %d", rowmIndex[i]); } */ printf("\n Allocating GPU memory\n"); cudaDeviceReset(); size_t free, total; cudaMemGetInfo(&free, &total); printf("\n Free Mem = %lf MB, Total mem = %lf MB\n", (double)(free)/(1024*1024), (double)(total)/(1024*1024)); double *dev_csrValA, *dev_b, *dev_x; int *dev_csrRowIdxA, *dev_csrColA; double *dev_GcsrVal, *dev_b1, *dev_x1; double *dev_PcsrVal, *dev_b2, *dev_x2; int *dev_GcsrRowIdx, *dev_PcsrRowIdx, *dev_GcsrCol, *dev_PcsrCol; cudaMalloc((void**)&dev_PcsrVal, G.Pnonzero*sizeof(double)); cudaMalloc((void**)&dev_PcsrRowIdx, (m+1)*sizeof(int)); cudaMalloc((void**)&dev_PcsrCol, G.Pnonzero*sizeof(int)); cudaMalloc((void**)&dev_b1, (m)*sizeof(double)); cudaMalloc((void**)&dev_b2, n*sizeof(double)); cudaMalloc((void**)&dev_x1, m*sizeof(double)); cudaMalloc((void**)&dev_x2, n*sizeof(double)); cudaMemcpy(dev_b1, G.b1, (m)*sizeof(double), cudaMemcpyHostToDevice); cudaMemcpy(dev_x1, G.x1, (m)*sizeof(double), cudaMemcpyHostToDevice); cudaMemcpy(dev_PcsrVal, G.Pmat, G.Pnonzero*sizeof(double), cudaMemcpyHostToDevice); cudaMemcpy(dev_b2, G.b2, (n)*sizeof(double), cudaMemcpyHostToDevice); cudaMemcpy(dev_x2, G.x2, (n)*sizeof(double), cudaMemcpyHostToDevice); cudaMemcpy(dev_PcsrRowIdx, G.ProwIndex, (m+1)*sizeof(int), cudaMemcpyHostToDevice); cudaMemcpy(dev_PcsrCol, G.Pcolumns, (G.Pnonzero)*sizeof(int), cudaMemcpyHostToDevice); /* Matrix has been created and stored in CSR format. However, CHOLMOD requires CSC format. Since our matrix is symmetric positive definite, we can simply swap csrColA with csrRowIdx and vice versa */ /* Starting the CHOLMOD routine now*/ printf("\n Initiating CHOLMOD\n"); cholmod_sparse *A, *P; cholmod_dense *x, *b, *r, *midvec; cholmod_factor *L; cholmod_common *Common, cm; Common = &cm; cholmod_l_start(Common); // &Common->useGPU=1; printf("\n m = %d, G.Gnonzero = %d\n", m, G.Gnonzero); cholmod_sparse *C = cholmod_l_allocate_sparse((size_t)(m), (size_t)(m), (size_t)(G.Gnonzero), 1, 0, 1, 1, Common); // P = cholmod_l_allocate_sparse((size_t)(m), (size_t)(n), (size_t)(G.Pnonzero), 1, 0, 0, 1, Common); // printf("\n Allocated \n"); C->itype = CHOLMOD_LONG; // printf("\n Itype \n"); C->p = &G.GrowIndex[0]; // printf("\n Columns \n"); C->nz = &Gnz[0]; // printf("\n Rows \n"); C->i = &G.Gcolumns[0]; C->dtype = 0; C->x = &G.Gmat[0]; /* P->itype = CHOLMOD_LONG; P->p = &G.ProwIndex[0]; P->nz = &Pnz[0]; P->i = &G.Pcolumns[0]; P->dtype = 0; P->x = &G.Pmat[0]; */ b = cholmod_l_allocate_dense((size_t)(m), 1, (size_t)(m), 1, Common); b->dtype=0; b->x = &G.b1[0]; b->xtype = 1; printf("\n CHOLMOD manually set\n"); cholmod_l_print_sparse(C, "A", Common); cholmod_l_print_dense(b, "b", Common); cudaEvent_t start, stop; cudaEventCreate(&start); cudaEventCreate(&stop); cudaEventRecord(start, 0); L = cholmod_l_analyze(C, Common); printf("\n Analysis: Flops: %g \t lnz: %g\n", Common->fl, Common->lnz); cholmod_l_factorize(C, L, Common); x = cholmod_l_solve(CHOLMOD_A, L, b, Common); cudaEventRecord(stop, 0); cudaEventSynchronize(stop); float elapsedTime; cudaEventElapsedTime(&elapsedTime, start, stop); printf("\n Time : %.6f secs :\n", elapsedTime); cholmod_l_print_dense(x, "X", Common); double *x1_mod = (double*)x->x; cudaMemcpy(dev_x1, x1_mod, m*sizeof(double), cudaMemcpyHostToDevice); cusparseStatus_t cuSparseStatus; cusparseHandle_t cuspHandle; cuSparseStatus = cusparseCreate(&cuspHandle); cusparseMatDescr_t descrP; cusparseCreateMatDescr(&descrP); cusparseSetMatType(descrP, CUSPARSE_MATRIX_TYPE_GENERAL); cusparseSetMatIndexBase(descrP, CUSPARSE_INDEX_BASE_ZERO); double *dev_res1, *dev_simple; double *res1 = (double*)calloc(n,sizeof(double)); cudaMalloc((void**)&dev_res1, n*sizeof(double)); cudaMalloc((void**)&dev_simple, n*sizeof(double)); const double alpha = 1.0, beta=0.0; //alpha = 1.0; //beta = 0.0; //solving P^T * G^-1 * b1 Result stored in dev_res1 cuSparseStatus = cusparseDcsrmv(cuspHandle, CUSPARSE_OPERATION_TRANSPOSE, m, n, G.Pnonzero, &alpha, descrP, dev_PcsrVal, dev_PcsrRowIdx, dev_PcsrCol, dev_x1, &beta, dev_res1); if(cuSparseStatus == CUSPARSE_STATUS_SUCCESS) { /* cudaMemcpy(res1, dev_res1, n*sizeof(double), cudaMemcpyDeviceToHost); for(i=0;i<n;i++) { printf("\nres1[%d] = %.8f", i, res1[i]); } printf("\n P^T * G^-1 * b1 done! Vector stored in res1"); */ } else { printf("\n P^T * G^-1 * b1 failed\n"); exit(1); } const double alphaneg = -1.0; //Solving P^T * G^-1 * b1 - b2 ; Result stored in dev_res1 cublasStatus_t cuBlasStatus; cublasHandle_t cubHandle; cuBlasStatus = cublasCreate(&cubHandle); cuBlasStatus = cublasDaxpy(cubHandle, n, &alphaneg, dev_b2, 1, dev_res1, 1); if(cuBlasStatus == CUBLAS_STATUS_SUCCESS) { // cudaMemcpy(res1, dev_res1, n*sizeof(double), cudaMemcpyDeviceToHost); // for(i=0;i<n;i++) // { // printf("\nres1[%d] = %.8f", i, res1[i]); // } printf("\n res1 = res1 - b2 done\n"); } else { printf("\n res1 = res1 - b2 failed\n"); } ///NOW COMPUTING G^-1 * P int k = 0; int breakloop=0; double **midMat = (double**)malloc(m*sizeof(double*)); for(i=0;i<m;i++) { midMat[i] = (double*)calloc(n,sizeof(double)); } cudaEventRecord(start, 0); for(i=0;i<n;i++) { breakloop = 0; double *vect = (double*)calloc(m,sizeof(double*)); for(j=0;j<m;j++) { int startin = G.ProwIndex[j]; int endin = G.ProwIndex[j+1]; if(startin == endin) continue; k = startin; while(k<endin) { if(G.Pcolumns[k] == i) { vect[j] = G.Pmat[k]; breakloop=1; break; } k++; } if(breakloop == 1) { break; } } midvec = cholmod_l_allocate_dense((size_t)(m), 1, (size_t)(m), 1, Common); midvec->dtype=0; midvec->x=&vect[0]; midvec->xtype = 1; cholmod_dense *res2; res2 = cholmod_l_solve(CHOLMOD_A, L, midvec, Common); double *re = (double*)res2->x; // printf("\n vector %d is:\n", i); int i1, j1, k1; // for(j1=0;j1<m;j1++) // { // midmat2flat[i+j1*n] = re[j1]; // printf(" %lf", re[j1]); // } // printf("\n"); for(i1=0;i1<m;i1++) { midMat[i1][i] = re[i1]; } cholmod_l_free_dense(&midvec, Common); } /* printf("\n Midmat = \n"); for(i=0;i<m;i++) { for(j=0;j<n;j++) { printf(" %lf", midMat[i][j]); } printf("\n"); } */ double *midMatflat = (double*)calloc((m*n),sizeof(double)); double *dev_midMat; double *dev_solut; int counter = 0; for(i=0;i<n;i++) { for(j=0;j<m;j++) { midMatflat[counter] = midMat[j][i]; counter++; } } cudaMalloc((void**)&dev_midMat, m*n*sizeof(double)); cudaMalloc((void**)&dev_solut, n*n*sizeof(double)); cudaMemcpy(dev_midMat, midMatflat, m*n*sizeof(double), cudaMemcpyHostToDevice); //Solving P^T * midMat; Result stored in dev_solut cuSparseStatus = cusparseDcsrmm(cuspHandle, CUSPARSE_OPERATION_TRANSPOSE, m, n, n, G.Pnonzero, &alpha, descrP, dev_PcsrVal, dev_PcsrRowIdx, dev_PcsrCol, dev_midMat, m, &beta, dev_solut, n); if(cuSparseStatus == CUSPARSE_STATUS_SUCCESS) { printf("\n Solved P^T * G^-1 * P. Result stored in solut\n"); } else { printf("\n Failed to Solve P^T * G^-1 * P \n"); exit(1); } /* double *matGflat = (double*)calloc(n*n,sizeof(double)); cudaMemcpy(matGflat, dev_solut, n*n*sizeof(double), cudaMemcpyDeviceToHost); counter = 0; printf("\nBefore LU starts\n"); for(i=0;i<n;i++) { for(j=0;j<n;j++) { printf(" %lf ", matGflat[counter]); counter++; } printf("\n"); } printf("\n"); */ cusolverStatus_t cuSolverStatus; cusolverDnHandle_t cudenHandle; cuSolverStatus = cusolverDnCreate(&cudenHandle); int Lwork = 0; cuSolverStatus = cusolverDnDgetrf_bufferSize(cudenHandle, n, n, dev_solut, n, &Lwork); if(cuSolverStatus == CUSOLVER_STATUS_SUCCESS) { printf("\n Buffer works\n Lwork = %d\n", Lwork); } else { exit(1); } double *dev_Workspace; int *dev_Ipiv, *dev_Info; cudaMalloc((void**)&dev_Workspace, Lwork*sizeof(double)); cudaMalloc((void**)&dev_Ipiv, n*sizeof(int)); cudaMalloc((void**)&dev_Info, sizeof(int)); //Calculating LU for dev_solut // double *nnmat = (double*)calloc(n*n,sizeof(double)); // cudaMemcpy(nnmat, dev_solut, n*n*sizeof(double), cudaMemcpyDeviceToHost); // cuSolverStatus = cusolverDnDgetrfHost(cudenHandle, n, n, cuSolverStatus = cusolverDnDgetrf(cudenHandle, n, n, dev_solut, n, dev_Workspace, dev_Ipiv, dev_Info); if(cuSolverStatus == CUSOLVER_STATUS_SUCCESS) { printf("\n solut has be defactorized into L and U. dev_Ipiv * solut = L * U\n"); } else { printf("\n Unable to defactorize solut into LU\n"); exit(1); } //solving dev_solut * x = dev_res1. Result stored in dev_res1 cuSolverStatus = cusolverDnDgetrs(cudenHandle, CUBLAS_OP_N, n, 1, dev_solut, n, dev_Ipiv, dev_res1, n, dev_Info); if(cuSolverStatus == CUSOLVER_STATUS_SUCCESS) { printf("\n Solution obtained for x2 \n"); } else { printf("\n LU decomposition obtained by LU solver failed\n"); } /* cudaMemcpy(G.x2, dev_res1, n*sizeof(double), cudaMemcpyDeviceToHost); printf("\n x2 = \n"); for(i=0;i<n;i++) { printf("\n x2[%d] = %lf", i, G.x2[i]); } */ double *dev_dummy; cudaMalloc((void**)&dev_dummy, m*sizeof(double)); cudaMemset(dev_dummy, 0.0, m*sizeof(double)); printf("\n Starting solving for x1 \n"); //Solving for x1 //Solving G^-1 * P * x2; G^-1 * P is stored in midMat cuBlasStatus = cublasDgemv(cubHandle, CUBLAS_OP_N, m, n, &alpha, dev_midMat, m, dev_res1, 1, &beta, dev_dummy, 1); if(cuBlasStatus == CUBLAS_STATUS_SUCCESS) { /* double *toprint = (double*)calloc(m,sizeof(double)); cudaMemcpy(toprint, dev_dummy, m*sizeof(double), cudaMemcpyDeviceToHost); printf("\n Intermediate vector :\n"); for(i=0;i<m;i++) { printf("\ndummy[%d] = %lf", i, toprint[i]); } */ printf("\n midmat * x2 obtained. Stored in dummy\n"); } else { printf("\n Failed to obtain midmat * x2\n"); } cuBlasStatus = cublasDaxpy(cubHandle, m, &alphaneg, dev_dummy, 1, dev_x1, 1); if(cuBlasStatus == CUBLAS_STATUS_SUCCESS) { /* cudaMemcpy(G.x1, dev_x1, m*sizeof(double), cudaMemcpyDeviceToHost); printf("\n x1 = \n"); for(i=0;i<m;i++) { printf("\n x1[%d] = %.15f", i, G.x1[i]); } */ printf("\n x1 obtained"); } else { printf("\n Failed to obtain x1"); } printf("\n Solver finished its work\n"); /* cudaEventRecord(stop, 0); cudaEventSynchronize(stop); cudaEventElapsedTime(&elapsedTime, start, stop); printf("\n Time: %.6f msecs :\n", elapsedTime); */ cholmod_l_finish(Common); return 0; }
int window_loop() { GLFWwindow* window; window = glfwCreateWindow(640, 480, "Shader test", NULL, NULL); if (!window) { glfwTerminate(); return 0; } glfwMakeContextCurrent(window); //glfwSetInputMode(window, GLFW_CURSOR, GLFW_CURSOR_DISABLED); glfwSetKeyCallback(window, key_callback); //glfwSetCursorPosCallback(window, cursor_callback); //glEnable(GL_CULL_FACE); glEnable(GL_LIGHT0); //glEnable(GL_DEPTH_TEST); // glEnable(GL_LIGHTING); // glEnable(GL_BLEND); // glBlendEquationSeparate(GL_FUNC_ADD, GL_FUNC_ADD); // glBlendFuncSeparate(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA, GL_ONE, GL_ZERO); // glEnable(GL_NORMALIZE); //glEnable(GL_NORMALIZE); glEnable(GL_TEXTURE_2D); //glPolygonMode( GL_FRONT_AND_BACK, GL_LINE ); // glPolygonMode( GL_FRONT, GL_LINE ); // glPolygonMode( GL_BACK, GL_POINT ); // glEnable(GL_COLOR_MATERIAL); cudaGLSetGLDevice(0); double ot, nt = glfwGetTime(); GLuint textureID[6]; glGenTextures(1, textureID); png_bytep* tex1; int lw, lh; printf("Laddar PNG\n"); read_png_file("/srv/texturer/Slate Tiles - (Normal Map).png", &tex1, &lw, &lh); printf("Laddade textur som är %i x %i pixelitaz stor.\n", lw, lh); float3* normal_map = NULL; size_t normal_map_bufferSize = 1024 * 1024 * sizeof(float3); cudaMalloc( &normal_map, normal_map_bufferSize ); float3* host_normal_map = calloc(1024*1024, sizeof(float3)); glBindTexture(GL_TEXTURE_2D, textureID[0]); for (int y=0; y<1024; y++) { for (int x=0; x<1024; x++) { host_normal_map[y*1024+x].x = (float)(tex1[y][x*3+0]-127) / 127; host_normal_map[y*1024+x].y = (float)(tex1[y][x*3+1]-127) / 127; host_normal_map[y*1024+x].z = (float)(tex1[y][x*3+2]-127) / 127; } } cudaMemcpy(normal_map, host_normal_map, normal_map_bufferSize, cudaMemcpyHostToDevice); glTexImage2D(GL_TEXTURE_2D, 0,GL_RGBA, 1024, 1024, 0, GL_RGBA, GL_UNSIGNED_BYTE, 0); glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST); glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR); // glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_REPEAT); // glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_REPEAT); glMatrixMode(GL_MODELVIEW); glLoadIdentity(); double cx, cy; glfwGetCursorPos(window, &cx, &cy); struct cudaGraphicsResource *test1; int r1=cudaGraphicsGLRegisterImage(&test1, textureID[0], GL_TEXTURE_2D, cudaGraphicsMapFlagsWriteDiscard); printf("r1=%i\n"); uchar4* g_dstBuffer = NULL; size_t bufferSize = 1024 * 1024 * sizeof(uchar4); cudaMalloc( &g_dstBuffer, bufferSize ); cudaMemset(g_dstBuffer, 0x7F, bufferSize); //Make texture gray to start with printf("cuda alloc: %p\n", g_dstBuffer); double fps_time =0 ; int fps_count=0; while (!glfwWindowShouldClose(window)) { ot=nt; nt =glfwGetTime(); float dt = nt - ot; fps_time += dt; fps_count++; if (fps_time > 1) { printf("FPS: %f\n", fps_count/fps_time); fps_time=0; fps_count =0; } int width, height; glfwGetFramebufferSize(window, &width, &height); glClearColor(0.0, 0.0, 0.1, 1.0); glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT); glViewport(0, 0, width-1, height-1); glMatrixMode(GL_PROJECTION); glLoadIdentity(); glOrtho(0, width-1, height-1, 0,0,1); glMatrixMode(GL_MODELVIEW); for (int testa_flera=0; testa_flera<16; testa_flera++) { glLoadIdentity(); glTranslatef(testa_flera*150, testa_flera*50+100, 0); glRotatef(testa_flera*10, 0,0,1); glTranslatef(0, testa_flera*50, 0); glScalef(0.5, 0.5, 0.5); float ta = fmod(nt+testa_flera*0.2, M_PI*2.0); float tb = fmod(nt*0.7+testa_flera*0.4, M_PI*2.0); float tc = fmod(nt*0.3+testa_flera*0.1, M_PI*2.0); float3 cam_vec = {sin(ta), sin(tb), sin(tc)}; int res=cudaGraphicsMapResources(1, &test1, 0); //printf("res: %i (succ=%i)\n", res, cudaSuccess); struct cudaArray* dstArray = 0; int r2 = cudaGraphicsSubResourceGetMappedArray( &dstArray, test1, 0, 0 ); //printf("r2: %i array: %p\n", r2, dstArray); first_test(g_dstBuffer, normal_map, cam_vec, 1024, 1024); cudaMemcpyToArray( dstArray, 0, 0, g_dstBuffer, bufferSize, cudaMemcpyDeviceToDevice ); cudaGraphicsUnmapResources(1, &test1, 0); glColor3f(1,1,1); glBegin(GL_QUADS); glTexCoord2f(0,0); glVertex3f(0,0,0); glTexCoord2f(1,0); glVertex3f(511,0,0); glTexCoord2f(1,1); glVertex3f(511,511,0); glTexCoord2f(0,1); glVertex3f(0,511,0); glEnd(); } glfwSwapBuffers(window); glfwPollEvents(); } glfwDestroyWindow(window); glfwTerminate(); return(EXIT_SUCCESS); }
void PathPlanner::slotComputePath(const QVector3D& vehiclePosition, const WayPointList& wayPointList) { Q_ASSERT(!wayPointList.isEmpty()); if(!mIsInitialized) initialize(); WayPointList wayPointsWithHighInformationGain = wayPointList; wayPointsWithHighInformationGain.prepend(WayPoint(vehiclePosition, 0)); qDebug() << __PRETTY_FUNCTION__ << "computing path for waypointlist" << wayPointsWithHighInformationGain.toString(); // mDeviceOccupancyGrid points to device memory filled with grid-values of quint8. // After fillOccupancyGrid(), empty cells contain a 0, occupied cells contain a 254/255. // The cell containing the start is set to 1. Then, one thread is launched per cell, // looking into the neighboring cells. If a neighboring cell (26 3d-neighbors) contains // a value other than 0 or 254/255, the current cell is set to min(neighbor)+1. // This is executed often, so that all cells reachable from start get filled with the // distance TO start alignPathPlannerGridToColliderCloud(); QTime t; t.start(); WayPointList computedPath; copyParametersToGpu(&mParametersPathPlanner); const bool haveToMapGridOccupancyTemplate = checkAndMapGridOccupancy(mCudaVboResourceGridOccupancyTemplate); const bool haveToMapGridOccupancyPathPlanner = checkAndMapGridOccupancy(mCudaVboResourceGridPathPlanner); // Only re-creates the occupancy grid if the collider cloud's content changed populateOccupancyGrid(); // The first path leads from vehicle position to first waypoint. Clear the occupancy grid above the vehicle! // This is currently necessary, because we also scan bernd and the fishing rod, occupying our own cells. clearOccupancyGridAboveVehiclePosition( mGridOccupancyPathPanner, vehiclePosition.x(), vehiclePosition.y(), vehiclePosition.z(), &mCudaStream); // We have freed the occupancy grid a little to make path planning easier/possible. Restore it to the real grid asap. mRepopulateOccupanccyGrid = true; // Make some room for waypoints in host memory. The first float4's x=y=z=w will store just the number of waypoints float* waypointsHost = new float[mMaxWaypoints * 4]; // Find a path between every pair of waypoints quint32 indexWayPointStart = 0; quint32 indexWayPointGoal = 1; quint32 pathNumber = 0; do { mParametersPathPlanner.start = CudaHelper::convert(wayPointsWithHighInformationGain.at(indexWayPointStart)); mParametersPathPlanner.goal = CudaHelper::convert(wayPointsWithHighInformationGain.at(indexWayPointGoal)); qDebug() << __PRETTY_FUNCTION__ << "now computing path from" << indexWayPointStart << ":" << wayPointsWithHighInformationGain.at(indexWayPointStart).toString() << "to" << indexWayPointGoal << ":" << wayPointsWithHighInformationGain.at(indexWayPointGoal).toString(); copyParametersToGpu(&mParametersPathPlanner); // Copy the populated and dilated occupancy grid into the PathFinder's domain cudaSafeCall(cudaMemcpy( mGridOccupancyPathPanner, mGridOccupancyTemplate, mParametersPathPlanner.grid.getCellCount(), cudaMemcpyDeviceToDevice)); // Now start path planning. markStartCell(mGridOccupancyPathPanner, &mCudaStream); growGrid( mGridOccupancyPathPanner, &mParametersPathPlanner, &mCudaStream); // We must set the waypoints-array on the device to a special value because // a bug can lead to some waypoints not being written. This is ok // as long as we can detect this. When memsetting with 0, we can, // otherwise we'd find a waypoint from a previous run and couldn't // detect this incidence. cudaSafeCall(cudaMemset( (void*)mDeviceWaypoints, 255, // interpreted as float, should be NaN! 4 * mMaxWaypoints * sizeof(float))); retrievePath( mGridOccupancyPathPanner, mDeviceWaypoints, &mCudaStream); cudaSafeCall(cudaMemcpy( (void*)waypointsHost, (void*)mDeviceWaypoints, 4 * mMaxWaypoints * sizeof(float), cudaMemcpyDeviceToHost)); if(fabs(waypointsHost[0]) < 0.001 && fabs(waypointsHost[1]) < 0.001 && fabs(waypointsHost[2]) < 0.001) { qDebug() << __PRETTY_FUNCTION__ << "found NO path from" << indexWayPointStart << ":" << wayPointsWithHighInformationGain.at(indexWayPointStart).toString() << "to" << indexWayPointGoal << ":" << wayPointsWithHighInformationGain.at(indexWayPointGoal).toString(); // When no path was found, we try to find a path to the next waypoint, skipping the problematic one. indexWayPointGoal++; } else { qDebug() << __PRETTY_FUNCTION__ << "found path with" << waypointsHost[0] << "waypoints"; // The first waypoint isn't one, it only contains the number of waypoints for(int i=1; i<=waypointsHost[0]; i++) { // workaround for the no-waypoint-bug, which will show values of NaN/NaN/NaN/NaN due to the 255-memset above if(isnan(waypointsHost[4*i+3])) { qDebug() << __PRETTY_FUNCTION__ << "ignoring waypoint that was skpped in retrievePath due to bug in growGrid."; continue; } WayPoint newWayPoint; if(i == 1) { newWayPoint = wayPointsWithHighInformationGain.at(indexWayPointStart); } else if(i == (int)waypointsHost[0]) { newWayPoint = wayPointsWithHighInformationGain.at(indexWayPointGoal); } else { newWayPoint = WayPoint(QVector3D(waypointsHost[4*i+0], waypointsHost[4*i+1], waypointsHost[4*i+2]), 0); } // Append all points of the first path, and then only starting at the second point of the following paths. // Otherwise, we have the end of path A and the beginning of path B in the list, although they're the same. if(pathNumber == 0 || i > 1) { computedPath.append(newWayPoint); } } qDebug() << __PRETTY_FUNCTION__ << "found path between" << wayPointsWithHighInformationGain.at(indexWayPointStart).toString() << "and" << wayPointsWithHighInformationGain.at(indexWayPointGoal).toString() << ":" << computedPath.toString(); pathNumber++; indexWayPointStart = indexWayPointGoal; indexWayPointGoal++; } } while(indexWayPointGoal < wayPointsWithHighInformationGain.size()); delete waypointsHost; if(haveToMapGridOccupancyTemplate) checkAndUnmapGridOccupancy(mCudaVboResourceGridOccupancyTemplate); // cudaGraphicsUnmapResources(1, &mCudaVboResourceGridPathFinder, 0); if(haveToMapGridOccupancyPathPlanner) checkAndUnmapGridOccupancy(mCudaVboResourceGridPathPlanner); // cudaGraphicsUnmapResources(1, &mCudaVboResourceGridOccupancy, 0); emit path(computedPath.list(), WayPointListSource::WayPointListSourceFlightPlanner); qDebug() << __PRETTY_FUNCTION__ << "took" << t.elapsed() << "ms."; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing cgeqrf */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time; float error, work[1]; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magmaFloatComplex *h_A, *h_T, *h_R, *tau, *h_work, tmp[1]; magmaFloatComplex *d_A, *d_T, *ddA, *dtau; magmaFloatComplex *d_A2, *d_T2, *ddA2, *dtau2; float *dwork, *dwork2; magma_int_t M, N, lda, ldda, lwork, n2, info, min_mn; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t status = 0; #define BLOCK_SIZE 64 magma_opts opts; parse_opts( argc, argv, &opts ); float tol = 10. * opts.tolerance * lapackf77_slamch("E"); magma_queue_t stream[2]; magma_queue_create( &stream[0] ); magma_queue_create( &stream[1] ); printf("version %d\n", (int) opts.version ); printf(" M N CPU GFlop/s (ms) GPU GFlop/s (ms) ||R||_F/||A||_F ||R_T||\n"); printf("=============================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[itest]; N = opts.nsize[itest]; if (N > 128) { printf("%5d %5d skipping because cgeqr2x requires N <= 128\n", (int) M, (int) N); continue; } if (M < N) { printf("%5d %5d skipping because cgeqr2x requires M >= N\n", (int) M, (int) N); continue; } min_mn = min(M, N); lda = M; n2 = lda*N; ldda = ((M+31)/32)*32; gflops = (FLOPS_CGEQRF( M, N ) + FLOPS_CGEQRT( M, N )) / 1e9; /* Allocate memory for the matrix */ TESTING_MALLOC_CPU( tau, magmaFloatComplex, min_mn ); TESTING_MALLOC_CPU( h_A, magmaFloatComplex, n2 ); TESTING_MALLOC_CPU( h_T, magmaFloatComplex, N*N ); TESTING_MALLOC_PIN( h_R, magmaFloatComplex, n2 ); TESTING_MALLOC_DEV( d_A, magmaFloatComplex, ldda*N ); TESTING_MALLOC_DEV( d_T, magmaFloatComplex, N*N ); TESTING_MALLOC_DEV( ddA, magmaFloatComplex, N*N ); TESTING_MALLOC_DEV( dtau, magmaFloatComplex, min_mn ); TESTING_MALLOC_DEV( d_A2, magmaFloatComplex, ldda*N ); TESTING_MALLOC_DEV( d_T2, magmaFloatComplex, N*N ); TESTING_MALLOC_DEV( ddA2, magmaFloatComplex, N*N ); TESTING_MALLOC_DEV( dtau2, magmaFloatComplex, min_mn ); TESTING_MALLOC_DEV( dwork, float, max(5*min_mn, (BLOCK_SIZE*2+2)*min_mn) ); TESTING_MALLOC_DEV( dwork2, float, max(5*min_mn, (BLOCK_SIZE*2+2)*min_mn) ); // todo replace with magma_claset cudaMemset(ddA, 0, N*N*sizeof(magmaFloatComplex)); cudaMemset(d_T, 0, N*N*sizeof(magmaFloatComplex)); cudaMemset(ddA2, 0, N*N*sizeof(magmaFloatComplex)); cudaMemset(d_T2, 0, N*N*sizeof(magmaFloatComplex)); lwork = -1; lapackf77_cgeqrf(&M, &N, NULL, &M, NULL, tmp, &lwork, &info); lwork = (magma_int_t)MAGMA_C_REAL( tmp[0] ); lwork = max(lwork, N*N); TESTING_MALLOC_CPU( h_work, magmaFloatComplex, lwork ); /* Initialize the matrix */ lapackf77_clarnv( &ione, ISEED, &n2, h_A ); lapackf77_clacpy( MagmaUpperLowerStr, &M, &N, h_A, &lda, h_R, &lda ); magma_csetmatrix( M, N, h_R, lda, d_A, ldda ); magma_csetmatrix( M, N, h_R, lda, d_A2, ldda ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ gpu_time = magma_sync_wtime(0); if (opts.version == 1) magma_cgeqr2x_gpu(M, N, d_A, ldda, dtau, d_T, ddA, dwork, &info); else if (opts.version == 2) magma_cgeqr2x2_gpu(M, N, d_A, ldda, dtau, d_T, ddA, dwork, &info); else if (opts.version == 3) magma_cgeqr2x3_gpu(M, N, d_A, ldda, dtau, d_T, ddA, dwork, &info); else { printf( "call magma_cgeqr2x4_gpu\n" ); /* Going through NULL stream is faster Going through any stream is slower Doing two streams in parallel is slower than doing them sequentially Queuing happens on the NULL stream - user defined buffers are smaller? */ magma_cgeqr2x4_gpu(M, N, d_A, ldda, dtau, d_T, ddA, dwork, &info, NULL); //magma_cgeqr2x4_gpu(M, N, d_A, ldda, dtau, d_T, ddA, dwork, &info, stream[1]); //magma_cgeqr2x4_gpu(M, N, d_A2, ldda, dtau2, d_T2, ddA2, dwork2, &info, stream[0]); //magma_cgeqr2x4_gpu(M, N, d_A2, ldda, dtau2, d_T2, ddA2, dwork2, &info, NULL); //gflops *= 2; } gpu_time = magma_sync_wtime(0) - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) { printf("magma_cgeqr2x_gpu version %d returned error %d: %s.\n", (int) opts.version, (int) info, magma_strerror( info )); } else { if ( opts.check ) { /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); lapackf77_cgeqrf(&M, &N, h_A, &lda, tau, h_work, &lwork, &info); lapackf77_clarft( MagmaForwardStr, MagmaColumnwiseStr, &M, &N, h_A, &lda, tau, h_work, &N); //magma_cgeqr2(&M, &N, h_A, &lda, tau, h_work, &info); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_cgeqrf returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ===================================================================== Check the result compared to LAPACK =================================================================== */ magma_cgetmatrix( M, N, d_A, ldda, h_R, M ); magma_cgetmatrix( N, N, ddA, N, h_T, N ); // Restore the upper triangular part of A before the check for(int col=0; col < N; col++){ for(int row=0; row <= col; row++) h_R[row + col*M] = h_T[row + col*N]; } error = lapackf77_clange("M", &M, &N, h_A, &lda, work); blasf77_caxpy(&n2, &c_neg_one, h_A, &ione, h_R, &ione); error = lapackf77_clange("M", &M, &N, h_R, &lda, work) / (N * error); // Check if T is the same magma_cgetmatrix( N, N, d_T, N, h_T, N ); float terr = 0.; for(int col=0; col < N; col++) for(int row=0; row <= col; row++) terr += ( MAGMA_C_ABS(h_work[row + col*N] - h_T[row + col*N])* MAGMA_C_ABS(h_work[row + col*N] - h_T[row + col*N]) ); terr = magma_ssqrt(terr); printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %8.2e %s\n", (int) M, (int) N, cpu_perf, 1000.*cpu_time, gpu_perf, 1000.*gpu_time, error, terr, (error < tol ? "ok" : "failed") ); status += ! (error < tol); } else { printf("%5d %5d --- ( --- ) %7.2f (%7.2f) --- \n", (int) M, (int) N, gpu_perf, 1000.*gpu_time); } } TESTING_FREE_CPU( tau ); TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_T ); TESTING_FREE_CPU( h_work ); TESTING_FREE_PIN( h_R ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( d_T ); TESTING_FREE_DEV( ddA ); TESTING_FREE_DEV( dtau ); TESTING_FREE_DEV( dwork ); TESTING_FREE_DEV( d_A2 ); TESTING_FREE_DEV( d_T2 ); TESTING_FREE_DEV( ddA2 ); TESTING_FREE_DEV( dtau2 ); TESTING_FREE_DEV( dwork2 ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } magma_queue_destroy( stream[0] ); magma_queue_destroy( stream[1] ); TESTING_FINALIZE(); return status; }
float gpuEncode(EntropyCodingTaskInfo *infos, type_image *img, int count, int targetSize) { int codeBlocks = count; int maxOutLength = /*MAX_CODESTREAM_SIZE*/(1 << img->cblk_exp_w) * (1 << img->cblk_exp_h) * 14; // long int start_bebcot = start_measure(); int n = 0; for(int i = 0; i < codeBlocks; i++) n += infos[i].width * infos[i].height; mem_mg_t *mem_mg = img->mem_mg; CodeBlockAdditionalInfo *h_infos = (CodeBlockAdditionalInfo *)mem_mg->alloc->host(sizeof(CodeBlockAdditionalInfo) * codeBlocks, mem_mg->ctx); byte *d_cxd_pairs = (byte *)mem_mg->alloc->dev(sizeof(byte) * codeBlocks * maxOutLength, mem_mg->ctx); CodeBlockAdditionalInfo *d_infos = (CodeBlockAdditionalInfo *)mem_mg->alloc->dev(sizeof(CodeBlockAdditionalInfo) * codeBlocks, mem_mg->ctx); int magconOffset = 0; for(int i = 0; i < codeBlocks; i++) { h_infos[i].width = infos[i].width; h_infos[i].height = infos[i].height; h_infos[i].nominalWidth = infos[i].nominalWidth; h_infos[i].stripeNo = (int) ceil(infos[i].height / 4.0f); h_infos[i].subband = infos[i].subband; h_infos[i].magconOffset = magconOffset + infos[i].width; h_infos[i].magbits = infos[i].magbits; h_infos[i].coefficients = infos[i].coefficients; h_infos[i].compType = infos[i].compType; h_infos[i].dwtLevel = infos[i].dwtLevel; h_infos[i].stepSize = infos[i].stepSize; magconOffset += h_infos[i].width * (h_infos[i].stripeNo + 2); } GPU_JPEG2K::CoefficientState *d_stBuffors = (GPU_JPEG2K::CoefficientState *)mem_mg->alloc->dev(sizeof(GPU_JPEG2K::CoefficientState) * magconOffset, mem_mg->ctx); CHECK_ERRORS(cudaMemset((void *) d_stBuffors, 0, sizeof(GPU_JPEG2K::CoefficientState) * magconOffset)); cuda_memcpy_htd(h_infos, d_infos, sizeof(CodeBlockAdditionalInfo) * codeBlocks); // printf("before launch encode: %d\n", stop_measure(start_bebcot)); long int start_ebcot = start_measure(); if(targetSize == 0) { //printf("No pcrd\n"); CHECK_ERRORS(GPU_JPEG2K::launch_encode((int) ceil((float) codeBlocks / THREADS), THREADS, d_stBuffors, d_cxd_pairs, maxOutLength, d_infos, codeBlocks, mem_mg)); } else { // printf("Pcrd\n"); CHECK_ERRORS(GPU_JPEG2K::launch_encode_pcrd((int) ceil((float) codeBlocks / THREADS), THREADS, d_stBuffors, maxOutLength, d_infos, codeBlocks, targetSize, mem_mg)); } // printf("launch encode: %d\n", stop_measure(start_ebcot)); // long int start_mqc = start_measure(); cuda_memcpy_dth(d_infos, h_infos, sizeof(CodeBlockAdditionalInfo) * codeBlocks); img->codestream = mqc_gpu_encode(infos, h_infos, codeBlocks, d_cxd_pairs, maxOutLength, mem_mg); // printf("mqc: %d\n", stop_measure(start_mqc)); // long int start_aebcot = start_measure(); for(int i = 0; i < codeBlocks; i++) { infos[i].significantBits = h_infos[i].significantBits; infos[i].codingPasses = h_infos[i].codingPasses; /*if(h_infos[i].length > 0) { infos[i].length = h_infos[i].length; int len = h_infos[i].length; infos[i].codeStream = (byte *) malloc(sizeof(byte) * len); cuda_memcpy_dth(d_outbuf + i * maxOutLength, infos[i].codeStream, sizeof(byte) * len); } else { infos[i].length = 0; infos[i].codeStream = NULL; }*/ } mem_mg->dealloc->dev(d_stBuffors, mem_mg->ctx); mem_mg->dealloc->dev(d_infos, mem_mg->ctx); mem_mg->dealloc->dev(d_cxd_pairs, mem_mg->ctx); mem_mg->dealloc->host(h_infos, mem_mg->ctx); // printf("after launch encode: %d\n", stop_measure(start_aebcot)); float elapsed = 0.0f; return elapsed; }
extern "C" magma_int_t magma_sgeqp3_gpu( magma_int_t m, magma_int_t n, float *A, magma_int_t lda, magma_int_t *jpvt, float *tau, float *work, magma_int_t lwork, #if defined(PRECISION_z) || defined(PRECISION_c) float *rwork, #endif magma_int_t *info ) { /* -- MAGMA (version 1.4.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver August 2013 Purpose ======= SGEQP3 computes a QR factorization with column pivoting of a matrix A: A*P = Q*R using Level 3 BLAS. Arguments ========= M (input) INTEGER The number of rows of the matrix A. M >= 0. N (input) INTEGER The number of columns of the matrix A. N >= 0. A (input/output) REAL array, dimension (LDA,N) On entry, the M-by-N matrix A. On exit, the upper triangle of the array contains the min(M,N)-by-N upper trapezoidal matrix R; the elements below the diagonal, together with the array TAU, represent the unitary matrix Q as a product of min(M,N) elementary reflectors. LDA (input) INTEGER The leading dimension of the array A. LDA >= max(1,M). JPVT (input/output) INTEGER array, dimension (N) On entry, if JPVT(J).ne.0, the J-th column of A is permuted to the front of A*P (a leading column); if JPVT(J)=0, the J-th column of A is a free column. On exit, if JPVT(J)=K, then the J-th column of A*P was the the K-th column of A. TAU (output) REAL array, dimension (min(M,N)) The scalar factors of the elementary reflectors. WORK (workspace/output) REAL array, dimension (MAX(1,LWORK)) On exit, if INFO=0, WORK(1) returns the optimal LWORK. LWORK (input) INTEGER The dimension of the array WORK. For [sd]geqp3, LWORK >= (N+1)*NB + 2*N; for [cz]geqp3, LWORK >= (N+1)*NB, where NB is the optimal blocksize. If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued by XERBLA. For [cz]geqp3 only: RWORK (workspace) DOUBLE PRECISION array, dimension (2*N) INFO (output) INTEGER = 0: successful exit. < 0: if INFO = -i, the i-th argument had an illegal value. Further Details =============== The matrix Q is represented as a product of elementary reflectors Q = H(1) H(2) . . . H(k), where k = min(m,n). Each H(i) has the form H(i) = I - tau * v * v' where tau is a real scalar, and v is a real vector with v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i), and tau in TAU(i). ===================================================================== */ #define A(i, j) (A + (i) + (j)*(lda )) magma_int_t ione = 1; //magma_int_t na; magma_int_t n_j; magma_int_t j, jb, nb, sm, sn, fjb, nfxd, minmn; magma_int_t topbmn, sminmn, lwkopt, lquery; *info = 0; lquery = (lwork == -1); if (m < 0) { *info = -1; } else if (n < 0) { *info = -2; } else if (lda < max(1,m)) { *info = -4; } nb = magma_get_sgeqp3_nb(min(m, n)); if (*info == 0) { minmn = min(m,n); if (minmn == 0) { lwkopt = 1; } else { lwkopt = (n + 1)*nb; #if defined(PRECISION_d) || defined(PRECISION_s) lwkopt += 2*n; #endif } //work[0] = MAGMA_S_MAKE( lwkopt, 0. ); if (lwork < lwkopt && ! lquery) { *info = -8; } } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) { return *info; } if (minmn == 0) return *info; #if defined(PRECISION_d) || defined(PRECISION_s) float *rwork = work + (n + 1)*nb; #endif float *df; if (MAGMA_SUCCESS != magma_smalloc( &df, (n+1)*nb )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } cudaMemset( df, 0, (n+1)*nb*sizeof(float) ); nfxd = 0; /* Move initial columns up front. * Note jpvt uses 1-based indices for historical compatibility. */ for (j = 0; j < n; ++j) { if (jpvt[j] != 0) { if (j != nfxd) { blasf77_sswap(&m, A(0, j), &ione, A(0, nfxd), &ione); jpvt[j] = jpvt[nfxd]; jpvt[nfxd] = j + 1; } else { jpvt[j] = j + 1; } ++nfxd; } else { jpvt[j] = j + 1; } } /* Factorize fixed columns ======================= Compute the QR factorization of fixed columns and update remaining columns. if (nfxd > 0) { na = min(m,nfxd); lapackf77_sgeqrf(&m, &na, A, &lda, tau, work, &lwork, info); if (na < n) { n_j = n - na; lapackf77_sormqr( MagmaLeftStr, MagmaTransStr, &m, &n_j, &na, A, &lda, tau, A(0, na), &lda, work, &lwork, info ); } }*/ /* Factorize free columns */ if (nfxd < minmn) { sm = m - nfxd; sn = n - nfxd; sminmn = minmn - nfxd; /*if (nb < sminmn) { j = nfxd; // Set the original matrix to the GPU magma_ssetmatrix_async( m, sn, A (0,j), lda, dA(0,j), ldda, stream[0] ); }*/ /* Initialize partial column norms. */ magmablas_snrm2_cols(sm, sn, A(nfxd,nfxd), lda, &rwork[nfxd]); #if defined(PRECISION_d) || defined(PRECISION_z) magma_dcopymatrix( sn, 1, &rwork[nfxd], sn, &rwork[n+nfxd], sn); #else magma_scopymatrix( sn, 1, &rwork[nfxd], sn, &rwork[n+nfxd], sn); #endif /*for (j = nfxd; j < n; ++j) { rwork[j] = cblas_snrm2(sm, A(nfxd, j), ione); rwork[n + j] = rwork[j]; }*/ j = nfxd; //if (nb < sminmn) { /* Use blocked code initially. */ //magma_queue_sync( stream[0] ); /* Compute factorization: while loop. */ topbmn = minmn;// - nb; while(j < topbmn) { jb = min(nb, topbmn - j); /* Factorize JB columns among columns J:N. */ n_j = n - j; /*if (j>nfxd) { // Get panel to the CPU magma_sgetmatrix( m-j, jb, dA(j,j), ldda, A (j,j), lda ); // Get the rows magma_sgetmatrix( jb, n_j - jb, dA(j,j + jb), ldda, A (j,j + jb), lda ); }*/ //magma_slaqps_gpu // this is a cpp-file magma_slaqps2_gpu // this is a cuda-file ( m, n_j, j, jb, &fjb, A (0, j), lda, &jpvt[j], &tau[j], &rwork[j], &rwork[n + j], work, &df[jb], n_j ); j += fjb; /* fjb is actual number of columns factored */ } } /* Use unblocked code to factor the last or only block. if (j < minmn) { n_j = n - j; if (j > nfxd) { magma_sgetmatrix( m-j, n_j, dA(j,j), ldda, A (j,j), lda ); } lapackf77_slaqp2(&m, &n_j, &j, A(0, j), &lda, &jpvt[j], &tau[j], &rwork[j], &rwork[n+j], work ); }*/ } //work[0] = MAGMA_S_MAKE( lwkopt, 0. ); magma_free(df); return *info; } /* sgeqp3 */