int matrixMult( float *A, float *B, float *C,int n, int m ,int o, int flag){ int sizeA=n*m*sizeof(int); int sizeB=m*o*sizeof(int); int sizeC=n*o*sizeof(int); float *d_A, *d_B, *d_C; //Reservo Memoria en el dispositivo cudaMalloc((void **)&d_A, sizeA); cudaMalloc((void **)&d_B, sizeB); cudaMalloc((void **)&d_C, sizeC); clock_t t; t=clock(); //Copio los datos al device cudaMemcpy(d_A, A, sizeA, cudaMemcpyHostToDevice); cudaMemcpy(d_B, B, sizeB, cudaMemcpyHostToDevice); dim3 dimBlock(32.0,32.0); //mayor cantidad de hilos por bloque dim3 dimGrid(ceil((float)n/dimBlock.x),ceil((float)n/dimBlock.y)); // Ejecuto el Kernel (del dispositivo) if(flag==1){ matMultParallelTiled<<<dimGrid,dimBlock>>>(d_A, d_B, d_C,n,m,o); cudaMemcpy(C, d_C, sizeC, cudaMemcpyDeviceToHost); printf("Multiplicacion paralela con tiling\t: %.8f\n",(clock()-t)/(double)CLOCKS_PER_SEC); }else{
int main() { // Initialize variables double *h_u; h_u = (double*)malloc(sizeof(double)*(NX*NY*NZ)); // Set Domain Initial Condition and BCs Call_IC(h_u); // GPU Memory Arrays double *d_u; checkCuda(cudaMalloc((void**)&d_u, sizeof(double)*(NX*NY))); double *d_un; checkCuda(cudaMalloc((void**)&d_un,sizeof(double)*(NX*NY))); // Copy Initial Condition from host to device checkCuda(cudaMemcpy(d_u, h_u,sizeof(double)*(NX*NY),cudaMemcpyHostToDevice)); checkCuda(cudaMemcpy(d_un,h_u,sizeof(double)*(NX*NY),cudaMemcpyHostToDevice)); // GPU kernel launch parameters dim3 dimBlock(BLOCK_SIZE_X, BLOCK_SIZE_Y, BLOCK_SIZE_Z); dim3 dimGrid (DIVIDE_INTO(NX, BLOCK_SIZE_X), DIVIDE_INTO(NY, BLOCK_SIZE_Y), DIVIDE_INTO(NZ, BLOCK_SIZE_Z)); // Request computer current time time_t t = clock(); // Solver Loop for (int step=0; step < NO_STEPS; step+=2) { if (step%10000==0) printf("Step %d of %d\n",step,(int)NO_STEPS); // Compute Laplace Call_Laplace(dimGrid,dimBlock,d_u,d_un); // Call_Laplace_Texture(dimGrid,dimBlock,d_u,d_un); } if (DEBUG) printf("CUDA error (Jacobi_Method) %s\n",cudaGetErrorString(cudaPeekAtLastError())); // Measure and Report computation time t = clock()-t; printf("Computing time (%f seconds).\n",((float)t)/CLOCKS_PER_SEC); // Copy data from device to host checkCuda(cudaMemcpy(h_u,d_u,sizeof(double)*(NX*NY*NZ),cudaMemcpyDeviceToHost)); // uncomment to print solution to terminal if (DEBUG) Print2D(h_u); // Write solution to file Save_Results(h_u); // Free device memory checkCuda(cudaFree(d_u)); checkCuda(cudaFree(d_un)); // Reset device checkCuda(cudaDeviceReset()); // Free memory on host and device free(h_u); return 0; }
bool try_convolution0_mcuda(const MCudaMatrix3D::Ptr& video, const MCudaMatrix3D::Ptr& kernel, MCudaMatrix3D::Ptr& output) { unsigned int yt = output->dim_y * output->dim_t; dim3 dimBlock(16, 16); dim3 dimGrid((output->dim_x-1) / 16 + 1, (yt-1) / 16 + 1); do_convolution0(*video, *kernel, *output, dimGrid, dimBlock); return true; }
void runbench_warmup(double *cd, long size){ const long reduced_grid_size = size/(UNROLLED_MEMORY_ACCESSES)/32; const int BLOCK_SIZE = 256; const int TOTAL_REDUCED_BLOCKS = reduced_grid_size/BLOCK_SIZE; dim3 dimBlock(BLOCK_SIZE, 1, 1); dim3 dimReducedGrid(TOTAL_REDUCED_BLOCKS, 1, 1); hipLaunchKernel(HIP_KERNEL_NAME(benchmark_func< short, BLOCK_SIZE, 0 >), dim3(dimReducedGrid), dim3(dimBlock ), 0, 0, (short)1, (short*)cd); CUDA_SAFE_CALL( hipGetLastError() ); CUDA_SAFE_CALL( hipDeviceSynchronize() ); }
void runbench(double *cd, long size){ if( memory_ratio>UNROLL_ITERATIONS ){ fprintf(stderr, "ERROR: memory_ratio exceeds UNROLL_ITERATIONS\n"); exit(1); } const long compute_grid_size = size/(UNROLLED_MEMORY_ACCESSES)/2; const int BLOCK_SIZE = 256; const int TOTAL_BLOCKS = compute_grid_size/BLOCK_SIZE; const long long computations = 2*(long long)(COMP_ITERATIONS)*REGBLOCK_SIZE*compute_grid_size; const long long memoryoperations = (long long)(COMP_ITERATIONS)*compute_grid_size; dim3 dimBlock(BLOCK_SIZE, 1, 1); dim3 dimGrid(TOTAL_BLOCKS, 1, 1); hipEvent_t start, stop; initializeEvents(&start, &stop); hipLaunchKernel(HIP_KERNEL_NAME(benchmark_func< float, BLOCK_SIZE, memory_ratio >), dim3(dimGrid), dim3(dimBlock ), 0, 0, 1.0f, (float*)cd); float kernel_time_mad_sp = finalizeEvents(start, stop); initializeEvents(&start, &stop); hipLaunchKernel(HIP_KERNEL_NAME(benchmark_func< double, BLOCK_SIZE, memory_ratio >), dim3(dimGrid), dim3(dimBlock ), 0, 0, 1.0, cd); float kernel_time_mad_dp = finalizeEvents(start, stop); initializeEvents(&start, &stop); hipLaunchKernel(HIP_KERNEL_NAME(benchmark_func< int, BLOCK_SIZE, memory_ratio >), dim3(dimGrid), dim3(dimBlock ), 0, 0, 1, (int*)cd); float kernel_time_mad_int = finalizeEvents(start, stop); const double memaccesses_ratio = (double)(memory_ratio)/UNROLL_ITERATIONS; const double computations_ratio = 1.0-memaccesses_ratio; printf(" %4d, %8.3f,%8.2f,%8.2f,%7.2f, %8.3f,%8.2f,%8.2f,%7.2f, %8.3f,%8.2f,%8.2f,%7.2f\n", UNROLL_ITERATIONS-memory_ratio, (computations_ratio*(double)computations)/(memaccesses_ratio*(double)memoryoperations*sizeof(float)), kernel_time_mad_sp, (computations_ratio*(double)computations)/kernel_time_mad_sp*1000./(double)(1000*1000*1000), (memaccesses_ratio*(double)memoryoperations*sizeof(float))/kernel_time_mad_sp*1000./(1000.*1000.*1000.), (computations_ratio*(double)computations)/(memaccesses_ratio*(double)memoryoperations*sizeof(double)), kernel_time_mad_dp, (computations_ratio*(double)computations)/kernel_time_mad_dp*1000./(double)(1000*1000*1000), (memaccesses_ratio*(double)memoryoperations*sizeof(double))/kernel_time_mad_dp*1000./(1000.*1000.*1000.), (computations_ratio*(double)computations)/(memaccesses_ratio*(double)memoryoperations*sizeof(int)), kernel_time_mad_int, (computations_ratio*(double)computations)/kernel_time_mad_int*1000./(double)(1000*1000*1000), (memaccesses_ratio*(double)memoryoperations*sizeof(int))/kernel_time_mad_int*1000./(1000.*1000.*1000.) ); }
ErrorCode GpuDilate<InputPixelType, InputBandCount, OutputPixelType, OutputBandCount>::launchKernel(unsigned blockWidth, unsigned blockHeight) { dim3 dimBlock(blockWidth,blockHeight); size_t gridWidth = this->dataSize.width / dimBlock.x + (((this->dataSize.width % dimBlock.x)==0) ? 0 :1 ); size_t gridHeight = this->dataSize.height / dimBlock.y + (((this->dataSize.height % dimBlock.y)==0) ? 0 :1 ); dim3 dimGrid(gridWidth, gridHeight); // Bind the texture to the array and setup the access parameters cvt::gpu::bind_texture<InputPixelType,0>(this->gpuInputDataArray); cudaError cuer = cudaGetLastError(); if (cudaSuccess != cuer) { return CudaError; // needs to be changed } // ==================================================== // Really launch, after one last error check! // ==================================================== cuer = cudaGetLastError(); if (cudaSuccess != cuer) { return CudaError; // needs to be changed } //TODO: Use this line when updating to use shared memory //const unsigned int shmem_bytes = neighbor_coordinates_.size() * sizeof(double) * blockWidth * blockHeight; cvt::gpu::launch_dilate<InputPixelType, OutputPixelType>(dimGrid, dimBlock, 0, this->stream,(OutputPixelType *)this->gpuOutputData, this->roiSize_.width,this->roiSize_.height, this->relativeOffsetsGpu_, this->relativeOffsets_.size(),this->bufferWidth_); cuer = cudaGetLastError(); if (cuer != cudaSuccess) { std::cout << "CUDA ERROR = " << cuer << std::endl; throw std::runtime_error("KERNEL LAUNCH FAILURE"); } return CudaError; // needs to be changed };
void runbench(double *cd, long size){ const long compute_grid_size = size/ELEMENTS_PER_THREAD; const int BLOCK_SIZE = 256; const int TOTAL_BLOCKS = compute_grid_size/BLOCK_SIZE; const long long computations = ELEMENTS_PER_THREAD*(long long)compute_grid_size+(2*ELEMENTS_PER_THREAD*compute_iterations)*(long long)compute_grid_size; const long long memoryoperations = size; dim3 dimBlock(BLOCK_SIZE, 1, 1); dim3 dimGrid(TOTAL_BLOCKS, 1, 1); hipEvent_t start, stop; initializeEvents(&start, &stop); hipLaunchKernel(HIP_KERNEL_NAME(benchmark_func< float, BLOCK_SIZE, ELEMENTS_PER_THREAD, compute_iterations >), dim3(dimGrid), dim3(dimBlock ), 0, 0, 1.0f, (float*)cd); float kernel_time_mad_sp = finalizeEvents(start, stop); initializeEvents(&start, &stop); hipLaunchKernel(HIP_KERNEL_NAME(benchmark_func< double, BLOCK_SIZE, ELEMENTS_PER_THREAD, compute_iterations >), dim3(dimGrid), dim3(dimBlock ), 0, 0, 1.0, cd); float kernel_time_mad_dp = finalizeEvents(start, stop); initializeEvents(&start, &stop); hipLaunchKernel(HIP_KERNEL_NAME(benchmark_func< int, BLOCK_SIZE, ELEMENTS_PER_THREAD, compute_iterations >), dim3(dimGrid), dim3(dimBlock ), 0, 0, 1, (int*)cd); float kernel_time_mad_int = finalizeEvents(start, stop); printf(" %4d, %8.3f,%8.2f,%8.2f,%7.2f, %8.3f,%8.2f,%8.2f,%7.2f, %8.3f,%8.2f,%8.2f,%7.2f\n", compute_iterations, ((double)computations)/((double)memoryoperations*sizeof(float)), kernel_time_mad_sp, ((double)computations)/kernel_time_mad_sp*1000./(double)(1000*1000*1000), ((double)memoryoperations*sizeof(float))/kernel_time_mad_sp*1000./(1000.*1000.*1000.), ((double)computations)/((double)memoryoperations*sizeof(double)), kernel_time_mad_dp, ((double)computations)/kernel_time_mad_dp*1000./(double)(1000*1000*1000), ((double)memoryoperations*sizeof(double))/kernel_time_mad_dp*1000./(1000.*1000.*1000.), ((double)computations)/((double)memoryoperations*sizeof(int)), kernel_time_mad_int, ((double)computations)/kernel_time_mad_int*1000./(double)(1000*1000*1000), ((double)memoryoperations*sizeof(int))/kernel_time_mad_int*1000./(1000.*1000.*1000.) ); }
int main() { const int size = 33; float* in = new float[size]; float* out = new float[size]; for (int i = 0; i < size; i++) { in[i] = (float)(i+1); }; dim3 dimGrid (1, 1, 1); dim3 dimBlock(BLOCK_SIZE, 1, 1); // Cannot support CUDA's <<<x,y>>> syntax. schedule(code, in, out, size) .setBlockSize(dimBlock) .setGridSize(dimGrid) .run(); for (int i = 0; i < size; i++) { printf("%0.2f %0.2f\n", in[i], out[i]); } }
/* compute N time steps */ int calc_path(DATATYPE *gpuWall, DATATYPE *gpuResult[2], int rows, int cols, \ int pyramid_height, int blockCols, int borderCols) { dim3 dimBlock(BLOCK_SIZE); dim3 dimGrid(blockCols); int size = rows * cols; int src = 1, dst = 0; #ifdef NOC DATATYPE *memport = (DATATYPE*)malloc(sizeof(DATATYPE) * ((size - cols) + cols + cols)); memcpy(memport, gpuWall, sizeof(DATATYPE) * (size - cols)); #endif for (int t = 0; t < rows-1; t+=pyramid_height) { int temp = src; src = dst; dst = temp; dynproc_kernel(MIN(pyramid_height, rows-t-1), gpuWall, gpuResult[src], gpuResult[dst], cols, rows, t, borderCols, dimGrid, dimBlock, 1, 0); } return dst; }
ErrorCode GpuAbsoluteDifference<InputPixelType, InputBandCount, OutputPixelType, OutputBandCount>::launchKernel(unsigned blockWidth, unsigned blockHeight) { dim3 dimBlock(blockWidth,blockHeight); size_t gridWidth = this->dataSize.width / dimBlock.x + (((this->dataSize.width % dimBlock.x)==0) ? 0 :1 ); size_t gridHeight = this->dataSize.height / dimBlock.y + (((this->dataSize.height % dimBlock.y)==0) ? 0 :1 ); dim3 dimGrid(gridWidth, gridHeight); // Bind the texture to the array and setup the access parameters bind_texture<InputPixelType, 0>(this->gpuInputDataArray); bind_texture<InputPixelType, 1>(this->gpuInputDataArrayTwo_); cvt::gpu::launch_absDifference<InputPixelType,OutputPixelType>(dimGrid, dimBlock, 0, this->stream, (OutputPixelType *)this->gpuOutputData, this->dataSize.width, this->dataSize.height); cudaError cuer; cuer = cudaGetLastError(); if (cuer != cudaSuccess) { std::cout << "CUDA ERROR = " << cuer << std::endl; throw std::runtime_error("KERNEL LAUNCH FAILURE"); } return CudaError; // needs to be changed };
void runTest( int argc, char** argv) { int rows, cols, size_I, size_R, niter = 10, iter; double *I, *J, lambda, q0sqr, sum, sum2, tmp, meanROI,varROI ; #ifdef CPU double Jc, G2, L, num, den, qsqr; int *iN,*iS,*jE,*jW, k; double *dN,*dS,*dW,*dE; double cN,cS,cW,cE,D; #endif #ifdef GPU double *J_cuda; double *C_cuda; double *E_C, *W_C, *N_C, *S_C; #endif unsigned int r1, r2, c1, c2; double *c; if (argc == 9) { rows = atoi(argv[1]); //number of rows in the domain cols = atoi(argv[2]); //number of cols in the domain if ((rows%16!=0) || (cols%16!=0)){ fprintf(stderr, "rows and cols must be multiples of 16\n"); exit(1); } r1 = atoi(argv[3]); //y1 position of the speckle r2 = atoi(argv[4]); //y2 position of the speckle c1 = atoi(argv[5]); //x1 position of the speckle c2 = atoi(argv[6]); //x2 position of the speckle lambda = atof(argv[7]); //Lambda value niter = atoi(argv[8]); //number of iterations } else{ usage(argc, argv); } size_I = cols * rows; size_R = (r2-r1+1)*(c2-c1+1); I = (double *)malloc( size_I * sizeof(double) ); J = (double *)malloc( size_I * sizeof(double) ); c = (double *)malloc(sizeof(double)* size_I) ; #ifdef CPU iN = (int *)malloc(sizeof(unsigned int*) * rows) ; iS = (int *)malloc(sizeof(unsigned int*) * rows) ; jW = (int *)malloc(sizeof(unsigned int*) * cols) ; jE = (int *)malloc(sizeof(unsigned int*) * cols) ; dN = (double *)malloc(sizeof(double)* size_I) ; dS = (double *)malloc(sizeof(double)* size_I) ; dW = (double *)malloc(sizeof(double)* size_I) ; dE = (double *)malloc(sizeof(double)* size_I) ; for (int i=0; i< rows; i++) { iN[i] = i-1; iS[i] = i+1; } for (int j=0; j< cols; j++) { jW[j] = j-1; jE[j] = j+1; } iN[0] = 0; iS[rows-1] = rows-1; jW[0] = 0; jE[cols-1] = cols-1; #endif #ifdef GPU printf("size_I = %d\n", size_I); //Allocate device memory //cudaMalloc((void**)& J_cuda, sizeof(double)* size_I); J_cuda = (double*)malloc(sizeof(double)*size_I); //cudaMalloc((void**)& C_cuda, sizeof(double)* size_I); C_cuda = (double*)malloc(sizeof(double)*size_I); //cudaMalloc((void**)& E_C, sizeof(double)* size_I); E_C = (double*)malloc(sizeof(double)*size_I); //cudaMalloc((void**)& W_C, sizeof(double)* size_I); W_C = (double*)malloc(sizeof(double)*size_I); //cudaMalloc((void**)& S_C, sizeof(double)* size_I); S_C = (double*)malloc(sizeof(double)*size_I); //cudaMalloc((void**)& N_C, sizeof(double)* size_I); N_C = (double*)malloc(sizeof(double)*size_I); #endif printf("Randomizing the input matrix\n"); //Generate a random matrix random_matrix(I, rows, cols); for (int k = 0; k < size_I; k++ ) { J[k] = exp(I[k]*1.0) ; } printf("Start the SRAD main loop\n"); for (iter=0; iter< niter; iter++){ sum=0; sum2=0; for (int i=r1; i<=r2; i++) { for (int j=c1; j<=c2; j++) { tmp = J[i * cols + j]; sum += tmp ; sum2 += tmp*tmp; } } meanROI = sum / (size_R * 1.0); varROI = (sum2 / (size_R*1.0)) - meanROI*meanROI; q0sqr = varROI / (1.0*(meanROI*meanROI)); #ifdef CPU for (int i = 0 ; i < rows ; i++) { for (int j = 0; j < cols; j++) { k = i * cols + j; Jc = J[k]; // directional derivates dN[k] = J[iN[i] * cols + j] - Jc; dS[k] = J[iS[i] * cols + j] - Jc; dW[k] = J[i * cols + jW[j]] - Jc; dE[k] = J[i * cols + jE[j]] - Jc; G2 = (dN[k]*dN[k] + dS[k]*dS[k] + dW[k]*dW[k] + dE[k]*dE[k]) / (Jc*Jc); L = (dN[k] + dS[k] + dW[k] + dE[k]) / Jc; num = (0.5*G2) - ((1.0/16.0)*(L*L)) ; den = 1.0 + (.25*L); qsqr = num/(den*den*1.0); // diffusion coefficent (equ 33) den = (qsqr-q0sqr) / (q0sqr * (1.0+q0sqr)) ; c[k] = 1.0 / (1.0+den) ; // saturate diffusion coefficent if (c[k] < 0) {c[k] = 0;} else if (c[k] > 1) {c[k] = 1;} } } for (int i = 0; i < rows; i++) { for (int j = 0; j < cols; j++) { // current index k = i * cols + j; // diffusion coefficent cN = c[k]; cS = c[iS[i] * cols + j]; cW = c[k]; cE = c[i * cols + jE[j]]; // divergence (equ 58) D = cN * dN[k] + cS * dS[k] + cW * dW[k] + cE * dE[k]; // image update (equ 61) J[k] = J[k] + 0.25*lambda*D; } } #endif // CPU #ifdef GPU //Currently the input size must be divided by 16 - the block size int block_x = cols/BLOCK_SIZE ; int block_y = rows/BLOCK_SIZE ; dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE); dim3 dimGrid(block_x , block_y); //Copy data from main memory to device memory //cudaMemcpy(J_cuda, J, sizeof(double) * size_I, cudaMemcpyHostToDevice); memcpy(J_cuda, J, sizeof(double) * size_I); //Run kernels //srad_cuda_1<<<dimGrid, dimBlock>>>(E_C, W_C, N_C, S_C, J_cuda, C_cuda, cols, rows, q0sqr); srad_cuda_1(E_C, W_C, N_C, S_C, J_cuda, C_cuda, cols, rows, q0sqr, dimGrid, dimBlock, 1, 0); //srad_cuda_2<<<dimGrid, dimBlock>>>(E_C, W_C, N_C, S_C, J_cuda, C_cuda, cols, rows, lambda, q0sqr); srad_cuda_2(E_C, W_C, N_C, S_C, J_cuda, C_cuda, cols, rows, lambda, q0sqr, dimGrid, dimBlock, 1, 0); //Copy data from device memory to main memory //cudaMemcpy(J, J_cuda, sizeof(double) * size_I, cudaMemcpyDeviceToHost); memcpy(J, J_cuda, sizeof(double) * size_I); #endif } //cudaThreadSynchronize(); #define OUTPUT #ifdef OUTPUT //Printing output printf("Printing Output:\n"); int passed = 1; FILE *gp = fopen("cuda/gold_output.txt", "r"); if (gp == NULL) { printf("Cannot open file.\n"); } double gold_J_val; for( int i = 0 ; i < rows ; i++){ for ( int j = 0 ; j < cols ; j++){ fscanf(gp, "%lf", &gold_J_val); //printf("%.8f ", J[i * cols + j]); if (fabs(gold_J_val - J[i * cols + j]) > EPSILON) { printf("Mismatch at %d: gold = %f, calc = %f.\n", i * cols + j, gold_J_val, J[i * cols + j]); passed = 0; break; } } if (passed == 0) break; //printf("\n"); } fclose(gp); if (passed == 1) printf("PASSED.\n"); else printf("FAILED.\n"); #endif printf("Computation Done\n"); free(I); free(J); #ifdef CPU free(iN); free(iS); free(jW); free(jE); free(dN); free(dS); free(dW); free(dE); #endif #ifdef GPU /*cudaFree(C_cuda); cudaFree(J_cuda); cudaFree(E_C); cudaFree(W_C); cudaFree(N_C); cudaFree(S_C);*/ free(C_cuda); free(J_cuda); free(E_C); free(W_C); free(N_C); free(S_C); #endif free(c); }
QList<resType> calculateOnGPU(const char * seqLib, int seqLibLength, ScoreType* queryProfile, ScoreType qProfLen, int queryLength, ScoreType gapOpen, ScoreType gapExtension, ScoreType maxScore, U2::SmithWatermanSettings::SWResultView resultView) { //TODO: calculate maximum alignment length const int overlapLength = calcOverlap(queryLength); int partsNumber = calcPartsNumber(seqLibLength, overlapLength); int queryDevider = 1; if (queryLength > sw_cuda_cpp::MAX_SHARED_VECTOR_LENGTH) { queryDevider = (queryLength + sw_cuda_cpp::MAX_SHARED_VECTOR_LENGTH - 1) / sw_cuda_cpp::MAX_SHARED_VECTOR_LENGTH; } int partQuerySize = (queryLength + queryDevider - 1) / queryDevider; int partSeqSize = calcPartSeqSize(seqLibLength, overlapLength, partsNumber); int sizeRow = calcSizeRow(seqLibLength, overlapLength, partsNumber, partSeqSize); u2log.details(QString("partsNumber: %1 queryDevider: %2").arg(partsNumber).arg(queryDevider)); u2log.details(QString("seqLen: %1 partSeqSize: %2 overlapSize: %3").arg(seqLibLength).arg(partSeqSize).arg(overlapLength)); u2log.details(QString("queryLen %1 partQuerySize: %2").arg(queryLength).arg(partQuerySize)); //************************** declare some temp variables on host ScoreType* tempRow = new ScoreType[sizeRow]; ScoreType* zerroArr = new ScoreType[sizeRow]; for (int i = 0; i < sizeRow; i++) { zerroArr[i] = 0; } ScoreType* directionRow = new ScoreType[sizeRow]; size_t directionMatrixSize = 0; size_t backtraceBeginsSize = 0; int * globalMatrix = NULL; int * backtraceBegins = NULL; if(U2::SmithWatermanSettings::MULTIPLE_ALIGNMENT == resultView) { directionMatrixSize = seqLibLength * queryLength * sizeof(int); backtraceBeginsSize = 2 * sizeRow * sizeof(int); globalMatrix = new int[directionMatrixSize / sizeof(int)]; backtraceBegins = new int[backtraceBeginsSize / sizeof(int)]; memset(globalMatrix, 0, directionMatrixSize); memset(backtraceBegins, 0, backtraceBeginsSize); } //************************** sizes of arrays size_t sizeQ = sizeRow * sizeof(ScoreType); size_t sizeQQ = (sizeRow) * sizeof(ScoreType); size_t sizeP = qProfLen * sizeof(ScoreType); size_t sizeL = (seqLibLength) * sizeof(char); //************************** declare arrays on device char * g_seqLib; ScoreType* g_queryProfile; ScoreType* g_HdataMax; ScoreType* g_HdataUp; ScoreType* g_HdataRec; ScoreType* g_HdataTmp; ScoreType* g_FdataUp; ScoreType* g_directionsUp; ScoreType* g_directionsMax; ScoreType* g_directionsRec; int * g_directionsMatrix = NULL; int * g_backtraceBegins = NULL; //************************** allocate global memory on device cudaMalloc((void **)& g_seqLib, sizeL); cudaMalloc((void **)& g_queryProfile, sizeP); cudaMalloc((void **)& g_HdataMax, sizeQ); cudaMalloc((void **)& g_HdataUp, sizeQ); cudaMalloc((void **)& g_FdataUp, sizeQ); cudaMalloc((void **)& g_directionsUp, sizeQ); cudaMalloc((void **)& g_directionsMax, sizeQ); cudaMalloc((void **)& g_HdataRec, sizeQ); cudaMalloc((void **)& g_directionsRec, sizeQ); if(U2::SmithWatermanSettings::MULTIPLE_ALIGNMENT == resultView) { cudaError errorMatrix = cudaMalloc(reinterpret_cast<void **>(&g_directionsMatrix), directionMatrixSize); cudaError errorBacktrace = cudaMalloc(reinterpret_cast<void **>(&g_backtraceBegins), backtraceBeginsSize); } u2log.details(QString("GLOBAL MEMORY USED %1 KB").arg((sizeL + sizeP + sizeQ * 7 + directionMatrixSize + backtraceBeginsSize) / 1024)); //************************** copy from host to device cudaMemcpy(g_seqLib, seqLib, sizeL, cudaMemcpyHostToDevice); cudaMemcpy(g_queryProfile, queryProfile, sizeP, cudaMemcpyHostToDevice); cudaMemcpy(g_HdataMax, zerroArr, sizeQ, cudaMemcpyHostToDevice); cudaMemcpy(g_HdataUp, zerroArr, sizeQ, cudaMemcpyHostToDevice); cudaMemcpy(g_FdataUp, zerroArr, sizeQ, cudaMemcpyHostToDevice); cudaMemcpy(g_directionsUp, zerroArr, sizeQ, cudaMemcpyHostToDevice); cudaMemcpy(g_directionsMax, zerroArr, sizeQ, cudaMemcpyHostToDevice); cudaMemcpy(g_directionsRec, zerroArr, sizeQ, cudaMemcpyHostToDevice); cudaMemcpy(g_HdataRec, zerroArr, sizeQ, cudaMemcpyHostToDevice); if(U2::SmithWatermanSettings::MULTIPLE_ALIGNMENT == resultView) { cudaMemcpy(g_directionsMatrix, globalMatrix, directionMatrixSize, cudaMemcpyHostToDevice); cudaMemcpy(g_backtraceBegins, backtraceBegins, backtraceBeginsSize, cudaMemcpyHostToDevice); } //************************** start calculation int BLOCK_SIZE = partsNumber; dim3 dimBlock(BLOCK_SIZE); dim3 dimGrid(partQuerySize); //move constants variables to constant cuda memory setConstants(partSeqSize, partsNumber, overlapLength, seqLibLength, queryLength, gapOpen, gapExtension, maxScore, partQuerySize, U2::SmithWatermanAlgorithm::UP, U2::SmithWatermanAlgorithm::LEFT, U2::SmithWatermanAlgorithm::DIAG, U2::SmithWatermanAlgorithm::STOP); size_t sh_mem_size = sizeof(ScoreType) * (dimGrid.x + 1) * 3; u2log.details(QString("SHARED MEM SIZE USED: %1 B").arg(sh_mem_size)); // start main loop for (int i = 0; i < queryDevider; i++) { calculateMatrix_wrap( dimBlock.x, dimGrid.x, g_seqLib, g_queryProfile, g_HdataUp, g_HdataRec, g_HdataMax, g_FdataUp, g_directionsUp, g_directionsRec, g_directionsMax, i * partQuerySize, g_directionsMatrix, g_backtraceBegins); cudaError hasErrors = cudaThreadSynchronize(); if (hasErrors != 0) { u2log.trace(QString("CUDA ERROR HAPPEN, errorId: ") + QString::number(hasErrors)); } //revert arrays g_HdataTmp = g_HdataRec; g_HdataRec = g_HdataUp; g_HdataUp = g_HdataTmp; g_HdataTmp = g_directionsRec; g_directionsRec = g_directionsUp; g_directionsUp = g_HdataTmp; } //Copy vectors on host and find actual results cudaMemcpy(tempRow, g_HdataMax, sizeQQ, cudaMemcpyDeviceToHost); cudaMemcpy(directionRow, g_directionsMax, sizeQQ, cudaMemcpyDeviceToHost); if(U2::SmithWatermanSettings::MULTIPLE_ALIGNMENT == resultView) { cudaMemcpy(globalMatrix, g_directionsMatrix, directionMatrixSize, cudaMemcpyDeviceToHost); cudaMemcpy(backtraceBegins, g_backtraceBegins, backtraceBeginsSize, cudaMemcpyDeviceToHost); } QList<resType> pas; resType res; for (int j = 0; j < (sizeRow); j++) { if (tempRow[j] >= maxScore) { res.refSubseq.startPos = directionRow[j]; res.refSubseq.length = j - res.refSubseq.startPos + 1 - (j) / (partSeqSize + 1) * overlapLength - (j) / (partSeqSize + 1); res.score = tempRow[j]; if(U2::SmithWatermanSettings::MULTIPLE_ALIGNMENT == resultView) { qint32 pairAlignOffset = 0; qint32 row = backtraceBegins[2 * j]; qint32 column = backtraceBegins[2 * j + 1]; while(U2::SmithWatermanAlgorithm::STOP != globalMatrix[seqLibLength * row + column]) { if(U2::SmithWatermanAlgorithm::DIAG == globalMatrix[seqLibLength * row + column]) { res.pairAlign[pairAlignOffset++] = U2::SmithWatermanAlgorithm::DIAG; row--; column--; } else if(U2::SmithWatermanAlgorithm::LEFT == globalMatrix[seqLibLength * row + column]) { res.pairAlign[pairAlignOffset++] = U2::SmithWatermanAlgorithm::UP; column--; } else if(U2::SmithWatermanAlgorithm::UP == globalMatrix[seqLibLength * row + column]) { res.pairAlign[pairAlignOffset++] = U2::SmithWatermanAlgorithm::LEFT; row--; } if(0 >= row || 0 >= column) { break; } } res.patternSubseq.startPos = row; res.patternSubseq.length = backtraceBegins[2 * j] - row + 1; } pas.append(res); } } //deallocation memory cudaFree(g_seqLib); cudaFree(g_queryProfile); cudaFree(g_HdataMax); cudaFree(g_HdataUp); cudaFree(g_HdataRec); cudaFree(g_FdataUp); cudaFree(g_directionsUp); cudaFree(g_directionsMax); cudaFree(g_directionsRec); if(U2::SmithWatermanSettings::MULTIPLE_ALIGNMENT == resultView) { cudaFree(g_directionsMatrix); cudaFree(g_backtraceBegins); } delete[] tempRow; delete[] directionRow; delete[] zerroArr; delete[] globalMatrix; delete[] backtraceBegins; return pas; }