void NLGLWidget::initializeTexture() { if (_texture >= 0) { cleanupTexture(); } // Create the texture for displaying the result: GLuint gltex; glGenTextures(1, &gltex); _texture = gltex; reportGLError("setupGLRendering() genTexture"); // Bind texture: glBindTexture(GL_TEXTURE_2D, _texture); reportGLError("setupGLRendering() bindTexture"); // Allocate texture: glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA32F_ARB, _img_width, _img_height, 0, GL_RGBA, GL_FLOAT, NULL); reportGLError("setupGLRendering() texImage2D"); // Set texture parameters: glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST); glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST); reportGLError("setupGLRendering() glTexParameteri"); // Unbind texture: glBindTexture(GL_TEXTURE_2D, 0); reportGLError("setupGLRendering() bindTexture(0)"); // Register the buffer object: checkCUDAError("Pre gl register"); cudaGraphicsGLRegisterImage(&_graphicsResource, _texture, GL_TEXTURE_2D, cudaGraphicsMapFlagsWriteDiscard); checkCUDAError("Post gl register"); }
void CudaImagePyramidHost::unbindTexture() { const textureReference* constTexRefPtr=NULL; cudaGetTextureReference(&constTexRefPtr, _texture_name); checkCUDAError("Can't get tex ref for unbind TEXTURE_PYRAMID", _name); cudaUnbindTexture(constTexRefPtr); checkCUDAError("Unbind error", _name); }
void CudaImagePyramidHost::bindTexture() { const textureReference* constTexRefPtr=NULL; cudaGetTextureReference(&constTexRefPtr, _texture_name); checkCUDAError("Can't get tex ref for bind TEXTURE_PYRAMID", _name); cudaChannelFormatDesc formatDesc = constTexRefPtr->channelDesc; cudaBindTextureToArray(constTexRefPtr, _storage, &formatDesc); checkCUDAError("Bind error", _name); }
// initArray ------------------------------------------------------------------ // Initializes an array of floats on the host and device // @param host - The array on the host to initialize. // @param device - The array on the device to initialize // @param size - The length of the arrays to allocate. // @param initial_value - The value to initialize the arrays to. -1.0 will // cause the array to initialize to the index value. // ---------------------------------------------------------------------------- void initArray(float** host, float** device, size_t size, float initial_value){ // Allocate host memory *host = (float*) malloc( size * sizeof(float)); // Allocate device memory cudaMalloc((void **) device, size * sizeof(float)); checkCUDAError("malloc"); // Check for allocation errors // Initialize arrays ... for( size_t i = 0; i < size; ++i) (*host)[i] = initial_value == -1.0f ? i : initial_value; // ... and copy to device cudaMemcpy(*device, *host, size * sizeof(float), cudaMemcpyHostToDevice); checkCUDAError("memcpy"); // Check for initialization errors }
void mpla_generic_dgemv(struct mpla_vector* b, struct mpla_generic_matrix* A, struct mpla_vector* x, void (*mpla_dgemv_core)(struct mpla_vector*, struct mpla_generic_matrix*, struct mpla_vector*, struct mpla_instance*), struct mpla_instance* instance) { // allocate redistributed vector struct mpla_vector x_redist; mpla_init_vector_for_block_rows(&x_redist, instance, x->vec_row_count); // redistribute input vector with row-block parallel distribution to column-block parallel distribution mpla_redistribute_vector_for_generic_dgesv(&x_redist, x, A, instance); // generic computation core: matrix-vector product mpla_dgemv_core(b, A, &x_redist, instance); // create sub-communicator for each process row int remain_dims[2]; remain_dims[0]=0; remain_dims[1]=1; MPI_Comm row_comm; MPI_Cart_sub(instance->comm, remain_dims, &row_comm); // summation of block row results double* sum; cudaMalloc((void**)&sum, sizeof(double)*b->cur_proc_row_count); cudaThreadSynchronize(); checkCUDAError("cudaMalloc"); MPI_Allreduce(b->data, sum, b->cur_proc_row_count, MPI_DOUBLE, MPI_SUM, row_comm); cudaMemcpy(b->data, sum, sizeof(double)*b->cur_proc_row_count, cudaMemcpyDeviceToDevice); // cleanup cudaFree(sum); mpla_free_vector(&x_redist, instance); MPI_Comm_free(&row_comm); }
void CudaImagePyramidHost::copyFromHost(const void* source) { assert(isInitialized()); assert(_textureType == cudaTextureType2D); cudaMemcpyToArray(_storage, 0,0, source, _baseWidth*_baseHeight*_typeSize, cudaMemcpyHostToDevice); checkCUDAError("Memcpy error", _name); }
void CudaImagePyramidHost::copyTo(CudaImagePyramidHost& target) { assert(target._typeSize == _typeSize && target._textureType == _textureType); target.initialize(_baseWidth, _baseHeight, _filterMode, _numLayers); cudaMemcpyArrayToArray(target._storage,0,0,_storage,0,0, _baseWidth*_baseHeight*_numLayers*_typeSize, cudaMemcpyDeviceToDevice); checkCUDAError("Memcpy error", _name); }
// Lanseaza procesarea CUDA void runCUDA() { // Copiaza vectorii de prelucrat la device cutilSafeCall(cudaMemcpy(a_d, a_h,N*sizeof(float),cudaMemcpyHostToDevice)); checkCUDAError("cudaMemcpy"); cutilSafeCall(cudaMemcpy(b_d, b_h,N*sizeof(float),cudaMemcpyHostToDevice)); checkCUDAError("cudaMemcpy"); // Run Kernel cutilSafeCall(launch_actiune_thread(a_d,b_d,r_d,N,dimGrid,dimBlock)); cutilSafeCall(cudaThreadSynchronize()); checkCUDAError("invocare kernel"); // Copiaza rezultatul prelucrat cutilSafeCall(cudaMemcpy(r_h,r_d,N*sizeof(float),cudaMemcpyDeviceToHost)); checkCUDAError("cudaMemcpy"); }
void mpla_init_vector(struct mpla_vector* vector, struct mpla_instance* instance, int vec_row_count) { // setting global vector size vector->vec_row_count = vec_row_count; // allocating memory for process-wise vector information vector->proc_row_count = new int*[instance->proc_rows]; vector->proc_row_offset = new int*[instance->proc_rows]; for (int i=0; i<instance->proc_rows; i++) { vector->proc_row_count[i] = new int[instance->proc_cols]; vector->proc_row_offset[i] = new int[instance->proc_cols]; } // computing general row block sizes int almost_filled_row_block_size = vec_row_count / instance->proc_rows; int remaining_rows = vec_row_count % instance->proc_rows; if (almost_filled_row_block_size == 0) { printf("MPLA: There are more process block rows than vector rows. Exiting...\n"); exit(1); } // computing process-wise block row / column counts for (int i=0; i< instance->proc_rows; i++) { for (int j=0; j<instance->proc_cols; j++) { vector->proc_row_count[i][j] = almost_filled_row_block_size + ( (i<remaining_rows) ? 1 : 0 ); } } // computing process-wise block row / column offsets vector->proc_row_offset[0][0] = 0; for (int j=1; j < instance->proc_cols; j++) vector->proc_row_offset[0][j] = 0; for (int i=1; i < instance->proc_rows; i++) for (int j=0; j < instance->proc_cols; j++) vector->proc_row_offset[i][j] = vector->proc_row_offset[i-1][j] + vector->proc_row_count[i-1][j]; // retrieving local data for the current process vector->cur_proc_row_count = vector->proc_row_count[instance->cur_proc_row][instance->cur_proc_col]; vector->cur_proc_row_offset = vector->proc_row_offset[instance->cur_proc_row][instance->cur_proc_col]; // allocating matrix data storage cudaMalloc((void**)&(vector->data), sizeof(double)*vector->cur_proc_row_count); cudaThreadSynchronize(); checkCUDAError("cudaMalloc"); }
bool initCUDA(void) { #if __DEVICE_EMULATION__ return true; #else int count = 0; int i = 0; cudaGetDeviceCount(&count); if(count == 0) { fprintf(stderr, "Nu exista nici un device.\n"); return false; } printf("Exista %d device-uri.\n",count); for(i = 0; i < count; i++) { cudaDeviceProp prop; if(cudaGetDeviceProperties(&prop, i) == cudaSuccess) { if(prop.major >= 1) { break; } } if(!prop.canMapHostMemory) { fprintf(stderr, "Device %d cannot map host memory!\n",0); exit(EXIT_FAILURE); } } if(i == count) { fprintf(stderr, "Nu exista nici un device care suporta CUDA.\n"); return false; } cudaSetDevice(cutGetMaxGflopsDeviceId()); cudaSetDeviceFlags(cudaDeviceMapHost); checkCUDAError("cudaSetDeviceFlags"); printf("CUDA initializat cu succes\n"); // Create the CUTIL timer cutilCheckError( cutCreateTimer( &timer)); return true; #endif }
// finishTest ------------------------------------------------------------------ // Initializes the cuda timer events and starts the timer. // @param start - Start time evet // @param end - End time evet // @returns the elapsed time in seconds. //----------------------------------------------------------------------------- float finishTest(cudaEvent_t &start, cudaEvent_t &stop){ float time; cudaEventRecord( stop, 0 ); cudaEventSynchronize( stop ); cudaEventElapsedTime( &time, start, stop ); cudaEventDestroy( start ); cudaEventDestroy( stop ); printf("Finished Test in %f s\n\n", time/1000.0f); // Check for errors checkCUDAError("test finished"); // Return elapsed time return time/1000.0f; }
void mpla_save_vector(struct mpla_vector* x, char* filename, struct mpla_instance* instance) { // create sub-communicator for each process column int remain_dims[2]; remain_dims[0]=1; remain_dims[1]=0; MPI_Comm column_comm; MPI_Cart_sub(instance->comm, remain_dims, &column_comm); int column_rank; MPI_Comm_rank(column_comm, &column_rank); // columnwise creation of the full vector double* full_vector; int* recvcounts = new int[instance->proc_rows]; int* displs = new int[instance->proc_rows]; for (int i=0; i<instance->proc_rows; i++) { recvcounts[i] = x->proc_row_count[i][instance->cur_proc_col]; displs[i] = x->proc_row_offset[i][instance->cur_proc_col]; } cudaMalloc((void**)&full_vector, sizeof(double)*x->vec_row_count); cudaThreadSynchronize(); checkCUDAError("cudaMalloc"); MPI_Allgatherv(x->data, x->cur_proc_row_count, MPI_DOUBLE, full_vector, recvcounts, displs, MPI_DOUBLE, column_comm); // writing full vector to file on parent process if (instance->is_parent) { FILE* f = fopen(filename, "wb"); double* full_vector_host = new double[x->vec_row_count]; cudaMemcpy(full_vector_host, full_vector, x->vec_row_count*sizeof(double), cudaMemcpyDeviceToHost); fwrite(&(x->vec_row_count), sizeof(int), 1, f); fwrite(full_vector_host, sizeof(double), x->vec_row_count, f); fclose(f); delete [] full_vector_host; } // memory cleanup cudaFree(full_vector); MPI_Comm_free(&column_comm); MPI_Barrier(instance->comm); }
void cleanup() { free(a_h);free(b_h);free(r_h); free(control); cutilCheckError(cutStopTimer(timer)); cutilCheckError(cutDeleteTimer( timer)); cudaFree(a_d);cudaFree(b_d);cudaFree(r_d); cutilSafeCall(release()); checkCUDAError("release"); cudaThreadExit(); }
void CudaImagePyramidHost::copyFromHost(int width, int height, const void* source, int layer) { assert(isInitialized()); assert(_textureType == cudaTextureType2DLayered); cudaMemcpy3DParms myParms = {0}; myParms.srcPtr = make_cudaPitchedPtr((void*)source,width*_typeSize,width,height); myParms.srcPos = make_cudaPos(0,0,0); myParms.dstArray = _storage; myParms.dstPos = make_cudaPos(0,0,layer); myParms.extent = make_cudaExtent(width,height,1); myParms.kind = cudaMemcpyHostToDevice; cudaMemcpy3D(&myParms); checkCUDAError("Memcpy error", _name); }
void CudaImagePyramidHost::clear() { if (!isInitialized()) { return; } // Don't bother unbinding the texture if everything is getting destroyed, // because it's likely that CUDA has already destroyed the texture. if (!_in_destructor) { unbindTexture(); } cudaFreeArray(_storage); checkCUDAError("Free error", _name); _storage = NULL; _baseWidth = 0; _baseHeight = 0; _baseWidth = 0; _baseHeight = 0; }
void mpla_redistribute_vector_for_generic_dgesv(struct mpla_vector* b_redist, struct mpla_vector* b, struct mpla_generic_matrix* A, struct mpla_instance* instance) { // attention: this code does no correctness check for the input data // WARNING: The following code is not efficient for a strong parallelization !!!!! // create sub-communicator for each process column int remain_dims[2]; remain_dims[0]=1; remain_dims[1]=0; MPI_Comm column_comm; MPI_Cart_sub(instance->comm, remain_dims, &column_comm); int column_rank; MPI_Comm_rank(column_comm, &column_rank); // columnwise creation of the full vector double* full_vector; int* recvcounts = new int[instance->proc_rows]; int* displs = new int[instance->proc_rows]; for (int i=0; i<instance->proc_rows; i++) { recvcounts[i] = b->proc_row_count[i][instance->cur_proc_col]; displs[i] = b->proc_row_offset[i][instance->cur_proc_col]; } cudaMalloc((void**)&full_vector, sizeof(double)*b->vec_row_count); cudaThreadSynchronize(); checkCUDAError("cudaMalloc"); MPI_Allgatherv(b->data, b->cur_proc_row_count, MPI_DOUBLE, full_vector, recvcounts, displs, MPI_DOUBLE, column_comm); // extract column-wise local part of full vector cudaMemcpy(b_redist->data, &(full_vector[b_redist->cur_proc_row_offset]), sizeof(double)*b_redist->cur_proc_row_count, cudaMemcpyDeviceToDevice); // memory cleanup cudaFree(full_vector); MPI_Comm_free(&column_comm); }
void init() { // Aloca memorie - local //a_h = (float *)malloc(N*sizeof(float)); //b_h = (float *)malloc(N*sizeof(float)); //r_h = (float *)malloc(N*sizeof(float)); int size = N*sizeof(float); cudaHostAlloc((void **)&a_h, size, 0); checkCUDAError("cudaHostAllocMapped"); cudaHostAlloc((void **)&b_h, size, 0); checkCUDAError("cudaHostAllocMapped"); cudaHostAlloc((void **)&r_h, size, 0); checkCUDAError("cudaHostAllocMapped"); // Aloca memorie - CUDA //cutilSafeCall(cudaMalloc((void **) &a_d, N*sizeof(float))); //cutilSafeCall(cudaMalloc((void **) &b_d, N*sizeof(float))); //cutilSafeCall(cudaMalloc((void **) &r_d, N*sizeof(float))); cudaHostGetDevicePointer((void **)&a_d, (void *)a_h, 0); checkCUDAError("cudaHostGetDevicePointer"); cudaHostGetDevicePointer((void **)&b_d, (void *)b_h, 0); checkCUDAError("cudaHostGetDevicePointer"); cudaHostGetDevicePointer((void **)&r_d, (void *)r_h, 0); checkCUDAError("cudaHostGetDevicePointer"); control = (float *)malloc(N*sizeof(float)); // Initializeaza vectori for(int i=0;i<N;i++) { a_h[i] = (float)(i % 13)+1; b_h[i] = (float)(i % 3)+1; } }
void CudaImagePyramidHost::initialize(int width, int height, cudaTextureFilterMode filter_mode, int depth) { qDebug() << "pyramid host initializing with params: " << width << height << filter_mode << depth; if (isInitialized() && width == _baseWidth && height == _baseHeight && filter_mode == _filterMode) { return; } clear(); qDebug() << "Clear done."; _baseWidth = width; _baseHeight = height; _filterMode = filter_mode; _numLayers = depth; // Get the texture and its channel descriptor to allocate the storage. const textureReference* constTexRefPtr=NULL; cudaGetTextureReference(&constTexRefPtr, _texture_name); qDebug() << "Texture Ref got:" << _name; if (constTexRefPtr == 0) { qDebug() << "constTexRefPtr==0"; } checkCUDAError("Can't get tex ref for init TEXTURE_PYRAMID", _name); cudaChannelFormatDesc formatDesc = constTexRefPtr->channelDesc; if(_textureType == cudaTextureType2DLayered){ cudaDeviceProp prop; qDebug() << "to get CUDA device prop"; cudaGetDeviceProperties(&prop,0); qDebug() << "CUDA Device Prop got"; if(prop.maxTexture2DLayered[0] < _baseWidth || prop.maxTexture2DLayered[1] < _baseHeight || prop.maxTexture2DLayered[2] < _numLayers){ qDebug()<< "Max layered texture size:" << prop.maxTexture2DLayered[0] << " x " << prop.maxTexture2DLayered[1] << " x " << prop.maxTexture2DLayered[2]; assert(0); } cudaExtent extent = make_cudaExtent(_baseWidth, _baseHeight, _numLayers); cudaMalloc3DArray(&_storage, &formatDesc, extent, cudaArrayLayered); }else{ cudaMallocArray(&_storage, &formatDesc, _baseWidth, _baseHeight); } checkCUDAError("Failure to allocate", _name); qDebug() << "allocate done"; // Set texture parameters. // Evil hack to get around an apparent bug in the cuda api: // cudaGetTextureReference only returns a const reference, and // there is no way to set the parameters with a reference other // than cast it to non-const. textureReference* texRefPtr=NULL; texRefPtr = const_cast<textureReference*>( constTexRefPtr ); texRefPtr->addressMode[0] = cudaAddressModeClamp; texRefPtr->addressMode[1] = cudaAddressModeClamp; texRefPtr->filterMode = filter_mode; texRefPtr->normalized = false; // Use unnormalized (pixel) coordinates for addressing. This forbids texture mode wrap. bindTexture(); qDebug() << "texture binded"; bool found = false; for (size_t i = 0; i < _instances.size(); i++) { if (_instances[i] == this) found = true; } if (!found) { qDebug() << "Not found"; _instances.push_back(this); } qDebug() << "paramid host initialized."; }
void PointerFreeHashGrid::updateLookupTable() { // __BENCH.LOOP_STAGE_START("Process Iterations > Iterations > Build Point Free lookup"); if (hashGridLists) delete[] hashGridLists; hashGridLists = new uint[hashGridEntryCount]; if (hashGridLenghts) memset(hashGridLenghts, 0, hashGridSize * sizeof(uint)); else hashGridLenghts = new uint[hashGridSize]; if (hashGridListsIndex) memset(hashGridListsIndex, 0, hashGridSize * sizeof(uint)); else hashGridListsIndex = new uint[hashGridSize]; uint listIndex = 0; for (unsigned int i = 0; i < hashGridSize; ++i) { std::list<uint> *hps = hashGrid[i]; hashGridListsIndex[i] = listIndex; if (hps) { hashGridLenghts[i] = hps->size(); std::list<uint>::iterator iter = hps->begin(); while (iter != hps->end()) { hashGridLists[listIndex++] = *iter++; } } else { hashGridLenghts[i] = 0; } } // __BENCH.LOOP_STAGE_STOP("Process Iterations > Iterations > Build Point Free lookup"); // __BENCH.LOOP_STAGE_START("Process Iterations > Iterations > Copy lookup to device"); //checkCUDAmemory("before updateLookupTable"); uint size1 = sizeof(uint) * hashGridEntryCount; if (hashGridListsBuff) cudaFree(hashGridListsBuff); cudaMalloc((void**) (&hashGridListsBuff), size1); cudaMemset(hashGridListsBuff, 0, size1); cudaMemcpy(hashGridListsBuff, hashGridLists, size1, cudaMemcpyHostToDevice); uint size2 = sizeof(uint) * hashGridSize; if (!hashGridListsIndexBuff) cudaMalloc((void**) (&hashGridListsIndexBuff), size2); cudaMemset(hashGridListsIndexBuff, 0, size2); cudaMemcpy(hashGridListsIndexBuff, hashGridListsIndex, size2, cudaMemcpyHostToDevice); if (!hashGridLenghtsBuff) cudaMalloc((void**) (&hashGridLenghtsBuff), size2); cudaMemset(hashGridLenghtsBuff, 0, size2); cudaMemcpy(hashGridLenghtsBuff, hashGridLenghts, size2, cudaMemcpyHostToDevice); checkCUDAError(); // __BENCH.LOOP_STAGE_STOP("Process Iterations > Iterations > Copy lookup to device"); //checkCUDAmemory("After updateLookupTable"); }
void mpla_init_matrix(struct mpla_matrix* matrix, struct mpla_instance* instance, int mat_row_count, int mat_col_count) { // setting global matrix dimensions matrix->mat_row_count = mat_row_count; matrix->mat_col_count = mat_col_count; // allocating memory for process-wise matrix information matrix->proc_row_count = new int*[instance->proc_rows]; matrix->proc_col_count = new int*[instance->proc_rows]; matrix->proc_row_offset = new int*[instance->proc_rows]; matrix->proc_col_offset = new int*[instance->proc_rows]; for (int i=0; i<instance->proc_rows; i++) { matrix->proc_row_count[i] = new int[instance->proc_cols]; matrix->proc_col_count[i] = new int[instance->proc_cols]; matrix->proc_row_offset[i] = new int[instance->proc_cols]; matrix->proc_col_offset[i] = new int[instance->proc_cols]; } /* // computing general row block sizes int filled_row_block_size = ceil((float)mat_row_count / (float)(instance->proc_rows)); // int filled_row_block_count = mat_row_count / filled_row_block_size; int last_row_block_size = mat_row_count % filled_row_block_size; // computing general column block sizes int filled_col_block_size = ceil((float)mat_col_count / (float)(instance->proc_cols)); // int filled_col_block_count = mat_col_count / filled_col_block_size; int last_col_block_size = mat_col_count % filled_col_block_size; // computing process-wise block row / column counts for (int i=0; i < instance->proc_rows; i++) { for (int j=0; j < instance->proc_cols; j++) { if ((i==(instance->proc_rows-1)) && (last_row_block_size>0)) // handling last row block which is only partially filled matrix->proc_row_count[i][j] = last_row_block_size; else matrix->proc_row_count[i][j] = filled_row_block_size; if ((j==(instance->proc_cols-1)) && (last_col_block_size>0)) // handling last column block which is only partially filled matrix->proc_col_count[i][j] = last_col_block_size; else matrix->proc_col_count[i][j] = filled_col_block_size; } } */ // computing general row block sizes int almost_filled_row_block_size = mat_row_count / instance->proc_rows; int remaining_rows = mat_row_count % instance->proc_rows; if (almost_filled_row_block_size == 0) { printf("MPLA: There are more process block rows than matrix rows. Exiting...\n"); exit(1); } // computing general column block sizes int almost_filled_col_block_size = mat_col_count / instance->proc_cols; int remaining_cols = mat_col_count % instance->proc_cols; if (almost_filled_row_block_size == 0) { printf("MPLA: There are more process block columns than matrix columns. Exiting...\n"); exit(1); } // computing process-wise block row / column counts for (int i=0; i< instance->proc_rows; i++) { for (int j=0; j<instance->proc_cols; j++) { matrix->proc_row_count[i][j] = almost_filled_row_block_size + ( (i<remaining_rows) ? 1 : 0 ); matrix->proc_col_count[i][j] = almost_filled_col_block_size + ( (j<remaining_cols) ? 1 : 0 ); } } // computing process-wise block row / column offsets matrix->proc_row_offset[0][0] = 0; matrix->proc_col_offset[0][0] = 0; for (int i=1; i<instance->proc_rows; i++) matrix->proc_col_offset[i][0] = 0; for (int j=1; j<instance->proc_cols; j++) matrix->proc_row_offset[0][j] = 0; for (int i=1; i < instance->proc_rows; i++) for (int j=0; j < instance->proc_cols; j++) matrix->proc_row_offset[i][j] = matrix->proc_row_offset[i-1][j] + matrix->proc_row_count[i-1][j]; for (int j=1; j < instance->proc_cols; j++) for (int i=0; i < instance->proc_rows; i++) matrix->proc_col_offset[i][j] = matrix->proc_col_offset[i][j-1] + matrix->proc_col_count[i][j-1]; // retrieving local data for the current process matrix->cur_proc_row_count = matrix->proc_row_count[instance->cur_proc_row][instance->cur_proc_col]; matrix->cur_proc_col_count = matrix->proc_col_count[instance->cur_proc_row][instance->cur_proc_col]; matrix->cur_proc_row_offset = matrix->proc_row_offset[instance->cur_proc_row][instance->cur_proc_col]; matrix->cur_proc_col_offset = matrix->proc_col_offset[instance->cur_proc_row][instance->cur_proc_col]; // allocating matrix data storage cudaMalloc((void**)&(matrix->data), sizeof(double)*matrix->cur_proc_row_count*matrix->cur_proc_col_count); cudaThreadSynchronize(); checkCUDAError("cudaMalloc"); }
void mpla_init_vector_for_block_rows(struct mpla_vector* vector, struct mpla_instance* instance, int vec_row_count) { // setting global vector size vector->vec_row_count = vec_row_count; // allocating memory for process-wise vector information vector->proc_row_count = new int*[instance->proc_rows]; vector->proc_row_offset = new int*[instance->proc_rows]; for (int i=0; i<instance->proc_rows; i++) { vector->proc_row_count[i] = new int[instance->proc_cols]; vector->proc_row_offset[i] = new int[instance->proc_cols]; } // computing general row block sizes int almost_filled_row_block_size = vec_row_count / instance->proc_cols; int remaining_rows = vec_row_count % instance->proc_cols; if (almost_filled_row_block_size == 0) { printf("MPLA: There are more process block columns than matrix columns. Exiting...\n"); exit(1); } // computing process-wise block row / column counts for (int i=0; i< instance->proc_rows; i++) { for (int j=0; j<instance->proc_cols; j++) { vector->proc_row_count[i][j] = almost_filled_row_block_size + ( (j<remaining_rows) ? 1 : 0 ); } } /* // computing general row block sizes int filled_row_block_size = ceil((float)vec_row_count / (float)(instance->proc_cols)); // int filled_row_block_count = vec_row_count / filled_row_block_size; int last_row_block_size = vec_row_count % filled_row_block_size; // computing process-wise block row / column counts for (int i=0; i < instance->proc_rows; i++) { for (int j=0; j < instance->proc_cols; j++) { if ((j==(instance->proc_cols-1)) && (last_row_block_size>0)) // handling last row block which is only partially filled vector->proc_row_count[i][j] = last_row_block_size; else vector->proc_row_count[i][j] = filled_row_block_size; } } */ // computing process-wise block row / column offsets vector->proc_row_offset[0][0] = 0; for (int i=1; i < instance->proc_rows; i++) vector->proc_row_offset[i][0] = 0; for (int j=1; j < instance->proc_cols; j++) for (int i=0; i < instance->proc_rows; i++) vector->proc_row_offset[i][j] = vector->proc_row_offset[i][j-1] + vector->proc_row_count[i][j-1]; // retrieving local data for the current process vector->cur_proc_row_count = vector->proc_row_count[instance->cur_proc_row][instance->cur_proc_col]; vector->cur_proc_row_offset = vector->proc_row_offset[instance->cur_proc_row][instance->cur_proc_col]; // allocating matrix data storage cudaMalloc((void**)&(vector->data), sizeof(double)*vector->cur_proc_row_count); cudaThreadSynchronize(); checkCUDAError("cudaMalloc"); }