コード例 #1
0
ファイル: nlGLWidget.cpp プロジェクト: Epsirom/Stylization
void NLGLWidget::initializeTexture()
{
    if (_texture >= 0) {
        cleanupTexture();
    }

    // Create the texture for displaying the result:
    GLuint gltex;
    glGenTextures(1, &gltex);
    _texture = gltex;
    reportGLError("setupGLRendering() genTexture");

    // Bind texture:
    glBindTexture(GL_TEXTURE_2D, _texture);
    reportGLError("setupGLRendering() bindTexture");

    // Allocate texture:
    glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA32F_ARB, _img_width, _img_height, 0, GL_RGBA, GL_FLOAT, NULL);
    reportGLError("setupGLRendering() texImage2D");

    // Set texture parameters:
    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
    reportGLError("setupGLRendering() glTexParameteri");

    // Unbind texture:
    glBindTexture(GL_TEXTURE_2D, 0);
    reportGLError("setupGLRendering() bindTexture(0)");

    // Register the buffer object:
    checkCUDAError("Pre gl register");
    cudaGraphicsGLRegisterImage(&_graphicsResource, _texture, GL_TEXTURE_2D, cudaGraphicsMapFlagsWriteDiscard);
    checkCUDAError("Post gl register");
}
コード例 #2
0
void CudaImagePyramidHost::unbindTexture()
{
    const textureReference* constTexRefPtr=NULL;
    cudaGetTextureReference(&constTexRefPtr, _texture_name);
    checkCUDAError("Can't get tex ref for unbind TEXTURE_PYRAMID", _name);

    cudaUnbindTexture(constTexRefPtr);
    checkCUDAError("Unbind error", _name);
}
コード例 #3
0
void CudaImagePyramidHost::bindTexture()
{
    const textureReference* constTexRefPtr=NULL;
    cudaGetTextureReference(&constTexRefPtr, _texture_name);
    checkCUDAError("Can't get tex ref for bind TEXTURE_PYRAMID", _name);
    cudaChannelFormatDesc formatDesc = constTexRefPtr->channelDesc;

    cudaBindTextureToArray(constTexRefPtr, _storage, &formatDesc);
    checkCUDAError("Bind error", _name);
}
コード例 #4
0
// initArray ------------------------------------------------------------------
//		Initializes an array of floats on the host and device
//    @param host				- The array on the host to initialize.
//		@param device			- The array on the device to initialize
//		@param size				- The length of the arrays to allocate.
//		@param initial_value	- The value to initialize the arrays to. -1.0 will
//										cause the array to initialize to the index value.
// ----------------------------------------------------------------------------
void initArray(float** host, float** device, size_t size, float initial_value){
	// Allocate host memory
	*host  = (float*) malloc( size * sizeof(float));

	// Allocate device memory
	cudaMalloc((void **) device, size * sizeof(float));
	checkCUDAError("malloc");	// Check for allocation errors

	// Initialize arrays ...
	for( size_t i = 0; i < size; ++i)
		(*host)[i] = initial_value == -1.0f ? i : initial_value;

	// ... and copy to device
	cudaMemcpy(*device, *host, size * sizeof(float), cudaMemcpyHostToDevice);
	checkCUDAError("memcpy"); // Check for initialization errors
}
コード例 #5
0
ファイル: mpla.cpp プロジェクト: zaspel/MPLA
void mpla_generic_dgemv(struct mpla_vector* b, struct mpla_generic_matrix* A, struct mpla_vector* x, void (*mpla_dgemv_core)(struct mpla_vector*, struct mpla_generic_matrix*, struct mpla_vector*, struct mpla_instance*), struct mpla_instance* instance)
{
	// allocate redistributed vector
	struct mpla_vector x_redist;
	mpla_init_vector_for_block_rows(&x_redist, instance, x->vec_row_count);

	// redistribute input vector with row-block parallel distribution to column-block parallel distribution
	mpla_redistribute_vector_for_generic_dgesv(&x_redist, x, A, instance);
	
	// generic computation core: matrix-vector product
	mpla_dgemv_core(b, A, &x_redist, instance);

	// create sub-communicator for each process row
	int remain_dims[2];
	remain_dims[0]=0;
	remain_dims[1]=1;
	MPI_Comm row_comm;
	MPI_Cart_sub(instance->comm, remain_dims, &row_comm);

	// summation of block row results
	double* sum;
	cudaMalloc((void**)&sum, sizeof(double)*b->cur_proc_row_count);
	cudaThreadSynchronize();
	checkCUDAError("cudaMalloc");
	MPI_Allreduce(b->data, sum, b->cur_proc_row_count, MPI_DOUBLE, MPI_SUM, row_comm);
	cudaMemcpy(b->data, sum, sizeof(double)*b->cur_proc_row_count, cudaMemcpyDeviceToDevice);

	// cleanup
	cudaFree(sum);
	mpla_free_vector(&x_redist, instance);

	MPI_Comm_free(&row_comm);
}
コード例 #6
0
void CudaImagePyramidHost::copyFromHost(const void* source)
{
    assert(isInitialized());
    assert(_textureType == cudaTextureType2D);

    cudaMemcpyToArray(_storage, 0,0, source, _baseWidth*_baseHeight*_typeSize, cudaMemcpyHostToDevice);

    checkCUDAError("Memcpy error", _name);
}
コード例 #7
0
void CudaImagePyramidHost::copyTo(CudaImagePyramidHost& target)
{
    assert(target._typeSize == _typeSize && target._textureType == _textureType);
    target.initialize(_baseWidth, _baseHeight, _filterMode, _numLayers);

    cudaMemcpyArrayToArray(target._storage,0,0,_storage,0,0,
                           _baseWidth*_baseHeight*_numLayers*_typeSize, cudaMemcpyDeviceToDevice);
    checkCUDAError("Memcpy error", _name);
}
コード例 #8
0
ファイル: main.cpp プロジェクト: elf11/GPGPU
// Lanseaza procesarea CUDA
void runCUDA()
{
	// Copiaza vectorii de prelucrat la device
	cutilSafeCall(cudaMemcpy(a_d, a_h,N*sizeof(float),cudaMemcpyHostToDevice));
	checkCUDAError("cudaMemcpy");

	cutilSafeCall(cudaMemcpy(b_d, b_h,N*sizeof(float),cudaMemcpyHostToDevice));
	checkCUDAError("cudaMemcpy");

	// Run Kernel
	cutilSafeCall(launch_actiune_thread(a_d,b_d,r_d,N,dimGrid,dimBlock));
	cutilSafeCall(cudaThreadSynchronize());
	checkCUDAError("invocare kernel");

	// Copiaza rezultatul prelucrat
	cutilSafeCall(cudaMemcpy(r_h,r_d,N*sizeof(float),cudaMemcpyDeviceToHost));
	checkCUDAError("cudaMemcpy");
	
	
}
コード例 #9
0
ファイル: mpla.cpp プロジェクト: zaspel/MPLA
void mpla_init_vector(struct mpla_vector* vector, struct mpla_instance* instance, int vec_row_count)
{
	// setting global vector size
	vector->vec_row_count = vec_row_count;

	// allocating memory for process-wise vector information
	vector->proc_row_count = new int*[instance->proc_rows];
	vector->proc_row_offset = new int*[instance->proc_rows];
	for (int i=0; i<instance->proc_rows; i++)
	{
		vector->proc_row_count[i] = new int[instance->proc_cols];
		vector->proc_row_offset[i] = new int[instance->proc_cols];
	}

	// computing general row block sizes
	int almost_filled_row_block_size = vec_row_count / instance->proc_rows;
	int remaining_rows = vec_row_count % instance->proc_rows;

	if (almost_filled_row_block_size == 0)
	{
		printf("MPLA: There are more process block rows than vector rows. Exiting...\n");
		exit(1);
	}


	// computing process-wise block row / column counts
	for (int i=0; i< instance->proc_rows; i++)
	{
		for (int j=0; j<instance->proc_cols; j++)
		{
			vector->proc_row_count[i][j] = almost_filled_row_block_size + ( (i<remaining_rows) ? 1 : 0 );
		}
	}

	// computing process-wise block row / column offsets
	vector->proc_row_offset[0][0] = 0;
	for (int j=1; j < instance->proc_cols; j++)
		vector->proc_row_offset[0][j] = 0;	
	for (int i=1; i < instance->proc_rows; i++)
		for (int j=0; j < instance->proc_cols; j++)
			vector->proc_row_offset[i][j] = vector->proc_row_offset[i-1][j] + vector->proc_row_count[i-1][j];
		
	// retrieving local data for the current process
	vector->cur_proc_row_count = vector->proc_row_count[instance->cur_proc_row][instance->cur_proc_col];
	vector->cur_proc_row_offset = vector->proc_row_offset[instance->cur_proc_row][instance->cur_proc_col];

	// allocating matrix data storage
	cudaMalloc((void**)&(vector->data), sizeof(double)*vector->cur_proc_row_count);
	cudaThreadSynchronize();
	checkCUDAError("cudaMalloc");
}
コード例 #10
0
ファイル: main.cpp プロジェクト: elf11/GPGPU
bool initCUDA(void)
{
#if __DEVICE_EMULATION__
	return true;
#else
	int count = 0;
	int i = 0;

	cudaGetDeviceCount(&count);
	if(count == 0) {
		fprintf(stderr, "Nu exista nici un device.\n");
		return false;
	}

	printf("Exista %d device-uri.\n",count);
	
	for(i = 0; i < count; i++) {
		cudaDeviceProp prop;
		if(cudaGetDeviceProperties(&prop, i) == cudaSuccess) {
			if(prop.major >= 1) {
				break;
			}
		}
		
		if(!prop.canMapHostMemory) {
		
			fprintf(stderr, "Device %d cannot map host memory!\n",0);
			exit(EXIT_FAILURE);
		
		}

	}

	if(i == count) {
		fprintf(stderr, "Nu exista nici un device care suporta CUDA.\n");
		return false;
	}
	cudaSetDevice(cutGetMaxGflopsDeviceId());

	cudaSetDeviceFlags(cudaDeviceMapHost);
	checkCUDAError("cudaSetDeviceFlags");
	
	
	printf("CUDA initializat cu succes\n");

	// Create the CUTIL timer
    cutilCheckError( cutCreateTimer( &timer));

	return true;
#endif
}
コード例 #11
0
// finishTest ------------------------------------------------------------------
//		Initializes the cuda timer events and starts the timer.
//		@param start - Start time evet
//		@param end   - End time evet
//		@returns the elapsed time in seconds.
//-----------------------------------------------------------------------------
float finishTest(cudaEvent_t &start, cudaEvent_t &stop){
	float time;
	cudaEventRecord( stop, 0 );
	cudaEventSynchronize( stop );
	cudaEventElapsedTime( &time, start, stop );
	cudaEventDestroy( start );
	cudaEventDestroy( stop );
	printf("Finished Test in %f s\n\n", time/1000.0f);
	
	// Check for errors
	checkCUDAError("test finished");
	
	// Return elapsed time
	return time/1000.0f;
}
コード例 #12
0
ファイル: mpla.cpp プロジェクト: zaspel/MPLA
void mpla_save_vector(struct mpla_vector* x, char* filename, struct mpla_instance* instance)
{
        // create sub-communicator for each process column
        int remain_dims[2];
        remain_dims[0]=1;
        remain_dims[1]=0;
        MPI_Comm column_comm;
        MPI_Cart_sub(instance->comm, remain_dims, &column_comm);
        int column_rank;
        MPI_Comm_rank(column_comm, &column_rank);

        // columnwise creation of the full vector
        double* full_vector;
        int* recvcounts = new int[instance->proc_rows];
        int* displs = new int[instance->proc_rows];
        for (int i=0; i<instance->proc_rows; i++)
        {
                recvcounts[i] = x->proc_row_count[i][instance->cur_proc_col];
                displs[i] = x->proc_row_offset[i][instance->cur_proc_col];
        }
        cudaMalloc((void**)&full_vector, sizeof(double)*x->vec_row_count);
        cudaThreadSynchronize();
        checkCUDAError("cudaMalloc");
        MPI_Allgatherv(x->data, x->cur_proc_row_count, MPI_DOUBLE, full_vector, recvcounts, displs, MPI_DOUBLE, column_comm);

	// writing full vector to file on parent process	
	if (instance->is_parent)
	{
		FILE* f = fopen(filename, "wb");

		double* full_vector_host = new double[x->vec_row_count];
		cudaMemcpy(full_vector_host, full_vector, x->vec_row_count*sizeof(double), cudaMemcpyDeviceToHost);
	
		fwrite(&(x->vec_row_count), sizeof(int), 1, f);
			
		fwrite(full_vector_host, sizeof(double), x->vec_row_count, f);

		fclose(f);	

		delete [] full_vector_host;
	}

        // memory cleanup
        cudaFree(full_vector);
	MPI_Comm_free(&column_comm);

	MPI_Barrier(instance->comm);
}
コード例 #13
0
ファイル: main.cpp プロジェクト: elf11/GPGPU
void cleanup()
{
	free(a_h);free(b_h);free(r_h);
	free(control);

	cutilCheckError(cutStopTimer(timer));
	cutilCheckError(cutDeleteTimer( timer));

	cudaFree(a_d);cudaFree(b_d);cudaFree(r_d);

	
	cutilSafeCall(release());
	checkCUDAError("release");
	cudaThreadExit();
      
}
コード例 #14
0
void CudaImagePyramidHost::copyFromHost(int width, int height, const void* source, int layer)
{
    assert(isInitialized());
    assert(_textureType == cudaTextureType2DLayered);

    cudaMemcpy3DParms myParms = {0};
    myParms.srcPtr = make_cudaPitchedPtr((void*)source,width*_typeSize,width,height);
    myParms.srcPos = make_cudaPos(0,0,0);
    myParms.dstArray = _storage;
    myParms.dstPos = make_cudaPos(0,0,layer);
    myParms.extent = make_cudaExtent(width,height,1);
    myParms.kind = cudaMemcpyHostToDevice;
    cudaMemcpy3D(&myParms);

    checkCUDAError("Memcpy error", _name);
}
コード例 #15
0
void CudaImagePyramidHost::clear() 
{
    if (!isInitialized()) {
        return;
    }

    // Don't bother unbinding the texture if everything is getting destroyed,
    // because it's likely that CUDA has already destroyed the texture.
    if (!_in_destructor) {
        unbindTexture();
    }

    cudaFreeArray(_storage);
    checkCUDAError("Free error", _name);

    _storage = NULL;
    _baseWidth = 0; _baseHeight = 0; _baseWidth = 0; _baseHeight = 0;
}
コード例 #16
0
ファイル: mpla.cpp プロジェクト: zaspel/MPLA
void mpla_redistribute_vector_for_generic_dgesv(struct mpla_vector* b_redist, struct mpla_vector* b, struct mpla_generic_matrix* A, struct mpla_instance* instance)
{
	// attention: this code does no correctness check for the input data


	// WARNING: The following code is not efficient for a strong parallelization !!!!!


	// create sub-communicator for each process column
	int remain_dims[2];
	remain_dims[0]=1;
	remain_dims[1]=0;
	MPI_Comm column_comm;
	MPI_Cart_sub(instance->comm, remain_dims, &column_comm);
	int column_rank;
	MPI_Comm_rank(column_comm, &column_rank);
	
	// columnwise creation of the full vector
	double* full_vector;
	int* recvcounts = new int[instance->proc_rows];
	int* displs = new int[instance->proc_rows];
	for (int i=0; i<instance->proc_rows; i++)
	{
		recvcounts[i] = b->proc_row_count[i][instance->cur_proc_col];
		displs[i] = b->proc_row_offset[i][instance->cur_proc_col];
	}
	cudaMalloc((void**)&full_vector, sizeof(double)*b->vec_row_count);
	cudaThreadSynchronize();
	checkCUDAError("cudaMalloc");
	MPI_Allgatherv(b->data, b->cur_proc_row_count, MPI_DOUBLE, full_vector, recvcounts, displs, MPI_DOUBLE, column_comm);

	// extract column-wise local part of full vector
	cudaMemcpy(b_redist->data, &(full_vector[b_redist->cur_proc_row_offset]), sizeof(double)*b_redist->cur_proc_row_count, cudaMemcpyDeviceToDevice);

	// memory cleanup
	cudaFree(full_vector);


	MPI_Comm_free(&column_comm);
}
コード例 #17
0
ファイル: main.cpp プロジェクト: elf11/GPGPU
void init()
{
	
	// Aloca memorie - local
	//a_h = (float *)malloc(N*sizeof(float));
	//b_h = (float *)malloc(N*sizeof(float));
	//r_h = (float *)malloc(N*sizeof(float));
	
	int size = N*sizeof(float);

	cudaHostAlloc((void **)&a_h, size, 0);
	checkCUDAError("cudaHostAllocMapped");
	cudaHostAlloc((void **)&b_h, size, 0);
	checkCUDAError("cudaHostAllocMapped");
	cudaHostAlloc((void **)&r_h, size, 0);
	checkCUDAError("cudaHostAllocMapped");

	// Aloca memorie - CUDA
	//cutilSafeCall(cudaMalloc((void **) &a_d, N*sizeof(float)));
	//cutilSafeCall(cudaMalloc((void **) &b_d, N*sizeof(float)));
	//cutilSafeCall(cudaMalloc((void **) &r_d, N*sizeof(float)));
	
	cudaHostGetDevicePointer((void **)&a_d, (void *)a_h, 0);
	checkCUDAError("cudaHostGetDevicePointer");
	cudaHostGetDevicePointer((void **)&b_d, (void *)b_h, 0);
	checkCUDAError("cudaHostGetDevicePointer");
	cudaHostGetDevicePointer((void **)&r_d, (void *)r_h, 0);
	checkCUDAError("cudaHostGetDevicePointer");
	
	control = (float *)malloc(N*sizeof(float));

	// Initializeaza vectori
	for(int i=0;i<N;i++)
	{
		a_h[i] = (float)(i % 13)+1;
		b_h[i] = (float)(i % 3)+1;
	}
}
コード例 #18
0
void CudaImagePyramidHost::initialize(int width, int height, cudaTextureFilterMode filter_mode, int depth)
{
    qDebug() << "pyramid host initializing with params: " << width << height << filter_mode << depth;
    if (isInitialized() && width == _baseWidth && height == _baseHeight && filter_mode == _filterMode) {
        return;
    }

    clear();
    qDebug() << "Clear done.";

    _baseWidth = width;
    _baseHeight = height;
    _filterMode = filter_mode;
    _numLayers = depth;

    // Get the texture and its channel descriptor to allocate the storage.
    const textureReference* constTexRefPtr=NULL;
    cudaGetTextureReference(&constTexRefPtr, _texture_name);
    qDebug() << "Texture Ref got:" << _name;
    if (constTexRefPtr == 0)
    {
        qDebug() << "constTexRefPtr==0";
    }
    checkCUDAError("Can't get tex ref for init TEXTURE_PYRAMID", _name);
    cudaChannelFormatDesc formatDesc = constTexRefPtr->channelDesc;

    if(_textureType == cudaTextureType2DLayered){
        cudaDeviceProp prop;
        qDebug() << "to get CUDA device prop";
        cudaGetDeviceProperties(&prop,0);
        qDebug() << "CUDA Device Prop got";
        if(prop.maxTexture2DLayered[0] < _baseWidth || prop.maxTexture2DLayered[1] < _baseHeight || prop.maxTexture2DLayered[2] < _numLayers){
            qDebug()<< "Max layered texture size:" << prop.maxTexture2DLayered[0] << " x " << prop.maxTexture2DLayered[1] << " x " << prop.maxTexture2DLayered[2];
            assert(0);
        }
        cudaExtent extent = make_cudaExtent(_baseWidth, _baseHeight, _numLayers);
        cudaMalloc3DArray(&_storage, &formatDesc, extent, cudaArrayLayered);
    }else{
        cudaMallocArray(&_storage, &formatDesc, _baseWidth, _baseHeight);
    }
    checkCUDAError("Failure to allocate", _name);

    qDebug() << "allocate done";

    // Set texture parameters.
    // Evil hack to get around an apparent bug in the cuda api:
    // cudaGetTextureReference only returns a const reference, and
    // there is no way to set the parameters with a reference other
    // than cast it to non-const.
    textureReference* texRefPtr=NULL;
    texRefPtr = const_cast<textureReference*>( constTexRefPtr );
    texRefPtr->addressMode[0] = cudaAddressModeClamp;
    texRefPtr->addressMode[1] = cudaAddressModeClamp;
    texRefPtr->filterMode = filter_mode;
    texRefPtr->normalized = false; // Use unnormalized (pixel) coordinates for addressing. This forbids texture mode wrap.

    bindTexture();

    qDebug() << "texture binded";

    bool found = false;
    for (size_t i = 0; i < _instances.size(); i++) {
        if (_instances[i] == this)
            found = true;
    }
    if (!found) {
        qDebug() << "Not found";
        _instances.push_back(this);
    }
    qDebug() << "paramid host initialized.";
}
コード例 #19
0
void PointerFreeHashGrid::updateLookupTable() {

//  __BENCH.LOOP_STAGE_START("Process Iterations > Iterations > Build Point Free lookup");


  if (hashGridLists)
    delete[] hashGridLists;

  hashGridLists = new uint[hashGridEntryCount];

  if (hashGridLenghts)
    memset(hashGridLenghts, 0, hashGridSize * sizeof(uint));
  else
    hashGridLenghts = new uint[hashGridSize];

  if (hashGridListsIndex)
    memset(hashGridListsIndex, 0, hashGridSize * sizeof(uint));
  else
    hashGridListsIndex = new uint[hashGridSize];

  uint listIndex = 0;
  for (unsigned int i = 0; i < hashGridSize; ++i) {

    std::list<uint> *hps = hashGrid[i];

    hashGridListsIndex[i] = listIndex;

    if (hps) {
      hashGridLenghts[i] = hps->size();
      std::list<uint>::iterator iter = hps->begin();
      while (iter != hps->end()) {
        hashGridLists[listIndex++] = *iter++;
      }
    } else {
      hashGridLenghts[i] = 0;
    }

  }

//  __BENCH.LOOP_STAGE_STOP("Process Iterations > Iterations > Build Point Free lookup");

//  __BENCH.LOOP_STAGE_START("Process Iterations > Iterations > Copy lookup to device");

  //checkCUDAmemory("before updateLookupTable");

  uint size1 = sizeof(uint) * hashGridEntryCount;

  if (hashGridListsBuff)
    cudaFree(hashGridListsBuff);
  cudaMalloc((void**) (&hashGridListsBuff), size1);

  cudaMemset(hashGridListsBuff, 0, size1);
  cudaMemcpy(hashGridListsBuff, hashGridLists, size1, cudaMemcpyHostToDevice);

  uint size2 = sizeof(uint) * hashGridSize;

  if (!hashGridListsIndexBuff)
    cudaMalloc((void**) (&hashGridListsIndexBuff), size2);

  cudaMemset(hashGridListsIndexBuff, 0, size2);

  cudaMemcpy(hashGridListsIndexBuff, hashGridListsIndex, size2, cudaMemcpyHostToDevice);

  if (!hashGridLenghtsBuff)
    cudaMalloc((void**) (&hashGridLenghtsBuff), size2);

  cudaMemset(hashGridLenghtsBuff, 0, size2);

  cudaMemcpy(hashGridLenghtsBuff, hashGridLenghts, size2, cudaMemcpyHostToDevice);

  checkCUDAError();

//  __BENCH.LOOP_STAGE_STOP("Process Iterations > Iterations > Copy lookup to device");


  //checkCUDAmemory("After updateLookupTable");

}
コード例 #20
0
ファイル: mpla.cpp プロジェクト: zaspel/MPLA
void mpla_init_matrix(struct mpla_matrix* matrix, struct mpla_instance* instance, int mat_row_count, int mat_col_count)
{
	// setting global matrix dimensions
	matrix->mat_row_count = mat_row_count;
	matrix->mat_col_count = mat_col_count;

	// allocating memory for process-wise matrix information
	matrix->proc_row_count = new int*[instance->proc_rows];
	matrix->proc_col_count = new int*[instance->proc_rows];
	matrix->proc_row_offset = new int*[instance->proc_rows];
	matrix->proc_col_offset = new int*[instance->proc_rows];
	for (int i=0; i<instance->proc_rows; i++)
	{
		matrix->proc_row_count[i] = new int[instance->proc_cols];
		matrix->proc_col_count[i] = new int[instance->proc_cols];
		matrix->proc_row_offset[i] = new int[instance->proc_cols];
		matrix->proc_col_offset[i] = new int[instance->proc_cols];
	}

/*
	// computing general row block sizes
	int filled_row_block_size = ceil((float)mat_row_count / (float)(instance->proc_rows));
//	int filled_row_block_count = mat_row_count / filled_row_block_size;
	int last_row_block_size =  mat_row_count % filled_row_block_size;

	// computing general column block sizes
	int filled_col_block_size = ceil((float)mat_col_count / (float)(instance->proc_cols));
//	int filled_col_block_count = mat_col_count / filled_col_block_size;
	int last_col_block_size =  mat_col_count % filled_col_block_size;


	// computing process-wise block row / column counts
	for (int i=0; i < instance->proc_rows; i++)
	{
		for (int j=0; j < instance->proc_cols; j++)
		{
			if ((i==(instance->proc_rows-1)) && (last_row_block_size>0)) // handling last row block which is only partially filled
				matrix->proc_row_count[i][j] = last_row_block_size;
			else
				matrix->proc_row_count[i][j] = filled_row_block_size;
	
			if ((j==(instance->proc_cols-1)) && (last_col_block_size>0)) // handling last column block which is only partially filled
				matrix->proc_col_count[i][j] = last_col_block_size;
			else
				matrix->proc_col_count[i][j] = filled_col_block_size;
		}
	}
*/
	
	// computing general row block sizes
	int almost_filled_row_block_size = mat_row_count / instance->proc_rows;
	int remaining_rows = mat_row_count % instance->proc_rows;
	if (almost_filled_row_block_size == 0)
	{
		printf("MPLA: There are more process block rows than matrix rows. Exiting...\n");
		exit(1);
	}


	// computing general column block sizes
	int almost_filled_col_block_size = mat_col_count / instance->proc_cols;
	int remaining_cols = mat_col_count % instance->proc_cols;

	if (almost_filled_row_block_size == 0)
	{
		printf("MPLA: There are more process block columns than matrix columns. Exiting...\n");
		exit(1);
	}


	// computing process-wise block row / column counts
	for (int i=0; i< instance->proc_rows; i++)
	{
		for (int j=0; j<instance->proc_cols; j++)
		{
			matrix->proc_row_count[i][j] = almost_filled_row_block_size + ( (i<remaining_rows) ? 1 : 0 );
			matrix->proc_col_count[i][j] = almost_filled_col_block_size + ( (j<remaining_cols) ? 1 : 0 );
		}
	}
	



	// computing process-wise block row / column offsets
	matrix->proc_row_offset[0][0] = 0;
	matrix->proc_col_offset[0][0] = 0;
	for (int i=1; i<instance->proc_rows; i++)
		matrix->proc_col_offset[i][0] = 0;
	for (int j=1; j<instance->proc_cols; j++)
		matrix->proc_row_offset[0][j] = 0;
	for (int i=1; i < instance->proc_rows; i++)
		for (int j=0; j < instance->proc_cols; j++)
			matrix->proc_row_offset[i][j] = matrix->proc_row_offset[i-1][j] + matrix->proc_row_count[i-1][j];
	for (int j=1; j < instance->proc_cols; j++)
		for (int i=0; i < instance->proc_rows; i++)
			matrix->proc_col_offset[i][j] = matrix->proc_col_offset[i][j-1] + matrix->proc_col_count[i][j-1];
		
	// retrieving local data for the current process
	matrix->cur_proc_row_count = matrix->proc_row_count[instance->cur_proc_row][instance->cur_proc_col];
	matrix->cur_proc_col_count = matrix->proc_col_count[instance->cur_proc_row][instance->cur_proc_col];
	matrix->cur_proc_row_offset = matrix->proc_row_offset[instance->cur_proc_row][instance->cur_proc_col];
	matrix->cur_proc_col_offset = matrix->proc_col_offset[instance->cur_proc_row][instance->cur_proc_col];

	// allocating matrix data storage
	cudaMalloc((void**)&(matrix->data), sizeof(double)*matrix->cur_proc_row_count*matrix->cur_proc_col_count);
	cudaThreadSynchronize();
	checkCUDAError("cudaMalloc");
}
コード例 #21
0
ファイル: mpla.cpp プロジェクト: zaspel/MPLA
void mpla_init_vector_for_block_rows(struct mpla_vector* vector, struct mpla_instance* instance, int vec_row_count)
{
	// setting global vector size
	vector->vec_row_count = vec_row_count;

	// allocating memory for process-wise vector information
	vector->proc_row_count = new int*[instance->proc_rows];
	vector->proc_row_offset = new int*[instance->proc_rows];
	for (int i=0; i<instance->proc_rows; i++)
	{
		vector->proc_row_count[i] = new int[instance->proc_cols];
		vector->proc_row_offset[i] = new int[instance->proc_cols];
	}

	// computing general row block sizes
	int almost_filled_row_block_size = vec_row_count / instance->proc_cols;
	int remaining_rows = vec_row_count % instance->proc_cols;

	if (almost_filled_row_block_size == 0)
	{
		printf("MPLA: There are more process block columns than matrix columns. Exiting...\n");
		exit(1);
	}


	// computing process-wise block row / column counts
	for (int i=0; i< instance->proc_rows; i++)
	{
		for (int j=0; j<instance->proc_cols; j++)
		{
			vector->proc_row_count[i][j] = almost_filled_row_block_size + ( (j<remaining_rows) ? 1 : 0 );
		}
	}
	

/*

	// computing general row block sizes
	int filled_row_block_size = ceil((float)vec_row_count / (float)(instance->proc_cols));
//	int filled_row_block_count = vec_row_count / filled_row_block_size;
	int last_row_block_size =  vec_row_count % filled_row_block_size;

	// computing process-wise block row / column counts
	for (int i=0; i < instance->proc_rows; i++)
	{
		for (int j=0; j < instance->proc_cols; j++)
		{
			if ((j==(instance->proc_cols-1)) && (last_row_block_size>0)) // handling last row block which is only partially filled
				vector->proc_row_count[i][j] = last_row_block_size;
			else
				vector->proc_row_count[i][j] = filled_row_block_size;
		}
	}
*/
	// computing process-wise block row / column offsets
	vector->proc_row_offset[0][0] = 0;
	for (int i=1; i < instance->proc_rows; i++)
		vector->proc_row_offset[i][0] = 0;
	for (int j=1; j < instance->proc_cols; j++)
		for (int i=0; i < instance->proc_rows; i++)
			vector->proc_row_offset[i][j] = vector->proc_row_offset[i][j-1] + vector->proc_row_count[i][j-1];
	
	// retrieving local data for the current process
	vector->cur_proc_row_count = vector->proc_row_count[instance->cur_proc_row][instance->cur_proc_col];
	vector->cur_proc_row_offset = vector->proc_row_offset[instance->cur_proc_row][instance->cur_proc_col];

	// allocating matrix data storage
	cudaMalloc((void**)&(vector->data), sizeof(double)*vector->cur_proc_row_count);
	cudaThreadSynchronize();
	checkCUDAError("cudaMalloc");
}