void solve_lower_system_on_gpu(gpu_symm_band_matrix gpu_matrix, double * d_b, cublasHandle_t handle)
{
	int num_tiles;
	int bs = gpu_matrix.block_size;
	int order = gpu_matrix.order;
	int cur_bs = gpu_matrix.block_size;
	double alpha_1 = 1;
	double alpha_m1 = -1;

	num_tiles = (order + bs - 1) / bs;
	for(int i = 0; i < num_tiles; i++)
	{
		if(cur_bs > gpu_matrix.tile_len[i])
		{
			cur_bs = gpu_matrix.tile_len[i];
		}

		checkCublasErrors(cublasDtrsv_v2(handle, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_T, CUBLAS_DIAG_NON_UNIT, cur_bs, gpu_matrix.gpu_matrix_tiles[i], cur_bs, &d_b[i*bs], 1));
		if(i < num_tiles -1)
		{
			checkCublasErrors(cublasDgemv_v2(handle, CUBLAS_OP_T, bs, gpu_matrix.tile_len[i] - bs, &alpha_m1, 
				&gpu_matrix.gpu_matrix_tiles[i][bs*bs], bs, &d_b[i*bs], 1, &alpha_1, &d_b[(i+1)*bs], 1));
		}
	}
}
void solve_system_on_gpu(gpu_symm_band_matrix gpu_matrix, double * b, cublasHandle_t handle)
{
	double * d_b;
	checkCudaErrors(cudaMalloc(&d_b, gpu_matrix.order*sizeof(double)));
	checkCublasErrors(cublasSetVector(gpu_matrix.order, sizeof(double), b, 1, d_b, 1));
	
	solve_lower_system_on_gpu(gpu_matrix, d_b, handle);
	solve_upper_system_on_gpu(gpu_matrix, d_b, handle);

	checkCublasErrors(cublasGetVector(gpu_matrix.order, sizeof(double), d_b, 1, b, 1));

	checkCudaErrors(cudaFree(d_b));
}
コード例 #3
0
void copyBandMatrixFromDevice(gpu_symm_band_matrix gpu_matrix, double* h_matrix,
	cublasHandle_t handle)
{
	int num_tiles;
	int bs = gpu_matrix.block_size;
	int order = gpu_matrix.order;
	int hb = gpu_matrix.half_bandwith;
	int i;
	int cur_row;
	int cur_bs;
	double* temp_tile;

	num_tiles = (order + bs - 1) / bs;
	temp_tile = (double*) malloc( (bs + hb) * bs *sizeof(double) );

	for (i = 0, cur_row = 0; i < num_tiles; i++, cur_row += bs)
	{
		cur_bs = bs;
		if (cur_row + cur_bs > order)
		{
			cur_bs = order - cur_row;
		}

		checkCublasErrors(cublasGetMatrix(cur_bs, gpu_matrix.tile_len[i], sizeof(double), gpu_matrix.gpu_matrix_tiles[i], cur_bs, temp_tile, bs));

		set_host_matrix_tile( cur_bs, cur_row, hb, order, h_matrix, temp_tile, bs, gpu_matrix.tile_len[i]);

	}
	free(temp_tile);
}
void solve_upper_system_on_gpu(gpu_symm_band_matrix gpu_matrix, double * d_b, cublasHandle_t handle)
{
	int num_tiles;
	int bs = gpu_matrix.block_size;
	int order = gpu_matrix.order;
	int cur_bs = gpu_matrix.block_size;
	double alpha_1 = 1;
	double alpha_m1 = -1;
	double b[14];
	for(int i = 0; i < 14; i++)
	{
		b[i] = -55;
	}

	num_tiles = (order + bs - 1) / bs;
	for(int i = num_tiles - 1; i >= 0; i--)
	{

		if(cur_bs > gpu_matrix.tile_len[i])
		{
			cur_bs = gpu_matrix.tile_len[i];
		}
		else
		{
			cur_bs = bs;
		}

		checkCublasErrors(cublasDtrsv_v2(handle, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT, cur_bs, gpu_matrix.gpu_matrix_tiles[i], cur_bs, &d_b[i*bs], 1));

		int pt = i-1;
		int el = bs;
		for(; pt >= 0 && gpu_matrix.tile_len[pt] > el; pt--, el+=bs)
		{
			int cur_col_num = min(bs, gpu_matrix.tile_len[pt] - el);
			checkCublasErrors(cublasDgemv_v2(handle, CUBLAS_OP_N, bs, cur_col_num, &alpha_m1, 
				&gpu_matrix.gpu_matrix_tiles[pt][el*bs], bs, &d_b[i*bs], 1, &alpha_1, &d_b[pt*bs], 1));
		}
		
	}
}
コード例 #5
0
void matrixMulti(cublasHandle_t cublasHandle, 
                                       int m, 
                                       int n,
                               int batchSize, 
                                 float alpha,
		                       const float*A, 
                               const float*x, 
                                  float beta, 
                                    float *y)
{
#ifdef DISABLE_GEMV
	checkCublasErrors(cublasSgemm(cublasHandle,
			          CUBLAS_OP_T,
			          CUBLAS_OP_T,
			          n,
			          batchSize,
			          m,
			          &alpha,
			          x,
			          m,
			          A,
			          batchSize,
			          &beta,
			          y,
			          n));

#else

	checkCublasErrors(cublasSgemv(cublasHandle, CUBLAS_OP_T,
			                     m, n,
			                     &alpha,
			                     A, m,
			                     x, 1,
			                     &beta,
			                     y, 1));


#endif

}