void solve_lower_system_on_gpu(gpu_symm_band_matrix gpu_matrix, double * d_b, cublasHandle_t handle) { int num_tiles; int bs = gpu_matrix.block_size; int order = gpu_matrix.order; int cur_bs = gpu_matrix.block_size; double alpha_1 = 1; double alpha_m1 = -1; num_tiles = (order + bs - 1) / bs; for(int i = 0; i < num_tiles; i++) { if(cur_bs > gpu_matrix.tile_len[i]) { cur_bs = gpu_matrix.tile_len[i]; } checkCublasErrors(cublasDtrsv_v2(handle, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_T, CUBLAS_DIAG_NON_UNIT, cur_bs, gpu_matrix.gpu_matrix_tiles[i], cur_bs, &d_b[i*bs], 1)); if(i < num_tiles -1) { checkCublasErrors(cublasDgemv_v2(handle, CUBLAS_OP_T, bs, gpu_matrix.tile_len[i] - bs, &alpha_m1, &gpu_matrix.gpu_matrix_tiles[i][bs*bs], bs, &d_b[i*bs], 1, &alpha_1, &d_b[(i+1)*bs], 1)); } } }
void solve_system_on_gpu(gpu_symm_band_matrix gpu_matrix, double * b, cublasHandle_t handle) { double * d_b; checkCudaErrors(cudaMalloc(&d_b, gpu_matrix.order*sizeof(double))); checkCublasErrors(cublasSetVector(gpu_matrix.order, sizeof(double), b, 1, d_b, 1)); solve_lower_system_on_gpu(gpu_matrix, d_b, handle); solve_upper_system_on_gpu(gpu_matrix, d_b, handle); checkCublasErrors(cublasGetVector(gpu_matrix.order, sizeof(double), d_b, 1, b, 1)); checkCudaErrors(cudaFree(d_b)); }
void copyBandMatrixFromDevice(gpu_symm_band_matrix gpu_matrix, double* h_matrix, cublasHandle_t handle) { int num_tiles; int bs = gpu_matrix.block_size; int order = gpu_matrix.order; int hb = gpu_matrix.half_bandwith; int i; int cur_row; int cur_bs; double* temp_tile; num_tiles = (order + bs - 1) / bs; temp_tile = (double*) malloc( (bs + hb) * bs *sizeof(double) ); for (i = 0, cur_row = 0; i < num_tiles; i++, cur_row += bs) { cur_bs = bs; if (cur_row + cur_bs > order) { cur_bs = order - cur_row; } checkCublasErrors(cublasGetMatrix(cur_bs, gpu_matrix.tile_len[i], sizeof(double), gpu_matrix.gpu_matrix_tiles[i], cur_bs, temp_tile, bs)); set_host_matrix_tile( cur_bs, cur_row, hb, order, h_matrix, temp_tile, bs, gpu_matrix.tile_len[i]); } free(temp_tile); }
void solve_upper_system_on_gpu(gpu_symm_band_matrix gpu_matrix, double * d_b, cublasHandle_t handle) { int num_tiles; int bs = gpu_matrix.block_size; int order = gpu_matrix.order; int cur_bs = gpu_matrix.block_size; double alpha_1 = 1; double alpha_m1 = -1; double b[14]; for(int i = 0; i < 14; i++) { b[i] = -55; } num_tiles = (order + bs - 1) / bs; for(int i = num_tiles - 1; i >= 0; i--) { if(cur_bs > gpu_matrix.tile_len[i]) { cur_bs = gpu_matrix.tile_len[i]; } else { cur_bs = bs; } checkCublasErrors(cublasDtrsv_v2(handle, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT, cur_bs, gpu_matrix.gpu_matrix_tiles[i], cur_bs, &d_b[i*bs], 1)); int pt = i-1; int el = bs; for(; pt >= 0 && gpu_matrix.tile_len[pt] > el; pt--, el+=bs) { int cur_col_num = min(bs, gpu_matrix.tile_len[pt] - el); checkCublasErrors(cublasDgemv_v2(handle, CUBLAS_OP_N, bs, cur_col_num, &alpha_m1, &gpu_matrix.gpu_matrix_tiles[pt][el*bs], bs, &d_b[i*bs], 1, &alpha_1, &d_b[pt*bs], 1)); } } }
void matrixMulti(cublasHandle_t cublasHandle, int m, int n, int batchSize, float alpha, const float*A, const float*x, float beta, float *y) { #ifdef DISABLE_GEMV checkCublasErrors(cublasSgemm(cublasHandle, CUBLAS_OP_T, CUBLAS_OP_T, n, batchSize, m, &alpha, x, m, A, batchSize, &beta, y, n)); #else checkCublasErrors(cublasSgemv(cublasHandle, CUBLAS_OP_T, m, n, &alpha, A, m, x, 1, &beta, y, 1)); #endif }