static double cuda_sdot(long size, const float* src1, const float* src2) { assert(cuda_ondevice(src1)); assert(cuda_ondevice(src2)); // printf("SDOT %x %x %ld\n", src1, src2, size); return cublasSdot(size, src1, 1, src2, 1); }
void fwt1(unsigned int N, unsigned int d, const long dims[N], const long ostr[N], complex float* low, complex float* hgh, const long istr[N], const complex float* in, const long flen, const float filter[2][2][flen]) { debug_printf(DP_DEBUG4, "fwt1: %d/%d\n", d, N); debug_print_dims(DP_DEBUG4, N, dims); assert(dims[d] >= 2); long odims[N]; md_copy_dims(N, odims, dims); odims[d] = bandsize(dims[d], flen); debug_print_dims(DP_DEBUG4, N, odims); long o = d + 1; long u = N - o; // 0 1 2 3 4 5 6|7 // --d-- * --u--|N // ---o--- assert(d == md_calc_blockdim(d, dims + 0, istr + 0, CFL_SIZE)); assert(u == md_calc_blockdim(u, dims + o, istr + o, CFL_SIZE * md_calc_size(o, dims))); assert(d == md_calc_blockdim(d, odims + 0, ostr + 0, CFL_SIZE)); assert(u == md_calc_blockdim(u, odims + o, ostr + o, CFL_SIZE * md_calc_size(o, odims))); // merge dims long wdims[3] = { md_calc_size(d, dims), dims[d], md_calc_size(u, dims + o) }; long wistr[3] = { CFL_SIZE, istr[d], CFL_SIZE * md_calc_size(o, dims) }; long wostr[3] = { CFL_SIZE, ostr[d], CFL_SIZE * md_calc_size(o, odims) }; #ifdef USE_CUDA if (cuda_ondevice(in)) { assert(cuda_ondevice(low)); assert(cuda_ondevice(hgh)); float* flow = md_gpu_move(1, MD_DIMS(flen), filter[0][0], FL_SIZE); float* fhgh = md_gpu_move(1, MD_DIMS(flen), filter[0][1], FL_SIZE); wl3_cuda_down3(wdims, wostr, low, wistr, in, flen, flow); wl3_cuda_down3(wdims, wostr, hgh, wistr, in, flen, fhgh); md_free(flow); md_free(fhgh); return; } #endif // no clear needed wavelet_down3(wdims, wostr, low, wistr, in, flen, filter[0][0]); wavelet_down3(wdims, wostr, hgh, wistr, in, flen, filter[0][1]); }
void iwt1(unsigned int N, unsigned int d, const long dims[N], const long ostr[N], complex float* out, const long istr[N], const complex float* low, const complex float* hgh, const long flen, const float filter[2][2][flen]) { debug_printf(DP_DEBUG4, "ifwt1: %d/%d\n", d, N); debug_print_dims(DP_DEBUG4, N, dims); assert(dims[d] >= 2); long idims[N]; md_copy_dims(N, idims, dims); idims[d] = bandsize(dims[d], flen); debug_print_dims(DP_DEBUG4, N, idims); long o = d + 1; long u = N - o; // 0 1 2 3 4 5 6|7 // --d-- * --u--|N // ---o--- assert(d == md_calc_blockdim(d, dims + 0, ostr + 0, CFL_SIZE)); assert(u == md_calc_blockdim(u, dims + o, ostr + o, CFL_SIZE * md_calc_size(o, dims))); assert(d == md_calc_blockdim(d, idims + 0, istr + 0, CFL_SIZE)); assert(u == md_calc_blockdim(u, idims + o, istr + o, CFL_SIZE * md_calc_size(o, idims))); long wdims[3] = { md_calc_size(d, dims), dims[d], md_calc_size(u, dims + o) }; long wistr[3] = { CFL_SIZE, istr[d], CFL_SIZE * md_calc_size(o, idims) }; long wostr[3] = { CFL_SIZE, ostr[d], CFL_SIZE * md_calc_size(o, dims) }; md_clear(3, wdims, out, CFL_SIZE); // we cannot clear because we merge outputs #ifdef USE_CUDA if (cuda_ondevice(out)) { assert(cuda_ondevice(low)); assert(cuda_ondevice(hgh)); float* flow = md_gpu_move(1, MD_DIMS(flen), filter[1][0], FL_SIZE); float* fhgh = md_gpu_move(1, MD_DIMS(flen), filter[1][1], FL_SIZE); wl3_cuda_up3(wdims, wostr, out, wistr, low, flen, flow); wl3_cuda_up3(wdims, wostr, out, wistr, hgh, flen, fhgh); md_free(flow); md_free(fhgh); return; } #endif wavelet_up3(wdims, wostr, out, wistr, low, flen, filter[1][0]); wavelet_up3(wdims, wostr, out, wistr, hgh, flen, filter[1][1]); }
static void linop_matrix_apply_normal(const linop_data_t* _data, complex float* dst, const complex float* src) { struct operator_matrix_s* data = CAST_DOWN(operator_matrix_s, _data); if (NULL == data->mat_gram) { complex float* tmp = md_alloc_sameplace(data->N, data->out_dims, CFL_SIZE, src); linop_matrix_apply(_data, tmp, src); linop_matrix_apply_adjoint(_data, dst, tmp); md_free(tmp); } else { const complex float* mat_gram = data->mat_gram; #ifdef USE_CUDA if (cuda_ondevice(src)) { if (NULL == data->mat_gram_gpu) data->mat_gram_gpu = md_gpu_move(2 * data->N, data->grm_dims, data->mat_gram, CFL_SIZE); mat_gram = data->mat_gram_gpu; } #endif md_ztenmul(2 * data->N, data->gout_dims, dst, data->gin_dims, src, data->grm_dims, mat_gram); } }
const struct operator_s* fft_create2(unsigned int D, const long dimensions[D], unsigned long flags, const long ostrides[D], complex float* dst, const long istrides[D], const complex float* src, bool backwards) { PTR_ALLOC(struct fft_plan_s, plan); SET_TYPEID(fft_plan_s, plan); plan->fftw = fft_fftwf_plan(D, dimensions, flags, ostrides, dst, istrides, src, backwards, false); #ifdef USE_CUDA plan->cuplan = NULL; #ifndef LAZY_CUDA if (cuda_ondevice(src)) plan->cuplan = fft_cuda_plan(D, dimensions, flags, ostrides, istrides, backwards); #else plan->D = D; plan->flags = flags; plan->backwards = backwards; PTR_ALLOC(long[D], dims); md_copy_dims(D, *dims, dimensions); plan->dims = *PTR_PASS(dims); PTR_ALLOC(long[D], istrs); md_copy_strides(D, *istrs, istrides); plan->istrs = *PTR_PASS(istrs); PTR_ALLOC(long[D], ostrs); md_copy_strides(D, *ostrs, ostrides); plan->ostrs = *PTR_PASS(ostrs); #endif #endif return operator_create2(D, dimensions, ostrides, D, dimensions, istrides, CAST_UP(PTR_PASS(plan)), fft_apply, fft_free_plan); }
static void sampling_apply(const linop_data_t* _data, complex float* dst, const complex float* src) { const auto data = CAST_DOWN(sampling_data_s, _data); #ifdef USE_CUDA const complex float* pattern = get_pat(data, cuda_ondevice(src)); #else const complex float* pattern = data->pattern; #endif md_zmul2(DIMS, data->dims, data->strs, dst, data->strs, src, data->pat_strs, pattern); }
static void fmac_apply(const linop_data_t* _data, complex float* dst, const complex float* src) { auto data = CAST_DOWN(fmac_data, _data); #ifdef USE_CUDA const complex float* tensor = get_tensor(data, cuda_ondevice(src)); #else const complex float* tensor = data->tensor; #endif md_clear2(data->N, data->odims, data->ostrs, dst, CFL_SIZE); md_zfmac2(data->N, data->dims, data->ostrs, dst, data->istrs, src, data->tstrs, tensor); }
void blas_cgemm(char transa, char transb, long M, long N, long K, const complex float alpha, long lda, const complex float A[K][lda], long ldb, const complex float B[N][ldb], const complex float beta, long ldc, complex float C[N][ldc]) { #ifdef USE_CUDA #define CUCOMPLEX(x) (((union { cuComplex cu; complex float std; }){ .std = (x) }).cu) if (cuda_ondevice(A)) { cublasCgemm(transa, transb, M, N, K, CUCOMPLEX(alpha), (const cuComplex*)A, lda, (const cuComplex*)B, ldb, CUCOMPLEX(beta), (cuComplex*)C, ldc); } else #endif cblas_cgemm(CblasColMajor, transa, transb, M, N, K, (void*)&alpha, (void*)A, lda, (void*)B, ldb, (void*)&beta, (void*)C, ldc); }
const struct operator_s* fft_create2(unsigned int D, const long dimensions[D], unsigned long flags, const long ostrides[D], complex float* dst, const long istrides[D], const complex float* src, bool backwards) { PTR_ALLOC(struct fft_plan_s, plan); plan->fftw = fft_fftwf_plan(D, dimensions, flags, ostrides, dst, istrides, src, backwards); #ifdef USE_CUDA plan->cuplan = NULL; if (cuda_ondevice(src)) plan->cuplan = fft_cuda_plan(D, dimensions, flags, ostrides, istrides, backwards); #endif return operator_create2(D, dimensions, ostrides, D, dimensions, istrides, &PTR_PASS(plan)->base, fft_apply, fft_free_plan); }
static void cdiag_adjoint(const linop_data_t* _data, complex float* dst, const complex float* src) { const struct cdiag_s* data = CAST_DOWN(cdiag_s, _data); const complex float* diag = data->diag; #ifdef USE_CUDA if (cuda_ondevice(src)) { if (NULL == data->gpu_diag) ((struct cdiag_s*)data)->gpu_diag = md_gpu_move(data->N, data->dims, data->diag, CFL_SIZE); diag = data->gpu_diag; } #endif (data->rmul ? md_zrmul2 : md_zmulc2)(data->N, data->dims, data->strs, dst, data->strs, src, data->dstrs, diag); }
void md_gaussian_rand(unsigned int D, const long dims[D], complex float* dst) { #ifdef USE_CUDA if (cuda_ondevice(dst)) { complex float* tmp = md_alloc(D, dims, sizeof(complex float)); md_gaussian_rand(D, dims, tmp); md_copy(D, dims, dst, tmp, sizeof(complex float)); md_free(tmp); return; } #endif //#pragma omp parallel for for (long i = 0; i < md_calc_size(D, dims); i++) dst[i] = (float)gaussian_rand(); }
static void linop_matrix_apply_adjoint(const linop_data_t* _data, complex float* dst, const complex float* src) { struct operator_matrix_s* data = CAST_DOWN(operator_matrix_s, _data); const complex float* mat = data->mat; #ifdef USE_CUDA if (cuda_ondevice(src)) { if (NULL == data->mat_gpu) data->mat_gpu = md_gpu_move(data->N, data->mat_dims, data->mat, CFL_SIZE); mat = data->mat_gpu; } #endif md_ztenmulc(data->N, data->in_dims, dst, data->out_dims, src, data->mat_dims, mat); }
const struct operator_s* fft_measure_create(unsigned int D, const long dimensions[D], unsigned long flags, bool inplace, bool backwards) { PTR_ALLOC(struct fft_plan_s, plan); SET_TYPEID(fft_plan_s, plan); complex float* src = md_alloc(D, dimensions, CFL_SIZE); complex float* dst = inplace ? src : md_alloc(D, dimensions, CFL_SIZE); long strides[D]; md_calc_strides(D, strides, dimensions, CFL_SIZE); plan->fftw = fft_fftwf_plan(D, dimensions, flags, strides, dst, strides, src, backwards, true); md_free(src); if (!inplace) md_free(dst); #ifdef USE_CUDA plan->cuplan = NULL; #ifndef LAZY_CUDA if (cuda_ondevice(src)) plan->cuplan = fft_cuda_plan(D, dimensions, flags, strides, strides, backwards); #else plan->D = D; plan->flags = flags; plan->backwards = backwards; PTR_ALLOC(long[D], dims); md_copy_dims(D, *dims, dimensions); plan->dims = *PTR_PASS(dims); PTR_ALLOC(long[D], istrs); md_copy_strides(D, *istrs, strides); plan->istrs = *PTR_PASS(istrs); PTR_ALLOC(long[D], ostrs); md_copy_strides(D, *ostrs, strides); plan->ostrs = *PTR_PASS(ostrs); #endif #endif return operator_create2(D, dimensions, strides, D, dimensions, strides, CAST_UP(PTR_PASS(plan)), fft_apply, fft_free_plan); }
static void fft_apply(const operator_data_t* _plan, unsigned int N, void* args[N]) { complex float* dst = args[0]; const complex float* src = args[1]; const struct fft_plan_s* plan = CONTAINER_OF(_plan, const struct fft_plan_s, base); assert(2 == N); #ifdef USE_CUDA if (cuda_ondevice(src)) { assert(NULL != plan->cuplan); fft_cuda_exec(plan->cuplan, dst, src); } else #endif { assert(NULL != plan->fftw); fftwf_execute_dft(plan->fftw, (complex float*)src, dst); } }
static void fft_apply(const operator_data_t* _plan, unsigned int N, void* args[N]) { complex float* dst = args[0]; const complex float* src = args[1]; const struct fft_plan_s* plan = CAST_DOWN(fft_plan_s, _plan); assert(2 == N); #ifdef USE_CUDA if (cuda_ondevice(src)) { #ifdef LAZY_CUDA if (NULL == plan->cuplan) ((struct fft_plan_s*)plan)->cuplan = fft_cuda_plan(plan->D, plan->dims, plan->flags, plan->ostrs, plan->istrs, plan->backwards); #endif assert(NULL != plan->cuplan); fft_cuda_exec(plan->cuplan, dst, src); } else #endif { assert(NULL != plan->fftw); fftwf_execute_dft(plan->fftw, (complex float*)src, dst); } }