float svthresh_nomeanv( long M, long N, float lambda, complex float* dst, const complex float* src) { long MN = M*N; complex float* basis = md_alloc( 1, &N, CFL_SIZE ); complex float* coeff = md_alloc( 1, &M, CFL_SIZE ); complex float* tmp = md_alloc( 1, &MN, CFL_SIZE ); for( int i = 0; i < N; i++) basis[i] = 1. / sqrtf( N ); md_clear( 1, &M, coeff, CFL_SIZE ); md_clear( 1, &MN, tmp, CFL_SIZE ); for( int j = 0; j < N; j++) for( int i = 0; i < M; i++) coeff[i] += basis[j] * src[i + j*M]; ; for( int j = 0; j < N; j++) for( int i = 0; i < M; i++) tmp[i + j*M] = src[i + j*M] - coeff[i] * basis[j]; svthresh(M, N, lambda , dst, tmp); for( int j = 0; j < N; j++) for( int i = 0; i < M; i++) dst[i + j*M] += coeff[i] * basis[j]; return 0; }
void tv_adjoint(unsigned int D, const long dims[D], unsigned int flags, complex float* out, const complex float* in) { unsigned int N = bitcount(flags); assert(N == dims[D - 1]); // we use the highest dim to store our different partial derivatives unsigned int flags2 = flags; complex float* tmp = md_alloc_sameplace(D - 1, dims, CFL_SIZE, out); md_clear(D - 1, dims, out, CFL_SIZE); md_clear(D - 1, dims, tmp, CFL_SIZE); for (unsigned int i = 0; i < N; i++) { unsigned int lsb = ffs(flags2) - 1; flags2 = MD_CLEAR(flags2, lsb); md_zfdiff_backwards(D - 1, dims, lsb, tmp, in + i * md_calc_size(D - 1, dims)); md_zadd(D - 1, dims, out, out, tmp); } md_free(tmp); assert(0 == flags2); }
void noir_recon(const struct noir_conf_s* conf, const long dims[DIMS], complex float* outbuf, complex float* sensout, const complex float* psf, const complex float* mask, const complex float* kspace) { long imgs_dims[DIMS]; long coil_dims[DIMS]; long data_dims[DIMS]; long img1_dims[DIMS]; md_select_dims(DIMS, FFT_FLAGS|MAPS_FLAG|CSHIFT_FLAG, imgs_dims, dims); md_select_dims(DIMS, FFT_FLAGS|COIL_FLAG|MAPS_FLAG, coil_dims, dims); md_select_dims(DIMS, FFT_FLAGS|COIL_FLAG, data_dims, dims); md_select_dims(DIMS, FFT_FLAGS, img1_dims, dims); long skip = md_calc_size(DIMS, imgs_dims); long size = skip + md_calc_size(DIMS, coil_dims); long data_size = md_calc_size(DIMS, data_dims); long d1[1] = { size }; complex float* img = md_alloc_sameplace(1, d1, CFL_SIZE, kspace); complex float* imgH = md_alloc_sameplace(1, d1, CFL_SIZE, kspace); md_clear(DIMS, imgs_dims, img, CFL_SIZE); md_zfill(DIMS, img1_dims, outbuf, 1.); // initial only first image md_copy(DIMS, img1_dims, img, outbuf, CFL_SIZE); md_clear(DIMS, coil_dims, img + skip, CFL_SIZE); md_clear(DIMS, imgs_dims, imgH, CFL_SIZE); md_clear(DIMS, coil_dims, imgH + skip, CFL_SIZE); struct noir_data* ndata = noir_init(dims, mask, psf, conf->rvc, conf->usegpu); struct data data = { ndata }; struct iter3_irgnm_conf irgnm_conf = { .iter = conf->iter, .alpha = conf->alpha, .redu = conf->redu }; iter3_irgnm(&irgnm_conf.base, frw, der, adj, &data, size * 2, (float*)img, data_size * 2, (const float*)kspace); md_copy(DIMS, imgs_dims, outbuf, img, CFL_SIZE); if (NULL != sensout) { assert(!conf->usegpu); noir_forw_coils(ndata, sensout, img + skip); } noir_free(ndata); md_free(img); md_free(imgH); }
/** * Compute Strang's circulant preconditioner * * Strang's reconditioner is simply the cropped psf in the image domain * */ static complex float* compute_precond(unsigned int N, const long* pre_dims, const long* pre_strs, const long* psf_dims, const long* psf_strs, const complex float* psf, const complex float* linphase) { int ND = N + 1; unsigned long flags = FFT_FLAGS; complex float* pre = md_alloc(ND, pre_dims, CFL_SIZE); complex float* psft = md_alloc(ND, psf_dims, CFL_SIZE); // Transform psf to image domain ifftuc(ND, psf_dims, flags, psft, psf); // Compensate for linear phase to get cropped psf md_clear(ND, pre_dims, pre, CFL_SIZE); md_zfmacc2(ND, psf_dims, pre_strs, pre, psf_strs, psft, psf_strs, linphase); md_free(psft); // Transform to Fourier domain fftuc(N, pre_dims, flags, pre, pre); md_zabs(N, pre_dims, pre, pre); md_zsadd(N, pre_dims, pre, pre, 1e-3); return pre; }
static void maps_apply(const void* _data, complex float* dst, const complex float* src) { const struct maps_data* data = _data; md_clear(DIMS, data->ksp_dims, dst, CFL_SIZE); md_zfmac2(DIMS, data->max_dims, data->strs_ksp, dst, data->strs_img, src, data->strs_mps, data->sens); }
static void admm_normaleq(void* _data, float* _dst, const float* _src) { struct admm_normaleq_data* data = _data; long dims[1] = { data->N }; //float* tmp = md_alloc_sameplace(1, dims, FL_SIZE, _src ); md_clear(1, dims, _dst, sizeof(float)); for (unsigned int i = 0; i < data->num_funs; i++) { data->ops[i].normal(data->ops[i].data, data->tmp, _src); if ((NULL != data->Aop) && (NULL != data->Aop_data)) md_axpy(1, dims, _dst, data->rho, data->tmp); else md_add(1, dims, _dst, _dst, data->tmp); } if ((NULL != data->Aop) && (NULL != data->Aop_data)) { data->Aop(data->Aop_data, data->tmp, _src); md_add(1, dims, _dst, _dst, data->tmp); } //md_free( tmp ); }
// Forward: from image to kspace static void nufft_apply(const void* _data, complex float* dst, const complex float* src) { struct nufft_data* data = (struct nufft_data*)_data; assert(!data->conf.toeplitz); // if toeplitz linphase has no roll, so would need to be added unsigned int ND = data->N + 3; md_zmul2(ND, data->cml_dims, data->cml_strs, data->grid, data->cim_strs, src, data->lph_strs, data->linphase); linop_forward(data->fft_op, ND, data->cml_dims, data->grid, ND, data->cml_dims, data->grid); md_zmul2(ND, data->cml_dims, data->cml_strs, data->grid, data->cml_strs, data->grid, data->img_strs, data->fftmod); md_clear(ND, data->ksp_dims, dst, CFL_SIZE); complex float* gridX = md_alloc(data->N, data->cm2_dims, CFL_SIZE); long factors[data->N]; for (unsigned int i = 0; i < data->N; i++) factors[i] = ((data->img_dims[i] > 1) && (i < 3)) ? 2 : 1; md_recompose(data->N, factors, data->cm2_dims, gridX, data->cml_dims, data->grid, CFL_SIZE); grid2H(2., data->width, data->beta, ND, data->trj_dims, data->traj, data->ksp_dims, dst, data->cm2_dims, gridX); md_free(gridX); if (NULL != data->weights) md_zmul2(data->N, data->ksp_dims, data->ksp_strs, dst, data->ksp_strs, dst, data->wgh_strs, data->weights); }
/** * * x = (ATA + uI)^-1 b * */ void sum_apply_pinverse(const void* _data, float rho, complex float* dst, const complex float* src) { struct sum_data* data = (struct sum_data*) _data; if (NULL == data->tmp) { #ifdef USE_CUDA data->tmp = (data->use_gpu ? md_alloc_gpu : md_alloc)(DIMS, data->img_dims, CFL_SIZE); #else data->tmp = md_alloc(DIMS, data->img_dims, CFL_SIZE); #endif } // get average md_clear( DIMS, data->img_dims, data->tmp, sizeof( complex float ) ); md_zadd2( DIMS, data->imgd_dims, data->img_strs, data->tmp, data->img_strs, data->tmp , data->imgd_strs, src ); md_zsmul( DIMS, data->img_dims, data->tmp, data->tmp, 1. / data->levels ); // get non-average md_zsub2( DIMS, data->imgd_dims, data->imgd_strs, dst, data->imgd_strs, src, data->img_strs, data->tmp ); // avg = avg / (1 + rho) md_zsmul( DIMS, data->img_dims, data->tmp, data->tmp, 1. / (1. + rho) ); // nonavg = nonavg / rho md_zsmul( DIMS, data->imgd_dims, dst, dst, 1. / rho ); // dst = avg + nonavg md_zadd2( DIMS, data->imgd_dims, data->imgd_strs, dst, data->imgd_strs, dst, data->img_strs, data->tmp ); }
static void maps_apply_adjoint(const void* _data, complex float* dst, const complex float* src) { const struct maps_data* data = _data; // dst = sum( conj(sens) .* tmp ) md_clear(DIMS, data->img_dims, dst, CFL_SIZE); md_zfmacc2(DIMS, data->max_dims, data->strs_img, dst, data->strs_ksp, src, data->strs_mps, data->sens); }
void overlapandsave_exec(struct conv_plan* plan, int N, const long dims[N], const long blk[N], complex float* dst, complex float* src1, const long dim2[N]) { md_clear(2 * N, L, tmp, 8); md_copy2(2 * N, ndim3, str2, tmp, str1, src1, 8); conv_exec(plan, dst, tmp); free(tmp); }
void sum_apply_adjoint(const void* _data, complex float* dst, const complex float* src) { const struct sum_data* data = _data; md_clear( DIMS, data->imgd_dims, dst, sizeof( complex float ) ); md_zaxpy2( DIMS, data->imgd_dims, data->imgd_strs, dst, 1. / sqrtf( data->levels ) , data->img_strs, src ); }
void noir_fun(struct noir_data* data, complex float* dst, const complex float* src) { long split = md_calc_size(DIMS, data->imgs_dims); md_copy(DIMS, data->imgs_dims, data->xn, src, CFL_SIZE); noir_forw_coils(data, data->sens, src + split); md_clear(DIMS, data->sign_dims, data->tmp, CFL_SIZE); md_zfmac2(DIMS, data->sign_dims, data->sign_strs, data->tmp, data->imgs_strs, src, data->coil_strs, data->sens); // could be moved to the benning, but see comment below md_zmul2(DIMS, data->sign_dims, data->sign_strs, data->tmp, data->sign_strs, data->tmp, data->mask_strs, data->mask); fft(DIMS, data->sign_dims, FFT_FLAGS, data->tmp, data->tmp); md_clear(DIMS, data->data_dims, dst, CFL_SIZE); md_zfmac2(DIMS, data->sign_dims, data->data_strs, dst, data->sign_strs, data->tmp, data->ptrn_strs, data->pattern); }
static void inverse(void* _data, float alpha, float* dst, const float* src) { struct irgnm_s* data = _data; md_clear(1, MD_DIMS(data->size), dst, FL_SIZE); float eps = md_norm(1, MD_DIMS(data->size), src); conjgrad(100, alpha, 0.1f * eps, data->size, (void*)data, select_vecops(src), normal, dst, src, NULL, NULL, NULL); }
static void sense_adjoint(const void* _data, complex float* imgs, const complex float* out) { const struct sense_data* data = _data; md_zmulc2(DIMS, data->data_dims, data->data_strs, data->tmp, data->data_strs, out, data->mask_strs, data->pattern); ifftc(DIMS, data->data_dims, FFT_FLAGS, data->tmp, data->tmp); fftscale(DIMS, data->data_dims, FFT_FLAGS, data->tmp, data->tmp); md_clear(DIMS, data->imgs_dims, imgs, CFL_SIZE); md_zfmacc2(DIMS, data->sens_dims, data->imgs_strs, imgs, data->data_strs, data->tmp, data->sens_strs, data->sens); }
static void sense_forward(const void* _data, complex float* out, const complex float* imgs) { const struct sense_data* data = _data; md_clear(DIMS, data->data_dims, out, CFL_SIZE); md_zfmac2(DIMS, data->sens_dims, data->data_strs, out, data->sens_strs, data->sens, data->imgs_strs, imgs); fftc(DIMS, data->data_dims, FFT_FLAGS, out, out); fftscale(DIMS, data->data_dims, FFT_FLAGS, out, out); md_zmul2(DIMS, data->data_dims, data->data_strs, out, data->data_strs, out, data->mask_strs, data->pattern); }
void iwt1(unsigned int N, unsigned int d, const long dims[N], const long ostr[N], complex float* out, const long istr[N], const complex float* low, const complex float* hgh, const long flen, const float filter[2][2][flen]) { debug_printf(DP_DEBUG4, "ifwt1: %d/%d\n", d, N); debug_print_dims(DP_DEBUG4, N, dims); assert(dims[d] >= 2); long idims[N]; md_copy_dims(N, idims, dims); idims[d] = bandsize(dims[d], flen); debug_print_dims(DP_DEBUG4, N, idims); long o = d + 1; long u = N - o; // 0 1 2 3 4 5 6|7 // --d-- * --u--|N // ---o--- assert(d == md_calc_blockdim(d, dims + 0, ostr + 0, CFL_SIZE)); assert(u == md_calc_blockdim(u, dims + o, ostr + o, CFL_SIZE * md_calc_size(o, dims))); assert(d == md_calc_blockdim(d, idims + 0, istr + 0, CFL_SIZE)); assert(u == md_calc_blockdim(u, idims + o, istr + o, CFL_SIZE * md_calc_size(o, idims))); long wdims[3] = { md_calc_size(d, dims), dims[d], md_calc_size(u, dims + o) }; long wistr[3] = { CFL_SIZE, istr[d], CFL_SIZE * md_calc_size(o, idims) }; long wostr[3] = { CFL_SIZE, ostr[d], CFL_SIZE * md_calc_size(o, dims) }; md_clear(3, wdims, out, CFL_SIZE); // we cannot clear because we merge outputs #ifdef USE_CUDA if (cuda_ondevice(out)) { assert(cuda_ondevice(low)); assert(cuda_ondevice(hgh)); float* flow = md_gpu_move(1, MD_DIMS(flen), filter[1][0], FL_SIZE); float* fhgh = md_gpu_move(1, MD_DIMS(flen), filter[1][1], FL_SIZE); wl3_cuda_up3(wdims, wostr, out, wistr, low, flen, flow); wl3_cuda_up3(wdims, wostr, out, wistr, hgh, flen, fhgh); md_free(flow); md_free(fhgh); return; } #endif wavelet_up3(wdims, wostr, out, wistr, low, flen, filter[1][0]); wavelet_up3(wdims, wostr, out, wistr, hgh, flen, filter[1][1]); }
static void toeplitz_mult(const struct nufft_data* data, complex float* dst, const complex float* src) { unsigned int ND = data->N + 3; md_zmul2(ND, data->cml_dims, data->cml_strs, data->grid, data->cim_strs, src, data->lph_strs, data->linphase); linop_forward(data->fft_op, ND, data->cml_dims, data->grid, ND, data->cml_dims, data->grid); md_zmul2(ND, data->cml_dims, data->cml_strs, data->grid, data->cml_strs, data->grid, data->psf_strs, data->psf); linop_adjoint(data->fft_op, ND, data->cml_dims, data->grid, ND, data->cml_dims, data->grid); md_clear(ND, data->cim_dims, dst, CFL_SIZE); md_zfmacc2(ND, data->cml_dims, data->cim_strs, dst, data->cml_strs, data->grid, data->lph_strs, data->linphase); }
void noir_adj(struct noir_data* data, complex float* dst, const complex float* src) { long split = md_calc_size(DIMS, data->imgs_dims); md_zmulc2(DIMS, data->sign_dims, data->sign_strs, data->tmp, data->data_strs, src, data->ptrn_strs, data->pattern); ifft(DIMS, data->sign_dims, FFT_FLAGS, data->tmp, data->tmp); // we should move it to the end, but fft scaling is applied so this would be need to moved into data->xn or weights maybe? md_zmulc2(DIMS, data->sign_dims, data->sign_strs, data->tmp, data->sign_strs, data->tmp, data->mask_strs, data->mask); md_clear(DIMS, data->coil_dims, dst + split, CFL_SIZE); md_zfmacc2(DIMS, data->sign_dims, data->coil_strs, dst + split, data->sign_strs, data->tmp, data->imgs_strs, data->xn); noir_back_coils(data, dst + split, dst + split); md_clear(DIMS, data->imgs_dims, dst, CFL_SIZE); md_zfmacc2(DIMS, data->sign_dims, data->imgs_strs, dst, data->sign_strs, data->tmp, data->coil_strs, data->sens); if (data->rvc) md_zreal(DIMS, data->imgs_dims, dst, dst); }
// Adjoint: from kspace to image static void nufft_apply_adjoint(const void* _data, complex float* dst, const complex float* src) { const struct nufft_data* data = _data; unsigned int ND = data->N + 3; complex float* gridX = md_alloc(data->N, data->cm2_dims, CFL_SIZE); md_clear(data->N, data->cm2_dims, gridX, CFL_SIZE); complex float* wdat = NULL; if (NULL != data->weights) { wdat = md_alloc(data->N, data->ksp_dims, CFL_SIZE); md_zmulc2(data->N, data->ksp_dims, data->ksp_strs, wdat, data->ksp_strs, src, data->wgh_strs, data->weights); src = wdat; } grid2(2., data->width, data->beta, ND, data->trj_dims, data->traj, data->cm2_dims, gridX, data->ksp_dims, src); md_free(wdat); long factors[data->N]; for (unsigned int i = 0; i < data->N; i++) factors[i] = ((data->img_dims[i] > 1) && (i < 3)) ? 2 : 1; md_decompose(data->N, factors, data->cml_dims, data->grid, data->cm2_dims, gridX, CFL_SIZE); md_free(gridX); md_zmulc2(ND, data->cml_dims, data->cml_strs, data->grid, data->cml_strs, data->grid, data->img_strs, data->fftmod); linop_adjoint(data->fft_op, ND, data->cml_dims, data->grid, ND, data->cml_dims, data->grid); md_clear(ND, data->cim_dims, dst, CFL_SIZE); md_zfmacc2(ND, data->cml_dims, data->cim_strs, dst, data->cml_strs, data->grid, data->lph_strs, data->linphase); if (data->conf.toeplitz) md_zmul2(ND, data->cim_dims, data->cim_strs, dst, data->cim_strs, dst, data->img_strs, data->roll); }
/** * Proximal function for f(z) = lambda || z ||_2. * Solution is z = ( 1 - lambda * mu / norm(z) )_+ * z, * i.e. block soft thresholding * * @param prox_data should be of type prox_l2norm_data * @param mu proximal penalty * @param z output * @param x_plus_u input */ static void prox_l2norm_fun(const operator_data_t* prox_data, float mu, float* z, const float* x_plus_u) { struct prox_l2norm_data* pdata = CAST_DOWN(prox_l2norm_data, prox_data); md_clear(1, MD_DIMS(pdata->size), z, FL_SIZE); double q1 = md_norm(1, MD_DIMS(pdata->size), x_plus_u); if (q1 != 0) { double q2 = 1 - pdata->lambda * mu / q1; if (q2 > 0.) md_smul(1, MD_DIMS(pdata->size), z, x_plus_u, q2); } }
static void sense_reco(struct sense_data* data, complex float* imgs, const complex float* kspace) { complex float* adj = md_alloc(DIMS, data->imgs_dims, CFL_SIZE); md_clear(DIMS, data->imgs_dims, imgs, CFL_SIZE); sense_adjoint(data, adj, kspace); long size = md_calc_size(DIMS, data->imgs_dims); conjgrad(100, data->alpha, 1.E-3, 2 * size, data, &cpu_iter_ops, sense_normal, (float*)imgs, (const float*)adj, NULL, NULL, NULL); md_free(adj); }
static double bench_generic_sum(long dims[DIMS], unsigned int flags, bool forloop) { long dimsX[DIMS]; long dimsY[DIMS]; long dimsC[DIMS]; md_select_dims(DIMS, ~0u, dimsX, dims); md_select_dims(DIMS, flags, dimsY, dims); md_select_dims(DIMS, ~flags, dimsC, dims); long strsX[DIMS]; long strsY[DIMS]; md_calc_strides(DIMS, strsX, dimsX, CFL_SIZE); md_calc_strides(DIMS, strsY, dimsY, CFL_SIZE); complex float* x = md_alloc(DIMS, dimsX, CFL_SIZE); complex float* y = md_alloc(DIMS, dimsY, CFL_SIZE); md_gaussian_rand(DIMS, dimsX, x); md_clear(DIMS, dimsY, y, CFL_SIZE); long L = md_calc_size(DIMS, dimsC); long T = md_calc_size(DIMS, dimsY); double tic = timestamp(); if (forloop) { for (long i = 0; i < L; i++) { for (long j = 0; j < T; j++) y[j] = y[j] + x[i + j * L]; } } else { md_zaxpy2(DIMS, dims, strsY, y, 1., strsX, x); } double toc = timestamp(); md_free(x); md_free(y); return toc - tic; }
int main_mip(int argc, char* argv[argc]) { bool mIP = false; const struct opt_s opts[] = { OPT_SET('m', &mIP, "minimum" ), }; cmdline(&argc, argv, 3, 3, usage_str, help_str, ARRAY_SIZE(opts), opts); unsigned int flags = atoi(argv[1]); long idims[DIMS]; complex float* in = load_cfl(argv[2], DIMS, idims); long odims[DIMS]; md_select_dims(DIMS, ~flags, odims, idims); complex float* out = create_cfl(argv[3], DIMS, odims); complex float* tmp = md_alloc(DIMS, idims, CFL_SIZE); md_zabs(DIMS, idims, tmp, in); long istr[DIMS]; long ostr[DIMS]; md_calc_strides(DIMS, istr, idims, CFL_SIZE); md_calc_strides(DIMS, ostr, odims, CFL_SIZE); md_clear(DIMS, odims, out, CFL_SIZE); md_max2(DIMS, idims, ostr, (float*)out, ostr, (const float*)out, istr, (const float*)tmp); if (mIP) { // need result of max in output md_min2(DIMS, idims, ostr, (float*)out, ostr, (const float*)out, istr, (const float*)tmp); } md_free(tmp); unmap_cfl(DIMS, idims, in); unmap_cfl(DIMS, odims, out); exit(0); }
void casorati_matrixH(unsigned int N, const long dimk[N], const long dim[N], const long str[N], complex float* optr, const long odim[2], const complex float* iptr) { long str2[2 * N]; long strc[2 * N]; long dimc[2 * N]; calc_casorati_geom(N, dimc, str2, dimk, dim, str); assert(odim[0] == md_calc_size(N, dimc)); assert(odim[1] == md_calc_size(N, dimc + N)); md_clear(N, dim, optr, CFL_SIZE); md_calc_strides(2 * N, strc, dimc, CFL_SIZE); md_zadd2(2 * N, dimc, str2, optr, str2, optr, strc, iptr); }
void overlapandsave(int N, const long dims[N], const long blk[N], complex float* dst, complex float* src1, const long dim2[N], complex float* src2) { // [------++++ // [------ long ndims[2 * N]; long L[2 * N]; long ndim2[2 * N]; long ndim3[2 * N]; for (int i = 0; i < N; i++) { assert(0 == dims[i] % blk[i]); assert(dim2[i] <= blk[i]); ndims[i * 2 + 1] = dims[i] / blk[i]; ndims[i * 2 + 0] = blk[i]; L[i * 2 + 1] = dims[i] / blk[i]; L[i * 2 + 0] = blk[i] + dim2[i] - 1; ndim2[i * 2 + 1] = 1; ndim2[i * 2 + 0] = dim2[i]; ndim3[i * 2 + 1] = dims[i] / blk[i] - 0; ndim3[i * 2 + 0] = blk[i]; } long T = md_calc_size(2 * N, L); complex float* tmp = xmalloc(T * 8); long str1[2 * N]; long str2[2 * N]; long str3[2 * N]; md_calc_strides(2 * N, str1, ndims, 8); md_calc_strides(2 * N, str2, L, 8); md_calc_strides(2 * N, str3, ndim3, 8); md_clear(2 * N, L, tmp, 8); md_copy2(2 * N, ndim3, str2, tmp, str1, src1, 8); conv(2 * N, ~0, CONV_VALID, CONV_CAUSAL, ndims, dst, L, tmp, ndim2, src2); free(tmp); }
void overlapandadd(int N, const long dims[N], const long blk[N], complex float* dst, complex float* src1, const long dim2[N], complex float* src2) { long ndims[2 * N]; long L[2 * N]; long ndim2[2 * N]; long ndim3[2 * N]; for (int i = 0; i < N; i++) { assert(0 == dims[i] % blk[i]); assert(dim2[i] <= blk[i]); ndims[i * 2 + 1] = dims[i] / blk[i]; ndims[i * 2 + 0] = blk[i]; L[i * 2 + 1] = dims[i] / blk[i]; L[i * 2 + 0] = blk[i] + dim2[i] - 1; ndim2[i * 2 + 1] = 1; ndim2[i * 2 + 0] = dim2[i]; ndim3[i * 2 + 1] = dims[i] / blk[i] + 1; ndim3[i * 2 + 0] = blk[i]; } long T = md_calc_size(2 * N, L); complex float* tmp = xmalloc(T * 8); // conv_causal_extend(2 * N, L, tmp, ndims, src1, ndim2, src2); conv(2 * N, ~0, CONV_EXTENDED, CONV_CAUSAL, L, tmp, ndims, src1, ndim2, src2); // [------++++|||||||| //long str1[2 * N]; long str2[2 * N]; long str3[2 * N]; //md_calc_strides(2 * N, str1, ndims, 8); md_calc_strides(2 * N, str2, L, 8); md_calc_strides(2 * N, str3, ndim3, 8); md_clear(2 * N, ndim3, dst, CFL_SIZE); md_zadd2(2 * N, L, str3, dst, str3, dst, str2, tmp); free(tmp); }
static double bench_generic_matrix_multiply(long dims[DIMS]) { long dimsX[DIMS]; long dimsY[DIMS]; long dimsZ[DIMS]; md_select_dims(DIMS, 2 * 3 + 17, dimsX, dims); // 1 110 1 md_select_dims(DIMS, 2 * 6 + 17, dimsY, dims); // 1 011 1 md_select_dims(DIMS, 2 * 5 + 17, dimsZ, dims); // 1 101 1 long strsX[DIMS]; long strsY[DIMS]; long strsZ[DIMS]; md_calc_strides(DIMS, strsX, dimsX, CFL_SIZE); md_calc_strides(DIMS, strsY, dimsY, CFL_SIZE); md_calc_strides(DIMS, strsZ, dimsZ, CFL_SIZE); complex float* x = md_alloc(DIMS, dimsX, CFL_SIZE); complex float* y = md_alloc(DIMS, dimsY, CFL_SIZE); complex float* z = md_alloc(DIMS, dimsZ, CFL_SIZE); md_gaussian_rand(DIMS, dimsX, x); md_gaussian_rand(DIMS, dimsY, y); md_clear(DIMS, dimsZ, z, CFL_SIZE); double tic = timestamp(); md_zfmac2(DIMS, dims, strsZ, z, strsX, x, strsY, y); double toc = timestamp(); md_free(x); md_free(y); md_free(z); return toc - tic; }
static double bench_transpose(long scale) { long dims[DIMS] = { 2000 * scale, 2000 * scale, 1, 1, 1, 1, 1, 1 }; complex float* x = md_alloc(DIMS, dims, CFL_SIZE); complex float* y = md_alloc(DIMS, dims, CFL_SIZE); md_gaussian_rand(DIMS, dims, x); md_clear(DIMS, dims, y, CFL_SIZE); double tic = timestamp(); md_transpose(DIMS, 0, 1, dims, y, dims, x, CFL_SIZE); double toc = timestamp(); md_free(x); md_free(y); return toc - tic; }
static void linop_matrix_apply_normal(const linop_data_t* _data, complex float* dst, const complex float* src) { const struct operator_matrix_s* data = CAST_DOWN(operator_matrix_s, _data); unsigned int N = data->mat_iovec->N; // FIXME check all the cases where computation can be done with blas //debug_printf(DP_DEBUG1, "compute normal\n"); if (cgemm_forward_standard(data)) { long max_dims_gram[N]; md_copy_dims(N, max_dims_gram, data->domain_iovec->dims); max_dims_gram[data->T_dim] = data->K; long tmp_dims[N]; long tmp_str[N]; md_copy_dims(N, tmp_dims, max_dims_gram); tmp_dims[data->K_dim] = 1; md_calc_strides(N, tmp_str, tmp_dims, CFL_SIZE); complex float* tmp = md_alloc_sameplace(N, data->domain_iovec->dims, CFL_SIZE, dst); md_clear(N, data->domain_iovec->dims, tmp, CFL_SIZE); md_zfmac2(N, max_dims_gram, tmp_str, tmp, data->domain_iovec->strs, src, data->mat_gram_iovec->strs, data->mat_gram); md_transpose(N, data->T_dim, data->K_dim, data->domain_iovec->dims, dst, tmp_dims, tmp, CFL_SIZE); md_free(tmp); } else { long L = md_calc_size(data->T_dim, data->domain_iovec->dims); blas_cgemm('N', 'T', L, data->K, data->K, 1., L, (const complex float (*)[])src, data->K, (const complex float (*)[])data->mat_gram, 0., L, (complex float (*)[])dst); } }
int main_zeros(int argc, char* argv[]) { mini_cmdline(argc, argv, -3, usage_str, help_str); int N = atoi(argv[1]); assert(N >= 0); assert(argc == 3 + N); long dims[N]; for (int i = 0; i < N; i++) { dims[i] = atoi(argv[2 + i]); assert(dims[i] >= 1); } complex float* x = create_cfl(argv[2 + N], N, dims); md_clear(N, dims, x, sizeof(complex float)); unmap_cfl(N, dims, x); exit(0); }