예제 #1
0
파일: gpuops.c 프로젝트: frankong/bart
static double cuda_sdot(long size, const float* src1, const float* src2)
{
	assert(cuda_ondevice(src1));
	assert(cuda_ondevice(src2));
//	printf("SDOT %x %x %ld\n", src1, src2, size);
	return cublasSdot(size, src1, 1, src2, 1);
}
예제 #2
0
파일: wavelet.c 프로젝트: hcmh/bart
void fwt1(unsigned int N, unsigned int d, const long dims[N], const long ostr[N], complex float* low, complex float* hgh, const long istr[N], const complex float* in, const long flen, const float filter[2][2][flen])
{
	debug_printf(DP_DEBUG4, "fwt1: %d/%d\n", d, N);
	debug_print_dims(DP_DEBUG4, N, dims);

	assert(dims[d] >= 2);

	long odims[N];
	md_copy_dims(N, odims, dims);
	odims[d] = bandsize(dims[d], flen);

	debug_print_dims(DP_DEBUG4, N, odims);

	long o = d + 1;
	long u = N - o;

	// 0 1 2 3 4 5 6|7
	// --d-- * --u--|N
	// ---o---

	assert(d == md_calc_blockdim(d, dims + 0, istr + 0, CFL_SIZE));
	assert(u == md_calc_blockdim(u, dims + o, istr + o, CFL_SIZE * md_calc_size(o, dims)));

	assert(d == md_calc_blockdim(d, odims + 0, ostr + 0, CFL_SIZE));
	assert(u == md_calc_blockdim(u, odims + o, ostr + o, CFL_SIZE * md_calc_size(o, odims)));

	// merge dims

	long wdims[3] = { md_calc_size(d, dims), dims[d], md_calc_size(u, dims + o) };
	long wistr[3] = { CFL_SIZE, istr[d], CFL_SIZE * md_calc_size(o, dims) };
	long wostr[3] = { CFL_SIZE, ostr[d], CFL_SIZE * md_calc_size(o, odims) };

#ifdef  USE_CUDA
	if (cuda_ondevice(in)) {

		assert(cuda_ondevice(low));
		assert(cuda_ondevice(hgh));

		float* flow = md_gpu_move(1, MD_DIMS(flen), filter[0][0], FL_SIZE);
		float* fhgh = md_gpu_move(1, MD_DIMS(flen), filter[0][1], FL_SIZE);

		wl3_cuda_down3(wdims, wostr, low, wistr, in, flen, flow);
		wl3_cuda_down3(wdims, wostr, hgh, wistr, in, flen, fhgh);

		md_free(flow);
		md_free(fhgh);
		return;
	}
#endif

	// no clear needed
	wavelet_down3(wdims, wostr, low, wistr, in, flen, filter[0][0]);
	wavelet_down3(wdims, wostr, hgh, wistr, in, flen, filter[0][1]);
}
예제 #3
0
파일: wavelet.c 프로젝트: hcmh/bart
void iwt1(unsigned int N, unsigned int d, const long dims[N], const long ostr[N], complex float* out, const long istr[N], const complex float* low, const complex float* hgh, const long flen, const float filter[2][2][flen])
{
	debug_printf(DP_DEBUG4, "ifwt1: %d/%d\n", d, N);
	debug_print_dims(DP_DEBUG4, N, dims);

	assert(dims[d] >= 2);

	long idims[N];
	md_copy_dims(N, idims, dims);
	idims[d] = bandsize(dims[d], flen);

	debug_print_dims(DP_DEBUG4, N, idims);

	long o = d + 1;
	long u = N - o;

	// 0 1 2 3 4 5 6|7
	// --d-- * --u--|N
	// ---o---

	assert(d == md_calc_blockdim(d, dims + 0, ostr + 0, CFL_SIZE));
	assert(u == md_calc_blockdim(u, dims + o, ostr + o, CFL_SIZE * md_calc_size(o, dims)));
	assert(d == md_calc_blockdim(d, idims + 0, istr + 0, CFL_SIZE));
	assert(u == md_calc_blockdim(u, idims + o, istr + o, CFL_SIZE * md_calc_size(o, idims)));

	long wdims[3] = { md_calc_size(d, dims), dims[d], md_calc_size(u, dims + o) };
	long wistr[3] = { CFL_SIZE, istr[d], CFL_SIZE * md_calc_size(o, idims) };
	long wostr[3] = { CFL_SIZE, ostr[d], CFL_SIZE * md_calc_size(o, dims) };

	md_clear(3, wdims, out, CFL_SIZE);	// we cannot clear because we merge outputs

#ifdef  USE_CUDA
	if (cuda_ondevice(out)) {

		assert(cuda_ondevice(low));
		assert(cuda_ondevice(hgh));

		float* flow = md_gpu_move(1, MD_DIMS(flen), filter[1][0], FL_SIZE);
		float* fhgh = md_gpu_move(1, MD_DIMS(flen), filter[1][1], FL_SIZE);

		wl3_cuda_up3(wdims, wostr, out, wistr, low, flen, flow);
		wl3_cuda_up3(wdims, wostr, out, wistr, hgh, flen, fhgh);

		md_free(flow);
		md_free(fhgh);
		return;
	}
#endif

	wavelet_up3(wdims, wostr, out, wistr, low, flen, filter[1][0]);
	wavelet_up3(wdims, wostr, out, wistr, hgh, flen, filter[1][1]);
}
예제 #4
0
파일: someops.c 프로젝트: hcmh/bart
static void linop_matrix_apply_normal(const linop_data_t* _data, complex float* dst, const complex float* src)
{
	struct operator_matrix_s* data = CAST_DOWN(operator_matrix_s, _data);

	if (NULL == data->mat_gram) {

		complex float* tmp = md_alloc_sameplace(data->N, data->out_dims, CFL_SIZE, src);

		linop_matrix_apply(_data, tmp, src);
		linop_matrix_apply_adjoint(_data, dst, tmp);

		md_free(tmp);

	} else {

		const complex float* mat_gram = data->mat_gram;
#ifdef USE_CUDA
		if (cuda_ondevice(src)) {

			if (NULL == data->mat_gram_gpu)
				data->mat_gram_gpu = md_gpu_move(2 * data->N, data->grm_dims, data->mat_gram, CFL_SIZE);

			mat_gram = data->mat_gram_gpu;
		}
#endif
		md_ztenmul(2 * data->N, data->gout_dims, dst, data->gin_dims, src, data->grm_dims, mat_gram);
	}
}
예제 #5
0
파일: fft.c 프로젝트: hcmh/bart
const struct operator_s* fft_create2(unsigned int D, const long dimensions[D], unsigned long flags, const long ostrides[D], complex float* dst, const long istrides[D], const complex float* src, bool backwards)
{
	PTR_ALLOC(struct fft_plan_s, plan);
	SET_TYPEID(fft_plan_s, plan);

	plan->fftw = fft_fftwf_plan(D, dimensions, flags, ostrides, dst, istrides, src, backwards, false);

#ifdef  USE_CUDA
	plan->cuplan = NULL;
#ifndef LAZY_CUDA
	if (cuda_ondevice(src))
		plan->cuplan = fft_cuda_plan(D, dimensions, flags, ostrides, istrides, backwards);
#else
	plan->D = D;
	plan->flags = flags;
	plan->backwards = backwards;

	PTR_ALLOC(long[D], dims);
	md_copy_dims(D, *dims, dimensions);
	plan->dims = *PTR_PASS(dims);

	PTR_ALLOC(long[D], istrs);
	md_copy_strides(D, *istrs, istrides);
	plan->istrs = *PTR_PASS(istrs);

	PTR_ALLOC(long[D], ostrs);
	md_copy_strides(D, *ostrs, ostrides);
	plan->ostrs = *PTR_PASS(ostrs);
#endif
#endif

	return operator_create2(D, dimensions, ostrides, D, dimensions, istrides, CAST_UP(PTR_PASS(plan)), fft_apply, fft_free_plan);
}
예제 #6
0
파일: sampling.c 프로젝트: mrirecon/bart
static void sampling_apply(const linop_data_t* _data, complex float* dst, const complex float* src)
{
	const auto data = CAST_DOWN(sampling_data_s, _data);

#ifdef USE_CUDA
	const complex float* pattern = get_pat(data, cuda_ondevice(src));
#else
	const complex float* pattern = data->pattern;
#endif

	md_zmul2(DIMS, data->dims, data->strs, dst, data->strs, src, data->pat_strs, pattern);
}
예제 #7
0
파일: fmac.c 프로젝트: mrirecon/bart
static void fmac_apply(const linop_data_t* _data, complex float* dst, const complex float* src)
{
        auto data = CAST_DOWN(fmac_data, _data);

#ifdef USE_CUDA
	const complex float* tensor = get_tensor(data, cuda_ondevice(src));
#else
	const complex float* tensor = data->tensor;
#endif

	md_clear2(data->N, data->odims, data->ostrs, dst, CFL_SIZE);
	md_zfmac2(data->N, data->dims, data->ostrs, dst, data->istrs, src, data->tstrs, tensor);
}
예제 #8
0
파일: blas.c 프로젝트: Markusjsommer/bart
void blas_cgemm(char transa, char transb, long M, long N,  long K, const complex float alpha, long lda, const complex float A[K][lda], long ldb, const complex float B[N][ldb], const complex float beta, long ldc, complex float C[N][ldc])
{
#ifdef USE_CUDA
#define CUCOMPLEX(x) (((union { cuComplex cu; complex float std; }){ .std = (x) }).cu)
        if (cuda_ondevice(A)) {

                cublasCgemm(transa, transb, M, N, K,  CUCOMPLEX(alpha),
                                (const cuComplex*)A, lda,
                                (const cuComplex*)B, ldb, CUCOMPLEX(beta),
                                (cuComplex*)C, ldc);
        } else
#endif
        cblas_cgemm(CblasColMajor, transa, transb, M, N, K, (void*)&alpha, (void*)A, lda, (void*)B, ldb, (void*)&beta, (void*)C, ldc);
}
예제 #9
0
파일: fft.c 프로젝트: Markusjsommer/bart
const struct operator_s* fft_create2(unsigned int D, const long dimensions[D], unsigned long flags, const long ostrides[D], complex float* dst, const long istrides[D], const complex float* src, bool backwards)
{
	PTR_ALLOC(struct fft_plan_s, plan);

	plan->fftw = fft_fftwf_plan(D, dimensions, flags, ostrides, dst, istrides, src, backwards);

#ifdef  USE_CUDA
	plan->cuplan = NULL;

	if (cuda_ondevice(src))
		plan->cuplan = fft_cuda_plan(D, dimensions, flags, ostrides, istrides, backwards);
#endif

	return operator_create2(D, dimensions, ostrides, D, dimensions, istrides, &PTR_PASS(plan)->base, fft_apply, fft_free_plan);
}
예제 #10
0
파일: someops.c 프로젝트: welcheb/bart
static void cdiag_adjoint(const linop_data_t* _data, complex float* dst, const complex float* src)
{
	const struct cdiag_s* data = CAST_DOWN(cdiag_s, _data);

	const complex float* diag = data->diag;
#ifdef USE_CUDA
	if (cuda_ondevice(src)) {

		if (NULL == data->gpu_diag)
			((struct cdiag_s*)data)->gpu_diag = md_gpu_move(data->N, data->dims, data->diag, CFL_SIZE);

		diag = data->gpu_diag;
	}
#endif
	(data->rmul ? md_zrmul2 : md_zmulc2)(data->N, data->dims, data->strs, dst, data->strs, src, data->dstrs, diag);
}
예제 #11
0
파일: rand.c 프로젝트: Markusjsommer/bart
void md_gaussian_rand(unsigned int D, const long dims[D], complex float* dst)
{
#ifdef  USE_CUDA
	if (cuda_ondevice(dst)) {
	
		complex float* tmp = md_alloc(D, dims, sizeof(complex float));
		md_gaussian_rand(D, dims, tmp);
		md_copy(D, dims, dst, tmp, sizeof(complex float));
		md_free(tmp);
		return;
	}
#endif
//#pragma omp parallel for
	for (long i = 0; i < md_calc_size(D, dims); i++)
		dst[i] = (float)gaussian_rand();
}
예제 #12
0
파일: someops.c 프로젝트: hcmh/bart
static void linop_matrix_apply_adjoint(const linop_data_t* _data, complex float* dst, const complex float* src)
{
	struct operator_matrix_s* data = CAST_DOWN(operator_matrix_s, _data);
	const complex float* mat = data->mat;

#ifdef USE_CUDA
	if (cuda_ondevice(src)) {

		if (NULL == data->mat_gpu)
			data->mat_gpu = md_gpu_move(data->N, data->mat_dims, data->mat, CFL_SIZE);

		mat = data->mat_gpu;
	}
#endif

	md_ztenmulc(data->N, data->in_dims, dst, data->out_dims, src, data->mat_dims, mat);
}
예제 #13
0
파일: fft.c 프로젝트: hcmh/bart
const struct operator_s* fft_measure_create(unsigned int D, const long dimensions[D], unsigned long flags, bool inplace, bool backwards)
{
	PTR_ALLOC(struct fft_plan_s, plan);
	SET_TYPEID(fft_plan_s, plan);

	complex float* src = md_alloc(D, dimensions, CFL_SIZE);
	complex float* dst = inplace ? src : md_alloc(D, dimensions, CFL_SIZE);

	long strides[D];
	md_calc_strides(D, strides, dimensions, CFL_SIZE);

	plan->fftw = fft_fftwf_plan(D, dimensions, flags, strides, dst, strides, src, backwards, true);

	md_free(src);

	if (!inplace)
		md_free(dst);

#ifdef  USE_CUDA
	plan->cuplan = NULL;
#ifndef LAZY_CUDA
	if (cuda_ondevice(src))
          plan->cuplan = fft_cuda_plan(D, dimensions, flags, strides, strides, backwards);
#else
	plan->D = D;
	plan->flags = flags;
	plan->backwards = backwards;

	PTR_ALLOC(long[D], dims);
	md_copy_dims(D, *dims, dimensions);
	plan->dims = *PTR_PASS(dims);

	PTR_ALLOC(long[D], istrs);
	md_copy_strides(D, *istrs, strides);
	plan->istrs = *PTR_PASS(istrs);

	PTR_ALLOC(long[D], ostrs);
	md_copy_strides(D, *ostrs, strides);
	plan->ostrs = *PTR_PASS(ostrs);
#endif
#endif
	return operator_create2(D, dimensions, strides, D, dimensions, strides, CAST_UP(PTR_PASS(plan)), fft_apply, fft_free_plan);
}
예제 #14
0
파일: fft.c 프로젝트: Markusjsommer/bart
static void fft_apply(const operator_data_t* _plan, unsigned int N, void* args[N])
{
	complex float* dst = args[0];
	const complex float* src = args[1];
	const struct fft_plan_s* plan = CONTAINER_OF(_plan, const struct fft_plan_s, base);

	assert(2 == N);

#ifdef  USE_CUDA
	if (cuda_ondevice(src)) {

		assert(NULL != plan->cuplan);
		fft_cuda_exec(plan->cuplan, dst, src);

	} else 
#endif
	{
		assert(NULL != plan->fftw);
		fftwf_execute_dft(plan->fftw, (complex float*)src, dst);
	}
}
예제 #15
0
파일: fft.c 프로젝트: hcmh/bart
static void fft_apply(const operator_data_t* _plan, unsigned int N, void* args[N])
{
	complex float* dst = args[0];
	const complex float* src = args[1];
	const struct fft_plan_s* plan = CAST_DOWN(fft_plan_s, _plan);

	assert(2 == N);

#ifdef  USE_CUDA
	if (cuda_ondevice(src)) {
#ifdef	LAZY_CUDA
          if (NULL == plan->cuplan)
		((struct fft_plan_s*)plan)->cuplan = fft_cuda_plan(plan->D, plan->dims, plan->flags, plan->ostrs, plan->istrs, plan->backwards);
#endif
		assert(NULL != plan->cuplan);
		fft_cuda_exec(plan->cuplan, dst, src);

	} else 
#endif
	{
		assert(NULL != plan->fftw);
		fftwf_execute_dft(plan->fftw, (complex float*)src, dst);
	}
}