/** * Generic functions which loops over all dimensions of a set of * multi-dimensional arrays and calls a given function for each position. * This functions tries to parallelize over the dimensions indicated * with flags. */ void md_parallel_nary(unsigned int C, unsigned int D, const long dim[D], unsigned long flags, const long* str[C], void* ptr[C], void* data, md_nary_fun_t fun) { if (0 == flags) { md_nary(C, D, dim, str, ptr, data, fun); return; } int b = ffsl(flags & -flags) - 1; assert(MD_IS_SET(flags, b)); flags = MD_CLEAR(flags, b); long dimc[D]; md_select_dims(D, ~MD_BIT(b), dimc, dim); debug_printf(DP_DEBUG4, "Parallelize: %d\n", dim[b]); // FIXME: this probably doesn't nest // (maybe collect all parallelizable dims into one giant loop?) #pragma omp parallel for for (long i = 0; i < dim[b]; i++) { void* moving_ptr[C]; for (unsigned int j = 0; j < C; j++) moving_ptr[j] = ptr[j] + i * str[j][b]; md_parallel_nary(C, D, dimc, flags, str, moving_ptr, data, fun); } }
void tv_adjoint(unsigned int D, const long dims[D], unsigned int flags, complex float* out, const complex float* in) { unsigned int N = bitcount(flags); assert(N == dims[D - 1]); // we use the highest dim to store our different partial derivatives unsigned int flags2 = flags; complex float* tmp = md_alloc_sameplace(D - 1, dims, CFL_SIZE, out); md_clear(D - 1, dims, out, CFL_SIZE); md_clear(D - 1, dims, tmp, CFL_SIZE); for (unsigned int i = 0; i < N; i++) { unsigned int lsb = ffs(flags2) - 1; flags2 = MD_CLEAR(flags2, lsb); md_zfdiff_backwards(D - 1, dims, lsb, tmp, in + i * md_calc_size(D - 1, dims)); md_zadd(D - 1, dims, out, out, tmp); } md_free(tmp); assert(0 == flags2); }
static long wavelet_filter_flags(unsigned int N, long flags, const long dims[N], const long min[N]) { for (unsigned int i = 0; i < N; i++) if (dims[i] < min[i]) // CHECK flags = MD_CLEAR(flags, i); return flags; }
/** * compute set of parallelizable dimensions * */ static unsigned int parallelizable(unsigned int D, unsigned int io, unsigned int N, const long dims[N], long (*strs[D])[N], size_t size[D]) { // we assume no input / output overlap // (i.e. inputs which are also outputs have to be marked as output) // a dimension is parallelizable if all output operations // for that dimension are independent // for all output operations: // check - all other dimensions have strides greater or equal // the extend of this dimension or have an extend smaller or // equal the stride of this dimension // no overlap: [222] // [111111111111] // [333333333] // overlap: [222] // [1111111111111111] // [333333333] unsigned int flags = (1 << N) - 1; for (unsigned int d = 0; d < D; d++) { if (MD_IS_SET(io, d)) { bool m[N][N]; compute_enclosures(N, m, dims, *strs[d]); // print_dims(N, dims); // print_dims(N, *strs[d]); for (unsigned int i = 0; i < N; i++) { unsigned int a = 0; for (unsigned int j = 0; j < N; j++) if (m[i][j] || m[j][i]) a++; // printf("%d %d %d\n", d, i, a); if ((a != N - 1) || ((size_t)labs((*strs[d])[i]) < size[d])) flags = MD_CLEAR(flags, i); } } } return flags; }
static void fftmod2_r(unsigned int N, const long dims[N], unsigned long flags, const long ostrs[N], complex float* dst, const long istrs[N], const complex float* src, bool inv, double phase) { if (0 == flags) { md_zsmul2(N, dims, ostrs, dst, istrs, src, cexp(M_PI * 2.i * (inv ? -phase : phase))); return; } /* this will also currently be slow on the GPU because we do not * support strides there on the lowest level */ unsigned int i = N - 1; while (!MD_IS_SET(flags, i)) i--; #if 1 // If there is only one dimensions left and it is the innermost // which is contiguous optimize using md_zfftmod2 if ((0u == MD_CLEAR(flags, i)) && (1 == md_calc_size(i, dims)) && (CFL_SIZE == ostrs[i]) && (CFL_SIZE == istrs[i])) { md_zfftmod2(N - i, dims + i, ostrs + i, dst, istrs + i, src, inv, phase); return; } #endif long tdims[N]; md_select_dims(N, ~MD_BIT(i), tdims, dims); #pragma omp parallel for for (int j = 0; j < dims[i]; j++) fftmod2_r(N, tdims, MD_CLEAR(flags, i), ostrs, (void*)dst + j * ostrs[i], istrs, (void*)src + j * istrs[i], inv, phase + fftmod_phase(dims[i], j)); }
void tv_op(unsigned int D, const long dims[D], unsigned int flags, complex float* out, const complex float* in) { unsigned int N = bitcount(flags); assert(N == dims[D - 1]); // we use the highest dim to store our different partial derivatives unsigned int flags2 = flags; for (unsigned int i = 0; i < N; i++) { unsigned int lsb = ffs(flags2) - 1; flags2 = MD_CLEAR(flags2, lsb); md_zfdiff(D - 1, dims, lsb, out + i * md_calc_size(D - 1, dims), in); } assert(0 == flags2); }
// FIXME: consider moving this to a more accessible location? static void wthresh(unsigned int D, const long dims[D], float lambda, unsigned int flags, complex float* out, const complex float* in) { long minsize[D]; md_singleton_dims(D, minsize); long course_scale[3] = MD_INIT_ARRAY(3, 16); md_copy_dims(3, minsize, course_scale); unsigned int wflags = 7; // FIXME for (unsigned int i = 0; i < 3; i++) if (dims[i] < minsize[i]) wflags = MD_CLEAR(wflags, i); long strs[D]; md_calc_strides(D, strs, dims, CFL_SIZE); const struct linop_s* w = linop_wavelet_create(D, wflags, dims, strs, minsize, false); const struct operator_p_s* p = prox_unithresh_create(D, w, lambda, flags); operator_p_apply(p, 1., D, dims, out, D, dims, in); operator_p_free(p); }
static unsigned long clear_singletons(unsigned int N, const long dims[N], unsigned long flags) { return (0 == N) ? flags : clear_singletons(N - 1, dims, (1 == dims[N - 1]) ? MD_CLEAR(flags, N - 1) : flags); }
int main_homodyne(int argc, char* argv[]) { bool clear = false; const char* phase_ref = NULL; int com; while (-1 != (com = getopt(argc, argv, "hCP:"))) { switch (com) { case 'C': clear = true; break; case 'P': phase_ref = strdup(optarg); break; case 'h': help(argv[0], stdout); exit(0); default: help(argv[0], stderr); exit(1); } } if (argc - optind != 4) { usage(argv[0], stderr); exit(1); } const int N = DIMS; long dims[N]; complex float* idata = load_cfl(argv[optind + 2], N, dims); complex float* data = create_cfl(argv[optind + 3], N, dims); int pfdim = atoi(argv[optind + 0]); float frac = atof(argv[optind + 1]); assert((0 <= pfdim) && (pfdim < N)); assert(frac > 0.); long strs[N]; md_calc_strides(N, strs, dims, CFL_SIZE); struct wdata wdata; wdata.frac = frac; wdata.pfdim = pfdim; md_select_dims(N, MD_BIT(pfdim), wdata.wdims, dims); md_calc_strides(N, wdata.wstrs, wdata.wdims, CFL_SIZE); wdata.weights = md_alloc(N, wdata.wdims, CFL_SIZE); md_loop(N, wdata.wdims, &wdata, comp_weights); long pstrs[N]; long pdims[N]; complex float* phase = NULL; if (NULL == phase_ref) { phase = estimate_phase(wdata, FFT_FLAGS, N, dims, idata); md_copy_dims(N, pdims, dims); } else phase = load_cfl(phase_ref, N, pdims); md_calc_strides(N, pstrs, pdims, CFL_SIZE); complex float* cdata = NULL; complex float* idata2 = NULL; if (clear) { long cdims[N]; md_select_dims(N, ~MD_BIT(pfdim), cdims, dims); cdims[pfdim] = (int)(dims[pfdim] * frac); cdata = md_alloc(N, cdims, CFL_SIZE); idata2 = anon_cfl(NULL, N, dims); md_resize(N, cdims, cdata, dims, idata, CFL_SIZE); md_resize(N, dims, idata2, cdims, cdata, CFL_SIZE); md_free(cdata); unmap_cfl(N, dims, idata); idata = idata2; } if ((1 == dims[PHS2_DIM]) || (PHS2_DIM == pfdim)) { homodyne(wdata, FFT_FLAGS, N, dims, strs, data, idata, pstrs, phase); } else { unsigned int pardim = PHS2_DIM; ifftuc(N, dims, MD_CLEAR(FFT_FLAGS, pfdim), data, idata); long rdims[N]; md_select_dims(N, ~MD_BIT(pardim), rdims, dims); long rstrs[N]; md_calc_strides(N, rstrs, rdims, CFL_SIZE); #pragma omp parallel for for (unsigned int i = 0; i < dims[pardim]; i++) { complex float* tmp = md_alloc(N, rdims, CFL_SIZE); long pos[N]; md_set_dims(N, pos, 0); pos[pardim] = i; md_copy_block(N, pos, rdims, tmp, dims, data, CFL_SIZE); homodyne(wdata, MD_BIT(pfdim), N, rdims, rstrs, tmp, tmp, pstrs, phase); md_copy_block(N, pos, dims, data, rdims, tmp, CFL_SIZE); md_free(tmp); } } md_free(wdata.weights); if (NULL == phase_ref) md_free(phase); else { unmap_cfl(N, pdims, phase); free((void*)phase_ref); } unmap_cfl(N, dims, idata); unmap_cfl(N, dims, data); exit(0); }
int main_homodyne(int argc, char* argv[]) { mini_cmdline(argc, argv, 4, usage_str, help_str); const int N = DIMS; long dims[N]; complex float* idata = load_cfl(argv[3], N, dims); complex float* data = create_cfl(argv[4], N, dims); int pfdim = atoi(argv[1]); float frac = atof(argv[2]); assert((0 <= pfdim) && (pfdim < N)); assert(frac > 0.); long strs[N]; md_calc_strides(N, strs, dims, CFL_SIZE); struct wdata wdata; wdata.frac = frac; wdata.pfdim = pfdim; md_select_dims(N, MD_BIT(pfdim), wdata.wdims, dims); md_calc_strides(N, wdata.wstrs, wdata.wdims, CFL_SIZE); wdata.weights = md_alloc(N, wdata.wdims, CFL_SIZE); md_loop(N, wdata.wdims, &wdata, comp_weights); if ((1 == dims[PHS2_DIM]) || (PHS2_DIM == pfdim)) { homodyne(wdata, FFT_FLAGS, N, dims, strs, data, idata); } else { unsigned int pardim = PHS2_DIM; ifftuc(N, dims, MD_CLEAR(FFT_FLAGS, pfdim), data, idata); long rdims[N]; md_select_dims(N, ~MD_BIT(pardim), rdims, dims); long rstrs[N]; md_calc_strides(N, rstrs, rdims, CFL_SIZE); #pragma omp parallel for for (unsigned int i = 0; i < dims[pardim]; i++) { complex float* tmp = md_alloc(N, rdims, CFL_SIZE); long pos[N]; md_set_dims(N, pos, 0); pos[pardim] = i; md_copy_block(N, pos, rdims, tmp, dims, data, CFL_SIZE); homodyne(wdata, MD_BIT(pfdim), N, rdims, rstrs, tmp, tmp); md_copy_block(N, pos, dims, data, rdims, tmp, CFL_SIZE); md_free(tmp); } } md_free(wdata.weights); unmap_cfl(N, dims, idata); unmap_cfl(N, dims, data); exit(0); }