int main_transpose(int argc, char* argv[]) { mini_cmdline(argc, argv, 4, usage_str, help_str); int N = DIMS; long idims[N]; int dim1 = atoi(argv[1]); int dim2 = atoi(argv[2]); assert((0 <= dim1) && (dim1 < N)); assert((0 <= dim2) && (dim2 < N)); complex float* idata = load_cfl(argv[3], N, idims); long odims[N]; md_transpose_dims(N, dim1, dim2, odims, idims); complex float* odata = create_cfl(argv[4], N, odims); md_transpose(N, dim1, dim2, odims, odata, idims, idata, sizeof(complex float)); unmap_cfl(N, idims, idata); unmap_cfl(N, odims, odata); exit(0); }
static bool test_md_transpose(void) { enum { N = 4 }; long dims[N] = { 10, 10, 10, 10 }; complex float* a = md_alloc(N, dims, sizeof(complex float)); md_gaussian_rand(N, dims, a); complex float* b = md_alloc(N, dims, sizeof(complex float)); complex float* c = md_alloc(N, dims, sizeof(complex float)); md_transpose(N, 0, 2, dims, b, dims, a, sizeof(complex float)); md_transpose(N, 0, 2, dims, c, dims, b, sizeof(complex float)); bool eq = md_compare(N, dims, a, c, sizeof(complex float)); md_free(a); md_free(b); md_free(c); return eq; }
static double bench_transpose(long scale) { long dims[DIMS] = { 2000 * scale, 2000 * scale, 1, 1, 1, 1, 1, 1 }; complex float* x = md_alloc(DIMS, dims, CFL_SIZE); complex float* y = md_alloc(DIMS, dims, CFL_SIZE); md_gaussian_rand(DIMS, dims, x); md_clear(DIMS, dims, y, CFL_SIZE); double tic = timestamp(); md_transpose(DIMS, 0, 1, dims, y, dims, x, CFL_SIZE); double toc = timestamp(); md_free(x); md_free(y); return toc - tic; }
static void linop_matrix_apply_normal(const linop_data_t* _data, complex float* dst, const complex float* src) { const struct operator_matrix_s* data = CAST_DOWN(operator_matrix_s, _data); unsigned int N = data->mat_iovec->N; // FIXME check all the cases where computation can be done with blas //debug_printf(DP_DEBUG1, "compute normal\n"); if (cgemm_forward_standard(data)) { long max_dims_gram[N]; md_copy_dims(N, max_dims_gram, data->domain_iovec->dims); max_dims_gram[data->T_dim] = data->K; long tmp_dims[N]; long tmp_str[N]; md_copy_dims(N, tmp_dims, max_dims_gram); tmp_dims[data->K_dim] = 1; md_calc_strides(N, tmp_str, tmp_dims, CFL_SIZE); complex float* tmp = md_alloc_sameplace(N, data->domain_iovec->dims, CFL_SIZE, dst); md_clear(N, data->domain_iovec->dims, tmp, CFL_SIZE); md_zfmac2(N, max_dims_gram, tmp_str, tmp, data->domain_iovec->strs, src, data->mat_gram_iovec->strs, data->mat_gram); md_transpose(N, data->T_dim, data->K_dim, data->domain_iovec->dims, dst, tmp_dims, tmp, CFL_SIZE); md_free(tmp); } else { long L = md_calc_size(data->T_dim, data->domain_iovec->dims); blas_cgemm('N', 'T', L, data->K, data->K, 1., L, (const complex float (*)[])src, data->K, (const complex float (*)[])data->mat_gram, 0., L, (complex float (*)[])dst); } }
/** * Compute the Gram matrix, A^H A. * Stores the result in @param gram, which is allocated by the function * Returns: iovec_s corresponding to the gram matrix dimensions * * @param N number of dimensions * @param T_dim dimension corresponding to the rows of A * @param T number of rows of A (codomain) * @param K_dim dimension corresponding to the columns of A * @param K number of columns of A (domain) * @param gram store the result (allocated by this function) * @param matrix_dims dimensions of A * @param matrix matrix data */ const struct iovec_s* compute_gram_matrix(unsigned int N, unsigned int T_dim, unsigned int T, unsigned int K_dim, unsigned int K, complex float** gram, const long matrix_dims[N], const complex float* matrix) { // FIXME this can certainly be simplfied... // Just be careful to consider the case where the data passed to the operator is a subset of a bigger array // B_dims = [T K 1] or [K T 1] // C_dims = [T 1 K] or [1 T K] // A_dims = [1 K K] or [K 1 K] // after: gram_dims = [1 K1 K2] --> [K2 K1 1] or [K1 1 K2] --> [K1 K2 1] long A_dims[N + 1]; long B_dims[N + 1]; long C_dims[N + 1]; long fake_gram_dims[N + 1]; long A_str[N + 1]; long B_str[N + 1]; long C_str[N + 1]; long max_dims[N + 1]; md_singleton_dims(N + 1, A_dims); md_singleton_dims(N + 1, B_dims); md_singleton_dims(N + 1, C_dims); md_singleton_dims(N + 1, fake_gram_dims); md_singleton_dims(N + 1, max_dims); A_dims[K_dim] = K; A_dims[N] = K; B_dims[T_dim] = T; B_dims[K_dim] = K; C_dims[T_dim] = T; C_dims[N] = K; max_dims[T_dim] = T; max_dims[K_dim] = K; max_dims[N] = K; fake_gram_dims[T_dim] = K; fake_gram_dims[K_dim] = K; md_calc_strides(N + 1, A_str, A_dims, CFL_SIZE); md_calc_strides(N + 1, B_str, B_dims, CFL_SIZE); md_calc_strides(N + 1, C_str, C_dims, CFL_SIZE); complex float* tmpA = md_alloc_sameplace(N + 1 , A_dims, CFL_SIZE, matrix); complex float* tmpB = md_alloc_sameplace(N + 1, B_dims, CFL_SIZE, matrix); complex float* tmpC = md_alloc_sameplace(N + 1, C_dims, CFL_SIZE, matrix); md_copy(N, matrix_dims, tmpB, matrix, CFL_SIZE); //md_copy(N, matrix_dims, tmpC, matrix, CFL_SIZE); md_transpose(N + 1, K_dim, N, C_dims, tmpC, B_dims, tmpB, CFL_SIZE); md_clear(N + 1, A_dims, tmpA, CFL_SIZE); md_zfmacc2(N + 1, max_dims, A_str, tmpA, B_str, tmpB, C_str, tmpC); *gram = md_alloc_sameplace(N, fake_gram_dims, CFL_SIZE, matrix); md_transpose(N + 1, T_dim, N, fake_gram_dims, *gram, A_dims, tmpA, CFL_SIZE); const struct iovec_s* s = iovec_create(N, fake_gram_dims, CFL_SIZE); md_free(tmpA); md_free(tmpB); md_free(tmpC); return s; }