void printvec(vector4double v) { double a = vec_extract(v, 0); double b = vec_extract(v, 1); double c = vec_extract(v, 2); double d = vec_extract(v, 3); printf("%4.3f\t%4.3f\t%4.3f\t%4.3f\n", a, b, c, d); }
static void test() { vector long long vl = {0, 1}; vector double vd = {0.0, 1.0}; check (vec_extract (vl, 0) == 0, "vec_extract, vl, 0"); check (vec_extract (vd, 1) == 1.0, "vec_extract, vd, 1"); check (vl[0] == 0, "[], vl, 0"); check (vd[1] == 1.0, "[], vd, 0"); }
unsigned int extract_uint_3 (vector unsigned int a) { int c = 3; unsigned int b = vec_extract (a, c); return b; }
short extract_short_7 (vector short a) { int c = 7; short b = vec_extract (a, c); return b; }
TYPE foo_3s (vector int v) { int c = 3; int i = vec_extract (v, c); return (TYPE) i; }
unsigned int extract_bool_int_0 (vector bool int a) { int c = 0; unsigned int b = vec_extract (a, c); return b; }
unsigned short int extract_bool_short_int_0 (vector bool short int a) { int c = 0; unsigned short int b = vec_extract (a, c); return b; }
TYPE foo_3u (vector unsigned int v) { int c = 3; unsigned int u = vec_extract (v, c); return (TYPE) u; }
unsigned char extract_bool_char_0 (vector bool char a) { int c = 0; unsigned char b = vec_extract (a, c); return b; }
signed char extract_schar_15 (vector signed char a) { int c = 15; signed char b = vec_extract (a, c); return b; }
unsigned char extract_uchar_0 (vector unsigned char a) { int c = 0; unsigned char b = vec_extract (a, c); return b; }
int extract_int_3 (vector int a) { int c = 3; int b = vec_extract (a, c); return b; }
unsigned short extract_ushort_7 (vector unsigned short a) { int c = 7; unsigned short b = vec_extract (a, c); return b; }
inline int v_signmask(const v_int32x4& a) { static const vec_uint4 slm = {0, 1, 2, 3}; vec_int4 sv = vec_sr(a.val, vec_uint4_sp(31)); sv = vec_sl(sv, slm); sv = vec_sums(sv, vec_int4_z); return vec_extract(sv, 3); }
double _SIMD_extract_pd(__SIMDd a, int32_t i) { #if defined USE_IBM return vec_extract(a,i); #else return *(((double*)&a)+i); #endif }
// extract scalar from SIMD operand float _SIMD_extract_ps(__SIMD a, int32_t i) { #if defined USE_IBM return vec_extract(a,i); #else return *(((float*)&a)+i); #endif }
int32_t _SIMD_extract_epi32(__SIMDi a, int32_t i) { #if defined USE_IBM return vec_extract(a,i); #else return *(((int32_t*)&a)+i); #endif }
static void test () { vector unsigned char vuc; vector signed char vsc; vector unsigned short vus; vector signed short vss; vector unsigned int vui; vector signed int vsi; vector float vf; init (); vuc = vec_lde (9*1, (unsigned char *)svuc); vsc = vec_lde (14*1, (signed char *)svsc); vus = vec_lde (7*2, (unsigned short *)svus); vss = vec_lde (1*2, (signed short *)svss); vui = vec_lde (3*4, (unsigned int *)svui); vsi = vec_lde (2*4, (signed int *)svsi); vf = vec_lde (0*4, (float *)svf); check (vec_extract (vuc, 9) == 9, "vuc"); check (vec_extract (vsc, 14) == 6, "vsc"); check (vec_extract (vus, 7) == 7, "vus"); check (vec_extract (vss, 1) == -3, "vss"); check (vec_extract (vui, 3) == 3, "vui"); check (vec_extract (vsi, 2) == 0, "vsi"); check (vec_extract (vf, 0) == 0.0, "vf"); }
int main(int argc, char **argv) { vector float t; vec_promote(); /* { dg-error "vec_promote only accepts 2" } */ vec_promote(1.0f); /* { dg-error "vec_promote only accepts 2" } */ vec_promote(1.0f, 2, 3); /* { dg-error "vec_promote only accepts 2" } */ vec_extract (); /* { dg-error "vec_extract only accepts 2" } */ vec_extract (t); /* { dg-error "vec_extract only accepts 2" } */ vec_extract (t, 2); vec_extract (t, 2, 5, 6); /* { dg-error "vec_extract only accepts 2" } */ vec_splats (); /* { dg-error "vec_splats only accepts 1" } */ vec_splats (t, 3); /* { dg-error "vec_splats only accepts 1" } */ vec_insert (); /* { dg-error "vec_insert only accepts 3" } */ vec_insert (t); /* { dg-error "vec_insert only accepts 3" } */ vec_insert (t, 3); /* { dg-error "vec_insert only accepts 3" } */ vec_insert (t, 3, 2, 4, 6, 6); /* { dg-error "vec_insert only accepts 3" } */ return 0; }
inline int v_signmask(const v_int16x8& a) { static const vec_ushort8 slm = {0, 1, 2, 3, 4, 5, 6, 7}; vec_short8 sv = vec_sr(a.val, vec_ushort8_sp(15)); sv = vec_sl(sv, slm); vec_int4 svi = vec_int4_z; svi = vec_sums(vec_sum4s(sv, svi), svi); return vec_extract(svi, 3); }
/** Mask **/ inline int v_signmask(const v_uint8x16& a) { vec_uchar16 sv = vec_sr(a.val, vec_uchar16_sp(7)); static const vec_uchar16 slm = {0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7}; sv = vec_sl(sv, slm); vec_uint4 sv4 = vec_sum4s(sv, vec_uint4_z); static const vec_uint4 slm4 = {0, 0, 8, 8}; sv4 = vec_sl(sv4, slm4); return vec_extract(vec_sums((vec_int4) sv4, vec_int4_z), 3); }
static void test() { vector signed int va = {-7,11,-13,17}; #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ vector signed int vb = {128,0,0,0}; #else vector signed int vb = {0,0,0,128}; #endif vector signed int vd = vec_sums (va, vb); signed int r = vec_extract (vd, 3); check (r == 136, "sums"); }
__attribute__((noinline)) void foo () { int i; vector long long va, vb, vc, vd, tmp; volatile unsigned long long three = 3; vector unsigned long long threes = vec_splats (three); for (i = 0; i < N; i+=2) { vb = vec_vsx_ld (0, (vector long long *)&cb[i]); vc = vec_vsx_ld (0, (vector long long *)&cc[i]); vd = vec_vsx_ld (0, (vector long long *)&cd[i]); tmp = vec_add (vb, vc); tmp = vec_sub (tmp, vd); tmp = vec_sra (tmp, threes); x = vec_extract (tmp, 0); vec_vsx_st (tmp, 0, (vector long long *)&ca[i]); } }
mxArray* omp_chol(const double m_dict[], const double m_x[], mwSize M, mwSize N, mwSize S, mwSize K, double res_norm_bnd, int sparse_output, int verbose){ // List of indices of selected atoms mwIndex *selected_atoms = 0; // Simple binary mask of selected atoms int* selected_atoms_mask = 0; // Storage for the Cholesky decomposition of D_I' D_I double *m_lt = 0; // The submatrix of selected atoms double* m_subdict = 0; // The proxy D' x double* v_proxy = 0; // The inner product of residual with atoms double* v_h = 0; // The residual double* v_r = 0; // b = D_I' d_k in the Cholesky decomposition updates double* v_b = 0; // New vector in the Cholesky decomposition updates double* v_w = 0; // Result of orthogonal projection LL' c = p_I double* v_c = 0; // Some temporary vectors double *v_t1 = 0, *v_t2 = 0; // Pointer to new atom const double* wv_new_atom; // residual norm squared double res_norm_sqr; // square of upper bound on residual norm double res_norm_bnd_sqr = SQR(res_norm_bnd); // Pointer to current signal const double *wv_x = 0; /// Output array mxArray* p_alpha; double* m_alpha; // row indices for non-zero entries in Alpha mwIndex *ir_alpha; // indices for first non-zero entry in column mwIndex *jc_alpha; /// Index for non-zero entries in alpha mwIndex nz_index; // counters int i, j , k, s; // index of new atom mwIndex new_atom_index; // misc variables double d1, d2; // Maximum number of columns to be used in representations mwSize max_cols; // structure for tracking time spent. omp_profile profile; if (K < 0 || K > M) { // K cannot be greater than M. K = M; } max_cols = (mwSize)(ceil(sqrt((double)M)/2.0) + 1.01); if(max_cols < K){ max_cols = K; } // Memory allocations // Number of selected atoms cannot exceed M selected_atoms = (mwIndex*) mxMalloc(M*sizeof(mwIndex)); // Total number of atoms is N selected_atoms_mask = (int*) mxMalloc(N*sizeof(int)); // Number of rows in L cannot exceed M. Number of columns // cannot exceed max_cols. m_lt = (double*) mxMalloc(M*max_cols*sizeof (double)); // Number of entries in new line for L cannot exceed N. v_b = (double*)mxMalloc(N*sizeof(double)); v_w = (double*)mxMalloc(N*sizeof(double)); v_c = (double*)mxMalloc(M*sizeof(double)); // Giving enough space for temporary vectors v_t1 = (double*)mxMalloc(N*sizeof(double)); v_t2 = (double*)mxMalloc(N*sizeof(double)); // Keeping max_cols space for subdictionary. m_subdict = (double*)mxMalloc(max_cols*M*sizeof(double)); // Proxy vector is in R^N v_proxy = (double*)mxMalloc(N*sizeof(double)); // h is in R^N. v_h = (double*)mxMalloc(N*sizeof(double)); // Residual is in signal space R^M. v_r = (double*)mxMalloc(M*sizeof(double)); if (sparse_output == 0){ p_alpha = mxCreateDoubleMatrix(N, S, mxREAL); m_alpha = mxGetPr(p_alpha); ir_alpha = 0; jc_alpha = 0; }else{ p_alpha = mxCreateSparse(N, S, max_cols*S, mxREAL); m_alpha = mxGetPr(p_alpha); ir_alpha = mxGetIr(p_alpha); jc_alpha = mxGetJc(p_alpha); nz_index = 0; jc_alpha[0] = 0; } omp_profile_init(&profile); for(s=0; s<S; ++s){ wv_x = m_x + M*s; // Initialization res_norm_sqr = inner_product(wv_x, wv_x, M); //Compute proxy p = D' * x mult_mat_t_vec(1, m_dict, wv_x, v_proxy, M, N); omp_profile_toctic(&profile, TIME_DtR); // h = p = D' * r copy_vec_vec(v_proxy, v_h, N); for (i=0; i<N; ++i){ selected_atoms_mask[i] = 0; } // Number of atoms selected so far. k = 0; // Iterate for each atom while (k < K && res_norm_sqr > res_norm_bnd_sqr){ omp_profile_tic(&profile); // Pick the index of (k+1)-th atom new_atom_index = abs_max_index(v_h, N); omp_profile_toctic(&profile, TIME_MaxAbs); // If this atom is already selected, we will break if (selected_atoms_mask[new_atom_index]){ // This is unlikely due to orthogonal structure of OMP if (verbose){ mexPrintf("This atom is already selected."); } break; } // Check for small values d2 = v_h[new_atom_index]; if (SQR(d2) < 1e-14){ // The inner product of residual with new atom is way too small. break; } // Store the index of new atom selected_atoms[k] = new_atom_index; selected_atoms_mask[new_atom_index] = 1; // Copy the new atom to the sub-dictionary wv_new_atom = m_dict + new_atom_index*M; copy_vec_vec(wv_new_atom, m_subdict+k*M, M); omp_profile_toctic(&profile, TIME_DictSubMatrixUpdate); // Cholesky update if (k == 0){ // Simply initialize the L matrix *m_lt = 1; }else{ // Incremental Cholesky decomposition if (chol_update(m_subdict, wv_new_atom, m_lt, v_b, v_w, M, k) != 0){ break; } } omp_profile_toctic(&profile, TIME_LCholUpdate); // It is time to increase the count of selected atoms ++k; // We will now solve the equation L L' alpha_I = p_I vec_extract(v_proxy, selected_atoms, v_t1, k); spd_chol_lt_solve(m_lt, v_t1, v_c, M, k); omp_profile_toctic(&profile, TIME_LLtSolve); // Compute residual // r = x - D_I c mult_mat_vec(-1, m_subdict, v_c, v_r, M, k); sum_vec_vec(1, wv_x, v_r, M); omp_profile_toctic(&profile, TIME_RUpdate); // Update h = D' r mult_mat_t_vec(1, m_dict, v_r, v_h, M, N); // Update residual norm squared res_norm_sqr = inner_product(v_r, v_r, M); omp_profile_toctic(&profile, TIME_DtR); //mexPrintf(".\n"); } // Write the output vector if(sparse_output == 0){ // Write the output vector double* wv_alpha = m_alpha + N*s; fill_vec_sparse_vals(v_c, selected_atoms, wv_alpha, N, k); } else{ // Sort the row indices quicksort_indices(selected_atoms, v_c, k); // add the non-zero entries for this column for(j=0; j <k; ++j){ m_alpha[nz_index] = v_c[j]; ir_alpha[nz_index] = selected_atoms[j]; ++nz_index; } // fill in the total number of nonzero entries in the end. jc_alpha[s+1] = jc_alpha[s] + k; } } if(verbose){ omp_profile_print(&profile); } // Memory cleanup mxFree(selected_atoms); mxFree(selected_atoms_mask); mxFree(m_lt); mxFree(v_b); mxFree(v_w); mxFree(v_c); mxFree(v_t1); mxFree(v_t2); mxFree(m_subdict); mxFree(v_proxy); mxFree(v_h); mxFree(v_r); // Return the result return p_alpha; }
long foou (vector unsigned char a, vector unsigned char b) { return vec_extract (vec_vbpermq (a, b), OFFSET); }
TYPE foo_3u (vector unsigned int v) { unsigned int u = vec_extract (v, 3); return (TYPE) u; }
TYPE foo_2s (vector int v) { int i = vec_extract (v, 2); return (TYPE) i; }
long get_value (vector long v) { return vec_extract (v, OFFSET); }
short extract_hi_n_mem (vector short *p, int n) { return vec_extract (*p, n); }
double get_value (vector double *p) { return vec_extract (*p, 0); }