// fills vscale and vshift with values so that algorithm performs // vdst = vscale * vsrc + vbeta next; void compute_vscaleshift(const Vmm &vscale, const Vmm &vshift, const Vmm &vmean, const Vmm &vsqrtvar, size_t offt, bool need_tail = false) { if (need_tail) { uni_vmovups_tail(vmean, mean_ptr(offt)); uni_vmovups_tail(vsqrtvar, var_ptr(offt)); } else { uni_vmovups(vmean, mean_ptr(offt)); uni_vmovups(vsqrtvar, var_ptr(offt)); } uni_vaddps(vsqrtvar, vsqrtvar, veps); uni_vsqrtps(vsqrtvar, vsqrtvar); if (bdesc_->use_scaleshift()) { if (need_tail) { uni_vmovups_tail(vscale, scale_ptr(offt)); uni_vmovups_tail(vshift, shift_ptr(offt)); } else { uni_vmovups(vscale, scale_ptr(offt)); uni_vmovups(vshift, shift_ptr(offt)); } vdivps(vscale, vscale, vsqrtvar); uni_vfnmadd231ps(vshift, vmean, vscale); } else { vdivps(vscale, vone, vsqrtvar); uni_vmulps(vmean, vmean, vscale); uni_vsubps(vshift, vzero, vmean); } };
int main(int argc, char *argv[]) { int i,j,trials=1, verbose=0,n = 3; data_t **a,**b,mask=1023,c = 7; v_vptr scale_ptr; /* get options */ for (i=1; i<argc; i++) if (strcmp(argv[i],"-n") == 0) n = atoi(argv[++i]); else if (strcmp(argv[i],"-t") == 0) trials = atoi(argv[++i]); else if (strcmp(argv[i],"-m") == 0) mask = atoi(argv[++i]); else if (strcmp(argv[i],"-c") == 0) c = atoi(argv[++i]); else if (strcmp(argv[i],"-v") == 0) verbose = 1; else if (strcmp(argv[i],"-d") == 0) disass = 1; else if (strcmp(argv[i],"-p") == 0) pixie = 1; else if (strcmp(argv[i],"-s") == 0) strength_reduce = 0; /* allocate and initialize */ a = new_matrix(n,n); b = new_matrix(n,n); for(i=0;i<n;i++) { for(j=0;j<n;j++) { b[i][j] = (random() & mask) + 1; } } if(verbose) { printf("constant = %u\n",c); m_print("B", b, n); } scale_ptr = scale(n, a, b); if(verbose) printf("doing constants 1 to %d\n",c); for(i=0;i<trials;i++) { int c1; if(!verbose) startwatch(0); for(c1 = 0; c1 < c; c1++) scale_ptr(a, b, c1); if(!verbose) stopwatch(0); } if(verbose) m_print("A", a, n); return 0; }