__v8sf test_mm(void) { __v8sf x[128]; /* 4kB */ __v8sf y[128]; /* 4kB */ __v8sf z= {0,0,0,0,0,0,0,0}, z2= {0,0,0,0,0,0,0,0}, z3= {0,0,0,0,0,0,0,0}, z4= {0,0,0,0,0,0,0,0}, z5= {0,0,0,0,0,0,0,0}, z6= {0,0,0,0,0,0,0,0}, z7= {0,0,0,0,0,0,0,0}, z8= {0,0,0,0,0,0,0,0}; struct timespec start, end; double iv; long long int i; printf("Testing memory-memory arithmetic speed..."); fflush(stdout); for(i= 0; i < 128*8; i++) ((float *)x)[i]= 1.0; for(i= 0; i < 128*8; i++) ((float *)y)[i]= 1.0; if(clock_gettime(CLOCK_REALTIME, &start)) { perror("clock_gettime"); exit(EXIT_FAILURE); } asm volatile("foo_mm:"); for(i= 0; i < REPS; i++) { z= __builtin_ia32_addps256(z, __builtin_ia32_mulps256(y[(8*i)%128],x[(8*i)%128])); z2= __builtin_ia32_addps256(z2, __builtin_ia32_mulps256(y[(8*i+1)%128],x[(8*i+1)%128])); z3= __builtin_ia32_addps256(z3, __builtin_ia32_mulps256(y[(8*i+2)%128],x[(8*i+2)%128])); z4= __builtin_ia32_addps256(z4, __builtin_ia32_mulps256(y[(8*i+3)%128],x[(8*i+3)%128])); z5= __builtin_ia32_addps256(z5, __builtin_ia32_mulps256(y[(8*i+4)%128],x[(8*i+4)%128])); z6= __builtin_ia32_addps256(z6, __builtin_ia32_mulps256(y[(8*i+5)%128],x[(8*i+5)%128])); z7= __builtin_ia32_addps256(z7, __builtin_ia32_mulps256(y[(8*i+6)%128],x[(8*i+6)%128])); z8= __builtin_ia32_addps256(z8, __builtin_ia32_mulps256(y[(8*i+7)%128],x[(8*i+7)%128])); } asm volatile("bar_mm:"); if(clock_gettime(CLOCK_REALTIME, &end)) { perror("clock_gettime"); exit(EXIT_FAILURE); } iv= end.tv_sec + end.tv_nsec*1e-9 - start.tv_sec - start.tv_nsec*1e-9; printf("%.3e REPS %.3es %.3fGFLOPS\n", (double)REPS, iv, (8 * 2 * 8 * REPS) / iv / 1e9); return z + z2 + z3 + z4 + z5 + z6 + z7 + z8; }
__v8sf test_big(long long int ymask) { long long int i, j; struct timespec start, end; __v8sf *x, *y; __v8sf z[STRIDE], zf= {0,0,0,0,0,0,0,0}; double iv; int r; printf("Testing memory-memory arithmetic speed with large vectors...\n"); printf("\tone working set restricted to %lldkB\n", ((ymask+1)*32)/1024); r= posix_memalign((void **)&x, sizeof(__v8sf), LENGTH * sizeof(__v8sf)); r|= posix_memalign((void **)&y, sizeof(__v8sf), LENGTH * sizeof(__v8sf)); if(r) { fprintf(stderr, "posix_memalign failed"); abort(); } for(j= 0; j < LENGTH * 8ULL; j++) { ((float *)x)[j]= 1.0; } for(j= 0; j < LENGTH * 8ULL; j++) { ((float *)y)[j]= 0.5; } for(j= 0; j < STRIDE * 8; j++) { ((float *)z)[j]= 0.0; } clock_gettime(CLOCK_REALTIME, &start); asm volatile("foo_big:"); for(r= 0; r < BIGREPS; r++) { for(i= 0; i < LENGTH; i+= STRIDE) { for(j=0; j < STRIDE; j++) { z[j]= __builtin_ia32_addps256(z[j], __builtin_ia32_mulps256(y[(i+j)&ymask],x[i+j])); } } } asm volatile("end_big:"); clock_gettime(CLOCK_REALTIME, &end); iv= end.tv_sec + end.tv_nsec*1e-9 - start.tv_sec - start.tv_nsec*1e-9; printf("\t%.3e REPS %.3es %.3fGFLOPS\n", BIGREPS * (double)LENGTH, iv, (BIGREPS * 2 * 8ULL * LENGTH) / iv / 1e9); free(x); free(y); for(j= 0; j < STRIDE; j++) zf+= z[j]; return zf; }
int main(int argv,char** args){ double start,stop; double a,b,c,d; srand48(-1); gettime(now);start=time_dbl(now); init(); gettime(now);stop=time_dbl(now); a=stop-start; /*loops w/out simd*/ gettime(now);start=time_dbl(now); for(i=0;i<N;i++){ for(j=0;j<M;j++){ /* *standard matrix multiplication loop, used to compare *times with the simd loop */ *(X.double_c+(i*N)+j)+= *(X.double_a+(i*M)+j)* *(X.double_b+(j*N)+i); *(X.float_c+(i*N)+j)+= *(X.float_a+(i*M)+j)* *(X.float_b+(j*N)+i); } } gettime(now);stop=time_dbl(now); b=stop-start; /*loops w/simd*/ gettime(now);start=time_dbl(now); for(i=0;i<N;i++){ for(j=0;j<M;j+=4){ /* *Store 4 doubles into the ymm registers *and multiply them */ __v4df vecd_a={*(X.dbl_a+(i*M)+j),*(X.dbl_a+(i*M)+j+1),\ *(X.dbl_a+(i*M)+j+2),*(X.dbl_a+(i*M)+j+3)}; __v4df vecd_b={*(X.dbl_b+(j*N)+i),*(X.dbl_b+(j*N)+i+1),\ *(X.dbl_b+(j*N)+i+2),*(X.dbl_b+(j*N)+i+3)}; __v4df vecd_c=__builtin_ia32_mulpd256(vecd_a,vecd_b); for(k=0;k<4;k++){ /* *Store the results from the above calculations *into memory */ *(X.dbl_c+(i*M)+j+k)+=vecd_c[k]; } } } gettime(now);stop=time_dbl(now); c=stop-start; gettime(now);start=time_dbl(now); for(i=0;i<N;i++){ for(j=0;j<M;j+=8){ /* *move 2 sets of 8 floats into ymm registers and multiply them */ __v8sf vecf_a={*(X.flt_a+(i*N)+j),*(X.flt_a+(i*N)+j+1), *(X.flt_a+(i*N)+j+2),*(X.flt_a+(i*N)+j+3), *(X.flt_a+(i*N)+j+4),*(X.flt_a+(i*N)+j+5), *(X.flt_a+(i*N)+j+6),*(X.flt_a+(i*N)+j+7),}; __v8sf vecf_b={*(X.flt_b+(i*N)+j),*(X.flt_b+(i*N)+j+1), *(X.flt_b+(i*N)+j+2),*(X.flt_b+(i*N)+j+3), *(X.flt_b+(i*N)+j+4),*(X.flt_b+(i*N)+j+5), *(X.flt_b+(i*N)+j+6),*(X.flt_b+(i*N)+j+7),}; __v8sf vecf_c=__builtin_ia32_mulps256(vecf_a,vecf_b); for(k=0;k<8;k++){ /* *Store the results from the above calculations *into memory */ *(X.flt_c+(i*M)+j+k)+=vecf_c[k]; } } } gettime(now);stop=time_dbl(now); d=stop-start; printf("Time to init: %f sec\n",a); printf("Without Simd: %f sec\n",b); printf("With Simd:\nDoubles: %f sec\n"\ "Floats: %f sec\nTotal: %f sec\n",c,d,c+d); return 0; }
__v8sf test_reg(void) { struct timespec start, end; double iv; long long int i; __v8sf x= {1,1,1,1,1,1,1,1}, y= {0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5}, z= {0,0,0,0,0,0,0,0}; __v8sf x2= {1,1,1,1,1,1,1,1}, y2= {0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5}, z2= {0,0,0,0,0,0,0,0}; __v8sf x3= {1,1,1,1,1,1,1,1}, y3= {0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5}, z3= {0,0,0,0,0,0,0,0}; __v8sf x4= {1,1,1,1,1,1,1,1}, y4= {0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5}, z4= {0,0,0,0,0,0,0,0}; __v8sf x5= {1,1,1,1,1,1,1,1}, y5= {0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5}, z5= {0,0,0,0,0,0,0,0}; __v8sf x6= {1,1,1,1,1,1,1,1}, y6= {0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5}, z6= {0,0,0,0,0,0,0,0}; __v8sf x7= {1,1,1,1,1,1,1,1}, y7= {0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5}, z7= {0,0,0,0,0,0,0,0}; __v8sf x8= {1,1,1,1,1,1,1,1}, y8= {0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5}, z8= {0,0,0,0,0,0,0,0}; __v8sf x9= {1,1,1,1,1,1,1,1}, y9= {0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5}, z9= {0,0,0,0,0,0,0,0}; __v8sf x10= {1,1,1,1,1,1,1,1}, y10= {0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5}, z10= {0,0,0,0,0,0,0,0}; __v8sf x11= {1,1,1,1,1,1,1,1}, y11= {0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5}, z11= {0,0,0,0,0,0,0,0}; __v8sf x12= {1,1,1,1,1,1,1,1}, y12= {0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5}, z12= {0,0,0,0,0,0,0,0}; printf("Testing register-register arithmetic speed..."); fflush(stdout); if(clock_gettime(CLOCK_REALTIME, &start)) { perror("clock_gettime"); exit(EXIT_FAILURE); } asm volatile("foo:nop"); for(i= 0; i < REPS; i++) { z= __builtin_ia32_addps256(z, __builtin_ia32_mulps256(y,x)); z2= __builtin_ia32_addps256(z2, __builtin_ia32_mulps256(y2,x2)); z3= __builtin_ia32_addps256(z3, __builtin_ia32_mulps256(y3,x3)); z4= __builtin_ia32_addps256(z4, __builtin_ia32_mulps256(y4,x4)); z5= __builtin_ia32_addps256(z5, __builtin_ia32_mulps256(y5,x5)); z6= __builtin_ia32_addps256(z6, __builtin_ia32_mulps256(y6,x6)); z7= __builtin_ia32_addps256(z7, __builtin_ia32_mulps256(y7,x7)); z8= __builtin_ia32_addps256(z8, __builtin_ia32_mulps256(y8,x8)); z9= __builtin_ia32_addps256(z9, __builtin_ia32_mulps256(y9,x9)); z10= __builtin_ia32_addps256(z10, __builtin_ia32_mulps256(y10,x10)); z11= __builtin_ia32_addps256(z11, __builtin_ia32_mulps256(y11,x11)); z12= __builtin_ia32_addps256(z12, __builtin_ia32_mulps256(y12,x12)); } asm volatile("bar:nop"); if(clock_gettime(CLOCK_REALTIME, &end)) { perror("clock_gettime"); exit(EXIT_FAILURE); } iv= end.tv_sec + end.tv_nsec*1e-9 - start.tv_sec - start.tv_nsec*1e-9; printf("%.3e REPS %.3es %.3fGFLOPS\n", (double)REPS, iv, (12 * 2 * 8 * REPS) / iv / 1e9); return z + z2 + z3 + z4 + z5 + z6 + z7 + z8 + z9 + z10 + z11 + z12; }