int main(int argc, char* argv[]) { unsigned int m=8, n=8, lda=8, ldb=8, nerrs, num, nmat, nmats, nmatd, ntest; unsigned int layout, asize, VLEND=4, VLENS=8, bsize; unsigned int ncorr; int i, j; char side, uplo, trans, diag; unsigned int typesize8 = 8; unsigned int typesize4 = 4; float *sa, *sb, *sc, *sd; double *da, *db, *dc, *dd, *tmpbuf; double dalpha = 1.0; float salpha; double dtmp; const unsigned char *cptr; unsigned long op_count; const libxsmm_trsm_descriptor* desc8 = NULL; const libxsmm_trsm_descriptor* desc4 = NULL; libxsmm_descriptor_blob blob; union { libxsmm_xtrsmfunction dp; libxsmm_xtrsmfunction sp; const void* pv; } mykernel = { 0 }; #ifdef USE_KERNEL_GENERATION_DIRECTLY void (*opcode_routine)(); #endif #ifdef USE_KERNEL_GENERATION_DIRECTLY #include <unistd.h> #include <signal.h> #include <malloc.h> #include <sys/mman.h> #include "../../src/generator_packed_trsm_avx_avx512.h" unsigned char *routine_output; libxsmm_generated_code io_generated_code; int pagesize = sysconf(_SC_PAGE_SIZE); if (pagesize == -1) fprintf(stderr,"sysconf pagesize\n"); routine_output = (unsigned char *) mmap(NULL, BUFSIZE2, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, 0,0); if (mprotect(routine_output, BUFSIZE2, PROT_EXEC | PROT_READ | PROT_WRITE ) == -1) fprintf(stderr,"mprotect\n"); printf("Routine ready\n"); io_generated_code.generated_code = &routine_output[0]; io_generated_code.buffer_size = BUFSIZE2; io_generated_code.code_size = 0; io_generated_code.code_type = 2; io_generated_code.last_error = 0; #endif if ( argc <= 3 ) { printf("\nUSAGE: %s m n lda ldb nmat side uplo trans diag layout ntest alpha\n",argv[0]); printf("Compact TRSM a mxn matrix of leading dimension ldb\n"); printf("This will test the jit of 1 VLEN work of nmat at a time\n"); printf("Defaults: m=n=lda=ldb=nmat=8, alpha=1.0, side=uplo='L',trans=diag='N',layout=102,ntest=1\n"); } if ( argc > 1 ) m = atoi(argv[1]); else m = 8; if ( argc > 2 ) n = atoi(argv[2]); else n = 8; if ( argc > 3 ) lda= atoi(argv[3]); else lda = 8; if ( argc > 4 ) ldb = atoi(argv[4]); else ldb = 8; if ( argc > 5 ) nmat = atoi(argv[5]); else nmat = 8; if ( argc > 6 ) side = argv[6][0]; else side = 'L'; if ( argc > 7 ) uplo = argv[7][0]; else uplo = 'L'; if ( argc > 8 ) trans = argv[8][0]; else trans = 'N'; if ( argc > 9 ) diag = argv[9][0]; else diag = 'N'; if ( argc > 10 ) layout = atoi(argv[10]); else layout=102; if ( argc > 11 ) ntest = atoi(argv[11]); else ntest = 1; if ( argc > 12 ) dalpha = atof(argv[12]); else dalpha = 1.0; salpha = (float)dalpha; m = LIBXSMM_MAX(m,1); n = LIBXSMM_MAX(n,1); /* A is either mxm or nxn depending on side */ if ( (side == 'L') || (side=='l') ) asize = m; else asize = n; lda = LIBXSMM_MAX(lda,asize); if ( layout == 102 ) { /* Column major: B is mxn, and stored in B format */ ldb = LIBXSMM_MAX(ldb,m); bsize = ldb*n; } else { /* Row major: B is mxn, and stored in B^T format */ ldb = LIBXSMM_MAX(ldb,n); bsize = ldb*m; } nmats = LIBXSMM_MAX(VLENS,nmat - (nmat%VLENS)); nmatd = LIBXSMM_MAX(VLEND,nmat - (nmat%VLEND)); nmat = LIBXSMM_MAX(nmats,nmatd); op_count = n * m * asize; printf("This is a real*%u tester for JIT compact TRSM kernels! (%c%c%c%c m=%u n=%u lda=%u ldb=%u layout=%u nmat=%u)\n",typesize8,side,uplo,trans,diag,m,n,lda,ldb,layout,nmat); #ifdef USE_XSMM_GENERATED printf("This code tests the LIBXSMM generated kernels\n"); #endif #ifdef USE_PREDEFINED_ASSEMBLY printf("This code tests some predefined assembly kenrel\n"); #endif #ifdef USE_KERNEL_GENERATION_DIRECTLY printf("This code tests kernel generation directly\n"); #endif #ifdef TIME_MKL printf("This code tests MKL compact batch directly\n"); #endif desc8 = libxsmm_trsm_descriptor_init(&blob, typesize8, m, n, lda, ldb, &dalpha, trans, diag, side, uplo, layout); desc4 = libxsmm_trsm_descriptor_init(&blob, typesize4, m, n, lda, ldb, &salpha, trans, diag, side, uplo, layout); #ifdef USE_XSMM_GENERATED printf("calling libxsmm_dispatch_trsm: typesize8=%u\n",typesize8); mykernel.dp = libxsmm_dispatch_trsm(desc8); printf("done calling libxsmm_dispatch_trsm: typesize8=%u\n",typesize8); if ( mykernel.dp == NULL ) printf("R8 Kernel after the create call is null\n"); mykernel.sp = libxsmm_dispatch_trsm(desc4); if ( mykernel.sp == NULL ) printf("R4 kernel after the create call is null\n"); #endif #ifdef USE_KERNEL_GENERATION_DIRECTLY libxsmm_generator_trsm_kernel ( &io_generated_code, &desc8, "hsw" ); #endif #ifndef NO_ACCURACY_CHECK printf("mallocing matrices\n"); #endif sa = (float *) malloc ( lda*asize*nmats*sizeof(float) ); da = (double *) malloc ( lda*asize*nmatd*sizeof(double) ); sb = (float *) malloc ( bsize*nmats*sizeof(float) ); db = (double *) malloc ( bsize*nmatd*sizeof(double) ); sc = (float *) malloc ( bsize*nmats*sizeof(float) ); dc = (double *) malloc ( bsize*nmatd*sizeof(double) ); sd = (float *) malloc ( bsize*nmats*sizeof(float) ); dd = (double *) malloc ( bsize*nmatd*sizeof(double) ); tmpbuf = (double *) malloc ( asize*VLEND*sizeof(double) ); #ifndef NO_ACCURACY_CHECK printf("filling matrices\n"); #endif sfill_matrix ( sa, lda, asize, asize*nmats ); #ifdef TRIANGLE_IS_IDENTITY printf("Warning: setting triangular matrix to identity. Not good for accuracy testing\n"); dfill_identity ( da, lda, asize, asize, VLEND, nmatd/VLEND ); #else dfill_matrix ( da, lda, asize, asize*nmatd ); #endif sfill_matrix ( sb, bsize, bsize, nmats ); dfill_matrix ( db, bsize, bsize, nmatd ); #ifndef NO_ACCURACY_CHECK for ( i = 0 ; i < (int)(bsize*nmats) ; i++ ) sc[i]=sb[i]; for ( i = 0 ; i < (int)(bsize*nmatd) ; i++ ) dc[i]=db[i]; for ( i = 0 ; i < (int)(bsize*nmats) ; i++ ) sd[i]=sb[i]; for ( i = 0 ; i < (int)(bsize*nmatd) ; i++ ) dd[i]=db[i]; printf("Pointing at the kernel now\n"); #endif #ifdef USE_XSMM_GENERATED cptr = (const unsigned char*) mykernel.pv; #endif #ifdef USE_PREDEFINED_ASSEMBLY cptr = (const unsigned char*) trsm_xct_; #endif #ifdef USE_KERNEL_GENERATION_DIRECTLY cptr = (const unsigned char*) &routine_output[0]; opcode_routine = (void *) &cptr[0]; #endif #ifndef TIME_MKL #define DUMP_ASSEMBLY_FILE #endif #ifdef DUMP_ASSEMBLY_FILE printf("Dumping assembly file\n"); FILE *fp = fopen("foo.s","w"); char buffer[80]; fputs("\t.text\n",fp); fputs("\t.align 256\n",fp); fputs("\t.globl trsm_xct_\n",fp); fputs("trsm_xct_:\n",fp); for (i = 0 ; i < 4000; i+=4 ) { sprintf(buffer,".byte 0x%02x, 0x%02x, 0x%02x, 0x%02x\n",cptr[i],cptr[i+1],cptr[i+2],cptr[i+3]); fputs(buffer,fp); } fputs("\tretq\n",fp); fputs("\t.type trsm_xct_,@function\n",fp); fputs("\t.size trsm_xct_,.-trsm_xct_\n",fp); fclose(fp); #endif #if defined(USE_MKL_FOR_REFERENCE) || defined(TIME_MKL) #include "mkl.h" MKL_LAYOUT CLAYOUT = (layout == 101) ? MKL_ROW_MAJOR : MKL_COL_MAJOR; MKL_SIDE SIDE = (side == 'R' || side == 'r') ? MKL_RIGHT : MKL_LEFT; MKL_UPLO UPLO = (uplo == 'U' || uplo == 'u') ? MKL_UPPER : MKL_LOWER; MKL_TRANSPOSE TRANSA = (trans == 'N' || trans == 'n') ? MKL_NOTRANS : MKL_TRANS; MKL_DIAG DIAG = (diag == 'N' || diag == 'n') ? MKL_NONUNIT : MKL_UNIT; MKL_COMPACT_PACK CMP_FORMAT = mkl_get_format_compact(); #if 0 MKL_COMPACT_PACK CMP_FORMAT = MKL_COMPACT_AVX; #endif #endif #ifndef NO_ACCURACY_CHECK printf("Before routine, initial B(1,1)=%g B[256]=%g\n",db[0],db[256]); #endif #ifdef USE_PREDEFINED_ASSEMBLY double one = 1.0; #endif double timer; #ifdef MKL_TIMER double tmptimer; tmptimer = dsecnd_(); #else unsigned long long l_start, l_end; #endif timer = 0.0; for ( j = 0 ; j < (int)ntest ; j++ ) { #ifndef TRIANGLE_IS_IDENTITY for ( i = 0 ; i < (int)(bsize*nmatd) ; i++ ) db[i]=dd[i]; #endif for ( i = 0 , num = 0; i < (int)nmatd ; i+= (int)VLEND, num++ ) { double *Ap = &da[num*lda*asize*VLEND]; double *Bp = &db[num*bsize*VLEND]; #ifdef MKL_TIMER tmptimer = dsecnd_(); #else l_start = libxsmm_timer_tick(); #endif #ifdef USE_XSMM_GENERATED mykernel.dp ( Ap, Bp, tmpbuf ); #endif #ifdef USE_PREDEFINED_ASSEMBLY trsm_xct_ ( Ap, Bp, &one ); #endif #ifdef USE_KERNEL_GENERATION_DIRECTLY (*opcode_routine)( Ap, Bp ); #endif #ifdef TIME_MKL mkl_dtrsm_compact ( CLAYOUT, SIDE, UPLO, TRANSA, DIAG, m, n, dalpha, da, lda, db, ldb, CMP_FORMAT, nmatd ); i+=nmatd; /* Because MKL will do everything */ #endif #ifdef MKL_TIMER timer += dsecnd_() - tmptimer; #else l_end = libxsmm_timer_tick(); timer += libxsmm_timer_duration(l_start,l_end); #endif } } timer /= ((double)ntest); #ifndef NO_ACCURACY_CHECK printf("Average time to get through %u matrices: %g\n",nmatd,timer); printf("Gflops: %g\n",(double)(op_count*nmatd)/(timer*1.0e9)); printf("after routine, new B(1,1)=%g B[256]=%g\n",db[0],db[256]); #endif #ifdef TEST_SINGLE printf("Before r4 routine, initial B(1,1)=%g B[256]=%g\n",sb[0],sb[256]); for ( i = 0 , num = 0; i < nmats ; i+= VLENS, num++ ) { float *Ap = &sa[num*lda*asize*VLENS]; float *Bp = &sb[num*bsize*VLENS]; #ifdef USE_XSMM_GENERATED mykernel.sp ( Ap, Bp, NULL ); #endif } printf("after r4 routine, new B(1,1)=%g B]256]=%g\n",db[0],db[256]); #endif #ifndef NO_ACCURACY_CHECK /* Call some reference code now on a copy of the B matrix (C) */ double timer2 = 0.0; for ( j = 0 ; j < (int)ntest ; j++ ) { #ifndef TRIANGLE_IS_IDENTITY for ( i = 0 ; i < (int)(bsize*nmatd) ; i++ ) dc[i]=dd[i]; #endif #ifdef MKL_TIMER tmptimer = dsecnd_(); #else l_start = libxsmm_timer_tick(); #endif #ifdef USE_MKL_FOR_REFERENCE mkl_dtrsm_compact ( CLAYOUT, SIDE, UPLO, TRANSA, DIAG, m, n, dalpha, da, lda, dc, ldb, CMP_FORMAT, nmatd ); #elif !defined(LIBXSMM_NOFORTRAN) if ( (layout == 101) && (nmatd!=VLEND) ) { unsigned int lay = 102, m1 = n, n1 = m; char side1='L', uplo1='L'; if ( side == 'L' || side == 'l' ) side1 = 'R'; if ( uplo == 'L' || uplo == 'l' ) uplo1 = 'U'; compact_dtrsm_ ( &lay, &side1, &uplo1, &trans, &diag, &m1, &n1, &dalpha, da, &lda, dc, &ldb, &nmatd, &VLEND ); } else { compact_dtrsm_ ( &layout, &side, &uplo, &trans, &diag, &m, &n, &dalpha, da, &lda, dc, &ldb, &nmatd, &VLEND ); } #endif #ifdef MKL_TIMER timer2 += dsecnd_() - tmptimer; #else l_end = libxsmm_timer_tick(); timer2 += libxsmm_timer_duration(l_start,l_end); #endif } timer2 /= ((double)ntest); printf("Reference time=%g Reference Gflops=%g\n",timer2,(op_count*nmatd)/(timer2*1.0e9)); /* Compute the residual between B and C */ dtmp = residual_d ( dc, bsize, bsize, nmatd, db, bsize, &nerrs, &ncorr ); printf("R8 %c%c%c%c m=%u n=%u lda=%u ldb=%u error: %g number of errors: %u corrects: %u",side,uplo,trans,diag,m,n,lda,ldb,dtmp,nerrs,ncorr); if ( nerrs > 0 ) printf(" ->FAILED at %ux%u real*8 %u case",m,n,layout); printf("\n"); #ifdef TEST_SINGLE /* Call some reference code now on a copy of the B matrix (C) */ compact_strsm_ ( &layout, &side, &uplo, &trans, &diag, &m, &n, &salpha, sa, &lda, sc, &ldb, &nmats, &VLENS ); /* Compute the residual between B and C */ dtmp = residual_s ( sc, bsize, bsize, nmats, sb, bsize, &nerrs, &ncorr ); printf("R4 %c%c%c%c m=%u n=%u lda=%u ldb=%u error: %g number of errors: %u corrects: %u\n",side,uplo,trans,diag,m,n,lda,ldb,dtmp,nerrs,ncorr); if ( nerrs > 0 ) printf(" ->FAILED at %ux%u real*4 case",m,n); printf("\n"); #endif #else for ( j = 0, nerrs = 0 ; j < bsize*nmatd ; j++ ) { if ( isnan(db[j]) || isinf(db[j]) ) { if ( ++nerrs < 10 ) { printf("WARNING: db[%d]=%g\n",j,db[j]); } } } printf("%g,real*8 %c%c%c%c m=%u n=%u lda=%u ldb=%u Denormals=%u Time=%g Gflops=%g",(op_count*nmatd)/(timer*1.0e9),side,uplo,trans,diag,m,n,lda,ldb,nerrs,timer,(op_count*nmatd)/(timer*1.0e9)); if ( nerrs > 0 ) printf(" -> FAILED at %ux%u real*8 case",m,n); printf("\n"); #endif free(dd); free(sd); free(dc); free(sc); free(db); free(sb); free(da); free(sa); return 0; }
int main(int argc, char* argv[]) { unsigned int m=8, n=8, k=8, lda=8, ldb=8, ldc=8, nerrs, num, nmat; unsigned int layout, asize, bsize, ntest, ncorr; #ifdef AVX512_TESTING unsigned int VLEND=8, VLENS=16; int arch=LIBXSMM_X86_AVX512_CORE; #else unsigned int VLEND=4, VLENS=8; int arch=LIBXSMM_X86_AVX2; #endif unsigned int nmats, nmatd; unsigned int i, j, l, iunroll, junroll, loopi, loopj; char side='L', uplo='U', transa='N', transb='N', diag='N'; unsigned int typesize8 = 8; unsigned int typesize4 = 4; float *sa, *sb, *sc, *sd, *sc1; double *da, *db, *dc, *dd, *dc1; double dalpha = 1.0; float salpha = (float)dalpha; double dbeta = 1.0; float sbeta = (float)dbeta; double dtmp; const unsigned char *cptr = NULL; unsigned long op_count; const libxsmm_pgemm_descriptor* desc8 = NULL; const libxsmm_pgemm_descriptor* desc4 = NULL; #ifdef USE_XSMM_GENERATED libxsmm_descriptor_blob blob; libxsmm_pgemm_xfunction mykernel = NULL; #endif #if defined(USE_KERNEL_GENERATION_DIRECTLY) && defined(__linux__) void (*opcode_routine)(); unsigned char *routine_output; libxsmm_generated_code io_generated_code; int pagesize = sysconf(_SC_PAGE_SIZE); if (pagesize == -1) fprintf(stderr,"sysconf pagesize\n"); routine_output = (unsigned char *) mmap(NULL, BUFSIZE2, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, 0,0); if (mprotect(routine_output, BUFSIZE2, PROT_EXEC | PROT_READ | PROT_WRITE ) == -1) fprintf(stderr,"mprotect\n"); printf("Routine ready\n"); io_generated_code.generated_code = &routine_output[0]; io_generated_code.buffer_size = BUFSIZE2; io_generated_code.code_size = 0; io_generated_code.code_type = 2; io_generated_code.last_error = 0; #endif printf("\nUSAGE: %s m n k lda ldb ldc nmat layout ntest transa transb iunroll junroll loopj loopi\n",argv[0]); if ( argc <= 3 ) { #ifdef TEST_SINGLE printf("Compact SGEMM a C_mxn<-C_mxn+A_mxk*B_kxn matrix of leading dims lda/b/c\n"); printf("This will test the jit of 1 VLEN=%d ",VLENS); if ( VLENS==8 ) printf("(AVX2)"); else printf("(AVX512)"); #else printf("Compact DGEMM a C_mxn<-C_mxn+A_mxk*B_kxn matrix of leading dims lda/b/c\n"); printf("This will test the jit of 1 VLEN=%d ",VLEND); if ( VLEND==4 ) printf("(AVX2)"); else printf("(AVX512)"); #endif printf(" work of nmat at a time\n"); printf("Configurable: M-loop controlled by iunroll & loopi. N-loop by junroll & loopj\n"); printf("Defaults: m=n=k=lda=ldb=ldc=nmat=8, layout=102 (col major), transa=/b='N', ntest=1\n"); } if ( argc > 1 ) m = atoi(argv[1]); else m = 8; if ( argc > 2 ) n = atoi(argv[2]); else n = 8; if ( argc > 3 ) k = atoi(argv[3]); else k = 8; if ( argc > 4 ) lda= atoi(argv[4]); else lda = 8; if ( argc > 5 ) ldb= atoi(argv[5]); else ldb = 8; if ( argc > 6 ) ldc= atoi(argv[6]); else ldc = 8; if ( argc > 7 ) nmat = atoi(argv[7]); else nmat = 8; if ( argc > 8 ) layout = atoi(argv[8]); else layout=102; if ( argc > 9 ) ntest = atoi(argv[9]); else ntest = 1; if ( argc > 10 ) transa = argv[10][0]; else transa = 'N'; if ( argc > 11 ) transb = argv[11][0]; else transb = 'N'; if ( argc > 12 ) iunroll=atoi(argv[12]); else iunroll=0; if ( argc > 13 ) junroll=atoi(argv[13]); else junroll=0; if ( argc > 14 ) loopj=atoi(argv[14]); else loopj=0; if ( argc > 15 ) loopi=atoi(argv[15]); else loopi=0; salpha = (float)dalpha; m = LIBXSMM_MAX(m,1); n = LIBXSMM_MAX(n,1); k = LIBXSMM_MAX(k,1); ntest = LIBXSMM_MAX(ntest,1); nmat = LIBXSMM_MAX(nmat,VLEND); layout = LIBXSMM_MAX(LIBXSMM_MIN(layout,102),101); if ( transa!='N' && transa!='n' && transa!='T' && transa!='t' ) transa='N'; if ( transb!='N' && transb!='n' && transb!='T' && transb!='t' ) transb='N'; lda = LIBXSMM_MAX(lda,m); ldb = LIBXSMM_MAX(ldb,k); ldc = LIBXSMM_MAX(ldc,m); nmats = LIBXSMM_MAX(VLENS,nmat - (nmat%VLENS)); nmatd = LIBXSMM_MAX(VLEND,nmat - (nmat%VLEND)); #ifdef TEST_SINGLE nmat = nmats; #else nmat = nmatd; #endif op_count = (unsigned long)(nmat * 2.0 * (double)m * (double)n * (double)k); #ifdef TEST_SINGLE printf("This is a real*%d tester for JIT compact SGEMM %c%c kernels! (m=%u n=%u k=%u lda=%u ldb=%u ldc=%u layout=%d nmat=%d alpha=%g beta=%g iun=%d jun=%d loopi=%d loopj=%d VLEN=%d)\n",typesize4,transa,transb,m,n,k,lda,ldb,ldc,layout,nmat,dalpha,dbeta,iunroll,junroll,loopi,loopj,VLENS); #else printf("This is a real*%d tester for JIT compact DGEMM %c%c kernels! (m=%u n=%u k=%u lda=%u ldb=%u ldc=%u layout=%d nmat=%d alpha=%g beta=%g iun=%d jun=%d loopi=%d loopj=%d VLEN=%d)\n",typesize8,transa,transb,m,n,k,lda,ldb,ldc,layout,nmat,dalpha,dbeta,iunroll,junroll,loopi,loopj,VLEND); #endif #ifdef USE_XSMM_GENERATED printf("This code tests the LIBXSMM generated kernels\n"); #endif #ifdef USE_PREDEFINED_ASSEMBLY printf("This code tests some predefined assembly kernel\n"); #endif #if defined(USE_KERNEL_GENERATION_DIRECTLY) && defined(__linux__) printf("This code tests kernel generation directly\n"); #endif #ifdef TIME_MKL printf("This code tests MKL compact batch directly\n"); #endif #ifdef AVX512_TESTING printf("This tests AVX512 binaries\n"); #endif #ifdef AVX2_TESTING printf("This tests AVX2 binaries\n"); #endif desc8 = libxsmm_pgemm_descriptor_init(&blob, typesize8, m, n, k, lda, ldb, ldc, &dalpha, transa, transb, layout ); #ifdef TEST_SINGLE desc4 = libxsmm_pgemm_descriptor_init(&blob, typesize4, m, n, k, lda, ldb, ldc, &dalpha, transa, transb, layout ); #endif printf("Descriptor set\n"); #ifdef USE_XSMM_GENERATED printf("calling libxsmm_dispatch_pgemm: typesize8=%u\n",typesize8); mykernel = libxsmm_dispatch_pgemm(desc8); printf("done calling libxsmm_dispatch_pgemm: typesize8=%u\n",typesize8); if ( mykernel == NULL ) printf("R8 Kernel after the create call is null\n"); #ifdef TEST_SINGLE mykernel = libxsmm_dispatch_pgemm(desc4); if ( mykernel == NULL ) printf("R4 kernel after the create call is null\n"); #endif #endif #if defined(USE_KERNEL_GENERATION_DIRECTLY) && defined(__linux__) libxsmm_generator_pgemm_kernel( &io_generated_code, desc8, arch, iunroll, junroll, loopi, loopj ); #endif #ifndef NO_ACCURACY_CHECK printf("mallocing matrices\n"); #endif sa = (float *) malloc ( lda*k*nmat*sizeof(float) ); da = (double *) malloc ( lda*k*nmat*sizeof(double) ); sb = (float *) malloc ( ldb*n*nmat*sizeof(float) ); db = (double *) malloc ( ldb*n*nmat*sizeof(double) ); sc1 = (float *) malloc ( ldc*n*nmat*sizeof(float) ); dc1 = (double *) malloc ( ldc*n*nmat*sizeof(double) ); sc = (float *) malloc ( ldc*n*nmat*sizeof(float) ); dc = (double *) malloc ( ldc*n*nmat*sizeof(double) ); sd = (float *) malloc ( ldc*n*nmat*sizeof(float) ); dd = (double *) malloc ( ldc*n*nmat*sizeof(double) ); #ifndef NO_ACCURACY_CHECK printf("filling matrices\n"); #endif sfill_matrix ( sa, lda, m, k*nmat ); sfill_matrix ( sb, ldb, k, n*nmat ); sfill_matrix ( sc, ldc, m, n*nmat ); dfill_matrix ( da, lda, m, k*nmat ); dfill_matrix ( db, ldb, k, n*nmat ); dfill_matrix ( dc, ldc, m, n*nmat ); #ifndef NO_ACCURACY_CHECK for ( i = 0 ; i < ldc*n*nmat ; i++ ) sd[i]=sc[i]; for ( i = 0 ; i < ldc*n*nmat ; i++ ) dd[i]=dc[i]; for ( i = 0 ; i < ldc*n*nmat ; i++ ) sc1[i]=sc[i]; for ( i = 0 ; i < ldc*n*nmat ; i++ ) dc1[i]=dc[i]; printf("Pointing at the kernel now\n"); #endif #ifdef USE_XSMM_GENERATED cptr = (const unsigned char*) mykernel; #endif #ifdef USE_PREDEFINED_ASSEMBLY cptr = (const unsigned char*) gemm_; #endif #if defined(USE_KERNEL_GENERATION_DIRECTLY) && defined(__linux__) cptr = (const unsigned char*) &routine_output[0]; opcode_routine = (void *) &cptr[0]; #endif #ifndef TIME_MKL # define DUMP_ASSEMBLY_FILE #endif #ifdef DUMP_ASSEMBLY_FILE printf("Dumping assembly file\n"); FILE *fp = fopen("foo.s","w"); char buffer[80]; fputs("\t.text\n",fp); fputs("\t.align 256\n",fp); fputs("\t.globl gemm_\n",fp); fputs("gemm_:\n",fp); for (i = 0 ; i < 7000; i+=4 ) { sprintf(buffer,".byte 0x%02x, 0x%02x, 0x%02x, 0x%02x\n",cptr[i],cptr[i+1],cptr[i+2],cptr[i+3]); fputs(buffer,fp); } fputs("\tretq\n",fp); fputs("\t.type gemm_,@function\n",fp); fputs("\t.size gemm_,.-gemm_\n",fp); fclose(fp); #endif #if defined(USE_MKL_FOR_REFERENCE) || defined(TIME_MKL) # include <mkl.h> MKL_LAYOUT CLAYOUT = (layout == 101) ? MKL_ROW_MAJOR : MKL_COL_MAJOR; MKL_SIDE SIDE = (side == 'R' || side == 'r') ? MKL_RIGHT : MKL_LEFT; MKL_UPLO UPLO = (uplo == 'U' || uplo == 'u') ? MKL_UPPER : MKL_LOWER; MKL_TRANSPOSE TRANSA = (transa == 'N' || transa == 'n') ? MKL_NOTRANS : MKL_TRANS; MKL_TRANSPOSE TRANSB = (transb == 'N' || transb == 'n') ? MKL_NOTRANS : MKL_TRANS; MKL_DIAG DIAG = (diag == 'N' || diag == 'n') ? MKL_NONUNIT : MKL_UNIT; MKL_COMPACT_PACK CMP_FORMAT = mkl_get_format_compact(); #if 0 MKL_COMPACT_PACK CMP_FORMAT = MKL_COMPACT_AVX; #endif #endif #ifndef NO_ACCURACY_CHECK printf("Before routine, initial A(1,1)=%g A[256]=%g\n",da[0],da[256]); #endif #ifdef USE_PREDEFINED_ASSEMBLY double one = 1.0; #endif double timer, firsttime = 0; #ifdef MKL_TIMER double tmptimer; tmptimer = dsecnd_(); #else unsigned long long l_start, l_end; #endif timer = 0.0; for ( j = 0 ; j < (int)ntest ; j++ ) { for ( i = 0 ; i < ldc*n*nmat ; i++ ) dc[i]=dc1[i]; for ( i = 0 , num = 0; i < (int)nmat ; i+= (int)VLEND, num++ ) { double *Ap = &da[num*lda*k*VLEND]; double *Bp = &db[num*ldb*n*VLEND]; double *Cp = &dc[num*ldc*n*VLEND]; #ifdef MKL_TIMER tmptimer = dsecnd_(); #else l_start = libxsmm_timer_tick(); #endif #if !defined(USE_XSMM_GENERATED) && !defined(USE_PREDEFINED_ASSEMBLY) && !defined(USE_KERNEL_GENERATION_DIRECTLY) && !defined(TIME_MKL) && !defined(USE_PREDEFINED_ASSEMBLY_XCT) gen_compact_dgemm_ ( &layout, &m, &n, &k, &dalpha, Ap, &lda, Bp, &ldb, &dbeta, Cp, &ldc, &VLEND ); #endif #ifdef USE_XSMM_GENERATED mykernel ( Ap, Bp, Cp ); #endif #ifdef USE_PREDEFINED_ASSEMBLY gemm_ ( Ap, Bp, Cp ); #endif #ifdef USE_KERNEL_GENERATION_DIRECTLY (*opcode_routine)( Ap, Bp, Cp ); #endif #ifdef TIME_MKL mkl_dgemm_compact ( CLAYOUT, TRANSA, TRANSB, m, n, k, dalpha, da, lda, db, ldb, dbeta, dc, ldc, CMP_FORMAT, nmat ); i+=nmatd; /* Because MKL will do everything */ #endif #ifdef MKL_TIMER dtmp = dsecnd_() - tmptimer; #else l_end = libxsmm_timer_tick(); dtmp = libxsmm_timer_duration(l_start,l_end); #endif if ( j == 0 ) firsttime=dtmp; timer += dtmp; } } if ( ntest >= 100 ) { /* Skip the first timing: super necessary if using MKL */ timer = (timer-firsttime)/((double)(ntest-1)); } else { timer /= ((double)ntest); } #ifndef NO_ACCURACY_CHECK printf("Average time to get through %u matrices: %g\n",nmat,timer); printf("Gflops: %g\n",(double)op_count/(timer*1.0e9)); printf("after routine, new C(1,1)=%g C[256]=%g\n",dc[0],dc[256]); #endif #ifdef TEST_SINGLE printf("Before r4 routine, initial C(1,1)=%g C[256]=%g\n",sc[0],sc[256]); for ( i = 0 , num = 0; i < nmats ; i+= VLENS, num++ ) { float *Ap = &sa[num*lda*k*VLENS]; float *Bp = &sb[num*ldb*n*VLENS]; float *Cp = &sc[num*ldc*n*VLENS]; #ifdef USE_XSMM_GENERATED mykernel ( Ap, Bp, Cp ); #endif } printf("after r4 routine, new C(1,1)=%g C]256]=%g\n",dc[0],dc[256]); #endif #ifndef NO_ACCURACY_CHECK /* Call some reference code now on a copy of the B matrix (C) */ double timer2 = 0.0; for ( j = 0 ; j < (int)ntest ; j++ ) { for ( i = 0 ; i < ldc*n*nmat ; i++ ) dd[i]=dc1[i]; #ifdef MKL_TIMER tmptimer = dsecnd_(); #else l_start = libxsmm_timer_tick(); #endif #ifndef USE_MKL_FOR_REFERENCE compact_dgemm_ ( &layout, &transa, &transb, &m, &n, &k, &dalpha, da, &lda, db, &ldb, &dbeta, dd, &ldc, &nmat, &VLEND ); #else mkl_dgemm_compact ( CLAYOUT, TRANSA, TRANSB, m, n, k, dalpha, da, lda, db, ldb, dbeta, dd, ldc, CMP_FORMAT, nmat ); #endif #ifdef MKL_TIMER timer2 += dsecnd_() - tmptimer; #else l_end = libxsmm_timer_tick(); timer2 += libxsmm_timer_duration(l_start,l_end); #endif } timer2 /= ((double)ntest); printf("Reference time=%g Reference Gflops=%g\n",timer2,op_count/(timer2*1.0e9)); /* Compute the residual between B and C */ dtmp = residual_d ( dc, ldc, m, n*nmat, dd, ldc, &nerrs, &ncorr ); printf("R8 mnk=%u %u %u ldabc=%u %u %u error: %g number of errors: %u corrects: %u",m,n,k,lda,ldb,ldc,dtmp,nerrs,ncorr); if ( nerrs > 0 ) printf(" ->FAILED at %ux%u real*8 %u case",m,n,layout); printf("\n"); #ifdef TEST_SINGLE /* Call some reference code now on a copy of the B matrix (C) */ compact_dgemm_ ( &layout, &transa, &transb, &m, &n, &k, &salpha, sa, &lda, sb, &ldb, &sbeta, sd, &ldc, &nmat, &VLENS ); /* Compute the residual between C and D */ dtmp = residual_s ( sc, ldc, m, n*nmat, sd, ldc, &nerrs, &ncorr ); printf("R4 mnk=%u %u %u ldabc=%u %u %u error: %g number of errors: %u corrects: %u",m,n,k,lda,ldb,ldc,dtmp,nerrs,ncorr); if ( nerrs > 0 ) printf(" ->FAILED at %ux%u real*4 case",m,n); printf("\n"); #endif #else for ( j = 0, nerrs = 0 ; j < lda*n*nmat; j++ ) { if ( isnan(dc[j]) || isinf(dc[j]) ) { if ( ++nerrs < 10 ) { printf("WARNING: dc[%d]=%g\n",j,dc[j]); } } } printf("%g,real*8 m/n/k=%u %u %u lda-c=%u %u %u Denormals=%u Time=%g Gflops=%g",op_count/(timer*1.0e9),m,n,k,lda,ldb,ldc,nerrs,timer,op_count/(timer*1.0e9)); if ( nerrs > 0 ) printf(" -> FAILED at %ux%u real*8 case",m,n); printf("\n"); #endif free(dd); free(sd); free(dc); free(sc); free(dc1); free(sc1); free(db); free(sb); free(da); free(sa); return 0; }