LIBXSMM_API_INTERN void libxsmm_generator_spgemm_csc_bsparse_soa_avx256_512( libxsmm_generated_code* io_generated_code, const libxsmm_gemm_descriptor* i_xgemm_desc, const char* i_arch, const unsigned int* i_row_idx, const unsigned int* i_column_idx, const void* i_values ) { unsigned int l_n = 0; unsigned int l_k = 0; unsigned int l_soa_width = 0; unsigned int l_max_cols = 0; unsigned int l_n_processed = 0; unsigned int l_n_limit = 0; unsigned int l_n_chunks = 0; unsigned int l_n_chunksize = 0; unsigned int l_found_mul = 0; unsigned int l_max_reg_block = 0; libxsmm_micro_kernel_config l_micro_kernel_config; libxsmm_loop_label_tracker l_loop_label_tracker; libxsmm_gp_reg_mapping l_gp_reg_mapping; LIBXSMM_UNUSED(i_values); /* select soa width */ if ( LIBXSMM_GEMM_PRECISION_F64 == LIBXSMM_GETENUM_INP( i_xgemm_desc->datatype ) ) { if ( strcmp(i_arch, "knl") == 0 || strcmp(i_arch, "knm") == 0 || strcmp(i_arch, "skx") == 0 || strcmp(i_arch, "clx") == 0 || strcmp(i_arch, "cpx") == 0 ) { l_soa_width = 8; l_max_reg_block = 28; } else { l_soa_width = 4; l_max_reg_block = 14; } } else { if ( strcmp(i_arch, "knl") == 0 || strcmp(i_arch, "knm") == 0 || strcmp(i_arch, "skx") == 0 || strcmp(i_arch, "clx") == 0 || strcmp(i_arch, "cpx") == 0 ) { l_soa_width = 16; l_max_reg_block = 28; } else { l_soa_width = 8; l_max_reg_block = 14; } } /* define gp register mapping */ libxsmm_reset_x86_gp_reg_mapping( &l_gp_reg_mapping ); /* matching calling convention on Linux */ #if defined(_WIN32) || defined(__CYGWIN__) l_gp_reg_mapping.gp_reg_a = LIBXSMM_X86_GP_REG_RCX; l_gp_reg_mapping.gp_reg_b = LIBXSMM_X86_GP_REG_RDX; l_gp_reg_mapping.gp_reg_c = LIBXSMM_X86_GP_REG_R8; /* TODO: full support for Windows calling convention */ l_gp_reg_mapping.gp_reg_a_prefetch = LIBXSMM_X86_GP_REG_RDI; l_gp_reg_mapping.gp_reg_b_prefetch = LIBXSMM_X86_GP_REG_RSI; #else /* match calling convention on Linux */ l_gp_reg_mapping.gp_reg_a = LIBXSMM_X86_GP_REG_RDI; l_gp_reg_mapping.gp_reg_b = LIBXSMM_X86_GP_REG_RSI; l_gp_reg_mapping.gp_reg_c = LIBXSMM_X86_GP_REG_RDX; l_gp_reg_mapping.gp_reg_a_prefetch = LIBXSMM_X86_GP_REG_RCX; l_gp_reg_mapping.gp_reg_b_prefetch = LIBXSMM_X86_GP_REG_R8; #endif l_gp_reg_mapping.gp_reg_c_prefetch = LIBXSMM_X86_GP_REG_UNDEF; l_gp_reg_mapping.gp_reg_mloop = LIBXSMM_X86_GP_REG_R12; l_gp_reg_mapping.gp_reg_nloop = LIBXSMM_X86_GP_REG_R13; l_gp_reg_mapping.gp_reg_kloop = LIBXSMM_X86_GP_REG_R14; l_gp_reg_mapping.gp_reg_help_0 = LIBXSMM_X86_GP_REG_UNDEF; l_gp_reg_mapping.gp_reg_help_1 = LIBXSMM_X86_GP_REG_UNDEF; l_gp_reg_mapping.gp_reg_help_2 = LIBXSMM_X86_GP_REG_UNDEF; l_gp_reg_mapping.gp_reg_help_3 = LIBXSMM_X86_GP_REG_UNDEF; l_gp_reg_mapping.gp_reg_help_4 = LIBXSMM_X86_GP_REG_UNDEF; l_gp_reg_mapping.gp_reg_help_5 = LIBXSMM_X86_GP_REG_UNDEF; /* define loop_label_tracker */ libxsmm_reset_loop_label_tracker( &l_loop_label_tracker ); /* define the micro kernel code gen properties */ libxsmm_generator_gemm_init_micro_kernel_config_fullvector( &l_micro_kernel_config, i_xgemm_desc, i_arch, 0 ); /* get max column in C */ l_max_cols = i_xgemm_desc->n; for ( l_n = 0; l_n < i_xgemm_desc->n; l_n++ ) { if ( i_column_idx[l_n] == i_column_idx[i_xgemm_desc->n] ) { l_max_cols = l_n+1; } } /* calculate the chunk size of current columns to work on */ l_n_chunks = ( (l_max_cols % l_max_reg_block) == 0 ) ? (l_max_cols / l_max_reg_block) : (l_max_cols / l_max_reg_block) + 1; assert(0 != l_n_chunks); /* mute static analysis (division-by-zero); such invalid input must be caught upfront */ l_n_chunksize = ( (l_max_cols % l_n_chunks) == 0 ) ? (l_max_cols / l_n_chunks) : (l_max_cols / l_n_chunks) + 1; /* open asm */ libxsmm_x86_instruction_open_stream( io_generated_code, &l_gp_reg_mapping, i_arch, i_xgemm_desc->prefetch ); /* m loop */ libxsmm_x86_instruction_register_jump_back_label( io_generated_code, &l_loop_label_tracker ); libxsmm_x86_instruction_alu_imm( io_generated_code, l_micro_kernel_config.alu_add_instruction, l_gp_reg_mapping.gp_reg_mloop, 1 ); /* loop over n-blocks */ l_n_processed = 0; l_n_limit = l_n_chunksize; while ( l_n_processed < l_max_cols ) { #if 0 printf("l_max_cols: %i, l_n_processed: %i, l_n_limit: %i\n", l_max_cols, l_n_processed, l_n_limit); #endif /* load C accumulator */ for ( l_n = 0; l_n < l_n_limit - l_n_processed; l_n++ ) { if (0 != (LIBXSMM_GEMM_FLAG_BETA_0 & i_xgemm_desc->flags)) { /* Beta=0 */ libxsmm_x86_instruction_vec_compute_reg( io_generated_code, l_micro_kernel_config.instruction_set, l_micro_kernel_config.vxor_instruction, l_micro_kernel_config.vector_name, l_n, l_n, l_n ); } else { libxsmm_x86_instruction_vec_move( io_generated_code, l_micro_kernel_config.instruction_set, l_micro_kernel_config.c_vmove_instruction, l_gp_reg_mapping.gp_reg_c, LIBXSMM_X86_GP_REG_UNDEF, 0, (l_n_processed + l_n)*l_soa_width*l_micro_kernel_config.datatype_size, l_micro_kernel_config.vector_name, l_n, 0, 1, 0 ); } } /* do dense soa times sparse multiplication */ for ( l_k = 0; l_k < (unsigned int)i_xgemm_desc->k; l_k++ ) { unsigned int l_found_qmadd = 0; unsigned int l_col_k = 0; unsigned int l_column_active[28]; int l_nnz_idx[28][4]; /* reset helpers */ for ( l_n = 0; l_n < l_n_limit - l_n_processed; l_n++ ) { l_column_active[l_n] = 0; l_nnz_idx[l_n][0] = -1; l_nnz_idx[l_n][1] = -1; l_nnz_idx[l_n][2] = -1; l_nnz_idx[l_n][3] = -1; } l_found_mul = 0; /* let's figure out if we can apply qmadd when being sin F32 setting and on KNM */ if ( (l_k < ((unsigned int)i_xgemm_desc->k - 3)) && (l_micro_kernel_config.instruction_set == LIBXSMM_X86_AVX512_KNM) && (LIBXSMM_GEMM_PRECISION_F32 == LIBXSMM_GETENUM_INP( i_xgemm_desc->datatype ) ) ) { /* loop over the columns of B/C */ for ( l_n = 0; l_n < l_n_limit - l_n_processed; l_n++ ) { unsigned int l_found = 0; unsigned int l_acol_k = 0; unsigned int l_col_elements = i_column_idx[l_n_processed+l_n+1] - i_column_idx[l_n_processed+l_n]; unsigned int l_cur_column = i_column_idx[l_n_processed+l_n]; for ( l_col_k = 0; l_col_k < l_col_elements; l_col_k++ ) { for ( l_acol_k = l_found; l_acol_k < 4; l_acol_k++ ) { if ( (l_k + l_acol_k) == i_row_idx[l_cur_column + l_col_k] ) { l_nnz_idx[l_n][l_acol_k] = l_cur_column + l_col_k; l_found = l_acol_k+1; } if (l_found == 4) { l_col_k = l_col_elements; } } } /* let's check if we can apply qmadd in col l_n */ if ( (l_nnz_idx[l_n][0] != -1) && (l_nnz_idx[l_n][1] != -1) && (l_nnz_idx[l_n][2] != -1) && (l_nnz_idx[l_n][3] != -1) ) { l_column_active[l_n] = 2; l_found_qmadd = 1; l_found_mul = 1; } else { /* let's check if we have at least one entry in the column that matches one of the four entries */ if ( (l_nnz_idx[l_n][0] != -1) || (l_nnz_idx[l_n][1] != -1) || (l_nnz_idx[l_n][2] != -1) || (l_nnz_idx[l_n][3] != -1) ) { l_column_active[l_n] = 1; l_found_mul = 1; } else { l_column_active[l_n] = 0; } } } } if ( l_found_qmadd == 0 ) { /* loop over the columns of B/C */ for ( l_n = 0; l_n < l_n_limit - l_n_processed; l_n++ ) { unsigned int l_col_elements = i_column_idx[l_n_processed+l_n+1] - i_column_idx[l_n_processed+l_n]; unsigned int l_cur_column = i_column_idx[l_n_processed+l_n]; /* search for entries matching that k */ for ( l_col_k = 0; l_col_k < l_col_elements; l_col_k++ ) { if ( l_k == i_row_idx[l_cur_column + l_col_k] ) { l_nnz_idx[l_n][0] = l_cur_column + l_col_k; l_col_k = l_col_elements; } } /* let's check if we have an entry in the column that matches the k from A */ if ( (l_nnz_idx[l_n][0] != -1) ) { l_column_active[l_n] = 1; l_found_mul = 1; } else { l_column_active[l_n] = 0; } } } /* First case: we can use qmadd */ if ( l_found_qmadd != 0 ) { unsigned int l_lcl_k = 0; for ( l_lcl_k = 0; l_lcl_k < 4; l_lcl_k++ ) { libxsmm_x86_instruction_vec_move( io_generated_code, l_micro_kernel_config.instruction_set, l_micro_kernel_config.a_vmove_instruction, l_gp_reg_mapping.gp_reg_a, LIBXSMM_X86_GP_REG_UNDEF, 0, (l_k+l_lcl_k)*l_soa_width*l_micro_kernel_config.datatype_size, l_micro_kernel_config.vector_name, l_max_reg_block+l_lcl_k, 0, 1, 0 ); } /* loop over the columns of B/C */ for ( l_n = 0; l_n < l_n_limit - l_n_processed; l_n++ ) { /* issue a qmadd */ if ( l_column_active[l_n] == 2 ) { libxsmm_x86_instruction_vec_compute_qfma( io_generated_code, l_micro_kernel_config.instruction_set, LIBXSMM_X86_INSTR_V4FMADDPS, l_gp_reg_mapping.gp_reg_b, LIBXSMM_X86_GP_REG_UNDEF, 0, l_nnz_idx[l_n][0] * l_micro_kernel_config.datatype_size, l_micro_kernel_config.vector_name, l_max_reg_block, l_n ); } else if ( l_column_active[l_n] == 1 ) { for ( l_lcl_k = 0; l_lcl_k < 4; l_lcl_k++ ) { if ( l_nnz_idx[l_n][l_lcl_k] != -1 ) { libxsmm_x86_instruction_vec_compute_mem( io_generated_code, l_micro_kernel_config.instruction_set, l_micro_kernel_config.vmul_instruction, 1, l_gp_reg_mapping.gp_reg_b, LIBXSMM_X86_GP_REG_UNDEF, 0, l_nnz_idx[l_n][l_lcl_k] * l_micro_kernel_config.datatype_size, l_micro_kernel_config.vector_name, l_max_reg_block+l_lcl_k, l_n ); } } } } /* increment by additional 3 columns */ l_k += 3; } else if ( l_found_mul != 0 ) { libxsmm_x86_instruction_vec_move( io_generated_code, l_micro_kernel_config.instruction_set, l_micro_kernel_config.a_vmove_instruction, l_gp_reg_mapping.gp_reg_a, LIBXSMM_X86_GP_REG_UNDEF, 0, l_k*l_soa_width*l_micro_kernel_config.datatype_size, l_micro_kernel_config.vector_name, l_max_reg_block, 0, 1, 0 ); /* loop over the columns of B/C */ for ( l_n = 0; l_n < l_n_limit - l_n_processed; l_n++ ) { if ( l_nnz_idx[l_n][0] != -1 ) { if ( strcmp(i_arch, "knl") == 0 || strcmp(i_arch, "knm") == 0 || strcmp(i_arch, "skx") == 0 || strcmp(i_arch, "clx") == 0 || strcmp(i_arch, "cpx") == 0 ) { libxsmm_x86_instruction_vec_compute_mem( io_generated_code, l_micro_kernel_config.instruction_set, l_micro_kernel_config.vmul_instruction, 1, l_gp_reg_mapping.gp_reg_b, LIBXSMM_X86_GP_REG_UNDEF, 0, l_nnz_idx[l_n][0] * l_micro_kernel_config.datatype_size, l_micro_kernel_config.vector_name, l_max_reg_block, l_n ); } else if ( strcmp(i_arch, "hsw") == 0 ) { libxsmm_x86_instruction_vec_move( io_generated_code, l_micro_kernel_config.instruction_set, l_micro_kernel_config.b_vmove_instruction, l_gp_reg_mapping.gp_reg_b, LIBXSMM_X86_GP_REG_UNDEF, 0, l_nnz_idx[l_n][0] * l_micro_kernel_config.datatype_size, l_micro_kernel_config.vector_name, 15, 0, 1, 0 ); libxsmm_x86_instruction_vec_compute_reg( io_generated_code, l_micro_kernel_config.instruction_set, l_micro_kernel_config.vmul_instruction, l_micro_kernel_config.vector_name, l_max_reg_block, 15, l_n ); } else { libxsmm_x86_instruction_vec_move( io_generated_code, l_micro_kernel_config.instruction_set, l_micro_kernel_config.b_vmove_instruction, l_gp_reg_mapping.gp_reg_b, LIBXSMM_X86_GP_REG_UNDEF, 0, l_nnz_idx[l_n][0] * l_micro_kernel_config.datatype_size, l_micro_kernel_config.vector_name, 15, 0, 1, 0 ); libxsmm_x86_instruction_vec_compute_reg( io_generated_code, l_micro_kernel_config.instruction_set, l_micro_kernel_config.vmul_instruction, l_micro_kernel_config.vector_name, l_max_reg_block, 15, 15 ); libxsmm_x86_instruction_vec_compute_reg( io_generated_code, l_micro_kernel_config.instruction_set, l_micro_kernel_config.vadd_instruction, l_micro_kernel_config.vector_name, 15, l_n, l_n ); } } } } else { /* shouldn't happen */ } } /* store C accumulator */ for ( l_n = 0; l_n < l_n_limit - l_n_processed; l_n++ ) { libxsmm_x86_instruction_vec_move( io_generated_code, l_micro_kernel_config.instruction_set, l_micro_kernel_config.c_vmove_instruction, l_gp_reg_mapping.gp_reg_c, LIBXSMM_X86_GP_REG_UNDEF, 0, (l_n_processed + l_n)*l_soa_width*l_micro_kernel_config.datatype_size, l_micro_kernel_config.vector_name, l_n, 0, 0, 1 ); } /* adjust n progression */ l_n_processed += l_n_chunksize; l_n_limit = LIBXSMM_MIN(l_n_processed + l_n_chunksize, l_max_cols); } /* advance C pointer */ libxsmm_x86_instruction_alu_imm( io_generated_code, l_micro_kernel_config.alu_add_instruction, l_gp_reg_mapping.gp_reg_c, l_micro_kernel_config.datatype_size*l_soa_width*i_xgemm_desc->ldc); /* advance A pointer */ libxsmm_x86_instruction_alu_imm( io_generated_code, l_micro_kernel_config.alu_add_instruction, l_gp_reg_mapping.gp_reg_a, l_micro_kernel_config.datatype_size*l_soa_width*i_xgemm_desc->lda); /* close m loop */ libxsmm_x86_instruction_alu_imm( io_generated_code, l_micro_kernel_config.alu_cmp_instruction, l_gp_reg_mapping.gp_reg_mloop, i_xgemm_desc->m ); libxsmm_x86_instruction_jump_back_to_label( io_generated_code, l_micro_kernel_config.alu_jmp_instruction, &l_loop_label_tracker ); /* close asm */ libxsmm_x86_instruction_close_stream( io_generated_code, &l_gp_reg_mapping, i_arch, i_xgemm_desc->prefetch ); }
LIBXSMM_API #if defined(_WIN32) /*TODO: no inline*/ #elif defined(__GNUC__) /*LIBXSMM_ATTRIBUTE(noinline)*/ #endif const char* libxsmm_trace_info(unsigned int* depth, unsigned int* threadid, const int* filter_threadid, const int* filter_mindepth, const int* filter_maxnsyms) { const char *fname = NULL; #if defined(LIBXSMM_TRACE) const int max_n = (0 != depth ? (LIBXSMM_TRACE_MAXDEPTH) : 2); const int min_n = (0 != depth ? (LIBXSMM_TRACE_MINDEPTH + *depth) : 2); void *stacktrace[LIBXSMM_TRACE_MAXDEPTH], **symbol = stacktrace + LIBXSMM_MIN(0 != depth ? ((int)(*depth + 1)) : 1, max_n - 1); static LIBXSMM_TLS int cerberus = 0; int i; /* check against entering a recursion (recursion should not happen due to * attribute "no_instrument_function" but better prevent this in any case) */ if (0 == cerberus) { ++cerberus; # if defined(__GNUC__) __asm__(""); # endif i = LIBXSMM_ATOMIC_LOAD(&internal_trace_initialized, LIBXSMM_ATOMIC_RELAXED); if (0 <= i) { /* do nothing if not yet initialized */ const int mindepth = (0 != filter_mindepth ? *filter_mindepth : internal_trace_mindepth); const int maxnsyms = (0 != filter_maxnsyms ? *filter_maxnsyms : internal_trace_maxnsyms); i = libxsmm_backtrace(stacktrace, max_n); /* filter depth against filter_mindepth and filter_maxnsyms */ if ((0 >= mindepth || (min_n + mindepth) <= i) && (0 > maxnsyms || i <= (min_n + mindepth + maxnsyms - 1))) { if (min_n <= i) { /* check against min. depth */ const int filter = (0 != filter_threadid ? *filter_threadid : internal_trace_threadid); int abs_tid = 0; # if defined(_WIN32) || defined(__CYGWIN__) static LIBXSMM_TLS char buffer[sizeof(SYMBOL_INFO)+LIBXSMM_TRACE_SYMBOLSIZE]; static LIBXSMM_TLS int tid = 0; PSYMBOL_INFO value = (PSYMBOL_INFO)buffer; value->SizeOfStruct = sizeof(SYMBOL_INFO); value->MaxNameLen = LIBXSMM_TRACE_SYMBOLSIZE - 1; if (0 != tid) { abs_tid = (0 <= tid ? tid : -tid); } else { abs_tid = LIBXSMM_ATOMIC_ADD_FETCH(&internal_trace_initialized, 1, LIBXSMM_ATOMIC_RELAXED); /* use sign bit to flag enabled fall-back for symbol resolution */ tid = -abs_tid; } assert(0 < abs_tid); if (0 > filter || filter == abs_tid - 1) { if (FALSE != SymFromAddr(GetCurrentProcess(), (DWORD64)*symbol, NULL, value) && 0 < value->NameLen) { /* disable fall-back allowing unresolved symbol names */ tid = abs_tid; /* make unsigned */ fname = value->Name; } else if (0 > tid) { /* fall-back allowing unresolved symbol names */ # if defined(__MINGW32__) sprintf(buffer, "%p", *symbol); # else sprintf(buffer, "0x%" PRIxPTR, (uintptr_t)*symbol); # endif fname = buffer; } if (depth) *depth = i - min_n; if (threadid) *threadid = abs_tid - 1; } # else # if defined(LIBXSMM_NO_SYNC) static char raw_c; char */*const*/ raw_value = &raw_c; /* const: avoid warning (below / constant control-flow) */ # else char *const raw_value = (char*)pthread_getspecific(internal_trace_key); # endif int* ivalue = 0, fd = -1; char* value = 0; if (raw_value) { ivalue = (int*)raw_value; abs_tid = (0 <= ivalue[1] ? ivalue[1] : -ivalue[1]); if (0 > filter || filter == abs_tid - 1) { fd = ivalue[0]; if (0 <= fd && (sizeof(int) * 2) == lseek(fd, sizeof(int) * 2, SEEK_SET)) { value = raw_value + sizeof(int) * 2; } # if !defined(NDEBUG) /* library code is expected to be mute */ else { fprintf(stderr, "LIBXSMM ERROR: failed to get buffer\n"); } # endif } } else { char filename[] = "/tmp/.libxsmm_XXXXXX.map"; #if defined(__GLIBC__) && defined(__GLIBC_MINOR__) && LIBXSMM_VERSION2(2, 19) <= LIBXSMM_VERSION2(__GLIBC__, __GLIBC_MINOR__) fd = mkstemps(filename, 4/*.map*/); #else char *const xpos = strrchr(filename, 'X'); const char c = (char)(NULL != xpos ? *(xpos + 1) : 0); if (0 != c) { xpos[1] = 0; fd = mkstemp(filename); xpos[1] = c; } else { fd = -1; } #endif if (0 <= fd && 0 == posix_fallocate(fd, 0, LIBXSMM_TRACE_SYMBOLSIZE)) { char *const buffer = (char*)mmap(NULL, LIBXSMM_TRACE_SYMBOLSIZE, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); if (MAP_FAILED != buffer) { int check = -1; ivalue = (int*)buffer; ivalue[0] = fd; /* valid file descriptor for internal_delete */ if ( # if !defined(LIBXSMM_NO_SYNC) 0 == pthread_setspecific(internal_trace_key, buffer) && # endif (sizeof(int) * 1) == read(fd, &check, sizeof(int)) && (sizeof(int) * 2) == lseek(fd, sizeof(int), SEEK_CUR) && check == fd) { abs_tid = LIBXSMM_ATOMIC_ADD_FETCH(&internal_trace_initialized, 1, LIBXSMM_ATOMIC_RELAXED); assert(0 < abs_tid); /* use sign bit to flag enabled fall-back for symbol resolution */ ivalue[1] = -abs_tid; if (0 > filter || filter == abs_tid - 1) { value = buffer + sizeof(int) * 2; } } else { # if !defined(NDEBUG) /* library code is expected to be mute */ fprintf(stderr, "LIBXSMM ERROR: failed to setup buffer\n"); # endif internal_delete(buffer); } } # if !defined(NDEBUG) else { const int error = errno; fprintf(stderr, "LIBXSMM ERROR: %s (mmap allocation error #%i)\n", strerror(error), error); } # endif } # if !defined(NDEBUG) /* library code is expected to be mute */ else { fprintf(stderr, "LIBXSMM ERROR: failed to setup file descriptor (%i)\n", fd); } # endif } if (value) { backtrace_symbols_fd(symbol, 1, fd); /* attempt to parse symbol name */ if (1 == sscanf(value, "%*[^(](%s0x", value)) { char* c; for (c = value; '+' != *c && *c; ++c); if ('+' == *c) { /* disable fall-back allowing unresolved symbol names */ ivalue[1] = abs_tid; /* make unsigned */ fname = value; *c = 0; } } /* fall-back to symbol address */ if (0 > ivalue[1] && 0 == fname) { sprintf(value, "0x%llx", (unsigned long long)*symbol); fname = value; } if (depth) *depth = i - min_n; if (threadid) *threadid = abs_tid - 1; } # endif } } } --cerberus; } #else LIBXSMM_UNUSED(depth); LIBXSMM_UNUSED(threadid); LIBXSMM_UNUSED(filter_threadid); LIBXSMM_UNUSED(filter_mindepth); LIBXSMM_UNUSED(filter_maxnsyms); #endif return fname; }
int main(int argc, char* argv[]) { int result = EXIT_SUCCESS; try { const libxsmm_blasint benchmark = 1 < argc ? std::atoi(argv[1]) : 0; LIBXSMM_GEMM_CONST libxsmm_blasint m = (2 < argc ? std::atoi(argv[2]) : 23); LIBXSMM_GEMM_CONST libxsmm_blasint k = (4 < argc ? std::atoi(argv[4]) : m); LIBXSMM_GEMM_CONST libxsmm_blasint n = (3 < argc ? std::atoi(argv[3]) : k); const libxsmm_blasint q = (5 < argc ? std::atoi(argv[5]) : 0/*auto*/); const libxsmm_blasint nrepeat = (6 < argc ? std::atoi(argv[6]) : (0 >= q ? 13 : 1)); LIBXSMM_GEMM_CONST libxsmm_blasint lda = m, ldb = k, ldc = m; LIBXSMM_GEMM_CONST char transa = 'N', transb = 'N'; LIBXSMM_GEMM_CONST OTYPE alpha = 1, beta = 1; const libxsmm_blasint asize = PAD(ITYPE, lda * k), bsize = PAD(ITYPE, ldb * n), csize = PAD(OTYPE, ldc * n); const libxsmm_blasint max_size = ((2ULL << 30/*2 GB*/) / ((asize + bsize) * sizeof(ITYPE) + csize * sizeof(OTYPE))); const libxsmm_blasint s = LIBXSMM_MIN(0 < q ? q : max_size, max_size); const libxsmm_blasint aspace = LIBXSMM_ALIGNMENT / sizeof(ITYPE); const size_t bwsize = static_cast<size_t>((asize/*load*/ + bsize/*load*/) * sizeof(ITYPE) + 2/*RFO*/ * csize * sizeof(OTYPE)); const double gflops = 2E-9 * s * m * n * k; #if LIBXSMM_TYPEINFO(ITYPE, FP) const char *const ops = "FLOPS"; const double scale = 1.0 / s; #else const char *const ops = "OPS"; const double scale = 1; #endif #if !defined(_DEBUG) const char *const env_check = getenv("CHECK"); const int check = (0 == env_check ? 0 : atoi(env_check)); #else /*const*/ int check = 1; #endif #if defined(LIBXSMM_OFFLOAD_TARGET) # pragma offload target(LIBXSMM_OFFLOAD_TARGET) #endif { #if defined(_OPENMP) const libxsmm_blasint chunksize = s / omp_get_max_threads(); #endif struct raii { // avoid std::vector (first-touch init. causes NUMA issue) ITYPE *a, *b; OTYPE *c, *d; libxsmm_blasint *m_shuffle; raii(libxsmm_blasint asize_, libxsmm_blasint bsize_, libxsmm_blasint csize_, libxsmm_blasint size_) : a(new ITYPE[static_cast<size_t>(asize_)]), b(new ITYPE[static_cast<size_t>(bsize_)]) , c(new OTYPE[static_cast<size_t>(csize_)]), d(new OTYPE[static_cast<size_t>(csize_)]) , m_shuffle(new libxsmm_blasint[size_]) { # if defined(_OPENMP) # pragma omp parallel for schedule(static) # endif for (libxsmm_blasint i = 0; i < size_; ++i) m_shuffle[i] = libxsmm_rand_u32(size_); } ~raii() { delete[] a; delete[] b; delete[] c; delete[] d; delete[] m_shuffle; } #if defined(RANDOMIZED) libxsmm_blasint shuffle(libxsmm_blasint i) const { return m_shuffle[i]; } #else libxsmm_blasint shuffle(libxsmm_blasint i) const { return i; } #endif } helper(s * asize + aspace - 1, s * bsize + aspace - 1, s * csize + aspace - 1, s); ITYPE *const a = LIBXSMM_ALIGN(helper.a, LIBXSMM_ALIGNMENT); ITYPE *const b = LIBXSMM_ALIGN(helper.b, LIBXSMM_ALIGNMENT); OTYPE *const c = LIBXSMM_ALIGN(helper.c, LIBXSMM_ALIGNMENT); OTYPE *const d = LIBXSMM_ALIGN(helper.d, LIBXSMM_ALIGNMENT); #if defined(_OPENMP) # pragma omp parallel for schedule(static) #endif for (libxsmm_blasint i = 0; i < s; ++i) { LIBXSMM_MATINIT(ITYPE, 42 + helper.shuffle(i), a + helper.shuffle(i) * asize, m, k, lda, scale); LIBXSMM_MATINIT(ITYPE, 24 + helper.shuffle(i), b + helper.shuffle(i) * bsize, k, n, ldb, scale); LIBXSMM_MATINIT(OTYPE, 22 + i, c + i * csize, m, n, ldc, scale); LIBXSMM_MATINIT(OTYPE, 22 + i, d + i * csize, m, n, ldc, scale); } #if defined(MKL_ENABLE_AVX512) mkl_enable_instructions(MKL_ENABLE_AVX512); #endif // initialize LIBXSMM libxsmm_init(); fprintf(stdout, "m=%lli n=%lli k=%lli size=%lli memory=%.1f MB (input=%s output=%s)\n\n", static_cast<long long>(m), static_cast<long long>(n), static_cast<long long>(k), static_cast<long long>(s), 1.0 * (s * ((asize + bsize) * sizeof(ITYPE) + csize * sizeof(OTYPE))) / (1 << 20), LIBXSMM_TYPENAME(ITYPE), LIBXSMM_TYPENAME(OTYPE)); // LAPACK/BLAS3 (warm-up BLAS Library) #if defined(_OPENMP) # pragma omp parallel for schedule(static) #endif for (libxsmm_blasint i = 0; i < s; ++i) { LIBXSMM_GEMM_SYMBOL(ITYPE)(&transa, &transb, &m, &n, &k, &alpha, a + helper.shuffle(i) * asize, &lda, b + helper.shuffle(i) * bsize, &ldb, &beta, c + i * csize, &ldc); } #if (defined(__MKL) || defined(MKL_DIRECT_CALL_SEQ) || defined(MKL_DIRECT_CALL)) && (LIBXSMM_VERSION3(11, 3, 0) <= INTEL_MKL_VERSION) std::vector<const ITYPE*> va_array(static_cast<size_t>(s)), vb_array(static_cast<size_t>(s)); std::vector<OTYPE*> vc_array(static_cast<size_t>(s)); const ITYPE* *const a_array = &va_array[0]; const ITYPE* *const b_array = &vb_array[0]; OTYPE* *const c_array = &vc_array[0]; const libxsmm_blasint group_count = 1; for (libxsmm_blasint i = 0; i < s; ++i) { // setup batched (A,B,C) a_array[i] = a + helper.shuffle(i) * asize; b_array[i] = b + helper.shuffle(i) * bsize; c_array[i] = d + i * csize; } // additional warm-up (also to eventually match the Gold result) LIBXSMM_TPREFIX(ITYPE,gemm_batch)(&transa, &transb, &m, &n, &k, &alpha, &a_array[0], &lda, &b_array[0], &ldb, &beta, &c_array[0], &ldc, &group_count, &s); #endif switch (benchmark) { case 0: { // batched fprintf(stdout, "Batched (A,B,C)...\n"); const unsigned long long start = libxsmm_timer_tick(); for (libxsmm_blasint r = 0; r < nrepeat; ++r) { #if defined(_OPENMP) # pragma omp parallel for schedule(static) #endif for (libxsmm_blasint i = 0; i < s; ++i) { LIBXSMM_GEMM_SYMBOL(ITYPE)(&transa, &transb, &m, &n, &k, &alpha, a + helper.shuffle(i) * asize, &lda, b + helper.shuffle(i) * bsize, &ldb, &beta, c + i * csize, &ldc); } } const unsigned long long ncycles = libxsmm_timer_diff(start, libxsmm_timer_tick()); const double duration = libxsmm_timer_duration(0, ncycles) / nrepeat; if (0 < duration && 0 != ncycles) { fprintf(stdout, "\tpseudo-perf.: %.1f %s/cycle\n", (2 * k - 1) * (double)(s * m * n) / ncycles, ops); fprintf(stdout, "\tperformance: %.1f G%s/s\n", gflops / duration, ops); fprintf(stdout, "\tbandwidth: %.1f GB/s\n", s * bwsize / (duration * (1 << 30))); } fprintf(stdout, "\tduration: %.0f ms\n", 1000.0 * duration); } /* fallthrough */ #if (defined(__MKL) || defined(MKL_DIRECT_CALL_SEQ) || defined(MKL_DIRECT_CALL)) && (LIBXSMM_VERSION3(11, 3, 0) <= INTEL_MKL_VERSION) case 1: { // batched indirect fprintf(stdout, "Indirect (A,B,C)...\n"); const unsigned long long start = libxsmm_timer_tick(); for (libxsmm_blasint r = 0; r < nrepeat; ++r) { LIBXSMM_TPREFIX(ITYPE,gemm_batch)(&transa, &transb, &m, &n, &k, &alpha, &a_array[0], &lda, &b_array[0], &ldb, &beta, &c_array[0], &ldc, &group_count, &s); } const unsigned long long ncycles = libxsmm_timer_diff(start, libxsmm_timer_tick()); const double duration = libxsmm_timer_duration(0, ncycles) / nrepeat; if (0 < duration && 0 != ncycles) { fprintf(stdout, "\tpseudo-perf.: %.1f %s/cycle\n", (2 * k - 1) * (double)(s * m * n) / ncycles, ops); fprintf(stdout, "\tperformance: %.1f G%s/s\n", gflops / duration, ops); fprintf(stdout, "\tbandwidth: %.1f GB/s\n", s * bwsize / (duration * (1 << 30))); } fprintf(stdout, "\tduration: %.0f ms\n", 1000.0 * duration); if (0 == benchmark) { /* Gold result is available */ libxsmm_matdiff_info diff; memset(&diff, 0, sizeof(diff)); for (libxsmm_blasint h = 0; h < s; ++h) { const OTYPE *const u = c + h * csize, *const v = c_array[h]; libxsmm_matdiff_info dv; if (EXIT_SUCCESS == libxsmm_matdiff(LIBXSMM_DATATYPE(OTYPE), m, n, u, v, &ldc, &ldc, &dv)) { libxsmm_matdiff_reduce(&diff, &dv); } } if (0 < diff.normf_rel) fprintf(stdout, "\tdiff: %.0f%%\n", 100.0 * diff.normf_rel); } } #endif break; case 2: { // streaming A and C fprintf(stdout, "Streamed (A,C)...\n"); const unsigned long long start = libxsmm_timer_tick(); for (libxsmm_blasint r = 0; r < nrepeat; ++r) { #if defined(_OPENMP) # pragma omp parallel for schedule(static) #endif for (libxsmm_blasint i = 0; i < s; ++i) { LIBXSMM_GEMM_SYMBOL(ITYPE)(&transa, &transb, &m, &n, &k, &alpha, a + helper.shuffle(i) * asize, &lda, b, &ldb, &beta, c + i * csize, &ldc); } } const unsigned long long ncycles = libxsmm_timer_diff(start, libxsmm_timer_tick()); const double duration = libxsmm_timer_duration(0, ncycles) / nrepeat; if (0 < duration && 0 != ncycles) { fprintf(stdout, "\tpseudo-perf.: %.1f %s/cycle\n", (2 * k - 1) * (double)(s * m * n) / ncycles, ops); fprintf(stdout, "\tperformance: %.1f G%s/s\n", gflops / duration, ops); fprintf(stdout, "\tbandwidth: %.1f GB/s\n", s * (bwsize - bsize * sizeof(ITYPE)) / (duration * (1 << 30))); } fprintf(stdout, "\tduration: %.0f ms\n", 1000.0 * duration); } /* fallthrough */ #if (defined(__MKL) || defined(MKL_DIRECT_CALL_SEQ) || defined(MKL_DIRECT_CALL)) && (LIBXSMM_VERSION3(11, 3, 0) <= INTEL_MKL_VERSION) case 3: { // indirect A and C fprintf(stdout, "Indirect (A,C)...\n"); for (libxsmm_blasint i = 0; i < s; ++i) { a_array[i] = a + helper.shuffle(i) * asize; b_array[i] = b; c_array[i] = d + i * csize; } const unsigned long long start = libxsmm_timer_tick(); for (libxsmm_blasint r = 0; r < nrepeat; ++r) { LIBXSMM_TPREFIX(ITYPE, gemm_batch)(&transa, &transb, &m, &n, &k, &alpha, &a_array[0], &lda, &b_array[0], &ldb, &beta, &c_array[0], &ldc, &group_count, &s); } const unsigned long long ncycles = libxsmm_timer_diff(start, libxsmm_timer_tick()); const double duration = libxsmm_timer_duration(0, ncycles) / nrepeat; if (0 < duration && 0 != ncycles) { fprintf(stdout, "\tpseudo-perf.: %.1f %s/cycle\n", (2 * k - 1) * (double)(s * m * n) / ncycles, ops); fprintf(stdout, "\tperformance: %.1f G%s/s\n", gflops / duration, ops); fprintf(stdout, "\tbandwidth: %.1f GB/s\n", s * (bwsize - bsize * sizeof(ITYPE)) / (duration * (1 << 30))); } fprintf(stdout, "\tduration: %.0f ms\n", 1000.0 * duration); } #endif break; case 4: { // streaming B and C fprintf(stdout, "Streamed (B,C)...\n"); const unsigned long long start = libxsmm_timer_tick(); for (libxsmm_blasint r = 0; r < nrepeat; ++r) { #if defined(_OPENMP) # pragma omp parallel for schedule(static) #endif for (libxsmm_blasint i = 0; i < s; ++i) { LIBXSMM_GEMM_SYMBOL(ITYPE)(&transa, &transb, &m, &n, &k, &alpha, a, &lda, b + helper.shuffle(i) * bsize, &ldb, &beta, c + i * csize, &ldc); } } const unsigned long long ncycles = libxsmm_timer_diff(start, libxsmm_timer_tick()); const double duration = libxsmm_timer_duration(0, ncycles) / nrepeat; if (0 < duration && 0 != ncycles) { fprintf(stdout, "\tpseudo-perf.: %.1f %s/cycle\n", (2 * k - 1) * (double)(s * m * n) / ncycles, ops); fprintf(stdout, "\tperformance: %.1f G%s/s\n", gflops / duration, ops); fprintf(stdout, "\tbandwidth: %.1f GB/s\n", s * (bwsize - asize * sizeof(ITYPE)) / (duration * (1 << 30))); } fprintf(stdout, "\tduration: %.0f ms\n", 1000.0 * duration); } /* fallthrough */ #if (defined(__MKL) || defined(MKL_DIRECT_CALL_SEQ) || defined(MKL_DIRECT_CALL)) && (LIBXSMM_VERSION3(11, 3, 0) <= INTEL_MKL_VERSION) case 5: { // indirect B and C fprintf(stdout, "Indirect (B,C)...\n"); for (libxsmm_blasint i = 0; i < s; ++i) { a_array[i] = a; b_array[i] = b + helper.shuffle(i) * bsize; c_array[i] = d + i * csize; } const unsigned long long start = libxsmm_timer_tick(); for (libxsmm_blasint r = 0; r < nrepeat; ++r) { LIBXSMM_TPREFIX(ITYPE, gemm_batch)(&transa, &transb, &m, &n, &k, &alpha, &a_array[0], &lda, &b_array[0], &ldb, &beta, &c_array[0], &ldc, &group_count, &s); } const unsigned long long ncycles = libxsmm_timer_diff(start, libxsmm_timer_tick()); const double duration = libxsmm_timer_duration(0, ncycles) / nrepeat; if (0 < duration && 0 != ncycles) { fprintf(stdout, "\tpseudo-perf.: %.1f %s/cycle\n", (2 * k - 1) * (double)(s * m * n) / ncycles, ops); fprintf(stdout, "\tperformance: %.1f G%s/s\n", gflops / duration, ops); fprintf(stdout, "\tbandwidth: %.1f GB/s\n", s * (bwsize - asize * sizeof(ITYPE)) / (duration * (1 << 30))); } fprintf(stdout, "\tduration: %.0f ms\n", 1000.0 * duration); } #endif break; case 6: { // streaming A and B fprintf(stdout, "Streamed (A,B)...\n"); const unsigned long long start = libxsmm_timer_tick(); for (libxsmm_blasint r = 0; r < nrepeat; ++r) { #if defined(_OPENMP) # pragma omp parallel for schedule(static) #endif for (libxsmm_blasint i = 0; i < s; ++i) { #if defined(_OPENMP) /* attempt to write to disjunct cachelines */ const libxsmm_blasint j = omp_get_thread_num() * chunksize * csize; #else const libxsmm_blasint j = 0; #endif LIBXSMM_GEMM_SYMBOL(ITYPE)(&transa, &transb, &m, &n, &k, &alpha, a + helper.shuffle(i) * asize, &lda, b + helper.shuffle(i) * bsize, &ldb, &beta, c + j, &ldc); } } const unsigned long long ncycles = libxsmm_timer_diff(start, libxsmm_timer_tick()); const double duration = libxsmm_timer_duration(0, ncycles) / nrepeat; if (0 < duration && 0 != ncycles) { fprintf(stdout, "\tpseudo-perf.: %.1f %s/cycle\n", (2 * k - 1) * (double)(s * m * n) / ncycles, ops); fprintf(stdout, "\tperformance: %.1f G%s/s\n", gflops / duration, ops); fprintf(stdout, "\tbandwidth: %.1f GB/s\n", s * (bwsize - 2 * csize * sizeof(OTYPE)) / (duration * (1 << 30))); } fprintf(stdout, "\tduration: %.0f ms\n", 1000.0 * duration); } /* fallthrough */ #if (defined(__MKL) || defined(MKL_DIRECT_CALL_SEQ) || defined(MKL_DIRECT_CALL)) && (LIBXSMM_VERSION3(11, 3, 0) <= INTEL_MKL_VERSION) case 7: { // indirect A and B fprintf(stdout, "Indirect (A,B)...\n"); #if defined(_OPENMP) # pragma omp parallel for schedule(static) #endif for (libxsmm_blasint i = 0; i < s; ++i) { a_array[i] = a + helper.shuffle(i) * asize; b_array[i] = b + helper.shuffle(i) * bsize; #if defined(_OPENMP) /* attempt to write to disjunct cachelines */ c_array[i] = d + omp_get_thread_num() * chunksize * csize; #else c_array[i] = d; #endif } const unsigned long long start = libxsmm_timer_tick(); for (libxsmm_blasint r = 0; r < nrepeat; ++r) { LIBXSMM_TPREFIX(ITYPE, gemm_batch)(&transa, &transb, &m, &n, &k, &alpha, &a_array[0], &lda, &b_array[0], &ldb, &beta, &c_array[0], &ldc, &group_count, &s); } const unsigned long long ncycles = libxsmm_timer_diff(start, libxsmm_timer_tick()); const double duration = libxsmm_timer_duration(0, ncycles) / nrepeat; if (0 < duration && 0 != ncycles) { fprintf(stdout, "\tpseudo-perf.: %.1f %s/cycle\n", (2 * k - 1) * (double)(s * m * n) / ncycles, ops); fprintf(stdout, "\tperformance: %.1f G%s/s\n", gflops / duration, ops); fprintf(stdout, "\tbandwidth: %.1f GB/s\n", s * (bwsize - 2 * csize * sizeof(OTYPE)) / (duration * (1 << 30))); } fprintf(stdout, "\tduration: %.0f ms\n", 1000.0 * duration); } #endif break; case 8: { // cached fprintf(stdout, "Cached...\n"); const unsigned long long start = libxsmm_timer_tick(); for (libxsmm_blasint r = 0; r < nrepeat; ++r) { #if defined(_OPENMP) # pragma omp parallel for schedule(static) #endif for (libxsmm_blasint i = 0; i < s; ++i) { #if defined(_OPENMP) /* attempt to write to disjunct cachelines */ const libxsmm_blasint j = omp_get_thread_num() * chunksize * csize; #else const libxsmm_blasint j = 0; #endif LIBXSMM_GEMM_SYMBOL(ITYPE)(&transa, &transb, &m, &n, &k, &alpha, a, &lda, b, &ldb, &beta, c + j, &ldc); } } const unsigned long long ncycles = libxsmm_timer_diff(start, libxsmm_timer_tick()); const double duration = libxsmm_timer_duration(0, ncycles) / nrepeat; if (0 < duration && 0 != ncycles) { fprintf(stdout, "\tpseudo-perf.: %.1f %s/cycle\n", (2 * k - 1) * (double)(s * m * n) / ncycles, ops); fprintf(stdout, "\tperformance: %.1f G%s/s\n", gflops / duration, ops); } fprintf(stdout, "\tduration: %.0f ms\n", 1000.0 * duration); } /* fallthrough */ #if (defined(__MKL) || defined(MKL_DIRECT_CALL_SEQ) || defined(MKL_DIRECT_CALL)) && (LIBXSMM_VERSION3(11, 3, 0) <= INTEL_MKL_VERSION) case 9: { // indirect cached fprintf(stdout, "Indirect cached...\n"); #if defined(_OPENMP) # pragma omp parallel for schedule(static) #endif for (libxsmm_blasint i = 0; i < s; ++i) { a_array[i] = a; b_array[i] = b; #if defined(_OPENMP) /* attempt to write to disjunct cachelines */ c_array[i] = d + omp_get_thread_num() * chunksize * csize; #else c_array[i] = d; #endif } const unsigned long long start = libxsmm_timer_tick(); for (libxsmm_blasint r = 0; r < nrepeat; ++r) { LIBXSMM_TPREFIX(ITYPE, gemm_batch)(&transa, &transb, &m, &n, &k, &alpha, &a_array[0], &lda, &b_array[0], &ldb, &beta, &c_array[0], &ldc, &group_count, &s); } const unsigned long long ncycles = libxsmm_timer_diff(start, libxsmm_timer_tick()); const double duration = libxsmm_timer_duration(0, ncycles) / nrepeat; if (0 < duration && 0 != ncycles) { fprintf(stdout, "\tpseudo-perf.: %.1f %s/cycle\n", (2 * k - 1) * (double)(s * m * n) / ncycles, ops); fprintf(stdout, "\tperformance: %.1f G%s/s\n", gflops / duration, ops); } fprintf(stdout, "\tduration: %.0f ms\n", 1000.0 * duration); } #endif break; default: throw "invalid case selected!"; } /*switch*/ if (0 != check) { libxsmm_matdiff_info diff; if (EXIT_SUCCESS == libxsmm_matdiff(LIBXSMM_DATATYPE(OTYPE), m, n, 0 == (benchmark & 1) ? c : d, NULL, &ldc, &ldc, &diff)) { fprintf(stdout, "\tcheck: %f\n", diff.l1_ref); } } // finalize LIBXSMM libxsmm_finalize(); fprintf(stdout, "Finished\n"); } } catch(const std::exception& e) { fprintf(stderr, "Error: %s\n", e.what()); result = EXIT_FAILURE; } catch(const char* message) { fprintf(stderr, "Error: %s\n", message); result = EXIT_FAILURE; } catch(...) { fprintf(stderr, "Error: unknown exception caught!\n"); result = EXIT_FAILURE; } return result; }
int main(int argc, char* argv[]) { const int insize = (1 < argc ? atoi(argv[1]) : 0); const int incrmt = (2 < argc ? atoi(argv[2]) : 0); const int nelems = (3 < argc ? atoi(argv[3]) : 0); const int niters = (4 < argc ? atoi(argv[4]) : 1); const int elsize = (0 >= insize ? LIBXSMM_DESCRIPTOR_SIGSIZE : insize); const int stride = (0 >= incrmt ? LIBXSMM_MAX(LIBXSMM_DESCRIPTOR_MAXSIZE, elsize) : LIBXSMM_MAX(incrmt, elsize)); const size_t n = (0 >= nelems ? (((size_t)2 << 30/*2 GB*/) / stride) : ((size_t)nelems)); unsigned char *input, *icopy = NULL, *ilast = NULL; int result = EXIT_SUCCESS; size_t nbytes, size, nrpt; if (0 < niters) { size = n; nrpt = niters; } else { size = LIBXSMM_MAX(LIBXSMM_ABS(niters), 1); nrpt = n; } nbytes = size * stride; input = (unsigned char*)(0 != nbytes ? malloc(nbytes) : NULL); if (NULL != input) { unsigned char *const ref = input + (size - 1) * stride; /* last item */ libxsmm_timer_tickint start; size_t i, j = 0; /* initialize the input data */ for (i = 0; i < nbytes; ++i) input[i] = LIBXSMM_MOD2(i, 128); for (i = 0; i < (size_t)elsize; ++i) ref[i] = 255; { /* benchmark libxsmm_diff_n */ #if defined(USE_HASH) const unsigned int hashref = libxsmm_hash(ref, elsize, 0/*seed*/); #endif start = libxsmm_timer_tick(); for (i = 0; i < nrpt; ++i) { #if !defined(USE_HASH) j = libxsmm_diff_n(ref, input, (unsigned char)elsize, (unsigned char)stride, (unsigned int)LIBXSMM_MIN(i, size)/*hint*/, (unsigned int)size); #else const unsigned char* tst = input; for (j = 0; j < size; ++j) { const unsigned int hashtst = libxsmm_hash(tst, elsize, 0/*seed*/); if (hashref == hashtst && 0 == libxsmm_diff(ref, tst, (unsigned char)elsize)) { break; } tst += stride; } #endif } printf("libxsmm_diff_n:\t\t%.8f s\n", libxsmm_timer_duration(start, libxsmm_timer_tick())); } if (size == (j + 1) && 0 == memcmp(ref, input + j * stride, elsize)) { /* benchmark libxsmm_memcmp */ icopy = (unsigned char*)(elsize == stride ? malloc(nbytes) : NULL); if (NULL != icopy) { ilast = icopy + (size - 1) * stride; /* last item */ memcpy(icopy, input, nbytes); start = libxsmm_timer_tick(); for (i = 0; i < nrpt; ++i) { j += libxsmm_memcmp(input, icopy, nbytes); /* take result of every execution */ /* memcmp may be pure and without touching input it is not repeated (nrpt) */ ilast[i%elsize] = 255; } printf("libxsmm_memcmp:\t\t%.8f s\n", libxsmm_timer_duration(start, libxsmm_timer_tick())); result += (int)j * ((int)stride / ((int)stride + 1)); /* ignore result */ } } else { result = EXIT_FAILURE; } if (NULL != icopy) { /* benchmark stdlib's memcmp */ LIBXSMM_ASSERT(NULL != ilast); start = libxsmm_timer_tick(); for (i = 0; i < nrpt; ++i) { j += memcmp(input, icopy, nbytes); /* take result of every execution */ /* memcmp is likely pure and without touching input it is not repeated (nrpt) */ ilast[i%elsize] = 255; } printf("stdlib memcmp:\t\t%.8f s\n", libxsmm_timer_duration(start, libxsmm_timer_tick())); result += (int)j * ((int)stride / ((int)stride + 1)); /* ignore result */ free(icopy); } free(input); } else { result = EXIT_FAILURE; } return result; }
LIBXSMM_API int libxsmm_matcopy_thread(void* out, const void* in, unsigned int typesize, libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ldi, libxsmm_blasint ldo, const int* prefetch, int tid, int nthreads) { int result = EXIT_SUCCESS; static int error_once = 0; assert(typesize <= 255); if (0 != out && out != in && 0 < typesize && 0 < m && 0 < n && m <= ldi && m <= ldo && /* use (signed) integer types, but check sanity of input */ 0 <= tid && tid < nthreads) { const unsigned int uldi = (unsigned int)ldi, uldo = (unsigned int)ldo; unsigned int tm = (unsigned int)m, tn = (unsigned int)n; const int iprefetch = (0 == prefetch ? 0 : *prefetch); libxsmm_xmcopyfunction xmatcopy = 0; LIBXSMM_INIT /* before leading tile sizes */ if (1 < nthreads) { libxsmm_blasint m0 = 0, n0 = 0, m1 = m, n1 = n; const unsigned int size = tm * tn, size2 = LIBXSMM_SQRT2(size); const unsigned int indx = LIBXSMM_MIN(size2 >> 10, 7); const unsigned int tidx = (4 < typesize ? 0 : 1); int mtasks; tm = LIBXSMM_MIN(tm, libxsmm_trans_tile[tidx][0/*M*/][indx]); tn = LIBXSMM_MIN(tn, libxsmm_trans_tile[tidx][1/*N*/][indx]); mtasks = ((1 < nthreads) ? ((int)((m + tm - 1) / tm)) : 1); if (1 < mtasks && nthreads <= mtasks) { /* only parallelized over M */ const int mc = (mtasks + nthreads - 1) / nthreads * tm; m0 = tid * mc; m1 = LIBXSMM_MIN(m0 + mc, m); } else if (1 < nthreads) { const int ntasks = nthreads / mtasks, mtid = tid / ntasks, ntid = tid - mtid * ntasks; const libxsmm_blasint nc = (((n + ntasks - 1) / ntasks + tn - 1) / tn) * tn; const libxsmm_blasint mc = tm; m0 = mtid * mc; m1 = LIBXSMM_MIN(m0 + mc, m); n0 = ntid * nc; n1 = LIBXSMM_MIN(n0 + nc, n); } if (0 != (1 & libxsmm_trans_jit) /* libxsmm_trans_jit: JIT'ted matrix-copy permitted? */ && (1 == typesize || 2 == typesize || 4 == typesize) /* TODO: support multiples */ /* avoid code-dispatch if task does not need the kernel for inner tiles */ && tm + m0 <= (unsigned int)(m1 - m0) && tn <= (unsigned int)(n1 - n0) /* TODO: investigate issue with Byte-element copy/MT on pre-AVX512 */ && (1 < typesize || LIBXSMM_X86_AVX2 < libxsmm_target_archid)) { libxsmm_descriptor_blob blob; const libxsmm_mcopy_descriptor *const desc = libxsmm_mcopy_descriptor_init(&blob, typesize, tm, tn, uldo, uldi, 0 != in ? 0 : LIBXSMM_MATCOPY_FLAG_ZERO_SOURCE, iprefetch, NULL/*default unroll*/); xmatcopy = libxsmm_dispatch_mcopy(desc); } if (0 != prefetch && 0 != *prefetch) { /* prefetch */ LIBXSMM_XCOPY( LIBXSMM_NOOP, LIBXSMM_NOOP_ARGS, LIBXSMM_NOOP_ARGS, LIBXSMM_NOOP, LIBXSMM_MCOPY_KERNEL, LIBXSMM_MCOPY_CALL, xmatcopy, out, in, typesize, uldi, uldo, tm, tn, m0, m1, n0, n1); } else { /* no prefetch */ LIBXSMM_XCOPY( LIBXSMM_NOOP, LIBXSMM_NOOP_ARGS, LIBXSMM_NOOP_ARGS, LIBXSMM_NOOP, LIBXSMM_MCOPY_KERNEL, LIBXSMM_MCOPY_CALL_NOPF, xmatcopy, out, in, typesize, uldi, uldo, tm, tn, m0, m1, n0, n1); } } else {
/* transpose, copy and reduce work-related variables */ const int reduce_work = BLOCKSOFM*BLOCKSIFM*handle->desc.R*handle->desc.S*handle->ifmblock; const int reduce_chunksize = (reduce_work % handle->desc.threads == 0) ? (reduce_work / handle->desc.threads) : (reduce_work / handle->desc.threads) + 1; const int reduce_thr_begin = (ltid * reduce_chunksize < reduce_work) ? (ltid * reduce_chunksize) : reduce_work; const int reduce_thr_end = ((ltid + 1) * reduce_chunksize < reduce_work) ? ((ltid + 1) * reduce_chunksize) : reduce_work; const int copywork = handle->desc.N*BLOCKSIFM; const int copychunksize = (copywork % handle->desc.threads == 0) ? (copywork / handle->desc.threads) : (copywork / handle->desc.threads) + 1; const int copy_thr_begin = (ltid * copychunksize < copywork) ? (ltid * copychunksize) : copywork; const int copy_thr_end = ((ltid + 1) * copychunksize < copywork) ? ((ltid + 1) * copychunksize) : copywork; /* Pointer related variables for output and weight */ element_output_type *const out = ((element_output_type*)handle->grad_output->data) + (handle->desc.pad_h_out * handle->ofwp + handle->desc.pad_w_out) * handle->ofmblock; LIBXSMM_VLA_DECL(5, element_output_type, output, out, BLOCKSOFM, handle->ofhp, handle->ofwp, handle->ofmblock); element_filter_type* weight_ptr = (element_filter_type*)handle->grad_filter->data; element_filter_type* per_thread_weight_ptr = ((element_filter_type*)handle->scratch4) + (ltid*LIBXSMM_MIN(handle->block_upd_ofm,BLOCKSOFM)*LIBXSMM_MIN(handle->block_upd_ifm,BLOCKSIFM)*handle->desc.R*handle->desc.S*handle->ifmblock*handle->ofmblock); LIBXSMM_VLA_DECL(2, element_filter_type, per_thread_weight, per_thread_weight_ptr, handle->ofmblock); element_filter_type* reduction_weight_ptr = ((element_filter_type*)handle->scratch4) + (handle->desc.threads*LIBXSMM_MIN(handle->block_upd_ofm,BLOCKSOFM)*LIBXSMM_MIN(handle->block_upd_ifm,BLOCKSIFM)*handle->desc.R*handle->desc.S*handle->ifmblock*handle->ofmblock); LIBXSMM_VLA_DECL(3, element_filter_type, reduction_weight, reduction_weight_ptr, handle->desc.threads, handle->ofmblock); /* Pointer related variables for input */ element_input_type *LIBXSMM_RESTRICT input_ptr; element_input_type *LIBXSMM_RESTRICT copy_ptr; element_input_type *prefetch_ptr; int padded_h = (handle->padding_flag == 1) ? handle->ifhp + 2 * handle->desc.pad_h : handle->ifhp; int padded_w = (handle->padding_flag == 1) ? handle->ifwp + 2 * handle->desc.pad_w : handle->ifwp; int ifwp_extended = (handle->resize_input == 1 ? (handle->ifwp_resized + handle->qfma_input_pad) : (padded_w + handle->qfma_input_pad)); int dst_ifhp = (handle->resize_input == 1 ? handle->ifhp_resized : handle->ifhp); LIBXSMM_VLA_DECL(5, const element_input_type, input_nopad, (element_input_type*)handle->reg_input->data, BLOCKSIFM, handle->ifhp, handle->ifwp, handle->ifmblock); LIBXSMM_VLA_DECL(5, element_input_type, tr_input_padded, (element_input_type*)handle->scratch5, BLOCKSIFM, padded_h, handle->ifmblock, ifwp_extended);
LIBXSMM_INLINE LIBXSMM_RETARGETABLE libxsmm_blasint randstart(libxsmm_blasint start, libxsmm_blasint value) { const libxsmm_blasint s = (start < value ? start : 0), r = LIBXSMM_MIN(s + (rand() % (value - s)) + 1, value); assert(0 < r && s <= r && r <= value); return r; }
int main(int argc, char* argv[]) { unsigned int m=8, n=8, k=8, lda=8, ldb=8, ldc=8, nerrs, num, nmat; unsigned int layout, asize, bsize, ntest, ncorr; #ifdef AVX512_TESTING unsigned int VLEND=8, VLENS=16; int arch=LIBXSMM_X86_AVX512_CORE; #else unsigned int VLEND=4, VLENS=8; int arch=LIBXSMM_X86_AVX2; #endif unsigned int nmats, nmatd; unsigned int i, j, l, iunroll, junroll, loopi, loopj; char side='L', uplo='U', transa='N', transb='N', diag='N'; unsigned int typesize8 = 8; unsigned int typesize4 = 4; float *sa, *sb, *sc, *sd, *sc1; double *da, *db, *dc, *dd, *dc1; double dalpha = 1.0; float salpha = (float)dalpha; double dbeta = 1.0; float sbeta = (float)dbeta; double dtmp; const unsigned char *cptr = NULL; unsigned long op_count; const libxsmm_pgemm_descriptor* desc8 = NULL; const libxsmm_pgemm_descriptor* desc4 = NULL; #ifdef USE_XSMM_GENERATED libxsmm_descriptor_blob blob; libxsmm_pgemm_xfunction mykernel = NULL; #endif #if defined(USE_KERNEL_GENERATION_DIRECTLY) && defined(__linux__) void (*opcode_routine)(); unsigned char *routine_output; libxsmm_generated_code io_generated_code; int pagesize = sysconf(_SC_PAGE_SIZE); if (pagesize == -1) fprintf(stderr,"sysconf pagesize\n"); routine_output = (unsigned char *) mmap(NULL, BUFSIZE2, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, 0,0); if (mprotect(routine_output, BUFSIZE2, PROT_EXEC | PROT_READ | PROT_WRITE ) == -1) fprintf(stderr,"mprotect\n"); printf("Routine ready\n"); io_generated_code.generated_code = &routine_output[0]; io_generated_code.buffer_size = BUFSIZE2; io_generated_code.code_size = 0; io_generated_code.code_type = 2; io_generated_code.last_error = 0; #endif printf("\nUSAGE: %s m n k lda ldb ldc nmat layout ntest transa transb iunroll junroll loopj loopi\n",argv[0]); if ( argc <= 3 ) { #ifdef TEST_SINGLE printf("Compact SGEMM a C_mxn<-C_mxn+A_mxk*B_kxn matrix of leading dims lda/b/c\n"); printf("This will test the jit of 1 VLEN=%d ",VLENS); if ( VLENS==8 ) printf("(AVX2)"); else printf("(AVX512)"); #else printf("Compact DGEMM a C_mxn<-C_mxn+A_mxk*B_kxn matrix of leading dims lda/b/c\n"); printf("This will test the jit of 1 VLEN=%d ",VLEND); if ( VLEND==4 ) printf("(AVX2)"); else printf("(AVX512)"); #endif printf(" work of nmat at a time\n"); printf("Configurable: M-loop controlled by iunroll & loopi. N-loop by junroll & loopj\n"); printf("Defaults: m=n=k=lda=ldb=ldc=nmat=8, layout=102 (col major), transa=/b='N', ntest=1\n"); } if ( argc > 1 ) m = atoi(argv[1]); else m = 8; if ( argc > 2 ) n = atoi(argv[2]); else n = 8; if ( argc > 3 ) k = atoi(argv[3]); else k = 8; if ( argc > 4 ) lda= atoi(argv[4]); else lda = 8; if ( argc > 5 ) ldb= atoi(argv[5]); else ldb = 8; if ( argc > 6 ) ldc= atoi(argv[6]); else ldc = 8; if ( argc > 7 ) nmat = atoi(argv[7]); else nmat = 8; if ( argc > 8 ) layout = atoi(argv[8]); else layout=102; if ( argc > 9 ) ntest = atoi(argv[9]); else ntest = 1; if ( argc > 10 ) transa = argv[10][0]; else transa = 'N'; if ( argc > 11 ) transb = argv[11][0]; else transb = 'N'; if ( argc > 12 ) iunroll=atoi(argv[12]); else iunroll=0; if ( argc > 13 ) junroll=atoi(argv[13]); else junroll=0; if ( argc > 14 ) loopj=atoi(argv[14]); else loopj=0; if ( argc > 15 ) loopi=atoi(argv[15]); else loopi=0; salpha = (float)dalpha; m = LIBXSMM_MAX(m,1); n = LIBXSMM_MAX(n,1); k = LIBXSMM_MAX(k,1); ntest = LIBXSMM_MAX(ntest,1); nmat = LIBXSMM_MAX(nmat,VLEND); layout = LIBXSMM_MAX(LIBXSMM_MIN(layout,102),101); if ( transa!='N' && transa!='n' && transa!='T' && transa!='t' ) transa='N'; if ( transb!='N' && transb!='n' && transb!='T' && transb!='t' ) transb='N'; lda = LIBXSMM_MAX(lda,m); ldb = LIBXSMM_MAX(ldb,k); ldc = LIBXSMM_MAX(ldc,m); nmats = LIBXSMM_MAX(VLENS,nmat - (nmat%VLENS)); nmatd = LIBXSMM_MAX(VLEND,nmat - (nmat%VLEND)); #ifdef TEST_SINGLE nmat = nmats; #else nmat = nmatd; #endif op_count = (unsigned long)(nmat * 2.0 * (double)m * (double)n * (double)k); #ifdef TEST_SINGLE printf("This is a real*%d tester for JIT compact SGEMM %c%c kernels! (m=%u n=%u k=%u lda=%u ldb=%u ldc=%u layout=%d nmat=%d alpha=%g beta=%g iun=%d jun=%d loopi=%d loopj=%d VLEN=%d)\n",typesize4,transa,transb,m,n,k,lda,ldb,ldc,layout,nmat,dalpha,dbeta,iunroll,junroll,loopi,loopj,VLENS); #else printf("This is a real*%d tester for JIT compact DGEMM %c%c kernels! (m=%u n=%u k=%u lda=%u ldb=%u ldc=%u layout=%d nmat=%d alpha=%g beta=%g iun=%d jun=%d loopi=%d loopj=%d VLEN=%d)\n",typesize8,transa,transb,m,n,k,lda,ldb,ldc,layout,nmat,dalpha,dbeta,iunroll,junroll,loopi,loopj,VLEND); #endif #ifdef USE_XSMM_GENERATED printf("This code tests the LIBXSMM generated kernels\n"); #endif #ifdef USE_PREDEFINED_ASSEMBLY printf("This code tests some predefined assembly kernel\n"); #endif #if defined(USE_KERNEL_GENERATION_DIRECTLY) && defined(__linux__) printf("This code tests kernel generation directly\n"); #endif #ifdef TIME_MKL printf("This code tests MKL compact batch directly\n"); #endif #ifdef AVX512_TESTING printf("This tests AVX512 binaries\n"); #endif #ifdef AVX2_TESTING printf("This tests AVX2 binaries\n"); #endif desc8 = libxsmm_pgemm_descriptor_init(&blob, typesize8, m, n, k, lda, ldb, ldc, &dalpha, transa, transb, layout ); #ifdef TEST_SINGLE desc4 = libxsmm_pgemm_descriptor_init(&blob, typesize4, m, n, k, lda, ldb, ldc, &dalpha, transa, transb, layout ); #endif printf("Descriptor set\n"); #ifdef USE_XSMM_GENERATED printf("calling libxsmm_dispatch_pgemm: typesize8=%u\n",typesize8); mykernel = libxsmm_dispatch_pgemm(desc8); printf("done calling libxsmm_dispatch_pgemm: typesize8=%u\n",typesize8); if ( mykernel == NULL ) printf("R8 Kernel after the create call is null\n"); #ifdef TEST_SINGLE mykernel = libxsmm_dispatch_pgemm(desc4); if ( mykernel == NULL ) printf("R4 kernel after the create call is null\n"); #endif #endif #if defined(USE_KERNEL_GENERATION_DIRECTLY) && defined(__linux__) libxsmm_generator_pgemm_kernel( &io_generated_code, desc8, arch, iunroll, junroll, loopi, loopj ); #endif #ifndef NO_ACCURACY_CHECK printf("mallocing matrices\n"); #endif sa = (float *) malloc ( lda*k*nmat*sizeof(float) ); da = (double *) malloc ( lda*k*nmat*sizeof(double) ); sb = (float *) malloc ( ldb*n*nmat*sizeof(float) ); db = (double *) malloc ( ldb*n*nmat*sizeof(double) ); sc1 = (float *) malloc ( ldc*n*nmat*sizeof(float) ); dc1 = (double *) malloc ( ldc*n*nmat*sizeof(double) ); sc = (float *) malloc ( ldc*n*nmat*sizeof(float) ); dc = (double *) malloc ( ldc*n*nmat*sizeof(double) ); sd = (float *) malloc ( ldc*n*nmat*sizeof(float) ); dd = (double *) malloc ( ldc*n*nmat*sizeof(double) ); #ifndef NO_ACCURACY_CHECK printf("filling matrices\n"); #endif sfill_matrix ( sa, lda, m, k*nmat ); sfill_matrix ( sb, ldb, k, n*nmat ); sfill_matrix ( sc, ldc, m, n*nmat ); dfill_matrix ( da, lda, m, k*nmat ); dfill_matrix ( db, ldb, k, n*nmat ); dfill_matrix ( dc, ldc, m, n*nmat ); #ifndef NO_ACCURACY_CHECK for ( i = 0 ; i < ldc*n*nmat ; i++ ) sd[i]=sc[i]; for ( i = 0 ; i < ldc*n*nmat ; i++ ) dd[i]=dc[i]; for ( i = 0 ; i < ldc*n*nmat ; i++ ) sc1[i]=sc[i]; for ( i = 0 ; i < ldc*n*nmat ; i++ ) dc1[i]=dc[i]; printf("Pointing at the kernel now\n"); #endif #ifdef USE_XSMM_GENERATED cptr = (const unsigned char*) mykernel; #endif #ifdef USE_PREDEFINED_ASSEMBLY cptr = (const unsigned char*) gemm_; #endif #if defined(USE_KERNEL_GENERATION_DIRECTLY) && defined(__linux__) cptr = (const unsigned char*) &routine_output[0]; opcode_routine = (void *) &cptr[0]; #endif #ifndef TIME_MKL # define DUMP_ASSEMBLY_FILE #endif #ifdef DUMP_ASSEMBLY_FILE printf("Dumping assembly file\n"); FILE *fp = fopen("foo.s","w"); char buffer[80]; fputs("\t.text\n",fp); fputs("\t.align 256\n",fp); fputs("\t.globl gemm_\n",fp); fputs("gemm_:\n",fp); for (i = 0 ; i < 7000; i+=4 ) { sprintf(buffer,".byte 0x%02x, 0x%02x, 0x%02x, 0x%02x\n",cptr[i],cptr[i+1],cptr[i+2],cptr[i+3]); fputs(buffer,fp); } fputs("\tretq\n",fp); fputs("\t.type gemm_,@function\n",fp); fputs("\t.size gemm_,.-gemm_\n",fp); fclose(fp); #endif #if defined(USE_MKL_FOR_REFERENCE) || defined(TIME_MKL) # include <mkl.h> MKL_LAYOUT CLAYOUT = (layout == 101) ? MKL_ROW_MAJOR : MKL_COL_MAJOR; MKL_SIDE SIDE = (side == 'R' || side == 'r') ? MKL_RIGHT : MKL_LEFT; MKL_UPLO UPLO = (uplo == 'U' || uplo == 'u') ? MKL_UPPER : MKL_LOWER; MKL_TRANSPOSE TRANSA = (transa == 'N' || transa == 'n') ? MKL_NOTRANS : MKL_TRANS; MKL_TRANSPOSE TRANSB = (transb == 'N' || transb == 'n') ? MKL_NOTRANS : MKL_TRANS; MKL_DIAG DIAG = (diag == 'N' || diag == 'n') ? MKL_NONUNIT : MKL_UNIT; MKL_COMPACT_PACK CMP_FORMAT = mkl_get_format_compact(); #if 0 MKL_COMPACT_PACK CMP_FORMAT = MKL_COMPACT_AVX; #endif #endif #ifndef NO_ACCURACY_CHECK printf("Before routine, initial A(1,1)=%g A[256]=%g\n",da[0],da[256]); #endif #ifdef USE_PREDEFINED_ASSEMBLY double one = 1.0; #endif double timer, firsttime = 0; #ifdef MKL_TIMER double tmptimer; tmptimer = dsecnd_(); #else unsigned long long l_start, l_end; #endif timer = 0.0; for ( j = 0 ; j < (int)ntest ; j++ ) { for ( i = 0 ; i < ldc*n*nmat ; i++ ) dc[i]=dc1[i]; for ( i = 0 , num = 0; i < (int)nmat ; i+= (int)VLEND, num++ ) { double *Ap = &da[num*lda*k*VLEND]; double *Bp = &db[num*ldb*n*VLEND]; double *Cp = &dc[num*ldc*n*VLEND]; #ifdef MKL_TIMER tmptimer = dsecnd_(); #else l_start = libxsmm_timer_tick(); #endif #if !defined(USE_XSMM_GENERATED) && !defined(USE_PREDEFINED_ASSEMBLY) && !defined(USE_KERNEL_GENERATION_DIRECTLY) && !defined(TIME_MKL) && !defined(USE_PREDEFINED_ASSEMBLY_XCT) gen_compact_dgemm_ ( &layout, &m, &n, &k, &dalpha, Ap, &lda, Bp, &ldb, &dbeta, Cp, &ldc, &VLEND ); #endif #ifdef USE_XSMM_GENERATED mykernel ( Ap, Bp, Cp ); #endif #ifdef USE_PREDEFINED_ASSEMBLY gemm_ ( Ap, Bp, Cp ); #endif #ifdef USE_KERNEL_GENERATION_DIRECTLY (*opcode_routine)( Ap, Bp, Cp ); #endif #ifdef TIME_MKL mkl_dgemm_compact ( CLAYOUT, TRANSA, TRANSB, m, n, k, dalpha, da, lda, db, ldb, dbeta, dc, ldc, CMP_FORMAT, nmat ); i+=nmatd; /* Because MKL will do everything */ #endif #ifdef MKL_TIMER dtmp = dsecnd_() - tmptimer; #else l_end = libxsmm_timer_tick(); dtmp = libxsmm_timer_duration(l_start,l_end); #endif if ( j == 0 ) firsttime=dtmp; timer += dtmp; } } if ( ntest >= 100 ) { /* Skip the first timing: super necessary if using MKL */ timer = (timer-firsttime)/((double)(ntest-1)); } else { timer /= ((double)ntest); } #ifndef NO_ACCURACY_CHECK printf("Average time to get through %u matrices: %g\n",nmat,timer); printf("Gflops: %g\n",(double)op_count/(timer*1.0e9)); printf("after routine, new C(1,1)=%g C[256]=%g\n",dc[0],dc[256]); #endif #ifdef TEST_SINGLE printf("Before r4 routine, initial C(1,1)=%g C[256]=%g\n",sc[0],sc[256]); for ( i = 0 , num = 0; i < nmats ; i+= VLENS, num++ ) { float *Ap = &sa[num*lda*k*VLENS]; float *Bp = &sb[num*ldb*n*VLENS]; float *Cp = &sc[num*ldc*n*VLENS]; #ifdef USE_XSMM_GENERATED mykernel ( Ap, Bp, Cp ); #endif } printf("after r4 routine, new C(1,1)=%g C]256]=%g\n",dc[0],dc[256]); #endif #ifndef NO_ACCURACY_CHECK /* Call some reference code now on a copy of the B matrix (C) */ double timer2 = 0.0; for ( j = 0 ; j < (int)ntest ; j++ ) { for ( i = 0 ; i < ldc*n*nmat ; i++ ) dd[i]=dc1[i]; #ifdef MKL_TIMER tmptimer = dsecnd_(); #else l_start = libxsmm_timer_tick(); #endif #ifndef USE_MKL_FOR_REFERENCE compact_dgemm_ ( &layout, &transa, &transb, &m, &n, &k, &dalpha, da, &lda, db, &ldb, &dbeta, dd, &ldc, &nmat, &VLEND ); #else mkl_dgemm_compact ( CLAYOUT, TRANSA, TRANSB, m, n, k, dalpha, da, lda, db, ldb, dbeta, dd, ldc, CMP_FORMAT, nmat ); #endif #ifdef MKL_TIMER timer2 += dsecnd_() - tmptimer; #else l_end = libxsmm_timer_tick(); timer2 += libxsmm_timer_duration(l_start,l_end); #endif } timer2 /= ((double)ntest); printf("Reference time=%g Reference Gflops=%g\n",timer2,op_count/(timer2*1.0e9)); /* Compute the residual between B and C */ dtmp = residual_d ( dc, ldc, m, n*nmat, dd, ldc, &nerrs, &ncorr ); printf("R8 mnk=%u %u %u ldabc=%u %u %u error: %g number of errors: %u corrects: %u",m,n,k,lda,ldb,ldc,dtmp,nerrs,ncorr); if ( nerrs > 0 ) printf(" ->FAILED at %ux%u real*8 %u case",m,n,layout); printf("\n"); #ifdef TEST_SINGLE /* Call some reference code now on a copy of the B matrix (C) */ compact_dgemm_ ( &layout, &transa, &transb, &m, &n, &k, &salpha, sa, &lda, sb, &ldb, &sbeta, sd, &ldc, &nmat, &VLENS ); /* Compute the residual between C and D */ dtmp = residual_s ( sc, ldc, m, n*nmat, sd, ldc, &nerrs, &ncorr ); printf("R4 mnk=%u %u %u ldabc=%u %u %u error: %g number of errors: %u corrects: %u",m,n,k,lda,ldb,ldc,dtmp,nerrs,ncorr); if ( nerrs > 0 ) printf(" ->FAILED at %ux%u real*4 case",m,n); printf("\n"); #endif #else for ( j = 0, nerrs = 0 ; j < lda*n*nmat; j++ ) { if ( isnan(dc[j]) || isinf(dc[j]) ) { if ( ++nerrs < 10 ) { printf("WARNING: dc[%d]=%g\n",j,dc[j]); } } } printf("%g,real*8 m/n/k=%u %u %u lda-c=%u %u %u Denormals=%u Time=%g Gflops=%g",op_count/(timer*1.0e9),m,n,k,lda,ldb,ldc,nerrs,timer,op_count/(timer*1.0e9)); if ( nerrs > 0 ) printf(" -> FAILED at %ux%u real*8 case",m,n); printf("\n"); #endif free(dd); free(sd); free(dc); free(sc); free(dc1); free(sc1); free(db); free(sb); free(da); free(sa); return 0; }