libxsmm_dnn_err_t libxsmm_dnn_fullyconnected_st_fwd_ncnc_kcck_f32_f32(libxsmm_dnn_fullyconnected* handle, int start_thread, int tid) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; #if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ typedef float element_input_type; typedef float element_output_type; typedef float element_filter_type; element_input_type alpha = (element_input_type)1; element_input_type beta = (element_input_type)0; libxsmm_blasint lda = (libxsmm_blasint)handle->bk; libxsmm_blasint ldb = (libxsmm_blasint)handle->bc; libxsmm_blasint ldc = (libxsmm_blasint)handle->bk; if ( handle->desc.fuse_ops == LIBXSMM_DNN_FULLYCONNECTED_FUSE_NONE ) { libxsmm_smmfunction_reducebatch batchreduce_kernel = libxsmm_smmdispatch_reducebatch(handle->bk, handle->bn, handle->bc, &lda, &ldb, &ldc, &alpha, &beta, NULL, NULL); # include "template/libxsmm_dnn_fullyconnected_st_fwd_ncnc_kcck_generic.tpl.c" } else { status = LIBXSMM_DNN_ERR_FUSEBN_UNSUPPORTED_FUSION; } #else /* should not happen */ LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; #endif return status; }
libxsmm_dnn_err_t libxsmm_dnn_fullyconnected_st_fwd_custom_bf16_f32(libxsmm_dnn_fullyconnected* handle, int start_thread, int tid) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; #if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ typedef libxsmm_bfloat16 element_input_type; typedef float element_output_type; typedef libxsmm_bfloat16 element_filter_type; typedef libxsmm_smmfunction gemm_function; libxsmm_blasint lda = (libxsmm_blasint)handle->ofmblock; libxsmm_blasint ldb = (libxsmm_blasint)handle->desc.C; libxsmm_blasint ldc = (libxsmm_blasint)handle->desc.K; float alpha = (element_input_type)1; float beta = (element_input_type)0; if ( handle->desc.fuse_ops == LIBXSMM_DNN_FULLYCONNECTED_FUSE_NONE ) { gemm_function gemm_kernel = libxsmm_smmdispatch(handle->ofmblock, handle->desc.N, handle->desc.C, &lda, &ldb, &ldc, &alpha, &beta, NULL, NULL); # define LIBXSMM_DNN_FULLYCONNECTED_FWD_BF16_F32 # include "template/libxsmm_dnn_fullyconnected_st_fwd_custom_generic.tpl.c" # undef LIBXSMM_DNN_FULLYCONNECTED_FWD_BF16_F32 } else { status = LIBXSMM_DNN_ERR_FUSEBN_UNSUPPORTED_FUSION; } #else /* should not happen */ LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; #endif return status; }
libxsmm_dnn_err_t libxsmm_dnn_pooling_st_bwd_custom_bf16_bf16(libxsmm_dnn_pooling* handle, int start_thread, int tid) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; #if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ typedef libxsmm_bfloat16 element_input_type; typedef libxsmm_bfloat16 element_output_type; # define LIBXSMM_DNN_POOLING_BWD_BF16 if ( handle->desc.pooling_type == LIBXSMM_DNN_POOLING_MAX ) { # define LIBXSMM_DNN_POOLING_BWD_MAX typedef int element_mask_type; # include "template/libxsmm_dnn_pooling_st_bwd_custom_f32_bf16_c16_avx512.tpl.c" # undef LIBXSMM_DNN_POOLING_BWD_MAX } else if ( handle->desc.pooling_type == LIBXSMM_DNN_POOLING_AVG ) { # define LIBXSMM_DNN_POOLING_BWD_AVG # include "template/libxsmm_dnn_pooling_st_bwd_custom_f32_bf16_c16_avx512.tpl.c" # undef LIBXSMM_DNN_POOLING_BWD_AVG } else { status = LIBXSMM_DNN_ERR_UNSUPPORTED_POOLING; } # undef LIBXSMM_DNN_POOLING_BWD_BF16 #else /* should not happen */ LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; #endif return status; }
LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_fullyconnected_st_fwd_nhwc(libxsmm_dnn_fullyconnected* handle, int start_thread, int tid) { libxsmm_dnn_err_t status = LIBXSMM_DNN_ERR_NOT_IMPLEMENTED; LIBXSMM_UNUSED( handle ); LIBXSMM_UNUSED( start_thread ); LIBXSMM_UNUSED( tid ); return status; }
LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_pooling_st_bwd_nhwc(libxsmm_dnn_pooling* handle, int start_thread, int tid) { libxsmm_dnn_err_t status = LIBXSMM_DNN_ERR_NOT_IMPLEMENTED; LIBXSMM_UNUSED( handle ); LIBXSMM_UNUSED( start_thread ); LIBXSMM_UNUSED( tid ); return status; }
libxsmm_dnn_err_t libxsmm_dnn_convolve_st_fwd_custom_custom_i16_i32(libxsmm_dnn_layer* handle, int start_thread, int tid) { libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; #if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ typedef short element_input_type; typedef int element_output_type; typedef short element_filter_type; typedef libxsmm_wconvfunction libxsmm_convfunction; # include "template/libxsmm_dnn_convolve_st_fwd_custom_custom.tpl.c" #else /* should not happen */ LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; #endif return status; }
LIBXSMM_INTERNAL_API_DEFINITION void libxsmm_generator_gemm_header_kloop( libxsmm_generated_code* io_generated_code, libxsmm_loop_label_tracker* io_loop_label_tracker, const libxsmm_gp_reg_mapping* i_gp_reg_mapping, const libxsmm_micro_kernel_config* i_micro_kernel_config, const unsigned int i_m_blocking, const unsigned int i_k_blocking ) { LIBXSMM_UNUSED(i_m_blocking); libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_mov_instruction, i_gp_reg_mapping->gp_reg_kloop, 0); libxsmm_x86_instruction_register_jump_label( io_generated_code, io_loop_label_tracker ); libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_add_instruction, i_gp_reg_mapping->gp_reg_kloop, i_k_blocking); }
LIBXSMM_API void libxsmm_trace(FILE* stream, unsigned int depth, const int* filter_threadid, const int* filter_mindepth, const int* filter_maxnsyms) { #if defined(LIBXSMM_TRACE) unsigned int depth1 = depth + 1, threadid; const char *const name = libxsmm_trace_info(&depth1, &threadid, filter_threadid, filter_mindepth, filter_maxnsyms); if (name && *name) { /* implies actual other results to be valid */ const int depth0 = LIBXSMM_MAX(0 != filter_mindepth ? *filter_mindepth : internal_trace_mindepth, 0); assert(0 != stream/*otherwise fprintf handle the error*/); if ((0 == filter_threadid && 0 > internal_trace_threadid) || (0 != filter_threadid && 0 > *filter_threadid)) { fprintf(stream, "%*s%s@%u\n", (int)(depth1 - depth0), "", name, threadid); } else { fprintf(stream, "%*s%s\n", (int)(depth1 - depth0), "", name); } } #else /* suppress warning */ LIBXSMM_UNUSED(stream); LIBXSMM_UNUSED(depth); LIBXSMM_UNUSED(filter_threadid); LIBXSMM_UNUSED(filter_mindepth); LIBXSMM_UNUSED(filter_maxnsyms); #endif }
LIBXSMM_API int libxsmm_trace_init(int filter_threadid, int filter_mindepth, int filter_maxnsyms) { int result = EXIT_SUCCESS; internal_trace_initialized = -1; /* disabled */ #if defined(LIBXSMM_TRACE) # if defined(_WIN32) || defined(__CYGWIN__) SymSetOptions(SYMOPT_DEFERRED_LOADS | SYMOPT_UNDNAME); result = (FALSE != SymInitialize(GetCurrentProcess(), NULL, TRUE) ? EXIT_SUCCESS : GetLastError()); # elif !defined(LIBXSMM_NO_SYNC) result = pthread_key_create(&internal_trace_key, internal_delete); # endif if (EXIT_SUCCESS == result) { internal_trace_threadid = filter_threadid; internal_trace_maxnsyms = filter_maxnsyms; internal_trace_mindepth = filter_mindepth; internal_trace_initialized = 0; /* enabled */ } #else LIBXSMM_UNUSED(filter_threadid); LIBXSMM_UNUSED(filter_mindepth); LIBXSMM_UNUSED(filter_maxnsyms); #endif return result; }
LIBXSMM_API void __cyg_profile_func_enter(void* this_fn, void* call_site) { #if defined(LIBXSMM_TRACE) # if !defined(LIBXSMM_TRACE_DLINFO) LIBXSMM_UNUSED(this_fn); LIBXSMM_UNUSED(call_site); /* suppress warning */ libxsmm_trace(stderr, 2/*no need for parent (0) but parent of parent (1)*/, /* inherit global settings from libxsmm_trace_init */ NULL, NULL, NULL); # else if (0 <= internal_trace_initialized && 0 != internal_trace_maxnsyms) { # if 1 Dl_info info; # else struct { const char* dli_fname; /* address at which shared object is loaded */ void* dli_fbase; /* name of nearest symbol with address lower than address */ const char* dli_sname; void* dli_saddr; } info; # endif if (0 != dladdr(this_fn, (Dl_info*)&info)) { if (0 != info.dli_sname) { fprintf(stderr, "%s\n", info.dli_sname); } else if (0 != info.dli_saddr) { fprintf(stderr, "0x%llx\n", (unsigned long long)info.dli_saddr); } } } # endif #else LIBXSMM_UNUSED(this_fn); LIBXSMM_UNUSED(call_site); /* suppress warning */ #endif }
LIBXSMM_INTERNAL_API_DEFINITION void libxsmm_generator_gemm_footer_kloop( libxsmm_generated_code* io_generated_code, libxsmm_loop_label_tracker* io_loop_label_tracker, const libxsmm_gp_reg_mapping* i_gp_reg_mapping, const libxsmm_micro_kernel_config* i_micro_kernel_config, const libxsmm_gemm_descriptor* i_xgemm_desc, const unsigned int i_m_blocking, const unsigned int i_max_blocked_k, const unsigned int i_kloop_complete ) { LIBXSMM_UNUSED(i_m_blocking); libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_cmp_instruction, i_gp_reg_mapping->gp_reg_kloop, i_max_blocked_k ); libxsmm_x86_instruction_jump_back_to_label( io_generated_code, i_micro_kernel_config->alu_jmp_instruction, io_loop_label_tracker ); if ( i_kloop_complete != 0 ) { libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_sub_instruction, i_gp_reg_mapping->gp_reg_b, (i_xgemm_desc->k)*(i_micro_kernel_config->datatype_size) ); } }
void libxsmm_generator_dense_noarch_kernel( libxsmm_generated_code* io_generated_code, const libxsmm_xgemm_descriptor* i_xgemm_desc, const char* i_arch ) { char l_new_code[512]; int l_max_code_length = 511; int l_code_length = 0; LIBXSMM_UNUSED(i_arch); l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " unsigned int l_m = 0;\n"); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " unsigned int l_n = 0;\n"); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " unsigned int l_k = 0;\n\n"); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " for ( l_n = 0; l_n < %u; l_n++ ) {\n", i_xgemm_desc->n); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); if ( i_xgemm_desc->beta == 0 ) { if ( (LIBXSMM_XGEMM_FLAG_F32PREC & i_xgemm_desc->flags) == 0 ) { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " for ( l_m = 0; l_m < %u; l_m++ ) { C[(l_n*%u)+l_m] = 0.0; }\n\n", i_xgemm_desc->m, i_xgemm_desc->ldc); } else { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " for ( l_m = 0; l_m < %u; l_m++ ) { C[(l_n*%u)+l_m] = 0.0f; }\n\n", i_xgemm_desc->m, i_xgemm_desc->ldc); } libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); } l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " for ( l_k = 0; l_k < %u; l_k++ ) {\n", i_xgemm_desc->k); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " #pragma simd\n"); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " for ( l_m = 0; l_m < %u; l_m++ ) {\n", i_xgemm_desc->m); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " C[(l_n*%u)+l_m] += A[(l_k*%u)+l_m] * B[(l_n*%u)+l_k];\n", i_xgemm_desc->ldc, i_xgemm_desc->lda, i_xgemm_desc->ldb); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " }\n"); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " }\n"); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " }\n"); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); }
LIBXSMM_API_INTERN void libxsmm_generator_spgemm_csc_bsparse_soa_avx256_512( libxsmm_generated_code* io_generated_code, const libxsmm_gemm_descriptor* i_xgemm_desc, const char* i_arch, const unsigned int* i_row_idx, const unsigned int* i_column_idx, const void* i_values ) { unsigned int l_n = 0; unsigned int l_k = 0; unsigned int l_soa_width = 0; unsigned int l_max_cols = 0; unsigned int l_n_processed = 0; unsigned int l_n_limit = 0; unsigned int l_n_chunks = 0; unsigned int l_n_chunksize = 0; unsigned int l_found_mul = 0; unsigned int l_max_reg_block = 0; libxsmm_micro_kernel_config l_micro_kernel_config; libxsmm_loop_label_tracker l_loop_label_tracker; libxsmm_gp_reg_mapping l_gp_reg_mapping; LIBXSMM_UNUSED(i_values); /* select soa width */ if ( LIBXSMM_GEMM_PRECISION_F64 == LIBXSMM_GETENUM_INP( i_xgemm_desc->datatype ) ) { if ( strcmp(i_arch, "knl") == 0 || strcmp(i_arch, "knm") == 0 || strcmp(i_arch, "skx") == 0 || strcmp(i_arch, "clx") == 0 || strcmp(i_arch, "cpx") == 0 ) { l_soa_width = 8; l_max_reg_block = 28; } else { l_soa_width = 4; l_max_reg_block = 14; } } else { if ( strcmp(i_arch, "knl") == 0 || strcmp(i_arch, "knm") == 0 || strcmp(i_arch, "skx") == 0 || strcmp(i_arch, "clx") == 0 || strcmp(i_arch, "cpx") == 0 ) { l_soa_width = 16; l_max_reg_block = 28; } else { l_soa_width = 8; l_max_reg_block = 14; } } /* define gp register mapping */ libxsmm_reset_x86_gp_reg_mapping( &l_gp_reg_mapping ); /* matching calling convention on Linux */ #if defined(_WIN32) || defined(__CYGWIN__) l_gp_reg_mapping.gp_reg_a = LIBXSMM_X86_GP_REG_RCX; l_gp_reg_mapping.gp_reg_b = LIBXSMM_X86_GP_REG_RDX; l_gp_reg_mapping.gp_reg_c = LIBXSMM_X86_GP_REG_R8; /* TODO: full support for Windows calling convention */ l_gp_reg_mapping.gp_reg_a_prefetch = LIBXSMM_X86_GP_REG_RDI; l_gp_reg_mapping.gp_reg_b_prefetch = LIBXSMM_X86_GP_REG_RSI; #else /* match calling convention on Linux */ l_gp_reg_mapping.gp_reg_a = LIBXSMM_X86_GP_REG_RDI; l_gp_reg_mapping.gp_reg_b = LIBXSMM_X86_GP_REG_RSI; l_gp_reg_mapping.gp_reg_c = LIBXSMM_X86_GP_REG_RDX; l_gp_reg_mapping.gp_reg_a_prefetch = LIBXSMM_X86_GP_REG_RCX; l_gp_reg_mapping.gp_reg_b_prefetch = LIBXSMM_X86_GP_REG_R8; #endif l_gp_reg_mapping.gp_reg_c_prefetch = LIBXSMM_X86_GP_REG_UNDEF; l_gp_reg_mapping.gp_reg_mloop = LIBXSMM_X86_GP_REG_R12; l_gp_reg_mapping.gp_reg_nloop = LIBXSMM_X86_GP_REG_R13; l_gp_reg_mapping.gp_reg_kloop = LIBXSMM_X86_GP_REG_R14; l_gp_reg_mapping.gp_reg_help_0 = LIBXSMM_X86_GP_REG_UNDEF; l_gp_reg_mapping.gp_reg_help_1 = LIBXSMM_X86_GP_REG_UNDEF; l_gp_reg_mapping.gp_reg_help_2 = LIBXSMM_X86_GP_REG_UNDEF; l_gp_reg_mapping.gp_reg_help_3 = LIBXSMM_X86_GP_REG_UNDEF; l_gp_reg_mapping.gp_reg_help_4 = LIBXSMM_X86_GP_REG_UNDEF; l_gp_reg_mapping.gp_reg_help_5 = LIBXSMM_X86_GP_REG_UNDEF; /* define loop_label_tracker */ libxsmm_reset_loop_label_tracker( &l_loop_label_tracker ); /* define the micro kernel code gen properties */ libxsmm_generator_gemm_init_micro_kernel_config_fullvector( &l_micro_kernel_config, i_xgemm_desc, i_arch, 0 ); /* get max column in C */ l_max_cols = i_xgemm_desc->n; for ( l_n = 0; l_n < i_xgemm_desc->n; l_n++ ) { if ( i_column_idx[l_n] == i_column_idx[i_xgemm_desc->n] ) { l_max_cols = l_n+1; } } /* calculate the chunk size of current columns to work on */ l_n_chunks = ( (l_max_cols % l_max_reg_block) == 0 ) ? (l_max_cols / l_max_reg_block) : (l_max_cols / l_max_reg_block) + 1; assert(0 != l_n_chunks); /* mute static analysis (division-by-zero); such invalid input must be caught upfront */ l_n_chunksize = ( (l_max_cols % l_n_chunks) == 0 ) ? (l_max_cols / l_n_chunks) : (l_max_cols / l_n_chunks) + 1; /* open asm */ libxsmm_x86_instruction_open_stream( io_generated_code, &l_gp_reg_mapping, i_arch, i_xgemm_desc->prefetch ); /* m loop */ libxsmm_x86_instruction_register_jump_back_label( io_generated_code, &l_loop_label_tracker ); libxsmm_x86_instruction_alu_imm( io_generated_code, l_micro_kernel_config.alu_add_instruction, l_gp_reg_mapping.gp_reg_mloop, 1 ); /* loop over n-blocks */ l_n_processed = 0; l_n_limit = l_n_chunksize; while ( l_n_processed < l_max_cols ) { #if 0 printf("l_max_cols: %i, l_n_processed: %i, l_n_limit: %i\n", l_max_cols, l_n_processed, l_n_limit); #endif /* load C accumulator */ for ( l_n = 0; l_n < l_n_limit - l_n_processed; l_n++ ) { if (0 != (LIBXSMM_GEMM_FLAG_BETA_0 & i_xgemm_desc->flags)) { /* Beta=0 */ libxsmm_x86_instruction_vec_compute_reg( io_generated_code, l_micro_kernel_config.instruction_set, l_micro_kernel_config.vxor_instruction, l_micro_kernel_config.vector_name, l_n, l_n, l_n ); } else { libxsmm_x86_instruction_vec_move( io_generated_code, l_micro_kernel_config.instruction_set, l_micro_kernel_config.c_vmove_instruction, l_gp_reg_mapping.gp_reg_c, LIBXSMM_X86_GP_REG_UNDEF, 0, (l_n_processed + l_n)*l_soa_width*l_micro_kernel_config.datatype_size, l_micro_kernel_config.vector_name, l_n, 0, 1, 0 ); } } /* do dense soa times sparse multiplication */ for ( l_k = 0; l_k < (unsigned int)i_xgemm_desc->k; l_k++ ) { unsigned int l_found_qmadd = 0; unsigned int l_col_k = 0; unsigned int l_column_active[28]; int l_nnz_idx[28][4]; /* reset helpers */ for ( l_n = 0; l_n < l_n_limit - l_n_processed; l_n++ ) { l_column_active[l_n] = 0; l_nnz_idx[l_n][0] = -1; l_nnz_idx[l_n][1] = -1; l_nnz_idx[l_n][2] = -1; l_nnz_idx[l_n][3] = -1; } l_found_mul = 0; /* let's figure out if we can apply qmadd when being sin F32 setting and on KNM */ if ( (l_k < ((unsigned int)i_xgemm_desc->k - 3)) && (l_micro_kernel_config.instruction_set == LIBXSMM_X86_AVX512_KNM) && (LIBXSMM_GEMM_PRECISION_F32 == LIBXSMM_GETENUM_INP( i_xgemm_desc->datatype ) ) ) { /* loop over the columns of B/C */ for ( l_n = 0; l_n < l_n_limit - l_n_processed; l_n++ ) { unsigned int l_found = 0; unsigned int l_acol_k = 0; unsigned int l_col_elements = i_column_idx[l_n_processed+l_n+1] - i_column_idx[l_n_processed+l_n]; unsigned int l_cur_column = i_column_idx[l_n_processed+l_n]; for ( l_col_k = 0; l_col_k < l_col_elements; l_col_k++ ) { for ( l_acol_k = l_found; l_acol_k < 4; l_acol_k++ ) { if ( (l_k + l_acol_k) == i_row_idx[l_cur_column + l_col_k] ) { l_nnz_idx[l_n][l_acol_k] = l_cur_column + l_col_k; l_found = l_acol_k+1; } if (l_found == 4) { l_col_k = l_col_elements; } } } /* let's check if we can apply qmadd in col l_n */ if ( (l_nnz_idx[l_n][0] != -1) && (l_nnz_idx[l_n][1] != -1) && (l_nnz_idx[l_n][2] != -1) && (l_nnz_idx[l_n][3] != -1) ) { l_column_active[l_n] = 2; l_found_qmadd = 1; l_found_mul = 1; } else { /* let's check if we have at least one entry in the column that matches one of the four entries */ if ( (l_nnz_idx[l_n][0] != -1) || (l_nnz_idx[l_n][1] != -1) || (l_nnz_idx[l_n][2] != -1) || (l_nnz_idx[l_n][3] != -1) ) { l_column_active[l_n] = 1; l_found_mul = 1; } else { l_column_active[l_n] = 0; } } } } if ( l_found_qmadd == 0 ) { /* loop over the columns of B/C */ for ( l_n = 0; l_n < l_n_limit - l_n_processed; l_n++ ) { unsigned int l_col_elements = i_column_idx[l_n_processed+l_n+1] - i_column_idx[l_n_processed+l_n]; unsigned int l_cur_column = i_column_idx[l_n_processed+l_n]; /* search for entries matching that k */ for ( l_col_k = 0; l_col_k < l_col_elements; l_col_k++ ) { if ( l_k == i_row_idx[l_cur_column + l_col_k] ) { l_nnz_idx[l_n][0] = l_cur_column + l_col_k; l_col_k = l_col_elements; } } /* let's check if we have an entry in the column that matches the k from A */ if ( (l_nnz_idx[l_n][0] != -1) ) { l_column_active[l_n] = 1; l_found_mul = 1; } else { l_column_active[l_n] = 0; } } } /* First case: we can use qmadd */ if ( l_found_qmadd != 0 ) { unsigned int l_lcl_k = 0; for ( l_lcl_k = 0; l_lcl_k < 4; l_lcl_k++ ) { libxsmm_x86_instruction_vec_move( io_generated_code, l_micro_kernel_config.instruction_set, l_micro_kernel_config.a_vmove_instruction, l_gp_reg_mapping.gp_reg_a, LIBXSMM_X86_GP_REG_UNDEF, 0, (l_k+l_lcl_k)*l_soa_width*l_micro_kernel_config.datatype_size, l_micro_kernel_config.vector_name, l_max_reg_block+l_lcl_k, 0, 1, 0 ); } /* loop over the columns of B/C */ for ( l_n = 0; l_n < l_n_limit - l_n_processed; l_n++ ) { /* issue a qmadd */ if ( l_column_active[l_n] == 2 ) { libxsmm_x86_instruction_vec_compute_qfma( io_generated_code, l_micro_kernel_config.instruction_set, LIBXSMM_X86_INSTR_V4FMADDPS, l_gp_reg_mapping.gp_reg_b, LIBXSMM_X86_GP_REG_UNDEF, 0, l_nnz_idx[l_n][0] * l_micro_kernel_config.datatype_size, l_micro_kernel_config.vector_name, l_max_reg_block, l_n ); } else if ( l_column_active[l_n] == 1 ) { for ( l_lcl_k = 0; l_lcl_k < 4; l_lcl_k++ ) { if ( l_nnz_idx[l_n][l_lcl_k] != -1 ) { libxsmm_x86_instruction_vec_compute_mem( io_generated_code, l_micro_kernel_config.instruction_set, l_micro_kernel_config.vmul_instruction, 1, l_gp_reg_mapping.gp_reg_b, LIBXSMM_X86_GP_REG_UNDEF, 0, l_nnz_idx[l_n][l_lcl_k] * l_micro_kernel_config.datatype_size, l_micro_kernel_config.vector_name, l_max_reg_block+l_lcl_k, l_n ); } } } } /* increment by additional 3 columns */ l_k += 3; } else if ( l_found_mul != 0 ) { libxsmm_x86_instruction_vec_move( io_generated_code, l_micro_kernel_config.instruction_set, l_micro_kernel_config.a_vmove_instruction, l_gp_reg_mapping.gp_reg_a, LIBXSMM_X86_GP_REG_UNDEF, 0, l_k*l_soa_width*l_micro_kernel_config.datatype_size, l_micro_kernel_config.vector_name, l_max_reg_block, 0, 1, 0 ); /* loop over the columns of B/C */ for ( l_n = 0; l_n < l_n_limit - l_n_processed; l_n++ ) { if ( l_nnz_idx[l_n][0] != -1 ) { if ( strcmp(i_arch, "knl") == 0 || strcmp(i_arch, "knm") == 0 || strcmp(i_arch, "skx") == 0 || strcmp(i_arch, "clx") == 0 || strcmp(i_arch, "cpx") == 0 ) { libxsmm_x86_instruction_vec_compute_mem( io_generated_code, l_micro_kernel_config.instruction_set, l_micro_kernel_config.vmul_instruction, 1, l_gp_reg_mapping.gp_reg_b, LIBXSMM_X86_GP_REG_UNDEF, 0, l_nnz_idx[l_n][0] * l_micro_kernel_config.datatype_size, l_micro_kernel_config.vector_name, l_max_reg_block, l_n ); } else if ( strcmp(i_arch, "hsw") == 0 ) { libxsmm_x86_instruction_vec_move( io_generated_code, l_micro_kernel_config.instruction_set, l_micro_kernel_config.b_vmove_instruction, l_gp_reg_mapping.gp_reg_b, LIBXSMM_X86_GP_REG_UNDEF, 0, l_nnz_idx[l_n][0] * l_micro_kernel_config.datatype_size, l_micro_kernel_config.vector_name, 15, 0, 1, 0 ); libxsmm_x86_instruction_vec_compute_reg( io_generated_code, l_micro_kernel_config.instruction_set, l_micro_kernel_config.vmul_instruction, l_micro_kernel_config.vector_name, l_max_reg_block, 15, l_n ); } else { libxsmm_x86_instruction_vec_move( io_generated_code, l_micro_kernel_config.instruction_set, l_micro_kernel_config.b_vmove_instruction, l_gp_reg_mapping.gp_reg_b, LIBXSMM_X86_GP_REG_UNDEF, 0, l_nnz_idx[l_n][0] * l_micro_kernel_config.datatype_size, l_micro_kernel_config.vector_name, 15, 0, 1, 0 ); libxsmm_x86_instruction_vec_compute_reg( io_generated_code, l_micro_kernel_config.instruction_set, l_micro_kernel_config.vmul_instruction, l_micro_kernel_config.vector_name, l_max_reg_block, 15, 15 ); libxsmm_x86_instruction_vec_compute_reg( io_generated_code, l_micro_kernel_config.instruction_set, l_micro_kernel_config.vadd_instruction, l_micro_kernel_config.vector_name, 15, l_n, l_n ); } } } } else { /* shouldn't happen */ } } /* store C accumulator */ for ( l_n = 0; l_n < l_n_limit - l_n_processed; l_n++ ) { libxsmm_x86_instruction_vec_move( io_generated_code, l_micro_kernel_config.instruction_set, l_micro_kernel_config.c_vmove_instruction, l_gp_reg_mapping.gp_reg_c, LIBXSMM_X86_GP_REG_UNDEF, 0, (l_n_processed + l_n)*l_soa_width*l_micro_kernel_config.datatype_size, l_micro_kernel_config.vector_name, l_n, 0, 0, 1 ); } /* adjust n progression */ l_n_processed += l_n_chunksize; l_n_limit = LIBXSMM_MIN(l_n_processed + l_n_chunksize, l_max_cols); } /* advance C pointer */ libxsmm_x86_instruction_alu_imm( io_generated_code, l_micro_kernel_config.alu_add_instruction, l_gp_reg_mapping.gp_reg_c, l_micro_kernel_config.datatype_size*l_soa_width*i_xgemm_desc->ldc); /* advance A pointer */ libxsmm_x86_instruction_alu_imm( io_generated_code, l_micro_kernel_config.alu_add_instruction, l_gp_reg_mapping.gp_reg_a, l_micro_kernel_config.datatype_size*l_soa_width*i_xgemm_desc->lda); /* close m loop */ libxsmm_x86_instruction_alu_imm( io_generated_code, l_micro_kernel_config.alu_cmp_instruction, l_gp_reg_mapping.gp_reg_mloop, i_xgemm_desc->m ); libxsmm_x86_instruction_jump_back_to_label( io_generated_code, l_micro_kernel_config.alu_jmp_instruction, &l_loop_label_tracker ); /* close asm */ libxsmm_x86_instruction_close_stream( io_generated_code, &l_gp_reg_mapping, i_arch, i_xgemm_desc->prefetch ); }
LIBXSMM_API_DEFINITION void libxsmm_gemm_configure(int archid, int prefetch) { int config = 0; LIBXSMM_UNUSED(prefetch); internal_gemm_prefetch = LIBXSMM_PREFETCH_AL2_AHEAD; internal_gemm_nt = 2; internal_gemm = 2; { /* behaviour of libxsmm_omp_?gemm routines or LD_PRELOAD ?GEMM routines * 0: sequential below-threshold routine (no OpenMP); may fall-back to BLAS, * 1: OpenMP-parallelized but without internal parallel region, * 2: OpenMP-parallelized with internal parallel region" ) */ const char *const env = getenv("LIBXSMM_GEMM"); if (0 != env && 0 != *env) { internal_gemm = atoi(env); } } #if defined(LIBXSMM_EXT_GEMM_TASKS) { /* consider user input about using (OpenMP-)tasks; this code must be here * because maybe only this translation unit is compiled with OpenMP support */ const char *const env_tasks = getenv("LIBXSMM_TASKS"); if (0 != env_tasks && 0 != *env_tasks) { internal_gemm_tasks = atoi(env_tasks); } } #endif #if defined(__MIC__) LIBXSMM_UNUSED(archid); #else if (LIBXSMM_X86_AVX512_MIC == archid) #endif { internal_gemm_nt = 4; config = 1; } { /* attempt to setup tile sizes from the environment (LIBXSMM_M, LIBXSMM_N, and LIBXSMM_K) */ const int tile_configs[/*configs*/][2/*DP/SP*/][3/*TILE_M,TILE_N,TILE_K*/] = { { { 72, 32, 16 }, { 72, 32, 16 } }, /*generic*/ { { 72, 32, 16 }, { 72, 32, 16 } } /*knl*/ }; const char* env[3]; env[0] = getenv("LIBXSMM_M"); env[1] = getenv("LIBXSMM_N"); env[2] = getenv("LIBXSMM_K"); internal_gemm_tile[0/*DP*/][0/*M*/] = (env[0] ? atoi(env[0]) : 0); internal_gemm_tile[0/*DP*/][1/*N*/] = (env[1] ? atoi(env[1]) : 0); internal_gemm_tile[0/*DP*/][2/*K*/] = (env[2] ? atoi(env[2]) : 0); /* environment-defined tile sizes applies for DP and SP */ internal_gemm_tile[1/*SP*/][0/*M*/] = internal_gemm_tile[0/*DP*/][0]; internal_gemm_tile[1/*SP*/][1/*N*/] = internal_gemm_tile[0/*DP*/][1]; internal_gemm_tile[1/*SP*/][2/*K*/] = internal_gemm_tile[0/*DP*/][2]; /* load predefined configuration if tile size is not setup by the environment */ if (0 >= internal_gemm_tile[0/*DP*/][0/*M*/]) internal_gemm_tile[0][0] = tile_configs[config][0][0]; if (0 >= internal_gemm_tile[0/*DP*/][1/*N*/]) internal_gemm_tile[0][1] = tile_configs[config][0][1]; if (0 >= internal_gemm_tile[0/*DP*/][2/*K*/]) internal_gemm_tile[0][2] = tile_configs[config][0][2]; if (0 >= internal_gemm_tile[1/*SP*/][0/*M*/]) internal_gemm_tile[1][0] = tile_configs[config][1][0]; if (0 >= internal_gemm_tile[1/*SP*/][1/*N*/]) internal_gemm_tile[1][1] = tile_configs[config][1][1]; if (0 >= internal_gemm_tile[1/*SP*/][2/*K*/]) internal_gemm_tile[1][2] = tile_configs[config][1][2]; } #if defined(__STATIC) && defined(LIBXSMM_BUILD) && !defined(__CYGWIN__) && \ !(defined(__APPLE__) && defined(__MACH__) /*&& defined(__clang__)*/) if (0 == libxsmm_original_sgemm) { libxsmm_original_sgemm = LIBXSMM_FSYMBOL(__real_sgemm); } #endif #if !defined(__BLAS) || (0 != __BLAS) if (0 == libxsmm_original_sgemm) { libxsmm_original_sgemm = LIBXSMM_FSYMBOL(sgemm); } #endif #if defined(LIBXSMM_RTLD_NEXT) if (0 == libxsmm_original_sgemm) { union { const void* pv; libxsmm_sgemm_function pf; } gemm = { NULL }; gemm.pv = dlsym(RTLD_NEXT, LIBXSMM_STRINGIFY(LIBXSMM_FSYMBOL(sgemm))); libxsmm_original_sgemm = gemm.pf; } #endif #if defined(__STATIC) && defined(LIBXSMM_BUILD) && !defined(__CYGWIN__) && \ !(defined(__APPLE__) && defined(__MACH__) /*&& defined(__clang__)*/) if (0 == libxsmm_original_dgemm) { libxsmm_original_dgemm = LIBXSMM_FSYMBOL(__real_dgemm); } #endif #if !defined(__BLAS) || (0 != __BLAS) if (0 == libxsmm_original_dgemm) { libxsmm_original_dgemm = LIBXSMM_FSYMBOL(dgemm); } #endif #if defined(LIBXSMM_RTLD_NEXT) if (0 == libxsmm_original_dgemm) { union { const void* pv; libxsmm_dgemm_function pf; } gemm = { NULL }; gemm.pv = dlsym(RTLD_NEXT, LIBXSMM_STRINGIFY(LIBXSMM_FSYMBOL(dgemm))); libxsmm_original_dgemm = gemm.pf; } #endif }
LIBXSMM_API #if defined(_WIN32) /*TODO: no inline*/ #elif defined(__GNUC__) /*LIBXSMM_ATTRIBUTE(noinline)*/ #endif const char* libxsmm_trace_info(unsigned int* depth, unsigned int* threadid, const int* filter_threadid, const int* filter_mindepth, const int* filter_maxnsyms) { const char *fname = NULL; #if defined(LIBXSMM_TRACE) const int max_n = (0 != depth ? (LIBXSMM_TRACE_MAXDEPTH) : 2); const int min_n = (0 != depth ? (LIBXSMM_TRACE_MINDEPTH + *depth) : 2); void *stacktrace[LIBXSMM_TRACE_MAXDEPTH], **symbol = stacktrace + LIBXSMM_MIN(0 != depth ? ((int)(*depth + 1)) : 1, max_n - 1); static LIBXSMM_TLS int cerberus = 0; int i; /* check against entering a recursion (recursion should not happen due to * attribute "no_instrument_function" but better prevent this in any case) */ if (0 == cerberus) { ++cerberus; # if defined(__GNUC__) __asm__(""); # endif i = LIBXSMM_ATOMIC_LOAD(&internal_trace_initialized, LIBXSMM_ATOMIC_RELAXED); if (0 <= i) { /* do nothing if not yet initialized */ const int mindepth = (0 != filter_mindepth ? *filter_mindepth : internal_trace_mindepth); const int maxnsyms = (0 != filter_maxnsyms ? *filter_maxnsyms : internal_trace_maxnsyms); i = libxsmm_backtrace(stacktrace, max_n); /* filter depth against filter_mindepth and filter_maxnsyms */ if ((0 >= mindepth || (min_n + mindepth) <= i) && (0 > maxnsyms || i <= (min_n + mindepth + maxnsyms - 1))) { if (min_n <= i) { /* check against min. depth */ const int filter = (0 != filter_threadid ? *filter_threadid : internal_trace_threadid); int abs_tid = 0; # if defined(_WIN32) || defined(__CYGWIN__) static LIBXSMM_TLS char buffer[sizeof(SYMBOL_INFO)+LIBXSMM_TRACE_SYMBOLSIZE]; static LIBXSMM_TLS int tid = 0; PSYMBOL_INFO value = (PSYMBOL_INFO)buffer; value->SizeOfStruct = sizeof(SYMBOL_INFO); value->MaxNameLen = LIBXSMM_TRACE_SYMBOLSIZE - 1; if (0 != tid) { abs_tid = (0 <= tid ? tid : -tid); } else { abs_tid = LIBXSMM_ATOMIC_ADD_FETCH(&internal_trace_initialized, 1, LIBXSMM_ATOMIC_RELAXED); /* use sign bit to flag enabled fall-back for symbol resolution */ tid = -abs_tid; } assert(0 < abs_tid); if (0 > filter || filter == abs_tid - 1) { if (FALSE != SymFromAddr(GetCurrentProcess(), (DWORD64)*symbol, NULL, value) && 0 < value->NameLen) { /* disable fall-back allowing unresolved symbol names */ tid = abs_tid; /* make unsigned */ fname = value->Name; } else if (0 > tid) { /* fall-back allowing unresolved symbol names */ # if defined(__MINGW32__) sprintf(buffer, "%p", *symbol); # else sprintf(buffer, "0x%" PRIxPTR, (uintptr_t)*symbol); # endif fname = buffer; } if (depth) *depth = i - min_n; if (threadid) *threadid = abs_tid - 1; } # else # if defined(LIBXSMM_NO_SYNC) static char raw_c; char */*const*/ raw_value = &raw_c; /* const: avoid warning (below / constant control-flow) */ # else char *const raw_value = (char*)pthread_getspecific(internal_trace_key); # endif int* ivalue = 0, fd = -1; char* value = 0; if (raw_value) { ivalue = (int*)raw_value; abs_tid = (0 <= ivalue[1] ? ivalue[1] : -ivalue[1]); if (0 > filter || filter == abs_tid - 1) { fd = ivalue[0]; if (0 <= fd && (sizeof(int) * 2) == lseek(fd, sizeof(int) * 2, SEEK_SET)) { value = raw_value + sizeof(int) * 2; } # if !defined(NDEBUG) /* library code is expected to be mute */ else { fprintf(stderr, "LIBXSMM ERROR: failed to get buffer\n"); } # endif } } else { char filename[] = "/tmp/.libxsmm_XXXXXX.map"; #if defined(__GLIBC__) && defined(__GLIBC_MINOR__) && LIBXSMM_VERSION2(2, 19) <= LIBXSMM_VERSION2(__GLIBC__, __GLIBC_MINOR__) fd = mkstemps(filename, 4/*.map*/); #else char *const xpos = strrchr(filename, 'X'); const char c = (char)(NULL != xpos ? *(xpos + 1) : 0); if (0 != c) { xpos[1] = 0; fd = mkstemp(filename); xpos[1] = c; } else { fd = -1; } #endif if (0 <= fd && 0 == posix_fallocate(fd, 0, LIBXSMM_TRACE_SYMBOLSIZE)) { char *const buffer = (char*)mmap(NULL, LIBXSMM_TRACE_SYMBOLSIZE, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); if (MAP_FAILED != buffer) { int check = -1; ivalue = (int*)buffer; ivalue[0] = fd; /* valid file descriptor for internal_delete */ if ( # if !defined(LIBXSMM_NO_SYNC) 0 == pthread_setspecific(internal_trace_key, buffer) && # endif (sizeof(int) * 1) == read(fd, &check, sizeof(int)) && (sizeof(int) * 2) == lseek(fd, sizeof(int), SEEK_CUR) && check == fd) { abs_tid = LIBXSMM_ATOMIC_ADD_FETCH(&internal_trace_initialized, 1, LIBXSMM_ATOMIC_RELAXED); assert(0 < abs_tid); /* use sign bit to flag enabled fall-back for symbol resolution */ ivalue[1] = -abs_tid; if (0 > filter || filter == abs_tid - 1) { value = buffer + sizeof(int) * 2; } } else { # if !defined(NDEBUG) /* library code is expected to be mute */ fprintf(stderr, "LIBXSMM ERROR: failed to setup buffer\n"); # endif internal_delete(buffer); } } # if !defined(NDEBUG) else { const int error = errno; fprintf(stderr, "LIBXSMM ERROR: %s (mmap allocation error #%i)\n", strerror(error), error); } # endif } # if !defined(NDEBUG) /* library code is expected to be mute */ else { fprintf(stderr, "LIBXSMM ERROR: failed to setup file descriptor (%i)\n", fd); } # endif } if (value) { backtrace_symbols_fd(symbol, 1, fd); /* attempt to parse symbol name */ if (1 == sscanf(value, "%*[^(](%s0x", value)) { char* c; for (c = value; '+' != *c && *c; ++c); if ('+' == *c) { /* disable fall-back allowing unresolved symbol names */ ivalue[1] = abs_tid; /* make unsigned */ fname = value; *c = 0; } } /* fall-back to symbol address */ if (0 > ivalue[1] && 0 == fname) { sprintf(value, "0x%llx", (unsigned long long)*symbol); fname = value; } if (depth) *depth = i - min_n; if (threadid) *threadid = abs_tid - 1; } # endif } } } --cerberus; } #else LIBXSMM_UNUSED(depth); LIBXSMM_UNUSED(threadid); LIBXSMM_UNUSED(filter_threadid); LIBXSMM_UNUSED(filter_mindepth); LIBXSMM_UNUSED(filter_maxnsyms); #endif return fname; }
void LIBXSMM_FSYMBOL(sgemm)( const char* transa, const char* transb, const libxsmm_blasint* m, const libxsmm_blasint* n, const libxsmm_blasint* k, const float* alpha, const float* a, const libxsmm_blasint* lda, const float* b, const libxsmm_blasint* ldb, const float* beta, float* c, const libxsmm_blasint* ldc) { #if !defined(NDEBUG) /* library code is expected to be mute */ static int error_once = 0; if (1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) { fprintf(stderr, "LIBXSMM: application must be linked against a LAPACK/BLAS implementation!\n"); } #endif LIBXSMM_UNUSED(transa); LIBXSMM_UNUSED(transb); LIBXSMM_UNUSED(m); LIBXSMM_UNUSED(n); LIBXSMM_UNUSED(k); LIBXSMM_UNUSED(alpha); LIBXSMM_UNUSED(a); LIBXSMM_UNUSED(lda); LIBXSMM_UNUSED(b); LIBXSMM_UNUSED(ldb); LIBXSMM_UNUSED(beta); LIBXSMM_UNUSED(c); LIBXSMM_UNUSED(ldc); }
void libxsmm_generator_sparse_asparse( libxsmm_generated_code* io_generated_code, const libxsmm_gemm_descriptor* i_xgemm_desc, const char* i_arch, const unsigned int* i_row_idx, const unsigned int* i_column_idx, const double* i_values ) { char l_new_code[512]; int l_max_code_length = 511; int l_code_length = 0; unsigned int l_k; unsigned int l_flop_count = 0; LIBXSMM_UNUSED(i_arch); LIBXSMM_UNUSED(i_values); /* loop over columns in C in generated code, we fully unroll inside each column */ l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " unsigned int l_n = 0;\n #pragma nounroll_and_jam\n for ( l_n = 0; l_n < %u; l_n++) {\n", i_xgemm_desc->n); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); /* reset the current column in C if needed */ if ( i_xgemm_desc->beta == 0 ) { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " unsigned int l_m = 0;\n"); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); if ( i_xgemm_desc->m > 1 ) { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " #pragma simd\n"); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); } if ( (LIBXSMM_GEMM_FLAG_F32PREC & i_xgemm_desc->flags) == 0 ) { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " for ( l_m = 0; l_m < %u; l_m++) {\n C[(l_n*%u)+l_m] = 0.0;\n }\n", i_xgemm_desc->m, i_xgemm_desc->ldc); } else { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " for ( l_m = 0; l_m < %u; l_m++) {\n C[(l_n*%u)+l_m] = 0.0f;\n }\n", i_xgemm_desc->m, i_xgemm_desc->ldc); } libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); } /* loop over columns in A, rows in B and fully unroll */ for ( l_k = 0; l_k < i_xgemm_desc->k; l_k++ ) { unsigned int l_column_elements = i_column_idx[l_k + 1] - i_column_idx[l_k]; unsigned int l_z = 0; l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, "#if defined(__SSE3__) || defined(__AVX__)\n"); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); if ( l_column_elements > 0 ) { if ( (LIBXSMM_GEMM_FLAG_F32PREC & i_xgemm_desc->flags) == 0 ) { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, "#if defined(__SSE3__) && defined(__AVX__)\n __m256d b%u = _mm256_broadcast_sd(&B[(l_n*%u)+%u]);\n#endif\n", l_k, i_xgemm_desc->ldb, l_k); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, "#if defined(__SSE3__) && !defined(__AVX__)\n __m128d b%u = _mm_loaddup_pd(&B[(l_n*%u)+%u]);\n#endif\n", l_k, i_xgemm_desc->ldb, l_k); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); } else { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, "#if defined(__SSE3__) && defined(__AVX__)\n __m128 b%u = _mm_broadcast_ss(&B[(l_n*%u)+%u]);\n#endif\n", l_k, i_xgemm_desc->ldb, l_k); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, "#if defined(__SSE3__) && !defined(__AVX__)\n __m128 b%u = _mm_load_ss(&B[(l_n*%u)+%u]); b%u = _mm_shuffle_ps(b%u, b%u, 0x00);\n#endif\n", l_k, i_xgemm_desc->ldb, l_k, l_k, l_k, l_k); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); } } /* loop over the columns of A and look for vectorization potential */ for ( l_z = 0; l_z < l_column_elements; l_z++ ) { /* 4 element vector might be possible */ if ( (l_z < (l_column_elements - 3)) && (l_column_elements > 3) ) { /* check for 256bit vector instruction */ if ((i_row_idx[i_column_idx[l_k] + l_z] + 1 == i_row_idx[i_column_idx[l_k] + l_z + 1]) && (i_row_idx[i_column_idx[l_k] + l_z] + 2 == i_row_idx[i_column_idx[l_k] + l_z + 2]) && (i_row_idx[i_column_idx[l_k] + l_z] + 3 == i_row_idx[i_column_idx[l_k] + l_z + 3]) && (i_row_idx[i_column_idx[l_k] + l_z + 3] < i_xgemm_desc->m)) { libxsmm_sparse_asparse_innerloop_four_vector(io_generated_code, i_xgemm_desc, l_k, l_z, i_row_idx, i_column_idx); l_z += 3; /* check for 128bit vector instruction */ } else if ((i_row_idx[i_column_idx[l_k] + l_z] + 1 == i_row_idx[i_column_idx[l_k] + l_z + 1]) && (i_row_idx[i_column_idx[l_k] + l_z + 1] < i_xgemm_desc->m) ) { libxsmm_sparse_asparse_innerloop_two_vector(io_generated_code, i_xgemm_desc, l_k, l_z, i_row_idx, i_column_idx); l_z++; /* scalare instruction */ } else { if ( (i_row_idx[i_column_idx[l_k] + l_z] < i_xgemm_desc->m) ) { libxsmm_sparse_asparse_innerloop_scalar(io_generated_code, i_xgemm_desc, l_k, l_z, i_row_idx, i_column_idx); } } /* 2 element vector might be possible */ } else if ( (l_z < (l_column_elements - 1)) && (l_column_elements > 1)) { /* check for 128bit vector instruction */ if ((i_row_idx[i_column_idx[l_k] + l_z] + 1 == i_row_idx[i_column_idx[l_k] + l_z + 1]) && (i_row_idx[i_column_idx[l_k] + l_z + 1] < i_xgemm_desc->m) ) { libxsmm_sparse_asparse_innerloop_two_vector(io_generated_code, i_xgemm_desc, l_k, l_z, i_row_idx, i_column_idx); l_z++; /* scalare instruction */ } else { if ( (i_row_idx[i_column_idx[l_k] + l_z] < i_xgemm_desc->m) ) { libxsmm_sparse_asparse_innerloop_scalar(io_generated_code, i_xgemm_desc, l_k, l_z, i_row_idx, i_column_idx); } } /* scalar anayways */ } else { if ( (i_row_idx[i_column_idx[l_k] + l_z] < i_xgemm_desc->m) ) { libxsmm_sparse_asparse_innerloop_scalar(io_generated_code, i_xgemm_desc, l_k, l_z, i_row_idx, i_column_idx); } } } /* C fallback code */ l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, "#else\n"); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); /* loop over the columns of A */ for ( l_z = 0; l_z < l_column_elements; l_z++ ) { if ( (i_row_idx[i_column_idx[l_k] + l_z] < i_xgemm_desc->m) ) { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " C[(l_n*%u)+%u] += A[%u] * B[(l_n*%u)+%u];\n", i_xgemm_desc->ldc, i_row_idx[i_column_idx[l_k] + l_z], i_column_idx[l_k] + l_z, i_xgemm_desc->ldb, l_k ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_flop_count += 2; } } l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, "#endif\n\n"); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); } l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " }\n"); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); /* add flop counter */ l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, "\n#ifndef NDEBUG\n#ifdef _OPENMP\n#pragma omp atomic\n#endif\nlibxsmm_num_total_flops += %u;\n#endif\n", l_flop_count * i_xgemm_desc->n); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); }
LIBXSMM_API_INTERN void libxsmm_generator_spgemm_csc_bsparse( libxsmm_generated_code* io_generated_code, const libxsmm_gemm_descriptor* i_xgemm_desc, const char* i_arch, const unsigned int* i_row_idx, const unsigned int* i_column_idx, const double* i_values ) { unsigned int l_n; unsigned int l_z; unsigned int l_column_elements; unsigned int l_flop_count = 0; char l_new_code[512]; int l_max_code_length = 511; int l_code_length = 0; LIBXSMM_UNUSED(i_values); l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " unsigned int l_m = 0;\n"); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); /* reset C if beta is zero */ if (0 != (LIBXSMM_GEMM_FLAG_BETA_0 & i_xgemm_desc->flags)) { /* Beta=0 */ l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " unsigned int l_n = 0;\n"); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " for ( l_n = 0; l_n < %u; l_n++) {\n", (unsigned int)i_xgemm_desc->n); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); if ( i_xgemm_desc->m > 1 ) { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " #pragma simd\n"); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " #pragma vector aligned\n"); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); } if ( LIBXSMM_GEMM_PRECISION_F64 == LIBXSMM_GETENUM_INP( i_xgemm_desc->datatype ) ) { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " for ( l_m = 0; l_m < %u; l_m++) { C[(l_n*%u)+l_m] = 0.0; }\n", (unsigned int)i_xgemm_desc->m, (unsigned int)i_xgemm_desc->ldc); } else { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " for ( l_m = 0; l_m < %u; l_m++) { C[(l_n*%u)+l_m] = 0.0f; }\n", (unsigned int)i_xgemm_desc->m, (unsigned int)i_xgemm_desc->ldc); } libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " }\n"); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); } l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, "\n"); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); /* determine the correct simd pragma for each architecture */ if ( ( strcmp( i_arch, "noarch" ) == 0 ) || ( strcmp( i_arch, "wsm" ) == 0 ) || ( strcmp( i_arch, "snb" ) == 0 ) || ( strcmp( i_arch, "hsw" ) == 0 ) ) { if ( i_xgemm_desc->m > 7 ) { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " #pragma simd vectorlength(8)\n"); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); } else if ( i_xgemm_desc->m > 3 ) { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " #pragma simd vectorlength(4)\n"); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); } else if ( i_xgemm_desc->m > 1 ) { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " #pragma simd vectorlength(2)\n"); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); } else {} if ( (i_xgemm_desc->m > 1) && ((LIBXSMM_GEMM_FLAG_ALIGN_A & i_xgemm_desc->flags) != 0) && ((LIBXSMM_GEMM_FLAG_ALIGN_C & i_xgemm_desc->flags) != 0) ) { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " #pragma vector aligned\n"); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); } } else if ( ( strcmp( i_arch, "knl" ) == 0 ) || ( strcmp( i_arch, "skx" ) == 0 ) || ( strcmp( i_arch, "clx" ) == 0 ) || ( strcmp( i_arch, "cpx" ) == 0 ) ) { if ( (i_xgemm_desc->m > 1) && ((LIBXSMM_GEMM_FLAG_ALIGN_A & i_xgemm_desc->flags) != 0) && ((LIBXSMM_GEMM_FLAG_ALIGN_C & i_xgemm_desc->flags) != 0) ) { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " #pragma simd vectorlength(32)\n #pragma vector aligned\n"); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); } } else { LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_ARCH ); return; } /* generate the actuel kernel */ l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " for ( l_m = 0; l_m < %u; l_m++) {\n", (unsigned int)i_xgemm_desc->m); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); for ( l_n = 0; l_n < (unsigned int)i_xgemm_desc->n; l_n++ ) { l_column_elements = i_column_idx[l_n+1] - i_column_idx[l_n]; for ( l_z = 0; l_z < l_column_elements; l_z++ ) { /* check k such that we just use rows which actually need to be multiplied */ if ( i_row_idx[i_column_idx[l_n] + l_z] < (unsigned int)i_xgemm_desc->k ) { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " C[%u+l_m] += A[%u+l_m] * B[%u];\n", l_n * i_xgemm_desc->ldc, i_row_idx[i_column_idx[l_n] + l_z]*i_xgemm_desc->lda, i_column_idx[l_n] + l_z); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_flop_count += 2; } } } l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " }\n"); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); /* add flop counter */ l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, "\n#ifndef NDEBUG\n#ifdef _OPENMP\n#pragma omp atomic\n#endif\nlibxsmm_num_total_flops += %u;\n#endif\n", l_flop_count * (unsigned int)i_xgemm_desc->m); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); }
void LIBXSMM_FSYMBOL(dgemm)(LIBXSMM_GEMM_CONST char* transa, LIBXSMM_GEMM_CONST char* transb, LIBXSMM_GEMM_CONST libxsmm_blasint* m, LIBXSMM_GEMM_CONST libxsmm_blasint* n, LIBXSMM_GEMM_CONST libxsmm_blasint* k, LIBXSMM_GEMM_CONST double* alpha, LIBXSMM_GEMM_CONST double* a, LIBXSMM_GEMM_CONST libxsmm_blasint* lda, LIBXSMM_GEMM_CONST double* b, LIBXSMM_GEMM_CONST libxsmm_blasint* ldb, LIBXSMM_GEMM_CONST double* beta, double* c, LIBXSMM_GEMM_CONST libxsmm_blasint* ldc) { #if !defined(NDEBUG) /* library code is expected to be mute */ static int error_once = 0; if (1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) { fprintf(stderr, "LIBXSMM ERROR: application must be linked against LAPACK/BLAS!\n"); } #endif LIBXSMM_UNUSED(transa); LIBXSMM_UNUSED(transb); LIBXSMM_UNUSED(m); LIBXSMM_UNUSED(n); LIBXSMM_UNUSED(k); LIBXSMM_UNUSED(alpha); LIBXSMM_UNUSED(a); LIBXSMM_UNUSED(lda); LIBXSMM_UNUSED(b); LIBXSMM_UNUSED(ldb); LIBXSMM_UNUSED(beta); LIBXSMM_UNUSED(c); LIBXSMM_UNUSED(ldc); }
LIBXSMM_API void __cyg_profile_func_exit(void* this_fn, void* call_site) { LIBXSMM_UNUSED(this_fn); LIBXSMM_UNUSED(call_site); /* suppress warning */ }