libxsmm_dnn_err_t libxsmm_dnn_fullyconnected_st_fwd_ncnc_kcck_f32_f32(libxsmm_dnn_fullyconnected* handle, int start_thread, int tid)
{
  libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS;
#if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/
  typedef float element_input_type;
  typedef float element_output_type;
  typedef float element_filter_type;
  element_input_type alpha = (element_input_type)1;
  element_input_type beta = (element_input_type)0;
  libxsmm_blasint lda = (libxsmm_blasint)handle->bk;
  libxsmm_blasint ldb = (libxsmm_blasint)handle->bc;
  libxsmm_blasint ldc = (libxsmm_blasint)handle->bk;

  if ( handle->desc.fuse_ops == LIBXSMM_DNN_FULLYCONNECTED_FUSE_NONE ) {
    libxsmm_smmfunction_reducebatch batchreduce_kernel = libxsmm_smmdispatch_reducebatch(handle->bk, handle->bn, handle->bc, &lda, &ldb, &ldc, &alpha, &beta, NULL, NULL);
# include "template/libxsmm_dnn_fullyconnected_st_fwd_ncnc_kcck_generic.tpl.c"
  } else {
    status = LIBXSMM_DNN_ERR_FUSEBN_UNSUPPORTED_FUSION;
  }
#else /* should not happen */
  LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid);
  status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH;
#endif
  return status;
}
libxsmm_dnn_err_t libxsmm_dnn_fullyconnected_st_fwd_custom_bf16_f32(libxsmm_dnn_fullyconnected* handle, int start_thread, int tid)
{
  libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS;
#if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/
  typedef libxsmm_bfloat16 element_input_type;
  typedef float element_output_type;
  typedef libxsmm_bfloat16 element_filter_type;
  typedef libxsmm_smmfunction gemm_function;
  libxsmm_blasint lda = (libxsmm_blasint)handle->ofmblock;
  libxsmm_blasint ldb = (libxsmm_blasint)handle->desc.C;
  libxsmm_blasint ldc = (libxsmm_blasint)handle->desc.K;
  float alpha = (element_input_type)1;
  float beta = (element_input_type)0;

  if ( handle->desc.fuse_ops == LIBXSMM_DNN_FULLYCONNECTED_FUSE_NONE ) {
    gemm_function gemm_kernel = libxsmm_smmdispatch(handle->ofmblock, handle->desc.N, handle->desc.C, &lda, &ldb, &ldc, &alpha, &beta, NULL, NULL);
# define LIBXSMM_DNN_FULLYCONNECTED_FWD_BF16_F32
# include "template/libxsmm_dnn_fullyconnected_st_fwd_custom_generic.tpl.c"
# undef LIBXSMM_DNN_FULLYCONNECTED_FWD_BF16_F32
  } else {
    status = LIBXSMM_DNN_ERR_FUSEBN_UNSUPPORTED_FUSION;
  }
#else /* should not happen */
  LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid);
  status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH;
#endif
  return status;
}
예제 #3
0
libxsmm_dnn_err_t libxsmm_dnn_pooling_st_bwd_custom_bf16_bf16(libxsmm_dnn_pooling* handle, int start_thread, int tid)
{
  libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS;
#if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/
  typedef libxsmm_bfloat16 element_input_type;
  typedef libxsmm_bfloat16 element_output_type;

# define LIBXSMM_DNN_POOLING_BWD_BF16
  if ( handle->desc.pooling_type == LIBXSMM_DNN_POOLING_MAX ) {
# define LIBXSMM_DNN_POOLING_BWD_MAX
    typedef int   element_mask_type;
# include "template/libxsmm_dnn_pooling_st_bwd_custom_f32_bf16_c16_avx512.tpl.c"
# undef LIBXSMM_DNN_POOLING_BWD_MAX
  } else if ( handle->desc.pooling_type == LIBXSMM_DNN_POOLING_AVG ) {
# define LIBXSMM_DNN_POOLING_BWD_AVG
# include "template/libxsmm_dnn_pooling_st_bwd_custom_f32_bf16_c16_avx512.tpl.c"
# undef LIBXSMM_DNN_POOLING_BWD_AVG
  } else {
    status = LIBXSMM_DNN_ERR_UNSUPPORTED_POOLING;
  }
# undef LIBXSMM_DNN_POOLING_BWD_BF16
#else /* should not happen */
  LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid);
  status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH;
#endif
  return status;
}
LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_fullyconnected_st_fwd_nhwc(libxsmm_dnn_fullyconnected* handle, int start_thread, int tid)
{
  libxsmm_dnn_err_t status = LIBXSMM_DNN_ERR_NOT_IMPLEMENTED;
  LIBXSMM_UNUSED( handle );
  LIBXSMM_UNUSED( start_thread );
  LIBXSMM_UNUSED( tid );
  return status;
}
예제 #5
0
LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_pooling_st_bwd_nhwc(libxsmm_dnn_pooling* handle, int start_thread, int tid)
{
  libxsmm_dnn_err_t status = LIBXSMM_DNN_ERR_NOT_IMPLEMENTED;
  LIBXSMM_UNUSED( handle );
  LIBXSMM_UNUSED( start_thread );
  LIBXSMM_UNUSED( tid );
  return status;
}
libxsmm_dnn_err_t libxsmm_dnn_convolve_st_fwd_custom_custom_i16_i32(libxsmm_dnn_layer* handle, int start_thread, int tid)
{
  libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS;
#if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/
  typedef short element_input_type;
  typedef int element_output_type;
  typedef short element_filter_type;
  typedef libxsmm_wconvfunction libxsmm_convfunction;
# include "template/libxsmm_dnn_convolve_st_fwd_custom_custom.tpl.c"
#else /* should not happen */
  LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid);
  status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH;
#endif
  return status;
}
예제 #7
0
LIBXSMM_INTERNAL_API_DEFINITION
void libxsmm_generator_gemm_header_kloop( libxsmm_generated_code*             io_generated_code,
                                           libxsmm_loop_label_tracker*        io_loop_label_tracker,
                                           const libxsmm_gp_reg_mapping*      i_gp_reg_mapping,
                                           const libxsmm_micro_kernel_config* i_micro_kernel_config,
                                           const unsigned int                 i_m_blocking,
                                           const unsigned int                 i_k_blocking ) {
  LIBXSMM_UNUSED(i_m_blocking);
  libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_mov_instruction, i_gp_reg_mapping->gp_reg_kloop, 0);
  libxsmm_x86_instruction_register_jump_label( io_generated_code, io_loop_label_tracker );
  libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_add_instruction, i_gp_reg_mapping->gp_reg_kloop, i_k_blocking);
}
예제 #8
0
LIBXSMM_API void libxsmm_trace(FILE* stream, unsigned int depth, const int* filter_threadid, const int* filter_mindepth, const int* filter_maxnsyms)
{
#if defined(LIBXSMM_TRACE)
  unsigned int depth1 = depth + 1, threadid;
  const char *const name = libxsmm_trace_info(&depth1, &threadid,
    filter_threadid, filter_mindepth, filter_maxnsyms);

  if (name && *name) { /* implies actual other results to be valid */
    const int depth0 = LIBXSMM_MAX(0 != filter_mindepth ? *filter_mindepth : internal_trace_mindepth, 0);
    assert(0 != stream/*otherwise fprintf handle the error*/);
    if ((0 == filter_threadid && 0 > internal_trace_threadid) || (0 != filter_threadid && 0 > *filter_threadid)) {
      fprintf(stream, "%*s%s@%u\n", (int)(depth1 - depth0), "", name, threadid);
    }
    else {
      fprintf(stream, "%*s%s\n", (int)(depth1 - depth0), "", name);
    }
  }
#else /* suppress warning */
  LIBXSMM_UNUSED(stream); LIBXSMM_UNUSED(depth);
  LIBXSMM_UNUSED(filter_threadid);
  LIBXSMM_UNUSED(filter_mindepth);
  LIBXSMM_UNUSED(filter_maxnsyms);
#endif
}
예제 #9
0
LIBXSMM_API int libxsmm_trace_init(int filter_threadid, int filter_mindepth, int filter_maxnsyms)
{
  int result = EXIT_SUCCESS;
  internal_trace_initialized = -1; /* disabled */
#if defined(LIBXSMM_TRACE)
# if defined(_WIN32) || defined(__CYGWIN__)
  SymSetOptions(SYMOPT_DEFERRED_LOADS | SYMOPT_UNDNAME);
  result = (FALSE != SymInitialize(GetCurrentProcess(), NULL, TRUE) ? EXIT_SUCCESS : GetLastError());
# elif !defined(LIBXSMM_NO_SYNC)
  result = pthread_key_create(&internal_trace_key, internal_delete);
# endif
  if (EXIT_SUCCESS == result) {
    internal_trace_threadid = filter_threadid;
    internal_trace_maxnsyms = filter_maxnsyms;
    internal_trace_mindepth = filter_mindepth;
    internal_trace_initialized = 0; /* enabled */
  }
#else
  LIBXSMM_UNUSED(filter_threadid);
  LIBXSMM_UNUSED(filter_mindepth);
  LIBXSMM_UNUSED(filter_maxnsyms);
#endif
  return result;
}
예제 #10
0
LIBXSMM_API void __cyg_profile_func_enter(void* this_fn, void* call_site)
{
#if defined(LIBXSMM_TRACE)
# if !defined(LIBXSMM_TRACE_DLINFO)
  LIBXSMM_UNUSED(this_fn); LIBXSMM_UNUSED(call_site); /* suppress warning */
  libxsmm_trace(stderr, 2/*no need for parent (0) but parent of parent (1)*/,
    /* inherit global settings from libxsmm_trace_init */
    NULL, NULL, NULL);
# else
  if (0 <= internal_trace_initialized && 0 != internal_trace_maxnsyms) {
#   if 1
    Dl_info info;
#   else
    struct {
      const char* dli_fname;
      /* address at which shared object is loaded */
      void* dli_fbase;
      /* name of nearest symbol with address lower than address */
      const char* dli_sname;
      void* dli_saddr;
    } info;
#   endif
    if (0 != dladdr(this_fn, (Dl_info*)&info)) {
      if (0 != info.dli_sname) {
        fprintf(stderr, "%s\n", info.dli_sname);
      }
      else if (0 != info.dli_saddr) {
        fprintf(stderr, "0x%llx\n", (unsigned long long)info.dli_saddr);
      }
    }
  }
# endif
#else
  LIBXSMM_UNUSED(this_fn); LIBXSMM_UNUSED(call_site); /* suppress warning */
#endif
}
예제 #11
0
LIBXSMM_INTERNAL_API_DEFINITION
void libxsmm_generator_gemm_footer_kloop( libxsmm_generated_code*             io_generated_code,
                                           libxsmm_loop_label_tracker*        io_loop_label_tracker,
                                           const libxsmm_gp_reg_mapping*      i_gp_reg_mapping,
                                           const libxsmm_micro_kernel_config* i_micro_kernel_config,
                                           const libxsmm_gemm_descriptor*     i_xgemm_desc,
                                           const unsigned int                 i_m_blocking,
                                           const unsigned int                 i_max_blocked_k,
                                           const unsigned int                 i_kloop_complete ) {
  LIBXSMM_UNUSED(i_m_blocking);
  libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_cmp_instruction, i_gp_reg_mapping->gp_reg_kloop, i_max_blocked_k );
  libxsmm_x86_instruction_jump_back_to_label( io_generated_code, i_micro_kernel_config->alu_jmp_instruction, io_loop_label_tracker );
  if ( i_kloop_complete != 0 ) {
    libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_sub_instruction,
                                 i_gp_reg_mapping->gp_reg_b, (i_xgemm_desc->k)*(i_micro_kernel_config->datatype_size) );
  }
}
예제 #12
0
void libxsmm_generator_dense_noarch_kernel( libxsmm_generated_code*         io_generated_code,
                                            const libxsmm_xgemm_descriptor* i_xgemm_desc,
                                            const char*                     i_arch ) {
  char l_new_code[512];
  int l_max_code_length = 511;
  int l_code_length = 0;

  LIBXSMM_UNUSED(i_arch);

  l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, "  unsigned int l_m = 0;\n");
  libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length );
  l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, "  unsigned int l_n = 0;\n");
  libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length );
  l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, "  unsigned int l_k = 0;\n\n");
  libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length );

  l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, "  for ( l_n = 0; l_n < %u; l_n++ ) {\n", i_xgemm_desc->n);
  libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length );
  if ( i_xgemm_desc->beta == 0 ) {
    if ( (LIBXSMM_XGEMM_FLAG_F32PREC & i_xgemm_desc->flags) == 0 ) {
      l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, "    for ( l_m = 0; l_m < %u; l_m++ ) { C[(l_n*%u)+l_m] = 0.0; }\n\n", i_xgemm_desc->m, i_xgemm_desc->ldc);
    } else {
      l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, "    for ( l_m = 0; l_m < %u; l_m++ ) { C[(l_n*%u)+l_m] = 0.0f; }\n\n", i_xgemm_desc->m, i_xgemm_desc->ldc);
    }
    libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length );
  }
  l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, "    for ( l_k = 0; l_k < %u; l_k++ ) {\n", i_xgemm_desc->k);
  libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length );
  l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, "      #pragma simd\n");
  libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length );
  l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, "      for ( l_m = 0; l_m < %u; l_m++ ) {\n", i_xgemm_desc->m);
  libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length );
  l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, "        C[(l_n*%u)+l_m] += A[(l_k*%u)+l_m] * B[(l_n*%u)+l_k];\n", i_xgemm_desc->ldc, i_xgemm_desc->lda, i_xgemm_desc->ldb);
  libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length );
  l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, "      }\n");
  libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length );
  l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, "    }\n");
  libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length );
  l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, "  }\n");
  libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length );
}
예제 #13
0
LIBXSMM_API_INTERN
void libxsmm_generator_spgemm_csc_bsparse_soa_avx256_512( libxsmm_generated_code*         io_generated_code,
                                                          const libxsmm_gemm_descriptor*  i_xgemm_desc,
                                                          const char*                     i_arch,
                                                          const unsigned int*             i_row_idx,
                                                          const unsigned int*             i_column_idx,
                                                          const void*                     i_values ) {
  unsigned int l_n = 0;
  unsigned int l_k = 0;
  unsigned int l_soa_width = 0;
  unsigned int l_max_cols = 0;
  unsigned int l_n_processed = 0;
  unsigned int l_n_limit = 0;
  unsigned int l_n_chunks = 0;
  unsigned int l_n_chunksize = 0;
  unsigned int l_found_mul = 0;
  unsigned int l_max_reg_block = 0;

  libxsmm_micro_kernel_config l_micro_kernel_config;
  libxsmm_loop_label_tracker l_loop_label_tracker;
  libxsmm_gp_reg_mapping l_gp_reg_mapping;

  LIBXSMM_UNUSED(i_values);

  /* select soa width */
  if ( LIBXSMM_GEMM_PRECISION_F64 == LIBXSMM_GETENUM_INP( i_xgemm_desc->datatype )  ) {
    if ( strcmp(i_arch, "knl") == 0 ||
         strcmp(i_arch, "knm") == 0 ||
         strcmp(i_arch, "skx") == 0 ||
         strcmp(i_arch, "clx") == 0 ||
         strcmp(i_arch, "cpx") == 0 ) {
      l_soa_width = 8;
      l_max_reg_block = 28;
    } else {
      l_soa_width = 4;
      l_max_reg_block = 14;
    }
  } else {
    if ( strcmp(i_arch, "knl") == 0 ||
         strcmp(i_arch, "knm") == 0 ||
         strcmp(i_arch, "skx") == 0 ||
         strcmp(i_arch, "clx") == 0 ||
         strcmp(i_arch, "cpx") == 0 ) {
      l_soa_width = 16;
      l_max_reg_block = 28;
    } else {
      l_soa_width = 8;
      l_max_reg_block = 14;
    }
  }

  /* define gp register mapping */
  libxsmm_reset_x86_gp_reg_mapping( &l_gp_reg_mapping );
  /* matching calling convention on Linux */
#if defined(_WIN32) || defined(__CYGWIN__)
  l_gp_reg_mapping.gp_reg_a = LIBXSMM_X86_GP_REG_RCX;
  l_gp_reg_mapping.gp_reg_b = LIBXSMM_X86_GP_REG_RDX;
  l_gp_reg_mapping.gp_reg_c = LIBXSMM_X86_GP_REG_R8;
  /* TODO: full support for Windows calling convention */
  l_gp_reg_mapping.gp_reg_a_prefetch = LIBXSMM_X86_GP_REG_RDI;
  l_gp_reg_mapping.gp_reg_b_prefetch = LIBXSMM_X86_GP_REG_RSI;
#else /* match calling convention on Linux */
  l_gp_reg_mapping.gp_reg_a = LIBXSMM_X86_GP_REG_RDI;
  l_gp_reg_mapping.gp_reg_b = LIBXSMM_X86_GP_REG_RSI;
  l_gp_reg_mapping.gp_reg_c = LIBXSMM_X86_GP_REG_RDX;
  l_gp_reg_mapping.gp_reg_a_prefetch = LIBXSMM_X86_GP_REG_RCX;
  l_gp_reg_mapping.gp_reg_b_prefetch = LIBXSMM_X86_GP_REG_R8;
#endif
  l_gp_reg_mapping.gp_reg_c_prefetch = LIBXSMM_X86_GP_REG_UNDEF;
  l_gp_reg_mapping.gp_reg_mloop = LIBXSMM_X86_GP_REG_R12;
  l_gp_reg_mapping.gp_reg_nloop = LIBXSMM_X86_GP_REG_R13;
  l_gp_reg_mapping.gp_reg_kloop = LIBXSMM_X86_GP_REG_R14;
  l_gp_reg_mapping.gp_reg_help_0 = LIBXSMM_X86_GP_REG_UNDEF;
  l_gp_reg_mapping.gp_reg_help_1 = LIBXSMM_X86_GP_REG_UNDEF;
  l_gp_reg_mapping.gp_reg_help_2 = LIBXSMM_X86_GP_REG_UNDEF;
  l_gp_reg_mapping.gp_reg_help_3 = LIBXSMM_X86_GP_REG_UNDEF;
  l_gp_reg_mapping.gp_reg_help_4 = LIBXSMM_X86_GP_REG_UNDEF;
  l_gp_reg_mapping.gp_reg_help_5 = LIBXSMM_X86_GP_REG_UNDEF;

  /* define loop_label_tracker */
  libxsmm_reset_loop_label_tracker( &l_loop_label_tracker );

  /* define the micro kernel code gen properties */
  libxsmm_generator_gemm_init_micro_kernel_config_fullvector( &l_micro_kernel_config, i_xgemm_desc, i_arch, 0 );

  /* get max column in C */
  l_max_cols = i_xgemm_desc->n;
  for ( l_n = 0; l_n < i_xgemm_desc->n; l_n++ ) {
    if ( i_column_idx[l_n] == i_column_idx[i_xgemm_desc->n] ) {
      l_max_cols = l_n+1;
    }
  }

  /* calculate the chunk size of current columns to work on */
  l_n_chunks = ( (l_max_cols % l_max_reg_block) == 0 ) ? (l_max_cols / l_max_reg_block) : (l_max_cols / l_max_reg_block) + 1;
  assert(0 != l_n_chunks); /* mute static analysis (division-by-zero); such invalid input must be caught upfront */
  l_n_chunksize = ( (l_max_cols % l_n_chunks) == 0 ) ? (l_max_cols / l_n_chunks) : (l_max_cols / l_n_chunks) + 1;

  /* open asm */
  libxsmm_x86_instruction_open_stream( io_generated_code, &l_gp_reg_mapping, i_arch, i_xgemm_desc->prefetch );

  /* m loop */
  libxsmm_x86_instruction_register_jump_back_label( io_generated_code, &l_loop_label_tracker );
  libxsmm_x86_instruction_alu_imm( io_generated_code, l_micro_kernel_config.alu_add_instruction, l_gp_reg_mapping.gp_reg_mloop, 1 );

  /* loop over n-blocks */
  l_n_processed = 0;
  l_n_limit = l_n_chunksize;
  while ( l_n_processed < l_max_cols ) {
#if 0
    printf("l_max_cols: %i, l_n_processed: %i, l_n_limit: %i\n", l_max_cols, l_n_processed, l_n_limit);
#endif
    /* load C accumulator */
    for ( l_n = 0; l_n < l_n_limit - l_n_processed; l_n++ ) {
      if (0 != (LIBXSMM_GEMM_FLAG_BETA_0 & i_xgemm_desc->flags)) { /* Beta=0 */
        libxsmm_x86_instruction_vec_compute_reg( io_generated_code,
                                                 l_micro_kernel_config.instruction_set,
                                                 l_micro_kernel_config.vxor_instruction,
                                                 l_micro_kernel_config.vector_name,
                                                 l_n, l_n, l_n );
      } else {
        libxsmm_x86_instruction_vec_move( io_generated_code,
                                          l_micro_kernel_config.instruction_set,
                                          l_micro_kernel_config.c_vmove_instruction,
                                          l_gp_reg_mapping.gp_reg_c,
                                          LIBXSMM_X86_GP_REG_UNDEF, 0,
                                          (l_n_processed + l_n)*l_soa_width*l_micro_kernel_config.datatype_size,
                                          l_micro_kernel_config.vector_name,
                                          l_n, 0, 1, 0 );
      }
    }

    /* do dense soa times sparse multiplication */
    for ( l_k = 0; l_k < (unsigned int)i_xgemm_desc->k; l_k++ ) {
      unsigned int l_found_qmadd = 0;
      unsigned int l_col_k = 0;
      unsigned int l_column_active[28];
      int l_nnz_idx[28][4];

      /* reset helpers */
      for ( l_n = 0; l_n < l_n_limit - l_n_processed; l_n++ ) {
        l_column_active[l_n] = 0;
        l_nnz_idx[l_n][0] = -1; l_nnz_idx[l_n][1] = -1; l_nnz_idx[l_n][2] = -1; l_nnz_idx[l_n][3] = -1;
      }
      l_found_mul = 0;

      /* let's figure out if we can apply qmadd when being sin F32 setting and on KNM */
      if ( (l_k < ((unsigned int)i_xgemm_desc->k - 3))                       &&
           (l_micro_kernel_config.instruction_set == LIBXSMM_X86_AVX512_KNM) &&
           (LIBXSMM_GEMM_PRECISION_F32 == LIBXSMM_GETENUM_INP( i_xgemm_desc->datatype ) )               ) {
        /* loop over the columns of B/C */
        for ( l_n = 0; l_n < l_n_limit - l_n_processed; l_n++ ) {
          unsigned int l_found = 0;
          unsigned int l_acol_k = 0;
          unsigned int l_col_elements = i_column_idx[l_n_processed+l_n+1] - i_column_idx[l_n_processed+l_n];
          unsigned int l_cur_column = i_column_idx[l_n_processed+l_n];

          for ( l_col_k = 0; l_col_k < l_col_elements; l_col_k++ ) {
            for ( l_acol_k = l_found; l_acol_k < 4; l_acol_k++ ) {
              if ( (l_k + l_acol_k) == i_row_idx[l_cur_column + l_col_k] ) {
                l_nnz_idx[l_n][l_acol_k] = l_cur_column + l_col_k;
                l_found = l_acol_k+1;
              }
              if (l_found == 4) {
                l_col_k = l_col_elements;
              }
            }
          }
          /* let's check if we can apply qmadd in col l_n */
          if ( (l_nnz_idx[l_n][0] != -1) && (l_nnz_idx[l_n][1] != -1) && (l_nnz_idx[l_n][2] != -1) && (l_nnz_idx[l_n][3] != -1) ) {
            l_column_active[l_n] = 2;
            l_found_qmadd = 1;
            l_found_mul = 1;
          } else {
            /* let's check if we have at least one entry in the column that matches one of the four entries */
            if ( (l_nnz_idx[l_n][0] != -1) || (l_nnz_idx[l_n][1] != -1) || (l_nnz_idx[l_n][2] != -1) || (l_nnz_idx[l_n][3] != -1) ) {
              l_column_active[l_n] = 1;
              l_found_mul = 1;
            } else {
              l_column_active[l_n] = 0;
            }
          }
        }
      }

      if ( l_found_qmadd == 0 ) {
        /* loop over the columns of B/C */
        for ( l_n = 0; l_n < l_n_limit - l_n_processed; l_n++ ) {
          unsigned int l_col_elements = i_column_idx[l_n_processed+l_n+1] - i_column_idx[l_n_processed+l_n];
          unsigned int l_cur_column = i_column_idx[l_n_processed+l_n];
          /* search for entries matching that k */
          for ( l_col_k = 0; l_col_k < l_col_elements; l_col_k++ ) {
            if ( l_k == i_row_idx[l_cur_column + l_col_k] ) {
              l_nnz_idx[l_n][0] = l_cur_column + l_col_k;
              l_col_k = l_col_elements;
            }
          }
          /* let's check if we have an entry in the column that matches the k from A */
          if ( (l_nnz_idx[l_n][0] != -1) ) {
            l_column_active[l_n] = 1;
            l_found_mul = 1;
          } else {
            l_column_active[l_n] = 0;
          }
        }
      }

      /* First case: we can use qmadd */
      if ( l_found_qmadd != 0 ) {
        unsigned int l_lcl_k = 0;
        for ( l_lcl_k = 0; l_lcl_k < 4; l_lcl_k++ ) {
          libxsmm_x86_instruction_vec_move( io_generated_code,
                                            l_micro_kernel_config.instruction_set,
                                            l_micro_kernel_config.a_vmove_instruction,
                                            l_gp_reg_mapping.gp_reg_a,
                                            LIBXSMM_X86_GP_REG_UNDEF, 0,
                                            (l_k+l_lcl_k)*l_soa_width*l_micro_kernel_config.datatype_size,
                                            l_micro_kernel_config.vector_name,
                                            l_max_reg_block+l_lcl_k, 0, 1, 0 );
        }

        /* loop over the columns of B/C */
        for ( l_n = 0; l_n < l_n_limit - l_n_processed; l_n++ ) {
          /* issue a qmadd */
          if ( l_column_active[l_n] == 2 ) {
            libxsmm_x86_instruction_vec_compute_qfma( io_generated_code,
                                                      l_micro_kernel_config.instruction_set,
                                                      LIBXSMM_X86_INSTR_V4FMADDPS,
                                                      l_gp_reg_mapping.gp_reg_b,
                                                      LIBXSMM_X86_GP_REG_UNDEF,
                                                      0,
                                                      l_nnz_idx[l_n][0] * l_micro_kernel_config.datatype_size,
                                                      l_micro_kernel_config.vector_name,
                                                      l_max_reg_block,
                                                      l_n );
          } else if ( l_column_active[l_n] == 1 ) {
            for ( l_lcl_k = 0; l_lcl_k < 4; l_lcl_k++ ) {
              if ( l_nnz_idx[l_n][l_lcl_k] != -1 ) {
                libxsmm_x86_instruction_vec_compute_mem( io_generated_code,
                                                         l_micro_kernel_config.instruction_set,
                                                         l_micro_kernel_config.vmul_instruction,
                                                         1,
                                                         l_gp_reg_mapping.gp_reg_b,
                                                         LIBXSMM_X86_GP_REG_UNDEF,
                                                         0,
                                                         l_nnz_idx[l_n][l_lcl_k] * l_micro_kernel_config.datatype_size,
                                                         l_micro_kernel_config.vector_name,
                                                         l_max_reg_block+l_lcl_k,
                                                         l_n );
              }
            }
          }
        }
        /* increment by additional 3 columns */
        l_k += 3;
      } else if ( l_found_mul != 0 ) {
        libxsmm_x86_instruction_vec_move( io_generated_code,
                                          l_micro_kernel_config.instruction_set,
                                          l_micro_kernel_config.a_vmove_instruction,
                                          l_gp_reg_mapping.gp_reg_a,
                                          LIBXSMM_X86_GP_REG_UNDEF, 0,
                                          l_k*l_soa_width*l_micro_kernel_config.datatype_size,
                                          l_micro_kernel_config.vector_name,
                                          l_max_reg_block, 0, 1, 0 );
        /* loop over the columns of B/C */
        for ( l_n = 0; l_n < l_n_limit - l_n_processed; l_n++ ) {
          if ( l_nnz_idx[l_n][0] != -1 ) {
            if ( strcmp(i_arch, "knl") == 0 ||
                 strcmp(i_arch, "knm") == 0 ||
                 strcmp(i_arch, "skx") == 0 ||
                 strcmp(i_arch, "clx") == 0 ||
                 strcmp(i_arch, "cpx") == 0 ) {
              libxsmm_x86_instruction_vec_compute_mem( io_generated_code,
                                                       l_micro_kernel_config.instruction_set,
                                                       l_micro_kernel_config.vmul_instruction,
                                                       1,
                                                       l_gp_reg_mapping.gp_reg_b,
                                                       LIBXSMM_X86_GP_REG_UNDEF,
                                                       0,
                                                       l_nnz_idx[l_n][0] * l_micro_kernel_config.datatype_size,
                                                       l_micro_kernel_config.vector_name,
                                                       l_max_reg_block,
                                                       l_n );
            } else if ( strcmp(i_arch, "hsw") == 0 ) {
              libxsmm_x86_instruction_vec_move( io_generated_code,
                                                l_micro_kernel_config.instruction_set,
                                                l_micro_kernel_config.b_vmove_instruction,
                                                l_gp_reg_mapping.gp_reg_b,
                                                LIBXSMM_X86_GP_REG_UNDEF, 0,
                                                l_nnz_idx[l_n][0] * l_micro_kernel_config.datatype_size,
                                                l_micro_kernel_config.vector_name,
                                                15, 0, 1, 0 );
              libxsmm_x86_instruction_vec_compute_reg( io_generated_code,
                                                       l_micro_kernel_config.instruction_set,
                                                       l_micro_kernel_config.vmul_instruction,
                                                       l_micro_kernel_config.vector_name,
                                                       l_max_reg_block,
                                                       15,
                                                       l_n );
            } else {
              libxsmm_x86_instruction_vec_move( io_generated_code,
                                                l_micro_kernel_config.instruction_set,
                                                l_micro_kernel_config.b_vmove_instruction,
                                                l_gp_reg_mapping.gp_reg_b,
                                                LIBXSMM_X86_GP_REG_UNDEF, 0,
                                                l_nnz_idx[l_n][0] * l_micro_kernel_config.datatype_size,
                                                l_micro_kernel_config.vector_name,
                                                15, 0, 1, 0 );
              libxsmm_x86_instruction_vec_compute_reg( io_generated_code,
                                                       l_micro_kernel_config.instruction_set,
                                                       l_micro_kernel_config.vmul_instruction,
                                                       l_micro_kernel_config.vector_name,
                                                       l_max_reg_block,
                                                       15,
                                                       15 );
              libxsmm_x86_instruction_vec_compute_reg( io_generated_code,
                                                       l_micro_kernel_config.instruction_set,
                                                       l_micro_kernel_config.vadd_instruction,
                                                       l_micro_kernel_config.vector_name,
                                                       15,
                                                       l_n,
                                                       l_n );
            }
          }
        }
      } else {
        /* shouldn't happen */
      }
    }

    /* store C accumulator */
    for ( l_n = 0; l_n < l_n_limit - l_n_processed; l_n++ ) {
      libxsmm_x86_instruction_vec_move( io_generated_code,
                                        l_micro_kernel_config.instruction_set,
                                        l_micro_kernel_config.c_vmove_instruction,
                                        l_gp_reg_mapping.gp_reg_c,
                                        LIBXSMM_X86_GP_REG_UNDEF, 0,
                                        (l_n_processed + l_n)*l_soa_width*l_micro_kernel_config.datatype_size,
                                        l_micro_kernel_config.vector_name,
                                        l_n, 0, 0, 1 );
    }

    /* adjust n progression */
    l_n_processed += l_n_chunksize;
    l_n_limit = LIBXSMM_MIN(l_n_processed + l_n_chunksize, l_max_cols);
  }

  /* advance C pointer */
  libxsmm_x86_instruction_alu_imm( io_generated_code, l_micro_kernel_config.alu_add_instruction, l_gp_reg_mapping.gp_reg_c,
                                     l_micro_kernel_config.datatype_size*l_soa_width*i_xgemm_desc->ldc);

  /* advance A pointer */
  libxsmm_x86_instruction_alu_imm( io_generated_code, l_micro_kernel_config.alu_add_instruction, l_gp_reg_mapping.gp_reg_a,
                                   l_micro_kernel_config.datatype_size*l_soa_width*i_xgemm_desc->lda);

  /* close m loop */
  libxsmm_x86_instruction_alu_imm( io_generated_code, l_micro_kernel_config.alu_cmp_instruction, l_gp_reg_mapping.gp_reg_mloop, i_xgemm_desc->m );
  libxsmm_x86_instruction_jump_back_to_label( io_generated_code, l_micro_kernel_config.alu_jmp_instruction, &l_loop_label_tracker );

  /* close asm */
  libxsmm_x86_instruction_close_stream( io_generated_code, &l_gp_reg_mapping, i_arch, i_xgemm_desc->prefetch );
}
예제 #14
0
LIBXSMM_API_DEFINITION void libxsmm_gemm_configure(int archid, int prefetch)
{
  int config = 0;
  LIBXSMM_UNUSED(prefetch);
  internal_gemm_prefetch = LIBXSMM_PREFETCH_AL2_AHEAD;
  internal_gemm_nt = 2;
  internal_gemm = 2;
  {
    /* behaviour of libxsmm_omp_?gemm routines or LD_PRELOAD ?GEMM routines
     * 0: sequential below-threshold routine (no OpenMP); may fall-back to BLAS,
     * 1: OpenMP-parallelized but without internal parallel region,
     * 2: OpenMP-parallelized with internal parallel region" )
     */
    const char *const env = getenv("LIBXSMM_GEMM");
    if (0 != env && 0 != *env) {
      internal_gemm = atoi(env);
    }
  }
#if defined(LIBXSMM_EXT_GEMM_TASKS)
  { /* consider user input about using (OpenMP-)tasks; this code must be here
    * because maybe only this translation unit is compiled with OpenMP support
    */
    const char *const env_tasks = getenv("LIBXSMM_TASKS");
    if (0 != env_tasks && 0 != *env_tasks) {
      internal_gemm_tasks = atoi(env_tasks);
    }
  }
#endif
#if defined(__MIC__)
  LIBXSMM_UNUSED(archid);
#else
  if (LIBXSMM_X86_AVX512_MIC == archid)
#endif
  {
    internal_gemm_nt = 4;
    config = 1;
  }
  { /* attempt to setup tile sizes from the environment (LIBXSMM_M, LIBXSMM_N, and LIBXSMM_K) */
    const int tile_configs[/*configs*/][2/*DP/SP*/][3/*TILE_M,TILE_N,TILE_K*/] = {
      { { 72, 32, 16 }, { 72, 32, 16 } }, /*generic*/
      { { 72, 32, 16 }, { 72, 32, 16 } }  /*knl*/
    };
    const char* env[3];
    env[0] = getenv("LIBXSMM_M"); env[1] = getenv("LIBXSMM_N"); env[2] = getenv("LIBXSMM_K");
    internal_gemm_tile[0/*DP*/][0/*M*/] = (env[0] ? atoi(env[0]) : 0);
    internal_gemm_tile[0/*DP*/][1/*N*/] = (env[1] ? atoi(env[1]) : 0);
    internal_gemm_tile[0/*DP*/][2/*K*/] = (env[2] ? atoi(env[2]) : 0);
    /* environment-defined tile sizes applies for DP and SP */
    internal_gemm_tile[1/*SP*/][0/*M*/] = internal_gemm_tile[0/*DP*/][0];
    internal_gemm_tile[1/*SP*/][1/*N*/] = internal_gemm_tile[0/*DP*/][1];
    internal_gemm_tile[1/*SP*/][2/*K*/] = internal_gemm_tile[0/*DP*/][2];
    /* load predefined configuration if tile size is not setup by the environment */
    if (0 >= internal_gemm_tile[0/*DP*/][0/*M*/]) internal_gemm_tile[0][0] = tile_configs[config][0][0];
    if (0 >= internal_gemm_tile[0/*DP*/][1/*N*/]) internal_gemm_tile[0][1] = tile_configs[config][0][1];
    if (0 >= internal_gemm_tile[0/*DP*/][2/*K*/]) internal_gemm_tile[0][2] = tile_configs[config][0][2];
    if (0 >= internal_gemm_tile[1/*SP*/][0/*M*/]) internal_gemm_tile[1][0] = tile_configs[config][1][0];
    if (0 >= internal_gemm_tile[1/*SP*/][1/*N*/]) internal_gemm_tile[1][1] = tile_configs[config][1][1];
    if (0 >= internal_gemm_tile[1/*SP*/][2/*K*/]) internal_gemm_tile[1][2] = tile_configs[config][1][2];
  }
#if defined(__STATIC) && defined(LIBXSMM_BUILD) && !defined(__CYGWIN__) && \
  !(defined(__APPLE__) && defined(__MACH__) /*&& defined(__clang__)*/)
  if (0 == libxsmm_original_sgemm) {
    libxsmm_original_sgemm = LIBXSMM_FSYMBOL(__real_sgemm);
  }
#endif
#if !defined(__BLAS) || (0 != __BLAS)
  if (0 == libxsmm_original_sgemm) {
    libxsmm_original_sgemm = LIBXSMM_FSYMBOL(sgemm);
  }
#endif
#if defined(LIBXSMM_RTLD_NEXT)
  if (0 == libxsmm_original_sgemm) {
    union { const void* pv; libxsmm_sgemm_function pf; } gemm = { NULL };
    gemm.pv = dlsym(RTLD_NEXT, LIBXSMM_STRINGIFY(LIBXSMM_FSYMBOL(sgemm)));
    libxsmm_original_sgemm = gemm.pf;
  }
#endif
#if defined(__STATIC) && defined(LIBXSMM_BUILD) && !defined(__CYGWIN__) && \
  !(defined(__APPLE__) && defined(__MACH__) /*&& defined(__clang__)*/)
  if (0 == libxsmm_original_dgemm) {
    libxsmm_original_dgemm = LIBXSMM_FSYMBOL(__real_dgemm);
  }
#endif
#if !defined(__BLAS) || (0 != __BLAS)
  if (0 == libxsmm_original_dgemm) {
    libxsmm_original_dgemm = LIBXSMM_FSYMBOL(dgemm);
  }
#endif
#if defined(LIBXSMM_RTLD_NEXT)
  if (0 == libxsmm_original_dgemm) {
    union { const void* pv; libxsmm_dgemm_function pf; } gemm = { NULL };
    gemm.pv = dlsym(RTLD_NEXT, LIBXSMM_STRINGIFY(LIBXSMM_FSYMBOL(dgemm)));
    libxsmm_original_dgemm = gemm.pf;
  }
#endif
}
예제 #15
0
LIBXSMM_API
#if defined(_WIN32)
/*TODO: no inline*/
#elif defined(__GNUC__)
/*LIBXSMM_ATTRIBUTE(noinline)*/
#endif
const char* libxsmm_trace_info(unsigned int* depth, unsigned int* threadid, const int* filter_threadid, const int* filter_mindepth, const int* filter_maxnsyms)
{
  const char *fname = NULL;
#if defined(LIBXSMM_TRACE)
  const int max_n = (0 != depth ? (LIBXSMM_TRACE_MAXDEPTH) : 2);
  const int min_n = (0 != depth ? (LIBXSMM_TRACE_MINDEPTH + *depth) : 2);
  void *stacktrace[LIBXSMM_TRACE_MAXDEPTH], **symbol = stacktrace + LIBXSMM_MIN(0 != depth ? ((int)(*depth + 1)) : 1, max_n - 1);
  static LIBXSMM_TLS int cerberus = 0;
  int i;

  /* check against entering a recursion (recursion should not happen due to
   * attribute "no_instrument_function" but better prevent this in any case)
   */
  if (0 == cerberus) {
    ++cerberus;
# if defined(__GNUC__)
    __asm__("");
# endif
    i = LIBXSMM_ATOMIC_LOAD(&internal_trace_initialized, LIBXSMM_ATOMIC_RELAXED);
    if (0 <= i) { /* do nothing if not yet initialized */
      const int mindepth = (0 != filter_mindepth ? *filter_mindepth : internal_trace_mindepth);
      const int maxnsyms = (0 != filter_maxnsyms ? *filter_maxnsyms : internal_trace_maxnsyms);
      i = libxsmm_backtrace(stacktrace, max_n);
      /* filter depth against filter_mindepth and filter_maxnsyms */
      if ((0 >= mindepth ||      (min_n + mindepth) <= i) &&
          (0 >  maxnsyms || i <= (min_n + mindepth + maxnsyms - 1)))
      {
        if (min_n <= i) { /* check against min. depth */
          const int filter = (0 != filter_threadid ? *filter_threadid : internal_trace_threadid);
          int abs_tid = 0;
# if defined(_WIN32) || defined(__CYGWIN__)
          static LIBXSMM_TLS char buffer[sizeof(SYMBOL_INFO)+LIBXSMM_TRACE_SYMBOLSIZE];
          static LIBXSMM_TLS int tid = 0;

          PSYMBOL_INFO value = (PSYMBOL_INFO)buffer;
          value->SizeOfStruct = sizeof(SYMBOL_INFO);
          value->MaxNameLen = LIBXSMM_TRACE_SYMBOLSIZE - 1;

          if (0 != tid) {
            abs_tid = (0 <= tid ? tid : -tid);
          }
          else {
            abs_tid = LIBXSMM_ATOMIC_ADD_FETCH(&internal_trace_initialized, 1, LIBXSMM_ATOMIC_RELAXED);
            /* use sign bit to flag enabled fall-back for symbol resolution */
            tid = -abs_tid;
          }

          assert(0 < abs_tid);
          if (0 > filter || filter == abs_tid - 1) {
            if (FALSE != SymFromAddr(GetCurrentProcess(), (DWORD64)*symbol, NULL, value)
              && 0 < value->NameLen)
            {
              /* disable fall-back allowing unresolved symbol names */
              tid = abs_tid; /* make unsigned */
              fname = value->Name;
            }
            else if (0 > tid) { /* fall-back allowing unresolved symbol names */
#   if defined(__MINGW32__)
              sprintf(buffer, "%p", *symbol);
#   else
              sprintf(buffer, "0x%" PRIxPTR, (uintptr_t)*symbol);
#   endif
              fname = buffer;
            }
            if (depth) *depth = i - min_n;
            if (threadid) *threadid = abs_tid - 1;
          }
# else
#   if defined(LIBXSMM_NO_SYNC)
          static char raw_c;
          char */*const*/ raw_value = &raw_c; /* const: avoid warning (below / constant control-flow) */
#   else
          char *const raw_value = (char*)pthread_getspecific(internal_trace_key);
#   endif
          int* ivalue = 0, fd = -1;
          char* value = 0;

          if (raw_value) {
            ivalue = (int*)raw_value;
            abs_tid = (0 <= ivalue[1] ? ivalue[1] : -ivalue[1]);

            if (0 > filter || filter == abs_tid - 1) {
              fd = ivalue[0];
              if (0 <= fd && (sizeof(int) * 2) == lseek(fd, sizeof(int) * 2, SEEK_SET)) {
                value = raw_value + sizeof(int) * 2;
              }
#   if !defined(NDEBUG) /* library code is expected to be mute */
              else {
                fprintf(stderr, "LIBXSMM ERROR: failed to get buffer\n");
              }
#   endif
            }
          }
          else {
            char filename[] = "/tmp/.libxsmm_XXXXXX.map";
#if defined(__GLIBC__) && defined(__GLIBC_MINOR__) && LIBXSMM_VERSION2(2, 19) <= LIBXSMM_VERSION2(__GLIBC__, __GLIBC_MINOR__)
            fd = mkstemps(filename, 4/*.map*/);
#else
            char *const xpos = strrchr(filename, 'X');
            const char c = (char)(NULL != xpos ? *(xpos + 1) : 0);
            if (0 != c) {
              xpos[1] = 0;
              fd = mkstemp(filename);
              xpos[1] = c;
            }
            else {
              fd = -1;
            }
#endif
            if (0 <= fd && 0 == posix_fallocate(fd, 0, LIBXSMM_TRACE_SYMBOLSIZE)) {
              char *const buffer = (char*)mmap(NULL, LIBXSMM_TRACE_SYMBOLSIZE,
                PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);

              if (MAP_FAILED != buffer) {
                int check = -1;
                ivalue = (int*)buffer;
                ivalue[0] = fd; /* valid file descriptor for internal_delete */

                if (
#   if !defined(LIBXSMM_NO_SYNC)
                  0 == pthread_setspecific(internal_trace_key, buffer) &&
#   endif
                     (sizeof(int) * 1) == read(fd, &check, sizeof(int))
                  && (sizeof(int) * 2) == lseek(fd, sizeof(int), SEEK_CUR)
                  && check == fd)
                {
                  abs_tid = LIBXSMM_ATOMIC_ADD_FETCH(&internal_trace_initialized, 1, LIBXSMM_ATOMIC_RELAXED);
                  assert(0 < abs_tid);
                  /* use sign bit to flag enabled fall-back for symbol resolution */
                  ivalue[1] = -abs_tid;

                  if (0 > filter || filter == abs_tid - 1) {
                    value = buffer + sizeof(int) * 2;
                  }
                }
                else {
#   if !defined(NDEBUG) /* library code is expected to be mute */
                  fprintf(stderr, "LIBXSMM ERROR: failed to setup buffer\n");
#   endif
                  internal_delete(buffer);
                }
              }
#   if !defined(NDEBUG)
              else {
                const int error = errno;
                fprintf(stderr, "LIBXSMM ERROR: %s (mmap allocation error #%i)\n",
                  strerror(error), error);
              }
#   endif
            }
#   if !defined(NDEBUG) /* library code is expected to be mute */
            else {
              fprintf(stderr, "LIBXSMM ERROR: failed to setup file descriptor (%i)\n", fd);
            }
#   endif
          }

          if (value) {
            backtrace_symbols_fd(symbol, 1, fd);

            /* attempt to parse symbol name */
            if (1 == sscanf(value, "%*[^(](%s0x", value)) {
              char* c;
              for (c = value; '+' != *c && *c; ++c);
              if ('+' == *c) {
                /* disable fall-back allowing unresolved symbol names */
                ivalue[1] = abs_tid; /* make unsigned */
                fname = value;
                *c = 0;
              }
            }

            /* fall-back to symbol address */
            if (0 > ivalue[1] && 0 == fname) {
              sprintf(value, "0x%llx", (unsigned long long)*symbol);
              fname = value;
            }

            if (depth) *depth = i - min_n;
            if (threadid) *threadid = abs_tid - 1;
          }
# endif
        }
      }
    }

    --cerberus;
  }
#else
  LIBXSMM_UNUSED(depth); LIBXSMM_UNUSED(threadid);
  LIBXSMM_UNUSED(filter_threadid);
  LIBXSMM_UNUSED(filter_mindepth);
  LIBXSMM_UNUSED(filter_maxnsyms);
#endif

  return fname;
}
예제 #16
0
void LIBXSMM_FSYMBOL(sgemm)(
  const char* transa, const char* transb,
  const libxsmm_blasint* m, const libxsmm_blasint* n, const libxsmm_blasint* k,
  const float* alpha, const float* a, const libxsmm_blasint* lda,
  const float* b, const libxsmm_blasint* ldb,
  const float* beta, float* c, const libxsmm_blasint* ldc)
{
#if !defined(NDEBUG) /* library code is expected to be mute */
  static int error_once = 0;
  if (1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) {
    fprintf(stderr, "LIBXSMM: application must be linked against a LAPACK/BLAS implementation!\n");
  }
#endif
  LIBXSMM_UNUSED(transa); LIBXSMM_UNUSED(transb); LIBXSMM_UNUSED(m); LIBXSMM_UNUSED(n); LIBXSMM_UNUSED(k);
  LIBXSMM_UNUSED(alpha); LIBXSMM_UNUSED(a); LIBXSMM_UNUSED(lda); LIBXSMM_UNUSED(b); LIBXSMM_UNUSED(ldb);
  LIBXSMM_UNUSED(beta); LIBXSMM_UNUSED(c); LIBXSMM_UNUSED(ldc);
}
예제 #17
0
void libxsmm_generator_sparse_asparse( libxsmm_generated_code*         io_generated_code,
                                       const libxsmm_gemm_descriptor* i_xgemm_desc,
                                       const char*                     i_arch,
                                       const unsigned int*             i_row_idx,
                                       const unsigned int*             i_column_idx,
                                       const double*                   i_values ) {
  char l_new_code[512];
  int l_max_code_length = 511;
  int l_code_length = 0;
  unsigned int l_k;
  unsigned int l_flop_count = 0;

  LIBXSMM_UNUSED(i_arch);
  LIBXSMM_UNUSED(i_values);

  /* loop over columns in C in generated code, we fully unroll inside each column */
  l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, "  unsigned int l_n = 0;\n  #pragma nounroll_and_jam\n  for ( l_n = 0; l_n < %u; l_n++) {\n", i_xgemm_desc->n);
  libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length );

  /* reset the current column in C if needed */
  if ( i_xgemm_desc->beta == 0 ) {
    l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, "    unsigned int l_m = 0;\n");
    libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length );
    if ( i_xgemm_desc->m > 1 ) {
      l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, "   #pragma simd\n");
      libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length );
    }
    if ( (LIBXSMM_GEMM_FLAG_F32PREC & i_xgemm_desc->flags) == 0 ) {
      l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, "    for ( l_m = 0; l_m < %u; l_m++) {\n      C[(l_n*%u)+l_m] = 0.0;\n    }\n", i_xgemm_desc->m, i_xgemm_desc->ldc);
    } else {
      l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, "    for ( l_m = 0; l_m < %u; l_m++) {\n      C[(l_n*%u)+l_m] = 0.0f;\n    }\n", i_xgemm_desc->m, i_xgemm_desc->ldc);
    }
    libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length );
  }

  /* loop over columns in A, rows in B and fully unroll */
  for ( l_k = 0; l_k < i_xgemm_desc->k; l_k++ ) {
    unsigned int l_column_elements = i_column_idx[l_k + 1] - i_column_idx[l_k];
    unsigned int l_z = 0;

    l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, "#if defined(__SSE3__) || defined(__AVX__)\n");
    libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length );

    if ( l_column_elements > 0 ) {
      if ( (LIBXSMM_GEMM_FLAG_F32PREC & i_xgemm_desc->flags) == 0 ) {
        l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, "#if defined(__SSE3__) && defined(__AVX__)\n    __m256d b%u = _mm256_broadcast_sd(&B[(l_n*%u)+%u]);\n#endif\n", l_k, i_xgemm_desc->ldb, l_k);
        libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length );
        l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, "#if defined(__SSE3__) && !defined(__AVX__)\n    __m128d b%u = _mm_loaddup_pd(&B[(l_n*%u)+%u]);\n#endif\n", l_k, i_xgemm_desc->ldb, l_k);
        libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length );
      } else {
        l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, "#if defined(__SSE3__) && defined(__AVX__)\n    __m128 b%u = _mm_broadcast_ss(&B[(l_n*%u)+%u]);\n#endif\n", l_k, i_xgemm_desc->ldb, l_k);
        libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length );
        l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, "#if defined(__SSE3__) && !defined(__AVX__)\n    __m128 b%u = _mm_load_ss(&B[(l_n*%u)+%u]);    b%u = _mm_shuffle_ps(b%u, b%u, 0x00);\n#endif\n", l_k, i_xgemm_desc->ldb, l_k, l_k, l_k, l_k);
        libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length );
      }
    }

    /* loop over the columns of A and look for vectorization potential */
    for ( l_z = 0; l_z < l_column_elements; l_z++ ) {
      /* 4 element vector might be possible */
      if ( (l_z < (l_column_elements - 3)) && (l_column_elements > 3) ) {
        /* check for 256bit vector instruction */
        if ((i_row_idx[i_column_idx[l_k] + l_z] + 1 == i_row_idx[i_column_idx[l_k] + l_z + 1]) &&
            (i_row_idx[i_column_idx[l_k] + l_z] + 2 == i_row_idx[i_column_idx[l_k] + l_z + 2]) &&
            (i_row_idx[i_column_idx[l_k] + l_z] + 3 == i_row_idx[i_column_idx[l_k] + l_z + 3]) &&
            (i_row_idx[i_column_idx[l_k] + l_z + 3] < i_xgemm_desc->m)) {
          libxsmm_sparse_asparse_innerloop_four_vector(io_generated_code, i_xgemm_desc, l_k, l_z, i_row_idx, i_column_idx);
          l_z += 3;
        /* check for 128bit vector instruction */
        } else if ((i_row_idx[i_column_idx[l_k] + l_z] + 1 == i_row_idx[i_column_idx[l_k] + l_z + 1]) &&
                   (i_row_idx[i_column_idx[l_k] + l_z + 1] < i_xgemm_desc->m) ) {
          libxsmm_sparse_asparse_innerloop_two_vector(io_generated_code, i_xgemm_desc, l_k, l_z, i_row_idx, i_column_idx);
          l_z++;
        /* scalare instruction */
        } else {
          if ( (i_row_idx[i_column_idx[l_k] + l_z] < i_xgemm_desc->m) ) {
            libxsmm_sparse_asparse_innerloop_scalar(io_generated_code, i_xgemm_desc, l_k, l_z, i_row_idx, i_column_idx);
          }
        }
      /* 2 element vector might be possible */
      } else if ( (l_z < (l_column_elements - 1)) && (l_column_elements > 1)) {
        /* check for 128bit vector instruction */
        if ((i_row_idx[i_column_idx[l_k] + l_z] + 1 == i_row_idx[i_column_idx[l_k] + l_z + 1]) &&
            (i_row_idx[i_column_idx[l_k] + l_z + 1] < i_xgemm_desc->m) ) {
          libxsmm_sparse_asparse_innerloop_two_vector(io_generated_code, i_xgemm_desc, l_k, l_z, i_row_idx, i_column_idx);
          l_z++;
        /* scalare instruction */
        } else {
          if ( (i_row_idx[i_column_idx[l_k] + l_z] < i_xgemm_desc->m) ) {
            libxsmm_sparse_asparse_innerloop_scalar(io_generated_code, i_xgemm_desc, l_k, l_z, i_row_idx, i_column_idx);
          }
        }
      /* scalar anayways */
      } else {
        if ( (i_row_idx[i_column_idx[l_k] + l_z] < i_xgemm_desc->m) ) {
          libxsmm_sparse_asparse_innerloop_scalar(io_generated_code, i_xgemm_desc, l_k, l_z, i_row_idx, i_column_idx);
        }
      }
    }

    /* C fallback code */
    l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, "#else\n");
    libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length );

    /* loop over the columns of A */
    for ( l_z = 0; l_z < l_column_elements; l_z++ ) {
      if ( (i_row_idx[i_column_idx[l_k] + l_z] < i_xgemm_desc->m) ) {
        l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, "    C[(l_n*%u)+%u] += A[%u] * B[(l_n*%u)+%u];\n", i_xgemm_desc->ldc, i_row_idx[i_column_idx[l_k] + l_z], i_column_idx[l_k] + l_z, i_xgemm_desc->ldb, l_k );
        libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length );
        l_flop_count += 2;
      }
    }

    l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, "#endif\n\n");
    libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length );
  }

  l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, "  }\n");
  libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length );

  /* add flop counter */
  l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, "\n#ifndef NDEBUG\n#ifdef _OPENMP\n#pragma omp atomic\n#endif\nlibxsmm_num_total_flops += %u;\n#endif\n", l_flop_count * i_xgemm_desc->n);
  libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length );
}
예제 #18
0
LIBXSMM_API_INTERN
void libxsmm_generator_spgemm_csc_bsparse( libxsmm_generated_code*         io_generated_code,
                                           const libxsmm_gemm_descriptor*  i_xgemm_desc,
                                           const char*                     i_arch,
                                           const unsigned int*             i_row_idx,
                                           const unsigned int*             i_column_idx,
                                           const double*                   i_values ) {
  unsigned int l_n;
  unsigned int l_z;
  unsigned int l_column_elements;
  unsigned int l_flop_count = 0;

  char l_new_code[512];
  int l_max_code_length = 511;
  int l_code_length = 0;

  LIBXSMM_UNUSED(i_values);

  l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, "  unsigned int l_m = 0;\n");
  libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length );

  /* reset C if beta is zero */
  if (0 != (LIBXSMM_GEMM_FLAG_BETA_0 & i_xgemm_desc->flags)) { /* Beta=0 */
    l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, "  unsigned int l_n = 0;\n");
    libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length );
    l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, "  for ( l_n = 0; l_n < %u; l_n++) {\n", (unsigned int)i_xgemm_desc->n);
    libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length );
    if ( i_xgemm_desc->m > 1 ) {
      l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, "    #pragma simd\n");
      libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length );
      l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, "    #pragma vector aligned\n");
      libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length );
    }
    if ( LIBXSMM_GEMM_PRECISION_F64 == LIBXSMM_GETENUM_INP( i_xgemm_desc->datatype ) ) {
      l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, "    for ( l_m = 0; l_m < %u; l_m++) { C[(l_n*%u)+l_m] = 0.0; }\n", (unsigned int)i_xgemm_desc->m, (unsigned int)i_xgemm_desc->ldc);
    } else {
      l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, "    for ( l_m = 0; l_m < %u; l_m++) { C[(l_n*%u)+l_m] = 0.0f; }\n", (unsigned int)i_xgemm_desc->m, (unsigned int)i_xgemm_desc->ldc);
    }
    libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length );
    l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, "  }\n");
    libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length );
  }
  l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, "\n");
  libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length );

  /* determine the correct simd pragma for each architecture */
  if ( ( strcmp( i_arch, "noarch" ) == 0 ) ||
       ( strcmp( i_arch, "wsm" ) == 0 )    ||
       ( strcmp( i_arch, "snb" ) == 0 )    ||
       ( strcmp( i_arch, "hsw" ) == 0 ) ) {
    if ( i_xgemm_desc->m > 7 ) {
      l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, "  #pragma simd vectorlength(8)\n");
      libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length );
    } else if ( i_xgemm_desc->m > 3 ) {
      l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, "  #pragma simd vectorlength(4)\n");
      libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length );
    } else if ( i_xgemm_desc->m > 1 ) {
      l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, "  #pragma simd vectorlength(2)\n");
      libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length );
    } else {}

    if ( (i_xgemm_desc->m > 1)          &&
         ((LIBXSMM_GEMM_FLAG_ALIGN_A & i_xgemm_desc->flags) != 0) &&
         ((LIBXSMM_GEMM_FLAG_ALIGN_C & i_xgemm_desc->flags) != 0) ) {
      l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, "  #pragma vector aligned\n");
      libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length );
    }
  } else if ( ( strcmp( i_arch, "knl" ) == 0 ) ||
              ( strcmp( i_arch, "skx" ) == 0 ) ||
              ( strcmp( i_arch, "clx" ) == 0 ) ||
              ( strcmp( i_arch, "cpx" ) == 0 ) ) {
    if ( (i_xgemm_desc->m > 1)          &&
         ((LIBXSMM_GEMM_FLAG_ALIGN_A & i_xgemm_desc->flags) != 0) &&
         ((LIBXSMM_GEMM_FLAG_ALIGN_C & i_xgemm_desc->flags) != 0) ) {
      l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, "  #pragma simd vectorlength(32)\n  #pragma vector aligned\n");
      libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length );
    }
  } else {
    LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_ARCH );
    return;
  }

  /* generate the actuel kernel */
  l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, "  for ( l_m = 0; l_m < %u; l_m++) {\n", (unsigned int)i_xgemm_desc->m);
  libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length );

  for ( l_n = 0; l_n < (unsigned int)i_xgemm_desc->n; l_n++ ) {
    l_column_elements = i_column_idx[l_n+1] - i_column_idx[l_n];
    for ( l_z = 0; l_z < l_column_elements; l_z++ ) {
      /* check k such that we just use rows which actually need to be multiplied */
      if ( i_row_idx[i_column_idx[l_n] + l_z] < (unsigned int)i_xgemm_desc->k ) {
        l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, "    C[%u+l_m] += A[%u+l_m] * B[%u];\n", l_n * i_xgemm_desc->ldc, i_row_idx[i_column_idx[l_n] + l_z]*i_xgemm_desc->lda, i_column_idx[l_n] + l_z);
        libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length );
        l_flop_count += 2;
      }
    }
  }

  l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, "  }\n");
  libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length );

  /* add flop counter */
  l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, "\n#ifndef NDEBUG\n#ifdef _OPENMP\n#pragma omp atomic\n#endif\nlibxsmm_num_total_flops += %u;\n#endif\n", l_flop_count * (unsigned int)i_xgemm_desc->m);
  libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length );
}
예제 #19
0
void LIBXSMM_FSYMBOL(dgemm)(LIBXSMM_GEMM_CONST char* transa, LIBXSMM_GEMM_CONST char* transb,
  LIBXSMM_GEMM_CONST libxsmm_blasint* m, LIBXSMM_GEMM_CONST libxsmm_blasint* n, LIBXSMM_GEMM_CONST libxsmm_blasint* k,
  LIBXSMM_GEMM_CONST double* alpha, LIBXSMM_GEMM_CONST double* a, LIBXSMM_GEMM_CONST libxsmm_blasint* lda,
  LIBXSMM_GEMM_CONST double* b, LIBXSMM_GEMM_CONST libxsmm_blasint* ldb,
  LIBXSMM_GEMM_CONST double* beta, double* c, LIBXSMM_GEMM_CONST libxsmm_blasint* ldc)
{
#if !defined(NDEBUG) /* library code is expected to be mute */
  static int error_once = 0;
  if (1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) {
    fprintf(stderr, "LIBXSMM ERROR: application must be linked against LAPACK/BLAS!\n");
  }
#endif
  LIBXSMM_UNUSED(transa); LIBXSMM_UNUSED(transb); LIBXSMM_UNUSED(m); LIBXSMM_UNUSED(n); LIBXSMM_UNUSED(k);
  LIBXSMM_UNUSED(alpha); LIBXSMM_UNUSED(a); LIBXSMM_UNUSED(lda); LIBXSMM_UNUSED(b); LIBXSMM_UNUSED(ldb);
  LIBXSMM_UNUSED(beta); LIBXSMM_UNUSED(c); LIBXSMM_UNUSED(ldc);
}
예제 #20
0
LIBXSMM_API void __cyg_profile_func_exit(void* this_fn, void* call_site)
{
  LIBXSMM_UNUSED(this_fn); LIBXSMM_UNUSED(call_site); /* suppress warning */
}