LIBXSMM_API void libxsmm_generator_spgemm_csc_soa_kernel( libxsmm_generated_code* io_generated_code, const libxsmm_gemm_descriptor* i_xgemm_desc, const char* i_arch, const unsigned int* i_row_idx, const unsigned int* i_column_idx, const void* i_values ) { /* B matrix is sparse */ if ( (i_xgemm_desc->lda > 0) && (i_xgemm_desc->ldb == 0) && (i_xgemm_desc->ldc > 0) ) { /* check LDA */ if ( i_xgemm_desc->lda < i_xgemm_desc->k ) { LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_LDA ); return; } /* check LDC */ if ( i_xgemm_desc->ldc < i_xgemm_desc->n ) { LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_LDC ); return; } libxsmm_generator_spgemm_csc_bsparse_soa( io_generated_code, i_xgemm_desc, i_arch, i_row_idx, i_column_idx, i_values ); } else { /* something bad happened... */ LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_SPGEMM_GEN ); return; } }
LIBXSMM_API void libxsmm_generator_spgemm_csr_kernel( libxsmm_generated_code* io_generated_code, const libxsmm_gemm_descriptor* i_xgemm_desc, const char* i_arch, const unsigned int* i_row_idx, const unsigned int* i_column_idx, const double* i_values ) { /* A matrix is sparse */ if ( (i_xgemm_desc->lda == 0) && (i_xgemm_desc->ldb > 0) && (i_xgemm_desc->ldc > 0) ) { /* check LDB */ if ( i_xgemm_desc->ldb < i_xgemm_desc->n ) { LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_LDB ); return; } /* check LDC */ if ( i_xgemm_desc->ldc < i_xgemm_desc->n ) { LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_LDC ); return; } libxsmm_generator_spgemm_csr_asparse( io_generated_code, i_xgemm_desc, i_arch, i_row_idx, i_column_idx, i_values ); /* B matrix is sparse */ } else if ( (i_xgemm_desc->lda > 0) && (i_xgemm_desc->ldb == 0) && (i_xgemm_desc->ldc > 0) ) { /* check LDA */ if ( i_xgemm_desc->lda < i_xgemm_desc->k ) { LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_LDA ); return; } /* check LDC */ if ( i_xgemm_desc->ldc < i_xgemm_desc->n ) { LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_LDC ); return; } /* something bad happened... */ fprintf(stderr, "LIBXSMM fatal error: B sparse for CSR data structure is not yet available!\n"); exit(-1); } else { /* something bad happened... */ LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_SPGEMM_GEN ); return; } }
/* @TODO change int based architecture value */ LIBXSMM_API void libxsmm_generator_transpose_kernel( libxsmm_generated_code* io_generated_code, const libxsmm_trans_descriptor* i_trans_desc, const char* i_arch ) { /* add instruction set mismatch check to code, header */ libxsmm_generator_isa_check_header( io_generated_code, i_arch ); /* generate kernel */ if ( (strcmp(i_arch, "skx") == 0) || (strcmp(i_arch, "knm") == 0) || (strcmp(i_arch, "knl") == 0) || (strcmp(i_arch, "hsw") == 0) || (strcmp(i_arch, "snb") == 0) ) { libxsmm_generator_transpose_avx_avx512_kernel( io_generated_code, i_trans_desc, i_arch ); } else { /* TODO fix this error */ LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_ARCH ); return; } /* add instruction set mismatch check to code, footer */ libxsmm_generator_isa_check_footer( io_generated_code, i_arch ); }
LIBXSMM_API_INTERN void libxsmm_generator_spgemm_csc_bsparse( libxsmm_generated_code* io_generated_code, const libxsmm_gemm_descriptor* i_xgemm_desc, const char* i_arch, const unsigned int* i_row_idx, const unsigned int* i_column_idx, const double* i_values ) { unsigned int l_n; unsigned int l_z; unsigned int l_column_elements; unsigned int l_flop_count = 0; char l_new_code[512]; int l_max_code_length = 511; int l_code_length = 0; LIBXSMM_UNUSED(i_values); l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " unsigned int l_m = 0;\n"); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); /* reset C if beta is zero */ if (0 != (LIBXSMM_GEMM_FLAG_BETA_0 & i_xgemm_desc->flags)) { /* Beta=0 */ l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " unsigned int l_n = 0;\n"); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " for ( l_n = 0; l_n < %u; l_n++) {\n", (unsigned int)i_xgemm_desc->n); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); if ( i_xgemm_desc->m > 1 ) { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " #pragma simd\n"); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " #pragma vector aligned\n"); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); } if ( LIBXSMM_GEMM_PRECISION_F64 == LIBXSMM_GETENUM_INP( i_xgemm_desc->datatype ) ) { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " for ( l_m = 0; l_m < %u; l_m++) { C[(l_n*%u)+l_m] = 0.0; }\n", (unsigned int)i_xgemm_desc->m, (unsigned int)i_xgemm_desc->ldc); } else { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " for ( l_m = 0; l_m < %u; l_m++) { C[(l_n*%u)+l_m] = 0.0f; }\n", (unsigned int)i_xgemm_desc->m, (unsigned int)i_xgemm_desc->ldc); } libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " }\n"); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); } l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, "\n"); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); /* determine the correct simd pragma for each architecture */ if ( ( strcmp( i_arch, "noarch" ) == 0 ) || ( strcmp( i_arch, "wsm" ) == 0 ) || ( strcmp( i_arch, "snb" ) == 0 ) || ( strcmp( i_arch, "hsw" ) == 0 ) ) { if ( i_xgemm_desc->m > 7 ) { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " #pragma simd vectorlength(8)\n"); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); } else if ( i_xgemm_desc->m > 3 ) { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " #pragma simd vectorlength(4)\n"); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); } else if ( i_xgemm_desc->m > 1 ) { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " #pragma simd vectorlength(2)\n"); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); } else {} if ( (i_xgemm_desc->m > 1) && ((LIBXSMM_GEMM_FLAG_ALIGN_A & i_xgemm_desc->flags) != 0) && ((LIBXSMM_GEMM_FLAG_ALIGN_C & i_xgemm_desc->flags) != 0) ) { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " #pragma vector aligned\n"); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); } } else if ( ( strcmp( i_arch, "knl" ) == 0 ) || ( strcmp( i_arch, "skx" ) == 0 ) || ( strcmp( i_arch, "clx" ) == 0 ) || ( strcmp( i_arch, "cpx" ) == 0 ) ) { if ( (i_xgemm_desc->m > 1) && ((LIBXSMM_GEMM_FLAG_ALIGN_A & i_xgemm_desc->flags) != 0) && ((LIBXSMM_GEMM_FLAG_ALIGN_C & i_xgemm_desc->flags) != 0) ) { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " #pragma simd vectorlength(32)\n #pragma vector aligned\n"); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); } } else { LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_ARCH ); return; } /* generate the actuel kernel */ l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " for ( l_m = 0; l_m < %u; l_m++) {\n", (unsigned int)i_xgemm_desc->m); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); for ( l_n = 0; l_n < (unsigned int)i_xgemm_desc->n; l_n++ ) { l_column_elements = i_column_idx[l_n+1] - i_column_idx[l_n]; for ( l_z = 0; l_z < l_column_elements; l_z++ ) { /* check k such that we just use rows which actually need to be multiplied */ if ( i_row_idx[i_column_idx[l_n] + l_z] < (unsigned int)i_xgemm_desc->k ) { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " C[%u+l_m] += A[%u+l_m] * B[%u];\n", l_n * i_xgemm_desc->ldc, i_row_idx[i_column_idx[l_n] + l_z]*i_xgemm_desc->lda, i_column_idx[l_n] + l_z); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_flop_count += 2; } } } l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " }\n"); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); /* add flop counter */ l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, "\n#ifndef NDEBUG\n#ifdef _OPENMP\n#pragma omp atomic\n#endif\nlibxsmm_num_total_flops += %u;\n#endif\n", l_flop_count * (unsigned int)i_xgemm_desc->m); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); }
LIBXSMM_API_INTERN void libxsmm_generator_gemm_rm_ac_soa_avx256_512( libxsmm_generated_code* io_generated_code, const libxsmm_gemm_descriptor* i_xgemm_desc, const char* i_arch ) { unsigned int l_soa_width = 0; unsigned int l_max_reg_block = 0; unsigned int l_n1_range = 0; unsigned int l_n2_range = 0; unsigned int l_n1_block = 0; unsigned int l_n2_block = 0; libxsmm_micro_kernel_config l_micro_kernel_config; libxsmm_loop_label_tracker l_loop_label_tracker; libxsmm_gp_reg_mapping l_gp_reg_mapping; /* select soa width */ if ( LIBXSMM_GEMM_PRECISION_F64 == LIBXSMM_GETENUM_INP( i_xgemm_desc->datatype ) ) { if ( strcmp(i_arch, "knl") == 0 || strcmp(i_arch, "knm") == 0 || strcmp(i_arch, "skx") == 0 || strcmp(i_arch, "clx") == 0 || strcmp(i_arch, "cpx") == 0 ) { l_soa_width = 8; l_max_reg_block = 28; } else { l_soa_width = 4; l_max_reg_block = 14; } } else { if ( strcmp(i_arch, "knl") == 0 || strcmp(i_arch, "knm") == 0 || strcmp(i_arch, "skx") == 0 || strcmp(i_arch, "clx") == 0 || strcmp(i_arch, "cpx") == 0 ) { l_soa_width = 16; l_max_reg_block = 28; } else { l_soa_width = 8; l_max_reg_block = 14; } } /* define gp register mapping */ libxsmm_reset_x86_gp_reg_mapping( &l_gp_reg_mapping ); /* matching calling convention on Linux */ #if defined(_WIN32) || defined(__CYGWIN__) l_gp_reg_mapping.gp_reg_a = LIBXSMM_X86_GP_REG_RCX; l_gp_reg_mapping.gp_reg_b = LIBXSMM_X86_GP_REG_RDX; l_gp_reg_mapping.gp_reg_c = LIBXSMM_X86_GP_REG_R8; /* TODO: full support for Windows calling convention */ l_gp_reg_mapping.gp_reg_a_prefetch = LIBXSMM_X86_GP_REG_RDI; l_gp_reg_mapping.gp_reg_b_prefetch = LIBXSMM_X86_GP_REG_RSI; #else /* match calling convention on Linux */ l_gp_reg_mapping.gp_reg_a = LIBXSMM_X86_GP_REG_RDI; l_gp_reg_mapping.gp_reg_b = LIBXSMM_X86_GP_REG_RSI; l_gp_reg_mapping.gp_reg_c = LIBXSMM_X86_GP_REG_RDX; l_gp_reg_mapping.gp_reg_a_prefetch = LIBXSMM_X86_GP_REG_RCX; l_gp_reg_mapping.gp_reg_b_prefetch = LIBXSMM_X86_GP_REG_R8; #endif l_gp_reg_mapping.gp_reg_c_prefetch = LIBXSMM_X86_GP_REG_UNDEF; l_gp_reg_mapping.gp_reg_mloop = LIBXSMM_X86_GP_REG_R12; l_gp_reg_mapping.gp_reg_nloop = LIBXSMM_X86_GP_REG_R13; l_gp_reg_mapping.gp_reg_kloop = LIBXSMM_X86_GP_REG_R14; l_gp_reg_mapping.gp_reg_help_0 = LIBXSMM_X86_GP_REG_UNDEF; l_gp_reg_mapping.gp_reg_help_1 = LIBXSMM_X86_GP_REG_UNDEF; l_gp_reg_mapping.gp_reg_help_2 = LIBXSMM_X86_GP_REG_UNDEF; l_gp_reg_mapping.gp_reg_help_3 = LIBXSMM_X86_GP_REG_UNDEF; l_gp_reg_mapping.gp_reg_help_4 = LIBXSMM_X86_GP_REG_UNDEF; l_gp_reg_mapping.gp_reg_help_5 = LIBXSMM_X86_GP_REG_UNDEF; /* define loop_label_tracker */ libxsmm_reset_loop_label_tracker( &l_loop_label_tracker ); /* define the micro kernel code gen properties */ libxsmm_generator_gemm_init_micro_kernel_config_fullvector( &l_micro_kernel_config, i_xgemm_desc, i_arch, 0 ); /* calculate the chunk size of current columns to work on */ if ( libxsmm_compute_equalized_blocking( i_xgemm_desc->n, l_max_reg_block, &l_n1_range, &l_n1_block, &l_n2_range, &l_n2_block ) ) { LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_N_BLOCK ); return; } /* open asm */ libxsmm_x86_instruction_open_stream( io_generated_code, &l_gp_reg_mapping, i_arch, i_xgemm_desc->prefetch ); /* m loop */ libxsmm_x86_instruction_register_jump_back_label( io_generated_code, &l_loop_label_tracker ); libxsmm_x86_instruction_alu_imm( io_generated_code, l_micro_kernel_config.alu_add_instruction, l_gp_reg_mapping.gp_reg_mloop, 1 ); /* loop over n-blocks */ if ( l_n1_block == i_xgemm_desc->n ) { /* no N loop at all */ libxsmm_generator_gemm_rm_ac_soa_avx256_512_kloop( io_generated_code, &l_loop_label_tracker, &l_gp_reg_mapping, &l_micro_kernel_config, i_xgemm_desc, i_arch, l_soa_width, i_xgemm_desc->n ); } else if ( (l_n1_range > 0) && (l_n2_range > 0) ) { /* reset n loop */ libxsmm_x86_instruction_alu_imm( io_generated_code, l_micro_kernel_config.alu_mov_instruction, l_gp_reg_mapping.gp_reg_nloop, 0 ); /* we have two ranges */ /* first range */ libxsmm_x86_instruction_register_jump_back_label( io_generated_code, &l_loop_label_tracker ); libxsmm_x86_instruction_alu_imm( io_generated_code, l_micro_kernel_config.alu_add_instruction, l_gp_reg_mapping.gp_reg_nloop, l_n1_block ); libxsmm_generator_gemm_rm_ac_soa_avx256_512_kloop( io_generated_code, &l_loop_label_tracker, &l_gp_reg_mapping, &l_micro_kernel_config, i_xgemm_desc, i_arch, l_soa_width, l_n1_block ); libxsmm_x86_instruction_alu_imm( io_generated_code, l_micro_kernel_config.alu_cmp_instruction, l_gp_reg_mapping.gp_reg_nloop, l_n1_range ); libxsmm_x86_instruction_jump_back_to_label( io_generated_code, l_micro_kernel_config.alu_jmp_instruction, &l_loop_label_tracker ); /* second range */ libxsmm_x86_instruction_register_jump_back_label( io_generated_code, &l_loop_label_tracker ); libxsmm_x86_instruction_alu_imm( io_generated_code, l_micro_kernel_config.alu_add_instruction, l_gp_reg_mapping.gp_reg_nloop, l_n2_block ); libxsmm_generator_gemm_rm_ac_soa_avx256_512_kloop( io_generated_code, &l_loop_label_tracker, &l_gp_reg_mapping, &l_micro_kernel_config, i_xgemm_desc, i_arch, l_soa_width, l_n2_block ); libxsmm_x86_instruction_alu_imm( io_generated_code, l_micro_kernel_config.alu_cmp_instruction, l_gp_reg_mapping.gp_reg_nloop, i_xgemm_desc->n ); libxsmm_x86_instruction_jump_back_to_label( io_generated_code, l_micro_kernel_config.alu_jmp_instruction, &l_loop_label_tracker ); /* reset B pointer */ libxsmm_x86_instruction_alu_imm( io_generated_code, l_micro_kernel_config.alu_sub_instruction, l_gp_reg_mapping.gp_reg_b, i_xgemm_desc->n * l_micro_kernel_config.datatype_size ); /* reset C pointer */ libxsmm_x86_instruction_alu_imm( io_generated_code, l_micro_kernel_config.alu_sub_instruction, l_gp_reg_mapping.gp_reg_c, i_xgemm_desc->n * l_soa_width * l_micro_kernel_config.datatype_size ); } else if ( (l_n1_range > 0) && (l_n2_range == 0) ) { /* reset n loop */ libxsmm_x86_instruction_alu_imm( io_generated_code, l_micro_kernel_config.alu_mov_instruction, l_gp_reg_mapping.gp_reg_nloop, 0 ); /* we have one range */ libxsmm_x86_instruction_register_jump_back_label( io_generated_code, &l_loop_label_tracker ); libxsmm_x86_instruction_alu_imm( io_generated_code, l_micro_kernel_config.alu_add_instruction, l_gp_reg_mapping.gp_reg_nloop, l_n1_block ); libxsmm_generator_gemm_rm_ac_soa_avx256_512_kloop( io_generated_code, &l_loop_label_tracker, &l_gp_reg_mapping, &l_micro_kernel_config, i_xgemm_desc, i_arch, l_soa_width, l_n1_block ); libxsmm_x86_instruction_alu_imm( io_generated_code, l_micro_kernel_config.alu_cmp_instruction, l_gp_reg_mapping.gp_reg_nloop, i_xgemm_desc->n ); libxsmm_x86_instruction_jump_back_to_label( io_generated_code, l_micro_kernel_config.alu_jmp_instruction, &l_loop_label_tracker ); /* reset B pointer */ libxsmm_x86_instruction_alu_imm( io_generated_code, l_micro_kernel_config.alu_sub_instruction, l_gp_reg_mapping.gp_reg_b, i_xgemm_desc->n * l_micro_kernel_config.datatype_size ); /* reset C pointer */ libxsmm_x86_instruction_alu_imm( io_generated_code, l_micro_kernel_config.alu_sub_instruction, l_gp_reg_mapping.gp_reg_c, i_xgemm_desc->n * l_soa_width * l_micro_kernel_config.datatype_size ); } else { LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_N_BLOCK ); return; } /* advance A pointer */ libxsmm_x86_instruction_alu_imm( io_generated_code, l_micro_kernel_config.alu_add_instruction, l_gp_reg_mapping.gp_reg_a, l_micro_kernel_config.datatype_size*l_soa_width*i_xgemm_desc->lda); /* advance C pointer */ libxsmm_x86_instruction_alu_imm( io_generated_code, l_micro_kernel_config.alu_add_instruction, l_gp_reg_mapping.gp_reg_c, l_micro_kernel_config.datatype_size*l_soa_width*i_xgemm_desc->ldc); /* close m loop */ libxsmm_x86_instruction_alu_imm( io_generated_code, l_micro_kernel_config.alu_cmp_instruction, l_gp_reg_mapping.gp_reg_mloop, i_xgemm_desc->m ); libxsmm_x86_instruction_jump_back_to_label( io_generated_code, l_micro_kernel_config.alu_jmp_instruction, &l_loop_label_tracker ); /* close asm */ libxsmm_x86_instruction_close_stream( io_generated_code, &l_gp_reg_mapping, i_arch, i_xgemm_desc->prefetch ); }
LIBXSMM_API_INTERN void libxsmm_generator_gemm_avx_microkernel( libxsmm_generated_code* io_generated_code, const libxsmm_gp_reg_mapping* i_gp_reg_mapping, const libxsmm_micro_kernel_config* i_micro_kernel_config, const libxsmm_gemm_descriptor* i_xgemm_desc, const unsigned int i_m_blocking, const unsigned int i_n_blocking, const int i_offset ) { /* deriving register blocking from kernel config */ unsigned int l_m_blocking = i_m_blocking/i_micro_kernel_config->vector_length; /* register blocking counter in n */ unsigned int l_n = 0; /* register blocking counter in m */ unsigned int l_m = 0; /* start register of accumulator */ unsigned int l_vec_reg_acc_start = 16 - (i_n_blocking * l_m_blocking); #if !defined(NDEBUG) if ( (i_n_blocking > 3) || (i_n_blocking < 1) ) { LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_N_BLOCK ); return; } if ( i_m_blocking % i_micro_kernel_config->vector_length != 0 ) { LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_M_BLOCK ); return; } #endif if (l_m_blocking == 1) { /* load column vectors of A */ libxsmm_x86_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->a_vmove_instruction, i_gp_reg_mapping->gp_reg_a, LIBXSMM_X86_GP_REG_UNDEF, 0, 0, i_micro_kernel_config->vector_name, i_n_blocking, i_micro_kernel_config->use_masking_a_c, 0 ); /* loop over columns of B */ for ( l_n = 0; l_n < i_n_blocking; l_n++ ) { /* post increment of a pointer early */ if ( l_n == 0 ) { libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_add_instruction, i_gp_reg_mapping->gp_reg_a, (i_xgemm_desc->lda)*(i_micro_kernel_config->datatype_size) ); } /* different ways of using B */ if ( i_offset != (-1) ) { libxsmm_x86_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->b_vmove_instruction, i_gp_reg_mapping->gp_reg_b, LIBXSMM_X86_GP_REG_UNDEF, 0, ((i_micro_kernel_config->datatype_size) * i_offset) + (i_xgemm_desc->ldb * l_n * (i_micro_kernel_config->datatype_size)), i_micro_kernel_config->vector_name, l_n, i_micro_kernel_config->use_masking_a_c, 0 ); } else { libxsmm_x86_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->b_vmove_instruction, i_gp_reg_mapping->gp_reg_b, LIBXSMM_X86_GP_REG_UNDEF, 0, i_xgemm_desc->ldb * l_n * i_micro_kernel_config->datatype_size, i_micro_kernel_config->vector_name, l_n, i_micro_kernel_config->use_masking_a_c, 0 ); if ( l_n == (i_n_blocking -1) ) { libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_add_instruction, i_gp_reg_mapping->gp_reg_b, i_micro_kernel_config->datatype_size ); } } /* issue mul-add */ libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->vmul_instruction, i_micro_kernel_config->vector_name, i_n_blocking, l_n, i_n_blocking + l_n + 1 ); libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->vadd_instruction, i_micro_kernel_config->vector_name, i_n_blocking + l_n + 1, l_vec_reg_acc_start + l_n, l_vec_reg_acc_start + l_n ); } } else { /* broadcast from B -> into vec registers 0 to i_n_blocking */ if ( i_offset != (-1) ) { for ( l_n = 0; l_n < i_n_blocking; l_n++ ) { libxsmm_x86_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->b_vmove_instruction, i_gp_reg_mapping->gp_reg_b, LIBXSMM_X86_GP_REG_UNDEF, 0, ((i_micro_kernel_config->datatype_size) * i_offset) + (i_xgemm_desc->ldb * l_n * (i_micro_kernel_config->datatype_size)), i_micro_kernel_config->vector_name, l_n, i_micro_kernel_config->use_masking_a_c, 0 ); } } else { for ( l_n = 0; l_n < i_n_blocking; l_n++ ) { libxsmm_x86_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->b_vmove_instruction, i_gp_reg_mapping->gp_reg_b, LIBXSMM_X86_GP_REG_UNDEF, 0, i_xgemm_desc->ldb * l_n * i_micro_kernel_config->datatype_size, i_micro_kernel_config->vector_name, l_n, i_micro_kernel_config->use_masking_a_c, 0 ); } libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_add_instruction, i_gp_reg_mapping->gp_reg_b, i_micro_kernel_config->datatype_size ); } if (l_m_blocking == 3) { /* load column vectors of A and multiply with all broadcasted row entries of B */ for ( l_m = 0; l_m < l_m_blocking; l_m++ ) { libxsmm_x86_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->a_vmove_instruction, i_gp_reg_mapping->gp_reg_a, LIBXSMM_X86_GP_REG_UNDEF, 0, (i_micro_kernel_config->datatype_size) * (i_micro_kernel_config->vector_length) * l_m, i_micro_kernel_config->vector_name, i_n_blocking, i_micro_kernel_config->use_masking_a_c, 0 ); for ( l_n = 0; l_n < i_n_blocking; l_n++ ) { /* post increment early */ if ( (l_m == (l_m_blocking-1)) && (l_n == 0) ) { libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_add_instruction, i_gp_reg_mapping->gp_reg_a, (i_xgemm_desc->lda)*(i_micro_kernel_config->datatype_size) ); } /* issue mul+add */ libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->vmul_instruction, i_micro_kernel_config->vector_name, i_n_blocking, l_n, i_n_blocking + l_m + 1 ); libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->vadd_instruction, i_micro_kernel_config->vector_name, i_n_blocking + l_m + 1, l_vec_reg_acc_start + l_m + (l_m_blocking * l_n), l_vec_reg_acc_start + l_m + (l_m_blocking * l_n) ); } } } else { /* load column vectors of A and multiply with all broadcasted row entries of B */ for ( l_m = 0; l_m < l_m_blocking; l_m++ ) { libxsmm_x86_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->a_vmove_instruction, i_gp_reg_mapping->gp_reg_a, LIBXSMM_X86_GP_REG_UNDEF, 0, (i_micro_kernel_config->datatype_size) * (i_micro_kernel_config->vector_length) * l_m, i_micro_kernel_config->vector_name, i_n_blocking+l_m, i_micro_kernel_config->use_masking_a_c, 0 ); } for ( l_m = 0; l_m < l_m_blocking; l_m++ ) { for ( l_n = 0; l_n < i_n_blocking; l_n++ ) { /* post increment early */ if ( (l_m == (l_m_blocking-1)) && (l_n == 0) ) { libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_add_instruction, i_gp_reg_mapping->gp_reg_a, (i_xgemm_desc->lda)*(i_micro_kernel_config->datatype_size) ); } /* issue mul/add */ libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->vmul_instruction, i_micro_kernel_config->vector_name, i_n_blocking + l_m, l_n, i_n_blocking + l_m_blocking + l_m ); libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->vadd_instruction, i_micro_kernel_config->vector_name, i_n_blocking + l_m_blocking + l_m, l_vec_reg_acc_start + l_m + (l_m_blocking * l_n), l_vec_reg_acc_start + l_m + (l_m_blocking * l_n) ); } } } } }