void libxsmm_append_code_as_string( libxsmm_generated_code* io_generated_code, const char* i_code_to_append, const int i_append_length ) { size_t l_length_1 = 0; size_t l_length_2 = 0; char* l_new_string = NULL; char* current_code = (char*)io_generated_code->generated_code; /* check if end up here accidentally */ if ( io_generated_code->code_type > 1 ) { libxsmm_handle_error( io_generated_code, LIBXSMM_ERR_APPEND_STR ); return; } /* some safety checks */ if (current_code != NULL) { l_length_1 = io_generated_code->code_size; } else { /* nothing to do */ l_length_1 = 0; } if (i_code_to_append != NULL) { l_length_2 = i_append_length; } else { fprintf(stderr, "LIBXSMM WARNING libxsmm_append_code_as_string was called with an empty string for appending code" ); } /* allocate new string */ l_new_string = (char*) malloc( (l_length_1+l_length_2+1)*sizeof(char) ); if (l_new_string == NULL) { libxsmm_handle_error( io_generated_code, LIBXSMM_ERR_ALLOC ); return; } /* copy old content */ if (l_length_1 > 0) { /* @TODO using memcpy instead? */ libxsmm_strncpy( l_new_string, current_code, (unsigned int)(l_length_1+l_length_2), (unsigned int)l_length_1 ); } else { l_new_string[0] = '\0'; } /* append new string */ /* @TODO using memcpy instead? */ strcat(l_new_string, i_code_to_append); /* free old memory and overwrite pointer */ if (l_length_1 > 0) free(current_code); io_generated_code->generated_code = (void*)l_new_string; /* update counters */ io_generated_code->code_size = (unsigned int)(l_length_1+l_length_2); io_generated_code->buffer_size = (io_generated_code->code_size) + 1; }
void libxsmm_instruction_register_jump_label( libxsmm_generated_code* io_generated_code, libxsmm_loop_label_tracker* io_loop_label_tracker ) { /* check if we still have lable we can jump to */ if ( io_loop_label_tracker->label_count == 32 ) { libxsmm_handle_error( io_generated_code, LIBXSMM_ERR_EXCEED_JMPLBL ); return; } /* @TODO add checks in debug mode */ if ( io_generated_code->code_type > 1 ) { /* @TODO-GREG call encoding here */ } else { char l_new_code[512]; int l_max_code_length = 511; int l_code_length = 0; io_loop_label_tracker->label_address[io_loop_label_tracker->label_count] = io_loop_label_tracker->label_count; if ( io_generated_code->code_type == 0 ) { l_code_length = libxsmm_snprintf(l_new_code, l_max_code_length, " \"%i:\\n\\t\"\n", io_loop_label_tracker->label_address[io_loop_label_tracker->label_count] ); } else { l_code_length = libxsmm_snprintf(l_new_code, l_max_code_length, " %i:\n", io_loop_label_tracker->label_address[io_loop_label_tracker->label_count] ); } libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); io_loop_label_tracker->label_count++; } }
void libxsmm_instruction_prefetch( libxsmm_generated_code* io_generated_code, const unsigned int i_prefetch_instr, const unsigned int i_gp_reg_base, const unsigned int i_gp_reg_idx, const unsigned int i_scale, const int i_displacement ) { #ifndef NDEBUG if ( i_gp_reg_idx != LIBXSMM_X86_GP_REG_UNDEF ) { libxsmm_handle_error( io_generated_code, LIBXSMM_ERR_NO_INDEX_SCALE_ADDR ); return; } #endif /* @TODO add checks in debug mode */ if ( io_generated_code->code_type > 1 ) { /* @TODO-GREG call encoding here */ } else { char l_new_code[512]; int l_max_code_length = 511; int l_code_length = 0; char l_gp_reg_base_name[4]; libxsmm_get_x86_gp_reg_name( i_gp_reg_base, l_gp_reg_base_name, 3 ); char l_instr_name[16]; libxsmm_get_x86_instr_name( i_prefetch_instr, l_instr_name, 15 ); if ( io_generated_code->code_type == 0 ) { l_code_length = libxsmm_snprintf(l_new_code, l_max_code_length, " \"%s %i(%%%%%s)\\n\\t\"\n", l_instr_name, i_displacement, l_gp_reg_base_name ); } else { l_code_length = libxsmm_snprintf(l_new_code, l_max_code_length, " %s %i(%%%s)\n", l_instr_name, i_displacement, l_gp_reg_base_name ); } libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); } }
void libxsmm_generator_sparse_kernel( libxsmm_generated_code* io_generated_code, const libxsmm_xgemm_descriptor* i_xgemm_desc, const char* i_arch, const unsigned int* i_row_idx, const unsigned int* i_column_idx, const double* i_values ) { /* A matrix is sparse */ if ( (i_xgemm_desc->lda == 0) && (i_xgemm_desc->ldb > 0) && (i_xgemm_desc->ldc > 0) ) { /* check LDB */ if ( i_xgemm_desc->ldb < i_xgemm_desc->k ) { libxsmm_handle_error( io_generated_code, LIBXSMM_ERR_LDB ); return; } /* check LDC */ if ( i_xgemm_desc->ldc < i_xgemm_desc->m ) { libxsmm_handle_error( io_generated_code, LIBXSMM_ERR_LDC ); return; } libxsmm_generator_sparse_asparse( io_generated_code, i_xgemm_desc, i_arch, i_row_idx, i_column_idx, i_values ); /* B matrix is sparse */ } else if ( (i_xgemm_desc->lda > 0) && (i_xgemm_desc->ldb == 0) && (i_xgemm_desc->ldc > 0) ) { /* check LDA */ if ( i_xgemm_desc->lda < i_xgemm_desc->m ) { libxsmm_handle_error( io_generated_code, LIBXSMM_ERR_LDA ); return; } /* check LDC */ if ( i_xgemm_desc->ldc < i_xgemm_desc->m ) { libxsmm_handle_error( io_generated_code, LIBXSMM_ERR_LDC ); return; } libxsmm_generator_sparse_bsparse( io_generated_code, i_xgemm_desc, i_arch, i_row_idx, i_column_idx, i_values ); } else { /* something bad happened... */ libxsmm_handle_error( io_generated_code, LIBXSMM_ERR_SPARSE_GEN ); return; } }
void libxsmm_instruction_jump_back_to_label( libxsmm_generated_code* io_generated_code, const unsigned int i_jmp_instr, libxsmm_loop_label_tracker* io_loop_label_tracker ) { /* check that we just handle jl */ if ( i_jmp_instr != LIBXSMM_X86_INSTR_JL) { libxsmm_handle_error( io_generated_code, LIBXSMM_ERR_UNSUPPORTED_JUMP ); return; } /* check if we still have lable we can jump to */ if ( io_loop_label_tracker->label_count == 0 ) { libxsmm_handle_error( io_generated_code, LIBXSMM_ERR_NO_JMPLBL_AVAIL ); return; } /* @TODO add checks in debug mode */ if ( io_generated_code->code_type > 1 ) { /* @TODO-GREG call encoding here */ } else { char l_new_code[512]; int l_max_code_length = 511; int l_code_length = 0; char l_instr_name[16]; libxsmm_get_x86_instr_name( i_jmp_instr, l_instr_name, 15 ); io_loop_label_tracker->label_count--; if ( io_generated_code->code_type == 0 ) { l_code_length = libxsmm_snprintf(l_new_code, l_max_code_length, " \"%s %ib\\n\\t\"\n", l_instr_name, io_loop_label_tracker->label_address[io_loop_label_tracker->label_count] ); } else { l_code_length = libxsmm_snprintf(l_new_code, l_max_code_length, " %s %ib\n", l_instr_name, io_loop_label_tracker->label_address[io_loop_label_tracker->label_count] ); } libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); io_loop_label_tracker->label_address[io_loop_label_tracker->label_count] = 0; } }
void libxsmm_generator_dense_sse3_microkernel( libxsmm_generated_code* io_generated_code, const libxsmm_gp_reg_mapping* i_gp_reg_mapping, const libxsmm_micro_kernel_config* i_micro_kernel_config, const libxsmm_xgemm_descriptor* i_xgemm_desc, const unsigned int i_m_blocking, const unsigned int i_n_blocking, const int i_offset ) { #ifndef NDEBUG if ( (i_n_blocking > 3) || (i_n_blocking < 1) ) { libxsmm_handle_error( io_generated_code, LIBXSMM_ERR_N_BLOCK ); return; } if ( i_m_blocking % i_micro_kernel_config->vector_length != 0 ) { libxsmm_handle_error( io_generated_code, LIBXSMM_ERR_M_BLOCK ); return; } #endif /* deriving register blocking from kernel config */ unsigned int l_m_blocking = i_m_blocking/i_micro_kernel_config->vector_length; /* register blocking counter in n */ unsigned int l_n = 0; /* register blocking counter in m */ unsigned int l_m = 0; /* start register of accumulator */ unsigned int l_vec_reg_acc_start = 16 - (i_n_blocking * l_m_blocking); if (l_m_blocking == 1) { /* load column vectors of A */ libxsmm_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->a_vmove_instruction, i_gp_reg_mapping->gp_reg_a, LIBXSMM_X86_GP_REG_UNDEF, 0, 0, i_micro_kernel_config->vector_name, i_n_blocking, i_micro_kernel_config->use_masking_a_c, 0 ); /* loop over columns of B */ for ( l_n = 0; l_n < i_n_blocking; l_n++ ) { /* post increment of a pointer early */ if ( (l_n == 0) ) { libxsmm_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_add_instruction, i_gp_reg_mapping->gp_reg_a, (i_xgemm_desc->lda)*(i_micro_kernel_config->datatype_size) ); } /* different ways of using B */ if ( i_offset != (-1) ) { libxsmm_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->b_vmove_instruction, i_gp_reg_mapping->gp_reg_b, LIBXSMM_X86_GP_REG_UNDEF, 0, ((i_micro_kernel_config->datatype_size) * i_offset) + (i_xgemm_desc->ldb * l_n * (i_micro_kernel_config->datatype_size)), i_micro_kernel_config->vector_name, l_n, i_micro_kernel_config->use_masking_a_c, 0 ); /* generate shuffle as SSE3 has no broadcast load for single precision */ if ( (i_xgemm_desc->single_precision != 0 ) && ( i_micro_kernel_config->b_shuff_instruction != LIBXSMM_X86_INSTR_UNDEF ) ) { libxsmm_instruction_vec_shuffle_reg( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->b_shuff_instruction, i_micro_kernel_config->vector_name, l_n, l_n, LIBXSMM_X86_VEC_REG_UNDEF, 0 ); } } else { libxsmm_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->b_vmove_instruction, i_gp_reg_mapping->gp_reg_b, LIBXSMM_X86_GP_REG_UNDEF, 0, i_xgemm_desc->ldb * l_n * i_micro_kernel_config->datatype_size, i_micro_kernel_config->vector_name, l_n, i_micro_kernel_config->use_masking_a_c, 0 ); /* generate shuffle as SSE3 has no broadcast load for single precision */ if ( (i_xgemm_desc->single_precision != 0 ) && ( i_micro_kernel_config->b_shuff_instruction != LIBXSMM_X86_INSTR_UNDEF ) ) { libxsmm_instruction_vec_shuffle_reg( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->b_shuff_instruction, i_micro_kernel_config->vector_name, l_n, l_n, LIBXSMM_X86_VEC_REG_UNDEF, 0 ); } if ( l_n == (i_n_blocking -1) ) { libxsmm_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_add_instruction, i_gp_reg_mapping->gp_reg_b, i_micro_kernel_config->datatype_size ); } } /* issue mul-add */ libxsmm_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->vmul_instruction, i_micro_kernel_config->vector_name, i_n_blocking, l_n, LIBXSMM_X86_VEC_REG_UNDEF ); libxsmm_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->vadd_instruction, i_micro_kernel_config->vector_name, l_n, l_vec_reg_acc_start + l_n, LIBXSMM_X86_VEC_REG_UNDEF ); } } else { /* broadcast from B -> into vec registers 0 to i_n_blocking */ if ( i_offset != (-1) ) { for ( l_n = 0; l_n < i_n_blocking; l_n++ ) { libxsmm_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->b_vmove_instruction, i_gp_reg_mapping->gp_reg_b, LIBXSMM_X86_GP_REG_UNDEF, 0, ((i_micro_kernel_config->datatype_size) * i_offset) + (i_xgemm_desc->ldb * l_n * (i_micro_kernel_config->datatype_size)), i_micro_kernel_config->vector_name, l_n, i_micro_kernel_config->use_masking_a_c, 0 ); /* generate shuffle as SSE3 has no broadcast load for single precision */ if ( (i_xgemm_desc->single_precision != 0 ) && ( i_micro_kernel_config->b_shuff_instruction != LIBXSMM_X86_INSTR_UNDEF ) ) { libxsmm_instruction_vec_shuffle_reg( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->b_shuff_instruction, i_micro_kernel_config->vector_name, l_n, l_n, LIBXSMM_X86_VEC_REG_UNDEF, 0 ); } } } else { for ( l_n = 0; l_n < i_n_blocking; l_n++ ) { libxsmm_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->b_vmove_instruction, i_gp_reg_mapping->gp_reg_b, LIBXSMM_X86_GP_REG_UNDEF, 0, i_xgemm_desc->ldb * l_n * i_micro_kernel_config->datatype_size, i_micro_kernel_config->vector_name, l_n, i_micro_kernel_config->use_masking_a_c, 0 ); /* generate shuffle as SSE3 has no broadcast load for single precision */ if ( i_xgemm_desc->single_precision != 0 ) { libxsmm_instruction_vec_shuffle_reg( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->b_shuff_instruction, i_micro_kernel_config->vector_name, l_n, l_n, LIBXSMM_X86_VEC_REG_UNDEF, 0 ); } } libxsmm_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_add_instruction, i_gp_reg_mapping->gp_reg_b, i_micro_kernel_config->datatype_size ); } if (l_m_blocking == 3) { /* load column vectors of A and multiply with all broadcasted row entries of B */ for ( l_m = 0; l_m < l_m_blocking ; l_m++ ) { libxsmm_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->a_vmove_instruction, i_gp_reg_mapping->gp_reg_a, LIBXSMM_X86_GP_REG_UNDEF, 0, (i_micro_kernel_config->datatype_size) * (i_micro_kernel_config->vector_length) * l_m, i_micro_kernel_config->vector_name, i_n_blocking, i_micro_kernel_config->use_masking_a_c, 0 ); for ( l_n = 0; l_n < i_n_blocking; l_n++ ) { /* post increment early */ if ( (l_m == (l_m_blocking-1)) && (l_n == 0) ) { libxsmm_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_add_instruction, i_gp_reg_mapping->gp_reg_a, (i_xgemm_desc->lda)*(i_micro_kernel_config->datatype_size) ); } if (l_n < i_n_blocking - 1) { /* issed vmove to save loads from A */ libxsmm_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->a_vmove_instruction, i_micro_kernel_config->vector_name, i_n_blocking + l_n, i_n_blocking + l_n + 1, LIBXSMM_X86_VEC_REG_UNDEF ); } /* issue mul+add */ libxsmm_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->vmul_instruction, i_micro_kernel_config->vector_name, l_n, i_n_blocking + l_n, LIBXSMM_X86_VEC_REG_UNDEF ); libxsmm_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->vadd_instruction, i_micro_kernel_config->vector_name, i_n_blocking + l_n, l_vec_reg_acc_start + l_m + (l_m_blocking * l_n), LIBXSMM_X86_VEC_REG_UNDEF ); } } } else { /* load column vectors of A and multiply with all broadcasted row entries of B */ for ( l_m = 0; l_m < l_m_blocking ; l_m++ ) { libxsmm_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->a_vmove_instruction, i_gp_reg_mapping->gp_reg_a, LIBXSMM_X86_GP_REG_UNDEF, 0, (i_micro_kernel_config->datatype_size) * (i_micro_kernel_config->vector_length) * l_m, i_micro_kernel_config->vector_name, i_n_blocking + l_m, i_micro_kernel_config->use_masking_a_c, 0 ); } for ( l_m = 0; l_m < l_m_blocking ; l_m++ ) { for ( l_n = 0; l_n < i_n_blocking; l_n++ ) { /* post increment early */ if ( (l_m == (l_m_blocking-1)) && (l_n == 0) ) { libxsmm_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_add_instruction, i_gp_reg_mapping->gp_reg_a, (i_xgemm_desc->lda)*(i_micro_kernel_config->datatype_size) ); } if (l_n < i_n_blocking - 1) { /* issed vmove to save loads from A */ if (l_n == 0 ) { libxsmm_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->a_vmove_instruction, i_micro_kernel_config->vector_name, i_n_blocking + l_m + l_n, i_n_blocking + l_m_blocking + l_n, LIBXSMM_X86_VEC_REG_UNDEF ); } else { libxsmm_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->a_vmove_instruction, i_micro_kernel_config->vector_name, i_n_blocking + l_m_blocking + l_n - 1, i_n_blocking + l_m_blocking + l_n, LIBXSMM_X86_VEC_REG_UNDEF ); } } /* issue mul/add */ if (l_n == 0 ) { libxsmm_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->vmul_instruction, i_micro_kernel_config->vector_name, l_n, i_n_blocking + l_m + l_n, LIBXSMM_X86_VEC_REG_UNDEF ); libxsmm_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->vadd_instruction, i_micro_kernel_config->vector_name, i_n_blocking + l_m + l_n, l_vec_reg_acc_start + l_m + (l_m_blocking * l_n) , LIBXSMM_X86_VEC_REG_UNDEF ); } else { libxsmm_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->vmul_instruction, i_micro_kernel_config->vector_name, l_n, i_n_blocking + l_m_blocking + l_n - 1, LIBXSMM_X86_VEC_REG_UNDEF ); libxsmm_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->vadd_instruction, i_micro_kernel_config->vector_name, i_n_blocking + l_m_blocking + l_n - 1, l_vec_reg_acc_start + l_m + (l_m_blocking * l_n) , LIBXSMM_X86_VEC_REG_UNDEF ); } } } } } }
void libxsmm_sparse_csc_reader( libxsmm_generated_code* io_generated_code, const char* i_csc_file_in, unsigned int** o_row_idx, unsigned int** o_column_idx, double** o_values, unsigned int* o_row_count, unsigned int* o_column_count, unsigned int* o_element_count ) { FILE *l_csc_file_handle; const unsigned int l_line_length = 512; char l_line[512/*l_line_length*/+1]; unsigned int l_header_read = 0; unsigned int* l_column_idx_id = NULL; unsigned int l_i = 0; l_csc_file_handle = fopen( i_csc_file_in, "r" ); if ( l_csc_file_handle == NULL ) { libxsmm_handle_error( io_generated_code, LIBXSMM_ERR_CSC_INPUT ); return; } while (fgets(l_line, l_line_length, l_csc_file_handle) != NULL) { if ( strlen(l_line) == l_line_length ) { libxsmm_handle_error( io_generated_code, LIBXSMM_ERR_CSC_READ_LEN ); return; } /* check if we are still reading comments header */ if ( l_line[0] == '%' ) { continue; } else { /* if we are the first line after comment header, we allocate our data structures */ if ( l_header_read == 0 ) { if ( sscanf(l_line, "%u %u %u", o_row_count, o_column_count, o_element_count) == 3 ) { /* allocate CSC datastructue matching mtx file */ *o_row_idx = (unsigned int*) malloc(sizeof(unsigned int) * (*o_element_count)); *o_column_idx = (unsigned int*) malloc(sizeof(unsigned int) * (*o_column_count + 1)); *o_values = (double*) malloc(sizeof(double) * (*o_element_count)); l_column_idx_id = (unsigned int*) malloc(sizeof(unsigned int) * (*o_column_count)); /* check if mallocs were successful */ if ( ( *o_row_idx == NULL ) || ( *o_column_idx == NULL ) || ( *o_values == NULL ) || ( l_column_idx_id == NULL ) ) { libxsmm_handle_error( io_generated_code, LIBXSMM_ERR_CSC_ALLOC_DATA ); return; } /* set everything to zero for init */ memset(*o_row_idx, 0, sizeof(unsigned int)*(*o_element_count)); memset(*o_column_idx, 0, sizeof(unsigned int)*(*o_column_count + 1)); memset(*o_values, 0, sizeof(double)*(*o_element_count)); memset(l_column_idx_id, 0, sizeof(unsigned int)*(*o_column_count)); /* init column idx */ for ( l_i = 0; l_i < (*o_column_count + 1); l_i++) (*o_column_idx)[l_i] = (*o_element_count); /* init */ (*o_column_idx)[0] = 0; l_i = 0; l_header_read = 1; } else { libxsmm_handle_error( io_generated_code, LIBXSMM_ERR_CSC_READ_DESC ); return; } /* now we read the actual content */ } else { unsigned int l_row, l_column; double l_value; /* read a line of content */ if ( sscanf(l_line, "%u %u %lf", &l_row, &l_column, &l_value) != 3 ) { libxsmm_handle_error( io_generated_code, LIBXSMM_ERR_CSC_READ_ELEMS ); return; } /* adjust numbers to zero termination */ l_row--; l_column--; /* add these values to row and value strucuture */ (*o_row_idx)[l_i] = l_row; (*o_values)[l_i] = l_value; l_i++; /* handle columns, set id to onw for this column, yeah we need to hanle empty columns */ l_column_idx_id[l_column] = 1; (*o_column_idx)[l_column+1] = l_i; } } } /* close mtx file */ fclose( l_csc_file_handle ); /* check if we read a file which was consitent */ if ( l_i != (*o_element_count) ) { libxsmm_handle_error( io_generated_code, LIBXSMM_ERR_CSC_LEN ); return; } /* let's handle empty colums */ for ( l_i = 0; l_i < (*o_column_count); l_i++) { if ( l_column_idx_id[l_i] == 0 ) { (*o_column_idx)[l_i+1] = (*o_column_idx)[l_i]; } } /* free helper data structure */ if ( l_column_idx_id != NULL ) { free( l_column_idx_id ); } }
LIBXSMM_INTERNAL_API_DEFINITION void libxsmm_generator_gemm_store_C( libxsmm_generated_code* io_generated_code, const libxsmm_gp_reg_mapping* i_gp_reg_mapping, const libxsmm_micro_kernel_config* i_micro_kernel_config, const libxsmm_gemm_descriptor* i_xgemm_desc, const unsigned int i_m_blocking, const unsigned int i_n_blocking ) { /* deriving register blocking from kernel config */ unsigned int l_m_blocking = i_m_blocking/i_micro_kernel_config->vector_length; /* register blocking counter in n */ unsigned int l_n = 0; /* register blocking counter in m */ unsigned int l_m = 0; /* start register of accumulator */ unsigned int l_vec_reg_acc_start = i_micro_kernel_config->vector_reg_count - (i_n_blocking * l_m_blocking); /* @TODO fix this test */ #if !defined(NDEBUG) if (i_micro_kernel_config->instruction_set == LIBXSMM_X86_SSE3 || i_micro_kernel_config->instruction_set == LIBXSMM_X86_AVX || i_micro_kernel_config->instruction_set == LIBXSMM_X86_AVX2 ) { if ( (i_n_blocking > 3) || (i_n_blocking < 1) || (i_m_blocking < 1) ) { libxsmm_handle_error( io_generated_code, LIBXSMM_ERR_REG_BLOCK ); return; } } else if (i_micro_kernel_config->instruction_set == LIBXSMM_X86_IMCI || i_micro_kernel_config->instruction_set == LIBXSMM_X86_AVX512_MIC || ( (i_micro_kernel_config->instruction_set == LIBXSMM_X86_AVX512_CORE) && (i_m_blocking == i_micro_kernel_config->vector_length) ) ) { if ( (i_n_blocking > 30) || (i_n_blocking < 1) || (i_m_blocking != i_micro_kernel_config->vector_length) ) { libxsmm_handle_error( io_generated_code, LIBXSMM_ERR_REG_BLOCK ); return; } } else if ( i_micro_kernel_config->instruction_set == LIBXSMM_X86_AVX512_CORE ) { if ( (i_n_blocking > 6) || (i_n_blocking < 1) || (i_m_blocking < 1) ) { libxsmm_handle_error( io_generated_code, LIBXSMM_ERR_REG_BLOCK ); return; } } else {} if ( i_m_blocking % i_micro_kernel_config->vector_length != 0 ) { libxsmm_handle_error( io_generated_code, LIBXSMM_ERR_M_BLOCK ); return; } #endif /* storing C accumulator */ /* adding to C, so let's load C */ for ( l_n = 0; l_n < i_n_blocking; l_n++ ) { for ( l_m = 0; l_m < l_m_blocking; l_m++ ) { libxsmm_x86_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->c_vmove_instruction, i_gp_reg_mapping->gp_reg_c, LIBXSMM_X86_GP_REG_UNDEF, 0, ((l_n * i_xgemm_desc->ldc) + (l_m * (i_micro_kernel_config->vector_length))) * (i_micro_kernel_config->datatype_size), i_micro_kernel_config->vector_name, l_vec_reg_acc_start + l_m + (l_m_blocking * l_n), i_micro_kernel_config->use_masking_a_c, 1 ); } if ( i_xgemm_desc->prefetch == LIBXSMM_PREFETCH_BL2_VIA_C || i_xgemm_desc->prefetch == LIBXSMM_PREFETCH_AL2BL2_VIA_C || i_xgemm_desc->prefetch == LIBXSMM_PREFETCH_AL2BL2_VIA_C_AHEAD || i_xgemm_desc->prefetch == LIBXSMM_PREFETCH_AL2BL2_VIA_C_JPST) { /* determining how many prefetches we need in M direction as we just need one prefetch per cache line */ unsigned int l_m_advance = 64 / ((i_micro_kernel_config->vector_length) * (i_micro_kernel_config->datatype_size)); /* 64: hardcoded cache line length */ for (l_m = 0; l_m < l_m_blocking; l_m += l_m_advance ) { libxsmm_x86_instruction_prefetch( io_generated_code, i_micro_kernel_config->prefetch_instruction, i_gp_reg_mapping->gp_reg_b_prefetch, LIBXSMM_X86_GP_REG_UNDEF, 0, ((l_n * i_xgemm_desc->ldc) + (l_m * (i_micro_kernel_config->vector_length))) * (i_micro_kernel_config->datatype_size)); } } } }
LIBXSMM_INTERNAL_API_DEFINITION void libxsmm_generator_gemm_load_C( libxsmm_generated_code* io_generated_code, const libxsmm_gp_reg_mapping* i_gp_reg_mapping, const libxsmm_micro_kernel_config* i_micro_kernel_config, const libxsmm_gemm_descriptor* i_xgemm_desc, const unsigned int i_m_blocking, const unsigned int i_n_blocking ) { /* deriving register blocking from kernel config */ const unsigned int l_m_blocking = i_m_blocking/i_micro_kernel_config->vector_length; /* start register of accumulator */ const unsigned int l_vec_reg_acc_start = i_micro_kernel_config->vector_reg_count - (i_n_blocking * l_m_blocking); /* register blocking counter in n */ unsigned int l_n = 0; /* register blocking counter in m */ unsigned int l_m = 0; #if !defined(NDEBUG) /* Do some test if it's possible to generated the requested code. This is not done in release mode and therefore bad things might happen.... HUAAH */ if (i_micro_kernel_config->instruction_set == LIBXSMM_X86_SSE3 || i_micro_kernel_config->instruction_set == LIBXSMM_X86_AVX || i_micro_kernel_config->instruction_set == LIBXSMM_X86_AVX2 ) { if ( (i_n_blocking > 3) || (i_n_blocking < 1) || (i_m_blocking < 1) ) { libxsmm_handle_error( io_generated_code, LIBXSMM_ERR_REG_BLOCK ); return; } } else if (i_micro_kernel_config->instruction_set == LIBXSMM_X86_IMCI || i_micro_kernel_config->instruction_set == LIBXSMM_X86_AVX512_MIC || ( (i_micro_kernel_config->instruction_set == LIBXSMM_X86_AVX512_CORE) && (i_m_blocking == i_micro_kernel_config->vector_length) ) ) { if ( (i_n_blocking > 30) || (i_n_blocking < 1) || (i_m_blocking != i_micro_kernel_config->vector_length) ) { libxsmm_handle_error( io_generated_code, LIBXSMM_ERR_REG_BLOCK ); return; } } else if ( i_micro_kernel_config->instruction_set == LIBXSMM_X86_AVX512_CORE ) { if ( (i_n_blocking > 6) || (i_n_blocking < 1) || (i_m_blocking < 1) ) { libxsmm_handle_error( io_generated_code, LIBXSMM_ERR_REG_BLOCK ); return; } } else {} if ( i_m_blocking % i_micro_kernel_config->vector_length != 0 ) { libxsmm_handle_error( io_generated_code, LIBXSMM_ERR_M_BLOCK ); return; } #endif /*NDEBUG*/ /* load C accumulator */ if (i_xgemm_desc->beta == 1) { /* adding to C, so let's load C */ for ( l_n = 0; l_n < i_n_blocking; l_n++ ) { for ( l_m = 0; l_m < l_m_blocking; l_m++ ) { libxsmm_x86_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->c_vmove_instruction, i_gp_reg_mapping->gp_reg_c, LIBXSMM_X86_GP_REG_UNDEF, 0, ((l_n * i_xgemm_desc->ldc) + (l_m * (i_micro_kernel_config->vector_length))) * (i_micro_kernel_config->datatype_size), i_micro_kernel_config->vector_name, l_vec_reg_acc_start + l_m + (l_m_blocking * l_n), i_micro_kernel_config->use_masking_a_c, 0 ); } #if 0 if ( i_xgemm_desc->prefetch == LIBXSMM_PREFETCH_CL2 || i_xgemm_desc->prefetch == LIBXSMM_PREFETCH_AL2CL2BL2_VIA_C ) { for (l_m = 0; l_m < l_m_blocking; l_m += l_m++ ) { libxsmm_x86_instruction_prefetch( io_generated_code, i_micro_kernel_config->prefetch_instruction, i_gp_reg_mapping->gp_reg_c_prefetch, LIBXSMM_X86_GP_REG_UNDEF, 0, ((l_n * i_xgemm_desc->ldc) + (l_m * (i_micro_kernel_config->vector_length))) * (i_micro_kernel_config->datatype_size)); } } #endif } } else { /* overwriting C, so let's xout accumulator */ for ( l_n = 0; l_n < i_n_blocking; l_n++ ) { for ( l_m = 0; l_m < l_m_blocking; l_m++ ) { libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->vxor_instruction, i_micro_kernel_config->vector_name, l_vec_reg_acc_start + l_m + (l_m_blocking * l_n), l_vec_reg_acc_start + l_m + (l_m_blocking * l_n), l_vec_reg_acc_start + l_m + (l_m_blocking * l_n) ); } #if 0 if ( i_xgemm_desc->prefetch == LIBXSMM_PREFETCH_CL2 || i_xgemm_desc->prefetch == LIBXSMM_PREFETCH_AL2CL2BL2_VIA_C ) { for (l_m = 0; l_m < l_m_blocking; l_m += l_m++ ) { libxsmm_x86_instruction_prefetch( io_generated_code, i_micro_kernel_config->prefetch_instruction, i_gp_reg_mapping->gp_reg_c_prefetch, LIBXSMM_X86_GP_REG_UNDEF, 0, ((l_n * i_xgemm_desc->ldc) + (l_m * (i_micro_kernel_config->vector_length))) * (i_micro_kernel_config->datatype_size)); } } #endif } } }
void libxsmm_generator_sparse_bsparse( libxsmm_generated_code* io_generated_code, const libxsmm_xgemm_descriptor* i_xgemm_desc, const char* i_arch, const unsigned int* i_row_idx, const unsigned int* i_column_idx, const double* i_values ) { unsigned int l_n; unsigned int l_z; unsigned int l_column_elements; unsigned int l_flop_count = 0; char l_new_code[512]; int l_max_code_length = 511; int l_code_length = 0; LIBXSMM_UNUSED(i_values); l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " unsigned int l_m = 0;\n"); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); /* reset C if beta is zero */ if ( i_xgemm_desc->beta == 0 ) { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " unsigned int l_n = 0;\n"); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " for ( l_n = 0; l_n < %u; l_n++) {\n", i_xgemm_desc->n); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); if ( i_xgemm_desc->m > 1 ) { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " #pragma simd\n"); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " #pragma vector aligned\n"); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); } if ( (LIBXSMM_XGEMM_FLAG_F32PREC & i_xgemm_desc->flags) == 0 ) { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " for ( l_m = 0; l_m < %u; l_m++) { C[(l_n*%u)+l_m] = 0.0; }\n", i_xgemm_desc->m, i_xgemm_desc->ldc); } else { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " for ( l_m = 0; l_m < %u; l_m++) { C[(l_n*%u)+l_m] = 0.0f; }\n", i_xgemm_desc->m, i_xgemm_desc->ldc); } libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " }\n"); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); } l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, "\n"); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); /* determine the correct simd pragma for each architecture */ if ( ( strcmp( i_arch, "noarch" ) == 0 ) || ( strcmp( i_arch, "wsm" ) == 0 ) || ( strcmp( i_arch, "snb" ) == 0 ) || ( strcmp( i_arch, "hsw" ) == 0 ) ) { if ( i_xgemm_desc->m > 7 ) { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " #pragma simd vectorlength(8)\n"); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); } else if ( i_xgemm_desc->m > 3 ) { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " #pragma simd vectorlength(4)\n"); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); } else if ( i_xgemm_desc->m > 1 ) { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " #pragma simd vectorlength(2)\n"); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); } else {} if ( (i_xgemm_desc->m > 1) && ((LIBXSMM_XGEMM_FLAG_ALIGN_A & i_xgemm_desc->flags) != 0) && ((LIBXSMM_XGEMM_FLAG_ALIGN_C & i_xgemm_desc->flags) != 0) ) { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " #pragma vector aligned\n"); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); } } else if ( ( strcmp( i_arch, "knc" ) == 0 ) || ( strcmp( i_arch, "knl" ) == 0 ) || ( strcmp( i_arch, "skx" ) == 0 ) ) { if ( (i_xgemm_desc->m > 1) && ((LIBXSMM_XGEMM_FLAG_ALIGN_A & i_xgemm_desc->flags) != 0) && ((LIBXSMM_XGEMM_FLAG_ALIGN_C & i_xgemm_desc->flags) != 0) ) { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " #pragma simd vectorlength(32)\n #pragma vector aligned\n"); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); } } else { libxsmm_handle_error( io_generated_code, LIBXSMM_ERR_ARCH ); return; } /* generate the actuel kernel */ l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " for ( l_m = 0; l_m < %u; l_m++) {\n", i_xgemm_desc->m); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); for ( l_n = 0; l_n < i_xgemm_desc->n; l_n++ ) { l_column_elements = i_column_idx[l_n+1] - i_column_idx[l_n]; for ( l_z = 0; l_z < l_column_elements; l_z++ ) { /* check k such that we just use rows which actually need to be multiplied */ if ( i_row_idx[i_column_idx[l_n] + l_z] < i_xgemm_desc->k ) { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " C[%u+l_m] += A[%u+l_m] * B[%u];\n", l_n * i_xgemm_desc->ldc, i_row_idx[i_column_idx[l_n] + l_z]*i_xgemm_desc->lda, i_column_idx[l_n] + l_z); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_flop_count += 2; } } } l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " }\n"); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); /* add flop counter */ l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, "\n#ifndef NDEBUG\n#ifdef _OPENMP\n#pragma omp atomic\n#endif\nlibxsmm_num_total_flops += %u;\n#endif\n", l_flop_count * i_xgemm_desc->m); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); }
LIBXSMM_INTERNAL_API_DEFINITION void libxsmm_generator_gemm_avx512_microkernel_nofsdbcst( libxsmm_generated_code* io_generated_code, const libxsmm_gp_reg_mapping* i_gp_reg_mapping, const libxsmm_micro_kernel_config* i_micro_kernel_config, const libxsmm_gemm_descriptor* i_xgemm_desc, const unsigned int i_m_blocking, const unsigned int i_n_blocking, const int i_offset ) { /* deriving register blocking from kernel config */ unsigned int l_m_blocking = i_m_blocking/i_micro_kernel_config->vector_length; /* register blocking counter in n */ unsigned int l_n = 0; /* register blocking counter in m */ unsigned int l_m = 0; /* start register of accumulator */ unsigned int l_vec_reg_acc_start = i_micro_kernel_config->vector_reg_count - (i_n_blocking * l_m_blocking); #if !defined(NDEBUG) if ( (i_n_blocking > 6) || (i_n_blocking < 1) ) { libxsmm_handle_error( io_generated_code, LIBXSMM_ERR_N_BLOCK ); return; } if ( i_m_blocking % i_micro_kernel_config->vector_length != 0 ) { libxsmm_handle_error( io_generated_code, LIBXSMM_ERR_M_BLOCK ); return; } #endif if (l_m_blocking == 1) { /* load column vectors of A */ libxsmm_x86_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->a_vmove_instruction, i_gp_reg_mapping->gp_reg_a, LIBXSMM_X86_GP_REG_UNDEF, 0, 0, i_micro_kernel_config->vector_name, i_n_blocking, i_micro_kernel_config->use_masking_a_c, 0 ); /* loop over columns of B */ for ( l_n = 0; l_n < i_n_blocking; l_n++ ) { /* post increment of a pointer early */ if ( l_n == 0 ) { libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_add_instruction, i_gp_reg_mapping->gp_reg_a, (i_xgemm_desc->lda)*(i_micro_kernel_config->datatype_size) ); } /* different ways of using B */ if ( i_offset != (-1) ) { libxsmm_x86_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->b_vmove_instruction, i_gp_reg_mapping->gp_reg_b, LIBXSMM_X86_GP_REG_UNDEF, 0, ((i_micro_kernel_config->datatype_size) * i_offset) + (i_xgemm_desc->ldb * l_n * (i_micro_kernel_config->datatype_size)), i_micro_kernel_config->vector_name, l_n, i_micro_kernel_config->use_masking_a_c, 0 ); } else { libxsmm_x86_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->b_vmove_instruction, i_gp_reg_mapping->gp_reg_b, LIBXSMM_X86_GP_REG_UNDEF, 0, i_xgemm_desc->ldb * l_n * i_micro_kernel_config->datatype_size, i_micro_kernel_config->vector_name, l_n, i_micro_kernel_config->use_masking_a_c, 0 ); if ( l_n == (i_n_blocking -1) ) { libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_add_instruction, i_gp_reg_mapping->gp_reg_b, i_micro_kernel_config->datatype_size ); } } /* issue fma */ libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->vmul_instruction, i_micro_kernel_config->vector_name, i_n_blocking, l_n, l_vec_reg_acc_start + l_n ); } } else { /* broadcast from B -> into vec registers 0 to i_n_blocking */ if ( i_offset != (-1) ) { for ( l_n = 0; l_n < i_n_blocking; l_n++ ) { libxsmm_x86_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->b_vmove_instruction, i_gp_reg_mapping->gp_reg_b, LIBXSMM_X86_GP_REG_UNDEF, 0, ((i_micro_kernel_config->datatype_size) * i_offset) + (i_xgemm_desc->ldb * l_n * (i_micro_kernel_config->datatype_size)), i_micro_kernel_config->vector_name, l_n, i_micro_kernel_config->use_masking_a_c, 0 ); } } else { for ( l_n = 0; l_n < i_n_blocking; l_n++ ) { libxsmm_x86_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->b_vmove_instruction, i_gp_reg_mapping->gp_reg_b, LIBXSMM_X86_GP_REG_UNDEF, 0, i_xgemm_desc->ldb * l_n * i_micro_kernel_config->datatype_size, i_micro_kernel_config->vector_name, l_n, i_micro_kernel_config->use_masking_a_c, 0 ); } libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_add_instruction, i_gp_reg_mapping->gp_reg_b, i_micro_kernel_config->datatype_size ); } if (l_m_blocking == 4) { /* load column vectors of A and multiply with all broadcasted row entries of B */ for ( l_m = 0; l_m < l_m_blocking ; l_m++ ) { libxsmm_x86_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->a_vmove_instruction, i_gp_reg_mapping->gp_reg_a, LIBXSMM_X86_GP_REG_UNDEF, 0, (i_micro_kernel_config->datatype_size) * (i_micro_kernel_config->vector_length) * l_m, i_micro_kernel_config->vector_name, i_n_blocking, i_micro_kernel_config->use_masking_a_c, 0 ); for ( l_n = 0; l_n < i_n_blocking; l_n++ ) { /* post increment early */ if ( (l_m == (l_m_blocking-1)) && (l_n == 0) ) { libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_add_instruction, i_gp_reg_mapping->gp_reg_a, (i_xgemm_desc->lda)*(i_micro_kernel_config->datatype_size) ); } /* issue fma */ libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->vmul_instruction, i_micro_kernel_config->vector_name, i_n_blocking, l_n, l_vec_reg_acc_start + l_m + (l_m_blocking * l_n) ); } } } else { /* load column vectors of A and multiply with all broadcasted row entries of B */ for ( l_m = 0; l_m < l_m_blocking ; l_m++ ) { libxsmm_x86_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->a_vmove_instruction, i_gp_reg_mapping->gp_reg_a, LIBXSMM_X86_GP_REG_UNDEF, 0, (i_micro_kernel_config->datatype_size) * (i_micro_kernel_config->vector_length) * l_m, i_micro_kernel_config->vector_name, i_n_blocking+l_m, i_micro_kernel_config->use_masking_a_c, 0 ); } for ( l_m = 0; l_m < l_m_blocking ; l_m++ ) { for ( l_n = 0; l_n < i_n_blocking; l_n++ ) { /* post increment early */ if ( (l_m == (l_m_blocking-1)) && (l_n == 0) ) { libxsmm_x86_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_add_instruction, i_gp_reg_mapping->gp_reg_a, (i_xgemm_desc->lda)*(i_micro_kernel_config->datatype_size) ); } /* issue fma */ libxsmm_x86_instruction_vec_compute_reg( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->vmul_instruction, i_micro_kernel_config->vector_name, i_n_blocking+l_m, l_n, l_vec_reg_acc_start + l_m + (l_m_blocking * l_n) ); } } } } }
void libxsmm_generator_dense_x86_close_instruction_stream( libxsmm_generated_code* io_generated_code, const libxsmm_gp_reg_mapping* i_gp_reg_mapping, const char* i_arch, const char* i_prefetch) { /* @TODO add checks in debug mode */ if ( io_generated_code->code_type > 1 ) { /* @TODO-GREG call encoding here */ /* @TODO-GREG: how do we interface here? */ /* this is start of the xGEMM kernel, the registers are in the variables */ } else if ( io_generated_code->code_type == 1 ) { /* @TODO this is currently System V AMD64 RTL(C) ABI only */ char l_new_code[512]; int l_max_code_length = 511; int l_code_length = 0; char l_gp_reg_name[4]; if ( (strcmp(i_arch, "wsm") == 0) || (strcmp(i_arch, "snb") == 0) || (strcmp(i_arch, "hsw") == 0) ) { if ( libxsmm_check_x86_gp_reg_name_callee_save( i_gp_reg_mapping->gp_reg_kloop ) ) { libxsmm_get_x86_gp_reg_name( i_gp_reg_mapping->gp_reg_kloop, l_gp_reg_name, 3 ); l_code_length = libxsmm_snprintf( l_new_code, l_max_code_length, " popq %%%s\n", l_gp_reg_name ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); } if ( libxsmm_check_x86_gp_reg_name_callee_save( i_gp_reg_mapping->gp_reg_nloop ) ) { libxsmm_get_x86_gp_reg_name( i_gp_reg_mapping->gp_reg_nloop, l_gp_reg_name, 3 ); l_code_length = libxsmm_snprintf( l_new_code, l_max_code_length, " popq %%%s\n", l_gp_reg_name ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); } if ( libxsmm_check_x86_gp_reg_name_callee_save( i_gp_reg_mapping->gp_reg_mloop ) ) { libxsmm_get_x86_gp_reg_name( i_gp_reg_mapping->gp_reg_mloop, l_gp_reg_name, 3 ); l_code_length = libxsmm_snprintf( l_new_code, l_max_code_length, " popq %%%s\n", l_gp_reg_name ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); } } else { l_code_length = libxsmm_snprintf( l_new_code, l_max_code_length, " popq %r15\n" ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = libxsmm_snprintf( l_new_code, l_max_code_length, " popq %r14\n" ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = libxsmm_snprintf( l_new_code, l_max_code_length, " popq %r13\n" ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = libxsmm_snprintf( l_new_code, l_max_code_length, " popq %r12\n" ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = libxsmm_snprintf( l_new_code, l_max_code_length, " popq %rbx\n" ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); } if ( libxsmm_check_x86_gp_reg_name_callee_save( i_gp_reg_mapping->gp_reg_b_prefetch ) ) { libxsmm_handle_error( io_generated_code, LIBXSMM_ERR_CALLEE_SAVE_B_PREF ); return; } if ( libxsmm_check_x86_gp_reg_name_callee_save( i_gp_reg_mapping->gp_reg_a_prefetch ) ) { libxsmm_handle_error( io_generated_code, LIBXSMM_ERR_CALLEE_SAVE_A_PREF ); return; } if ( libxsmm_check_x86_gp_reg_name_callee_save( i_gp_reg_mapping->gp_reg_c ) ) { libxsmm_handle_error( io_generated_code, LIBXSMM_ERR_CALLEE_SAVE_C ); return; } if ( libxsmm_check_x86_gp_reg_name_callee_save( i_gp_reg_mapping->gp_reg_b ) ) { libxsmm_handle_error( io_generated_code, LIBXSMM_ERR_CALLEE_SAVE_B ); return; } if ( libxsmm_check_x86_gp_reg_name_callee_save( i_gp_reg_mapping->gp_reg_a ) ) { libxsmm_handle_error( io_generated_code, LIBXSMM_ERR_CALLEE_SAVE_A ); return; } /* @TODO: I don't know if this is the correct placement in the generation process */ l_code_length = libxsmm_snprintf( l_new_code, l_max_code_length, " retq\n" ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); } else { char l_new_code[1024]; int l_max_code_length = 1023; int l_code_length = 0; char l_gp_reg_a[4]; char l_gp_reg_b[4]; char l_gp_reg_c[4]; char l_gp_reg_pre_a[4]; char l_gp_reg_pre_b[4]; char l_gp_reg_mloop[4]; char l_gp_reg_nloop[4]; char l_gp_reg_kloop[4]; libxsmm_get_x86_gp_reg_name( i_gp_reg_mapping->gp_reg_a, l_gp_reg_a, 3 ); libxsmm_get_x86_gp_reg_name( i_gp_reg_mapping->gp_reg_b, l_gp_reg_b, 3 ); libxsmm_get_x86_gp_reg_name( i_gp_reg_mapping->gp_reg_c, l_gp_reg_c, 3 ); libxsmm_get_x86_gp_reg_name( i_gp_reg_mapping->gp_reg_a_prefetch, l_gp_reg_pre_a, 3 ); libxsmm_get_x86_gp_reg_name( i_gp_reg_mapping->gp_reg_b_prefetch, l_gp_reg_pre_b, 3 ); libxsmm_get_x86_gp_reg_name( i_gp_reg_mapping->gp_reg_mloop, l_gp_reg_mloop, 3 ); libxsmm_get_x86_gp_reg_name( i_gp_reg_mapping->gp_reg_nloop, l_gp_reg_nloop, 3 ); libxsmm_get_x86_gp_reg_name( i_gp_reg_mapping->gp_reg_kloop, l_gp_reg_kloop, 3 ); if ( ( strcmp(i_prefetch, "BL2viaC") == 0 ) || ( strcmp(i_prefetch, "curAL2_BL2viaC") == 0 ) ) { if ( (strcmp(i_arch, "wsm") == 0) || (strcmp(i_arch, "snb") == 0) || (strcmp(i_arch, "hsw") == 0) ) { l_code_length = libxsmm_snprintf( l_new_code, l_max_code_length, " : : \"m\"(B), \"m\"(A), \"m\"(C), \"m\"(B_prefetch) : \"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"xmm0\",\"xmm1\",\"xmm2\",\"xmm3\",\"xmm4\",\"xmm5\",\"xmm6\",\"xmm7\",\"xmm8\",\"xmm9\",\"xmm10\",\"xmm11\",\"xmm12\",\"xmm13\",\"xmm14\",\"xmm15\");\n", l_gp_reg_a, l_gp_reg_b, l_gp_reg_c, l_gp_reg_pre_b, l_gp_reg_mloop, l_gp_reg_nloop, l_gp_reg_kloop); } else { l_code_length = libxsmm_snprintf( l_new_code, l_max_code_length, " : : \"m\"(B), \"m\"(A), \"m\"(C), \"m\"(B_prefetch) : \"k1\",\"rax\",\"rbx\",\"rcx\",\"rdx\",\"rdi\",\"rsi\",\"r8\",\"r9\",\"r10\",\"r12\",\"r13\",\"r14\",\"r15\",\"zmm0\",\"zmm1\",\"zmm2\",\"zmm3\",\"zmm4\",\"zmm5\",\"zmm6\",\"zmm7\",\"zmm8\",\"zmm9\",\"zmm10\",\"zmm11\",\"zmm12\",\"zmm13\",\"zmm14\",\"zmm15\",\"zmm16\",\"zmm17\",\"zmm18\",\"zmm19\",\"zmm20\",\"zmm21\",\"zmm22\",\"zmm23\",\"zmm24\",\"zmm25\",\"zmm26\",\"zmm27\",\"zmm28\",\"zmm29\",\"zmm30\",\"zmm31\");\n"); } } else if ( ( strcmp(i_prefetch, "AL2jpst") == 0 ) || ( strcmp(i_prefetch, "AL2") == 0 ) ) { if ( (strcmp(i_arch, "wsm") == 0) || (strcmp(i_arch, "snb") == 0) || (strcmp(i_arch, "hsw") == 0) ) { l_code_length = libxsmm_snprintf( l_new_code, l_max_code_length, " : : \"m\"(B), \"m\"(A), \"m\"(C), \"m\"(A_prefetch) : \"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"xmm0\",\"xmm1\",\"xmm2\",\"xmm3\",\"xmm4\",\"xmm5\",\"xmm6\",\"xmm7\",\"xmm8\",\"xmm9\",\"xmm10\",\"xmm11\",\"xmm12\",\"xmm13\",\"xmm14\",\"xmm15\");\n", l_gp_reg_a, l_gp_reg_b, l_gp_reg_c, l_gp_reg_pre_a, l_gp_reg_mloop, l_gp_reg_nloop, l_gp_reg_kloop); } else { l_code_length = libxsmm_snprintf( l_new_code, l_max_code_length, " : : \"m\"(B), \"m\"(A), \"m\"(C), \"m\"(A_prefetch) : \"k1\",\"rax\",\"rbx\",\"rcx\",\"rdx\",\"rdi\",\"rsi\",\"r8\",\"r9\",\"r10\",\"r12\",\"r13\",\"r14\",\"r15\",\"zmm0\",\"zmm1\",\"zmm2\",\"zmm3\",\"zmm4\",\"zmm5\",\"zmm6\",\"zmm7\",\"zmm8\",\"zmm9\",\"zmm10\",\"zmm11\",\"zmm12\",\"zmm13\",\"zmm14\",\"zmm15\",\"zmm16\",\"zmm17\",\"zmm18\",\"zmm19\",\"zmm20\",\"zmm21\",\"zmm22\",\"zmm23\",\"zmm24\",\"zmm25\",\"zmm26\",\"zmm27\",\"zmm28\",\"zmm29\",\"zmm30\",\"zmm31\");\n"); } } else if ( ( strcmp(i_prefetch, "AL2jpst_BL2viaC") == 0 ) || ( strcmp(i_prefetch, "AL2_BL2viaC") == 0 ) ) { if ( (strcmp(i_arch, "wsm") == 0) || (strcmp(i_arch, "snb") == 0) || (strcmp(i_arch, "hsw") == 0) ) { l_code_length = libxsmm_snprintf( l_new_code, l_max_code_length, " : : \"m\"(B), \"m\"(A), \"m\"(C), \"m\"(A_prefetch), \"m\"(B_prefetch) : \"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"xmm0\",\"xmm1\",\"xmm2\",\"xmm3\",\"xmm4\",\"xmm5\",\"xmm6\",\"xmm7\",\"xmm8\",\"xmm9\",\"xmm10\",\"xmm11\",\"xmm12\",\"xmm13\",\"xmm14\",\"xmm15\");\n", l_gp_reg_a, l_gp_reg_b, l_gp_reg_c, l_gp_reg_pre_a, l_gp_reg_pre_b, l_gp_reg_mloop, l_gp_reg_nloop, l_gp_reg_kloop); } else { l_code_length = libxsmm_snprintf( l_new_code, l_max_code_length, " : : \"m\"(B), \"m\"(A), \"m\"(C), \"m\"(A_prefetch), \"m\"(B_prefetch) : \"k1\",\"rax\",\"rbx\",\"rcx\",\"rdx\",\"rdi\",\"rsi\",\"r8\",\"r9\",\"r10\",\"r12\",\"r13\",\"r14\",\"r15\",\"zmm0\",\"zmm1\",\"zmm2\",\"zmm3\",\"zmm4\",\"zmm5\",\"zmm6\",\"zmm7\",\"zmm8\",\"zmm9\",\"zmm10\",\"zmm11\",\"zmm12\",\"zmm13\",\"zmm14\",\"zmm15\",\"zmm16\",\"zmm17\",\"zmm18\",\"zmm19\",\"zmm20\",\"zmm21\",\"zmm22\",\"zmm23\",\"zmm24\",\"zmm25\",\"zmm26\",\"zmm27\",\"zmm28\",\"zmm29\",\"zmm30\",\"zmm31\");\n"); } } else { if ( (strcmp(i_arch, "wsm") == 0) || (strcmp(i_arch, "snb") == 0) || (strcmp(i_arch, "hsw") == 0) ) { l_code_length = libxsmm_snprintf( l_new_code, l_max_code_length, " : : \"m\"(B), \"m\"(A), \"m\"(C) : \"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"xmm0\",\"xmm1\",\"xmm2\",\"xmm3\",\"xmm4\",\"xmm5\",\"xmm6\",\"xmm7\",\"xmm8\",\"xmm9\",\"xmm10\",\"xmm11\",\"xmm12\",\"xmm13\",\"xmm14\",\"xmm15\");\n", l_gp_reg_a, l_gp_reg_b, l_gp_reg_c, l_gp_reg_mloop, l_gp_reg_nloop, l_gp_reg_kloop); } else { l_code_length = libxsmm_snprintf( l_new_code, l_max_code_length, " : : \"m\"(B), \"m\"(A), \"m\"(C) : \"k1\",\"rax\",\"rbx\",\"rcx\",\"rdx\",\"rdi\",\"rsi\",\"r8\",\"r9\",\"r10\",\"r12\",\"r13\",\"r14\",\"r15\",\"zmm0\",\"zmm1\",\"zmm2\",\"zmm3\",\"zmm4\",\"zmm5\",\"zmm6\",\"zmm7\",\"zmm8\",\"zmm9\",\"zmm10\",\"zmm11\",\"zmm12\",\"zmm13\",\"zmm14\",\"zmm15\",\"zmm16\",\"zmm17\",\"zmm18\",\"zmm19\",\"zmm20\",\"zmm21\",\"zmm22\",\"zmm23\",\"zmm24\",\"zmm25\",\"zmm26\",\"zmm27\",\"zmm28\",\"zmm29\",\"zmm30\",\"zmm31\");\n"); } } libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); } }
void libxsmm_generator_dense_x86_open_instruction_stream( libxsmm_generated_code* io_generated_code, const libxsmm_gp_reg_mapping* i_gp_reg_mapping, const char* i_arch, const char* i_prefetch) { /* @TODO add checks in debug mode */ if ( io_generated_code->code_type > 1 ) { /* @TODO-GREG call encoding here */ /* @TODO-GREG: how do we interface here? */ /* this is start of the xGEMM kernel, the registers are in the variables */ } else if ( io_generated_code->code_type == 1 ) { /* @TODO this is currently System V AMD64 RTL(C) ABI only */ char l_new_code[512]; int l_max_code_length = 511; int l_code_length = 0; char l_gp_reg_name[4]; if ( libxsmm_check_x86_gp_reg_name_callee_save( i_gp_reg_mapping->gp_reg_a ) ) { libxsmm_handle_error( io_generated_code, LIBXSMM_ERR_CALLEE_SAVE_A ); return; } if ( libxsmm_check_x86_gp_reg_name_callee_save( i_gp_reg_mapping->gp_reg_b ) ) { libxsmm_handle_error( io_generated_code, LIBXSMM_ERR_CALLEE_SAVE_B ); return; } if ( libxsmm_check_x86_gp_reg_name_callee_save( i_gp_reg_mapping->gp_reg_c ) ) { libxsmm_handle_error( io_generated_code, LIBXSMM_ERR_CALLEE_SAVE_C ); return; } if ( libxsmm_check_x86_gp_reg_name_callee_save( i_gp_reg_mapping->gp_reg_a_prefetch ) ) { libxsmm_handle_error( io_generated_code, LIBXSMM_ERR_CALLEE_SAVE_A_PREF ); return; } if ( libxsmm_check_x86_gp_reg_name_callee_save( i_gp_reg_mapping->gp_reg_b_prefetch ) ) { libxsmm_handle_error( io_generated_code, LIBXSMM_ERR_CALLEE_SAVE_B_PREF ); return; } if ( (strcmp(i_arch, "wsm") == 0) || (strcmp(i_arch, "snb") == 0) || (strcmp(i_arch, "hsw") == 0) ) { if ( libxsmm_check_x86_gp_reg_name_callee_save( i_gp_reg_mapping->gp_reg_mloop ) ) { libxsmm_get_x86_gp_reg_name( i_gp_reg_mapping->gp_reg_mloop, l_gp_reg_name, 3 ); l_code_length = libxsmm_snprintf( l_new_code, l_max_code_length, " pushq %%%s\n", l_gp_reg_name ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); } if ( libxsmm_check_x86_gp_reg_name_callee_save( i_gp_reg_mapping->gp_reg_nloop ) ) { libxsmm_get_x86_gp_reg_name( i_gp_reg_mapping->gp_reg_nloop, l_gp_reg_name, 3 ); l_code_length = libxsmm_snprintf( l_new_code, l_max_code_length, " pushq %%%s\n", l_gp_reg_name ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); } if ( libxsmm_check_x86_gp_reg_name_callee_save( i_gp_reg_mapping->gp_reg_kloop ) ) { libxsmm_get_x86_gp_reg_name( i_gp_reg_mapping->gp_reg_kloop, l_gp_reg_name, 3 ); l_code_length = libxsmm_snprintf( l_new_code, l_max_code_length, " pushq %%%s\n", l_gp_reg_name ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); } } else { l_code_length = libxsmm_snprintf( l_new_code, l_max_code_length, " pushq %rbx\n" ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = libxsmm_snprintf( l_new_code, l_max_code_length, " pushq %r12\n" ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = libxsmm_snprintf( l_new_code, l_max_code_length, " pushq %r13\n" ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = libxsmm_snprintf( l_new_code, l_max_code_length, " pushq %r14\n" ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = libxsmm_snprintf( l_new_code, l_max_code_length, " pushq %r15\n" ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); } } else { char l_new_code[512]; int l_max_code_length = 511; int l_code_length = 0; char l_gp_reg_name[4]; /* loading b pointer in assembley */ libxsmm_get_x86_gp_reg_name( i_gp_reg_mapping->gp_reg_b, l_gp_reg_name, 3 ); l_code_length = libxsmm_snprintf( l_new_code, l_max_code_length, " __asm__ __volatile__(\"movq %%0, %%%%%s\\n\\t\"\n", l_gp_reg_name ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); /* loading a pointer in assembley */ libxsmm_get_x86_gp_reg_name( i_gp_reg_mapping->gp_reg_a, l_gp_reg_name, 3 ); l_code_length = libxsmm_snprintf( l_new_code, l_max_code_length, " \"movq %%1, %%%%%s\\n\\t\"\n", l_gp_reg_name ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); /* loading c pointer in assembley */ libxsmm_get_x86_gp_reg_name( i_gp_reg_mapping->gp_reg_c, l_gp_reg_name, 3 ); l_code_length = libxsmm_snprintf( l_new_code, l_max_code_length, " \"movq %%2, %%%%%s\\n\\t\"\n", l_gp_reg_name ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); /* loading b prefetch pointer in assembly */ if ( ( strcmp(i_prefetch, "BL2viaC") == 0 ) || ( strcmp(i_prefetch, "curAL2_BL2viaC") == 0 ) ) { libxsmm_get_x86_gp_reg_name( i_gp_reg_mapping->gp_reg_b_prefetch, l_gp_reg_name, 3 ); l_code_length = libxsmm_snprintf( l_new_code, l_max_code_length, " \"movq %%3, %%%%%s\\n\\t\"\n", l_gp_reg_name ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); /* loading a prefetch pointer in assembly */ } else if ( ( strcmp(i_prefetch, "AL2jpst") == 0 ) || ( strcmp(i_prefetch, "AL2") == 0 ) ) { libxsmm_get_x86_gp_reg_name( i_gp_reg_mapping->gp_reg_a_prefetch, l_gp_reg_name, 3 ); l_code_length = libxsmm_snprintf( l_new_code, l_max_code_length, " \"movq %%3, %%%%%s\\n\\t\"\n", l_gp_reg_name ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); /* loading a and b prefetch pointer in assembly */ } else if ( ( strcmp(i_prefetch, "AL2jpst_BL2viaC") == 0 ) || ( strcmp(i_prefetch, "AL2_BL2viaC") == 0 ) ) { libxsmm_get_x86_gp_reg_name( i_gp_reg_mapping->gp_reg_a_prefetch, l_gp_reg_name, 3 ); l_code_length = libxsmm_snprintf( l_new_code, l_max_code_length, " \"movq %%3, %%%%%s\\n\\t\"\n", l_gp_reg_name ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); libxsmm_get_x86_gp_reg_name( i_gp_reg_mapping->gp_reg_b_prefetch, l_gp_reg_name, 3 ); l_code_length = libxsmm_snprintf( l_new_code, l_max_code_length, " \"movq %%4, %%%%%s\\n\\t\"\n", l_gp_reg_name ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); } else {} } /* reset loop counters */ libxsmm_instruction_alu_imm( io_generated_code, LIBXSMM_X86_INSTR_MOVQ, i_gp_reg_mapping->gp_reg_mloop, 0 ); libxsmm_instruction_alu_imm( io_generated_code, LIBXSMM_X86_INSTR_MOVQ, i_gp_reg_mapping->gp_reg_nloop, 0 ); libxsmm_instruction_alu_imm( io_generated_code, LIBXSMM_X86_INSTR_MOVQ, i_gp_reg_mapping->gp_reg_kloop, 0 ); }
void libxsmm_instruction_vec_move( libxsmm_generated_code* io_generated_code, const unsigned int i_instruction_set, const unsigned int i_vmove_instr, const unsigned int i_gp_reg_base, const unsigned int i_gp_reg_idx, const unsigned int i_scale, const int i_displacement, const char i_vector_name, const unsigned int i_vec_reg_number_0, const unsigned int i_use_masking, const unsigned int i_is_store ) { #ifndef NDEBUG if ( i_gp_reg_idx != LIBXSMM_X86_GP_REG_UNDEF ) { libxsmm_handle_error( io_generated_code, LIBXSMM_ERR_NO_INDEX_SCALE_ADDR ); return; } #endif /* @TODO add checks in debug mode */ if ( io_generated_code->code_type > 1 ) { /* @TODO-GREG call encoding here */ } else { char l_new_code[512]; int l_max_code_length = 511; int l_code_length = 0; char l_gp_reg_base_name[4]; libxsmm_get_x86_gp_reg_name( i_gp_reg_base, l_gp_reg_base_name, 3 ); char l_instr_name[16]; libxsmm_get_x86_instr_name( i_vmove_instr, l_instr_name, 15 ); if ( (i_instruction_set == LIBXSMM_X86_AVX512) && (i_use_masking != 0) ) { /* build vmovpd/ps/sd/ss instruction, load use */ if ( i_is_store == 0 ) { if ( io_generated_code->code_type == 0 ) { l_code_length = libxsmm_snprintf(l_new_code, l_max_code_length, " \"%s %i(%%%%%s), %%%%%cmm%i%%{%%%%k%i%%}%%{z%%}\\n\\t\"\n", l_instr_name, i_displacement, l_gp_reg_base_name, i_vector_name, i_vec_reg_number_0, LIBXSMM_X86_IMCI_AVX512_MASK ); } else { l_code_length = libxsmm_snprintf(l_new_code, l_max_code_length, " %s %i(%%%s), %%%cmm%i{%%k%i}{z}\n", l_instr_name, i_displacement, l_gp_reg_base_name, i_vector_name, i_vec_reg_number_0, LIBXSMM_X86_IMCI_AVX512_MASK ); } } else { if ( io_generated_code->code_type == 0 ) { l_code_length = libxsmm_snprintf(l_new_code, l_max_code_length, " \"%s %%%%%cmm%i, %i(%%%%%s)%%{%%%%k%i%%}\\n\\t\"\n", l_instr_name, i_vector_name, i_vec_reg_number_0, i_displacement, l_gp_reg_base_name, LIBXSMM_X86_IMCI_AVX512_MASK ); } else { l_code_length = libxsmm_snprintf(l_new_code, l_max_code_length, " %s %%%cmm%i, %i(%%%s){%%k%i}\n", l_instr_name, i_vector_name, i_vec_reg_number_0, i_displacement, l_gp_reg_base_name, LIBXSMM_X86_IMCI_AVX512_MASK ); } } } else if ( (i_instruction_set == LIBXSMM_X86_IMCI) && (i_use_masking != 0) ) { /* build vmovpd/ps/sd/ss instruction, load use */ if ( i_is_store == 0 ) { if ( io_generated_code->code_type == 0 ) { l_code_length = libxsmm_snprintf(l_new_code, l_max_code_length, " \"%s %i(%%%%%s), %%%%%cmm%i%%{%%%%k%i%%}\\n\\t\"\n", l_instr_name, i_displacement, l_gp_reg_base_name, i_vector_name, i_vec_reg_number_0, LIBXSMM_X86_IMCI_AVX512_MASK ); } else { l_code_length = libxsmm_snprintf(l_new_code, l_max_code_length, " %s %i(%%%s), %%%cmm%i{%%k%i}\n", l_instr_name, i_displacement, l_gp_reg_base_name, i_vector_name, i_vec_reg_number_0, LIBXSMM_X86_IMCI_AVX512_MASK ); } } else { if ( io_generated_code->code_type == 0 ) { l_code_length = libxsmm_snprintf(l_new_code, l_max_code_length, " \"%s %%%%%cmm%i, %i(%%%%%s)%%{%%%%k%i%%}\\n\\t\"\n", l_instr_name, i_vector_name, i_vec_reg_number_0, i_displacement, l_gp_reg_base_name, LIBXSMM_X86_IMCI_AVX512_MASK ); } else { l_code_length = libxsmm_snprintf(l_new_code, l_max_code_length, " %s %%%cmm%i, %i(%%%s){%%k%i}\n", l_instr_name, i_vector_name, i_vec_reg_number_0, i_displacement, l_gp_reg_base_name, LIBXSMM_X86_IMCI_AVX512_MASK ); } } } else { /* build vmovpd/ps/sd/ss instruction, load use */ if ( i_is_store == 0 ) { if ( io_generated_code->code_type == 0 ) { l_code_length = libxsmm_snprintf(l_new_code, l_max_code_length, " \"%s %i(%%%%%s), %%%%%cmm%i\\n\\t\"\n", l_instr_name, i_displacement, l_gp_reg_base_name, i_vector_name, i_vec_reg_number_0 ); } else { l_code_length = libxsmm_snprintf(l_new_code, l_max_code_length, " %s %i(%%%s), %%%cmm%i\n", l_instr_name, i_displacement, l_gp_reg_base_name, i_vector_name, i_vec_reg_number_0 ); } } else { if ( io_generated_code->code_type == 0 ) { l_code_length = libxsmm_snprintf(l_new_code, l_max_code_length, " \"%s %%%%%cmm%i, %i(%%%%%s)\\n\\t\"\n", l_instr_name, i_vector_name, i_vec_reg_number_0, i_displacement, l_gp_reg_base_name ); } else { l_code_length = libxsmm_snprintf(l_new_code, l_max_code_length, " %s %%%cmm%i, %i(%%%s)\n", l_instr_name, i_vector_name, i_vec_reg_number_0, i_displacement, l_gp_reg_base_name ); } } } libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); } }
void libxsmm_instruction_vec_compute_mem( libxsmm_generated_code* io_generated_code, const unsigned int i_instruction_set, const unsigned int i_vec_instr, const unsigned int i_use_broadcast, const unsigned int i_gp_reg_base, const unsigned int i_gp_reg_idx, const unsigned int i_scale, const int i_displacement, const char i_vector_name, const unsigned int i_vec_reg_number_0, const unsigned int i_vec_reg_number_1 ) { /* @TODO add checks in debug mode */ if ( (i_instruction_set != LIBXSMM_X86_IMCI) && (i_instruction_set != LIBXSMM_X86_AVX512) && (i_use_broadcast != 0) ) { libxsmm_handle_error( io_generated_code, LIBXSMM_ERR_NO_IMCI_AVX512_BCAST ); return; } if ( io_generated_code->code_type > 1 ) { /* @TODO-GREG call encoding here */ } else { char l_new_code[512]; int l_max_code_length = 511; int l_code_length = 0; char l_gp_reg_base[4]; char l_gp_reg_idx[4]; libxsmm_get_x86_gp_reg_name( i_gp_reg_base, l_gp_reg_base, 3 ); char l_instr_name[16]; libxsmm_get_x86_instr_name( i_vec_instr, l_instr_name, 15 ); char l_broadcast[8]; unsigned int l_single_precision = libxsmm_is_x86_vec_instr_single_precision( i_vec_instr ); if (l_single_precision == 0) { libxsmm_snprintf( l_broadcast, 7, "1to8" ); } else { libxsmm_snprintf( l_broadcast, 7, "1to16" ); } /* build vXYZpd/ps/sd/ss instruction pure register use*/ if ( i_gp_reg_idx == LIBXSMM_X86_GP_REG_UNDEF ) { if ( io_generated_code->code_type == 0 ) { if (i_use_broadcast != 0) { l_code_length = libxsmm_snprintf(l_new_code, l_max_code_length, " \"%s %i(%%%%%s)%%{%s%%}, %%%%%cmm%i, %%%%%cmm%i\\n\\t\"\n", l_instr_name, i_displacement, l_gp_reg_base, l_broadcast, i_vector_name, i_vec_reg_number_0, i_vector_name, i_vec_reg_number_1 ); } else { l_code_length = libxsmm_snprintf(l_new_code, l_max_code_length, " \"%s %i(%%%%%s), %%%%%cmm%i, %%%%%cmm%i\\n\\t\"\n", l_instr_name, i_displacement, l_gp_reg_base, i_vector_name, i_vec_reg_number_0, i_vector_name, i_vec_reg_number_1 ); } } else { if (i_use_broadcast != 0) { l_code_length = libxsmm_snprintf(l_new_code, l_max_code_length, " %s %i(%%%s){%s}, %%%cmm%i, %%%cmm%i\n", l_instr_name, i_displacement, l_gp_reg_base, l_broadcast, i_vector_name, i_vec_reg_number_0, i_vector_name, i_vec_reg_number_1 ); } else { l_code_length = libxsmm_snprintf(l_new_code, l_max_code_length, " %s %i(%%%s), %%%cmm%i, %%%cmm%i\n", l_instr_name, i_displacement, l_gp_reg_base, i_vector_name, i_vec_reg_number_0, i_vector_name, i_vec_reg_number_1 ); } } } else { libxsmm_get_x86_gp_reg_name( i_gp_reg_idx, l_gp_reg_idx, 3 ); if ( io_generated_code->code_type == 0 ) { if (i_use_broadcast != 0) { l_code_length = libxsmm_snprintf(l_new_code, l_max_code_length, " \"%s %i(%%%%%s,%%%%%s,%i)%%{%s%%}, %%%%%cmm%i, %%%%%cmm%i\\n\\t\"\n", l_instr_name, i_displacement, l_gp_reg_base, l_gp_reg_idx, i_scale, l_broadcast, i_vector_name, i_vec_reg_number_0, i_vector_name, i_vec_reg_number_1 ); } else { l_code_length = libxsmm_snprintf(l_new_code, l_max_code_length, " \"%s %i(%%%%%s,%%%%%s,%i), %%%%%cmm%i, %%%%%cmm%i\\n\\t\"\n", l_instr_name, i_displacement, l_gp_reg_base, l_gp_reg_idx, i_scale, i_vector_name, i_vec_reg_number_0, i_vector_name, i_vec_reg_number_1 ); } } else { if (i_use_broadcast != 0) { l_code_length = libxsmm_snprintf(l_new_code, l_max_code_length, " %s %i(%%%s,%%%s,%i){%s}, %%%cmm%i, %%%cmm%i\n", l_instr_name, i_displacement, l_gp_reg_base, l_gp_reg_idx, i_scale, l_broadcast, i_vector_name, i_vec_reg_number_0, i_vector_name, i_vec_reg_number_1 ); } else { l_code_length = libxsmm_snprintf(l_new_code, l_max_code_length, " %s %i(%%%s,%%%s,%i), %%%cmm%i, %%%cmm%i\n", l_instr_name, i_displacement, l_gp_reg_base, l_gp_reg_idx, i_scale, i_vector_name, i_vec_reg_number_0, i_vector_name, i_vec_reg_number_1 ); } } } libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); } }
void libxsmm_generator_dense_avx512_microkernel( libxsmm_generated_code* io_generated_code, const libxsmm_gp_reg_mapping* i_gp_reg_mapping, const libxsmm_micro_kernel_config* i_micro_kernel_config, const libxsmm_xgemm_descriptor* i_xgemm_desc, const unsigned int i_n_blocking, const unsigned int i_k_blocking, const int i_offset ) { #ifndef NDEBUG if ( i_n_blocking > 30 ) { libxsmm_handle_error( io_generated_code, LIBXSMM_ERR_N_BLOCK ); return; } if ( (i_offset >= 0) && (i_k_blocking != 1) ) { fprintf(stderr, "LIBXSMM WARNING, libxsmm_generator_dense_avx512_microkernel: i_k_blocking is ignored as offset is >=0\n"); } #endif unsigned int l_n; unsigned int l_k; /* if we have an offset greater-equal -> external k-unrolling */ if (i_offset != (-1)) { /* load A */ libxsmm_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->a_vmove_instruction, i_gp_reg_mapping->gp_reg_a, LIBXSMM_X86_GP_REG_UNDEF, 0, i_xgemm_desc->lda * i_offset * i_micro_kernel_config->datatype_size, i_micro_kernel_config->vector_name, 0, i_micro_kernel_config->use_masking_a_c, 0 ); /* current A prefetch, next 8 rows for the current column */ if ( (strcmp( i_xgemm_desc->prefetch,"curAL2" ) == 0) || (strcmp( i_xgemm_desc->prefetch,"curAL2_BL2viaC" ) == 0) ) { libxsmm_instruction_prefetch( io_generated_code, i_micro_kernel_config->prefetch_instruction, i_gp_reg_mapping->gp_reg_a, LIBXSMM_X86_GP_REG_UNDEF, 0, (i_xgemm_desc->lda * i_offset * i_micro_kernel_config->datatype_size) + 64 ); } /* next A prefetch "same" rows in "same" column, but in a different matrix */ if ( (strcmp( i_xgemm_desc->prefetch,"AL2jpst" ) == 0) || (strcmp( i_xgemm_desc->prefetch,"AL2jpst_BL2viaC" ) == 0) || (strcmp( i_xgemm_desc->prefetch,"AL2" ) == 0) || (strcmp( i_xgemm_desc->prefetch,"AL2_BL2viaC" ) == 0) ) { libxsmm_instruction_prefetch( io_generated_code, i_micro_kernel_config->prefetch_instruction, i_gp_reg_mapping->gp_reg_a_prefetch, LIBXSMM_X86_GP_REG_UNDEF, 0, (i_xgemm_desc->lda * i_offset * i_micro_kernel_config->datatype_size) ); } /* compute vectorwidth (A) * column broadcast (B) */ for ( l_n = 0; l_n < i_n_blocking; l_n++) { libxsmm_instruction_vec_compute_mem( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->vmul_instruction, 1, i_gp_reg_mapping->gp_reg_b, LIBXSMM_X86_GP_REG_UNDEF, 0, (i_offset * i_micro_kernel_config->datatype_size) + (i_xgemm_desc->ldb * i_micro_kernel_config->datatype_size * l_n), i_micro_kernel_config->vector_name, 0, i_micro_kernel_config->vector_reg_count - i_n_blocking + l_n ); } } else { /* apply k blocking */ for ( l_k = 0; l_k < i_k_blocking; l_k++ ) { if ( l_k == 0 ) { /* load A */ libxsmm_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->a_vmove_instruction, i_gp_reg_mapping->gp_reg_a, LIBXSMM_X86_GP_REG_UNDEF, 0, i_xgemm_desc->lda * l_k * i_micro_kernel_config->datatype_size, i_micro_kernel_config->vector_name, 0, i_micro_kernel_config->use_masking_a_c, 0 ); if ( i_k_blocking > 1 ) { /* second A load in first iteration, in case of large blockings -> hiding L1 latencies */ libxsmm_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->a_vmove_instruction, i_gp_reg_mapping->gp_reg_a, LIBXSMM_X86_GP_REG_UNDEF, 0, i_xgemm_desc->lda * (l_k+1) * i_micro_kernel_config->datatype_size, i_micro_kernel_config->vector_name, 1, i_micro_kernel_config->use_masking_a_c, 0 ); } } else if ( l_k < (i_k_blocking - 1) ) { /* pipelined load of A, one k iteration ahead */ libxsmm_instruction_vec_move( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->a_vmove_instruction, i_gp_reg_mapping->gp_reg_a, LIBXSMM_X86_GP_REG_UNDEF, 0, i_xgemm_desc->lda * (l_k+1) * i_micro_kernel_config->datatype_size, i_micro_kernel_config->vector_name, (l_k+1)%2, i_micro_kernel_config->use_masking_a_c, 0 ); } /* current A prefetch, next 8 rows for the current column */ if ( (strcmp( i_xgemm_desc->prefetch, "curAL2" ) == 0) || (strcmp( i_xgemm_desc->prefetch, "curAL2_BL2viaC" ) == 0) ) { libxsmm_instruction_prefetch( io_generated_code, i_micro_kernel_config->prefetch_instruction, i_gp_reg_mapping->gp_reg_a, LIBXSMM_X86_GP_REG_UNDEF, 0, (i_xgemm_desc->lda * l_k * i_micro_kernel_config->datatype_size) + 64 ); } /* next A prefetch "same" rows in "same" column, but in a different matrix */ if ( (strcmp( i_xgemm_desc->prefetch, "AL2jpst" ) == 0) || (strcmp( i_xgemm_desc->prefetch, "AL2jpst_BL2viaC" ) == 0) || (strcmp( i_xgemm_desc->prefetch, "AL2" ) == 0) || (strcmp( i_xgemm_desc->prefetch, "AL2_BL2viaC" ) == 0) ) { libxsmm_instruction_prefetch( io_generated_code, i_micro_kernel_config->prefetch_instruction, i_gp_reg_mapping->gp_reg_a_prefetch, LIBXSMM_X86_GP_REG_UNDEF, 0, (i_xgemm_desc->lda * l_k * i_micro_kernel_config->datatype_size) ); if ( l_k == (i_k_blocking - 1) ) { libxsmm_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_add_instruction, i_gp_reg_mapping->gp_reg_a_prefetch, i_k_blocking * i_micro_kernel_config->datatype_size * i_xgemm_desc->lda ); } } /* in last k-iteration: advance pointers */ if ( l_k == (i_k_blocking - 1) ) { libxsmm_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_add_instruction, i_gp_reg_mapping->gp_reg_a, i_k_blocking * i_micro_kernel_config->datatype_size * i_xgemm_desc->lda ); } /* compute vectorwidth (A) * column broadcast (B) */ for ( l_n = 0; l_n < i_n_blocking; l_n++) { libxsmm_instruction_vec_compute_mem( io_generated_code, i_micro_kernel_config->instruction_set, i_micro_kernel_config->vmul_instruction, 1, i_gp_reg_mapping->gp_reg_b, LIBXSMM_X86_GP_REG_UNDEF, 0, (l_k * i_micro_kernel_config->datatype_size)+(i_xgemm_desc->ldb * i_micro_kernel_config->datatype_size * l_n), i_micro_kernel_config->vector_name, l_k%2, i_micro_kernel_config->vector_reg_count - i_n_blocking + l_n ); } } /* advance pointers of B only when we are not fully unrolling K*/ if ( i_k_blocking < i_xgemm_desc->k ) { /* advance pointers of B */ libxsmm_instruction_alu_imm( io_generated_code, i_micro_kernel_config->alu_add_instruction, i_gp_reg_mapping->gp_reg_b, i_k_blocking * i_micro_kernel_config->datatype_size ); } } }