Beispiel #1
0
void libxsmm_append_code_as_string( libxsmm_generated_code* io_generated_code,
                                    const char*             i_code_to_append,
                                    const int               i_append_length ) {
  size_t l_length_1 = 0;
  size_t l_length_2 = 0;
  char* l_new_string = NULL;
  char* current_code = (char*)io_generated_code->generated_code;

  /* check if end up here accidentally */
  if ( io_generated_code->code_type > 1 ) {
    libxsmm_handle_error( io_generated_code, LIBXSMM_ERR_APPEND_STR );
    return;
  }

  /* some safety checks */
  if (current_code != NULL) {
    l_length_1 = io_generated_code->code_size;
  } else {
    /* nothing to do */
    l_length_1 = 0;
  }
  if (i_code_to_append != NULL) {
    l_length_2 = i_append_length;
  } else {
    fprintf(stderr, "LIBXSMM WARNING libxsmm_append_code_as_string was called with an empty string for appending code" );
  }

  /* allocate new string */
  l_new_string = (char*) malloc( (l_length_1+l_length_2+1)*sizeof(char) );
  if (l_new_string == NULL) {
    libxsmm_handle_error( io_generated_code, LIBXSMM_ERR_ALLOC );
    return;
  }

  /* copy old content */
  if (l_length_1 > 0) {
    /* @TODO using memcpy instead? */
    libxsmm_strncpy( l_new_string, current_code, (unsigned int)(l_length_1+l_length_2), (unsigned int)l_length_1 );
  } else {
    l_new_string[0] = '\0';
  }

  /* append new string */
  /* @TODO using memcpy instead? */
  strcat(l_new_string, i_code_to_append);

  /* free old memory and overwrite pointer */
  if (l_length_1 > 0)
    free(current_code);

  io_generated_code->generated_code = (void*)l_new_string;

  /* update counters */
  io_generated_code->code_size = (unsigned int)(l_length_1+l_length_2);
  io_generated_code->buffer_size = (io_generated_code->code_size) + 1;
}
void libxsmm_instruction_register_jump_label( libxsmm_generated_code*     io_generated_code,
                                              libxsmm_loop_label_tracker* io_loop_label_tracker ) {
  /* check if we still have lable we can jump to */
  if ( io_loop_label_tracker->label_count == 32 ) {
    libxsmm_handle_error( io_generated_code, LIBXSMM_ERR_EXCEED_JMPLBL );
    return;
  }

  /* @TODO add checks in debug mode */
  if ( io_generated_code->code_type > 1 ) {
    /* @TODO-GREG call encoding here */
  } else {
    char l_new_code[512];
    int l_max_code_length = 511;
    int l_code_length = 0;
    
    io_loop_label_tracker->label_address[io_loop_label_tracker->label_count] = io_loop_label_tracker->label_count;

    if ( io_generated_code->code_type == 0 ) {
      l_code_length = libxsmm_snprintf(l_new_code, l_max_code_length, "                       \"%i:\\n\\t\"\n", io_loop_label_tracker->label_address[io_loop_label_tracker->label_count] );
    } else {
      l_code_length = libxsmm_snprintf(l_new_code, l_max_code_length, "                       %i:\n", io_loop_label_tracker->label_address[io_loop_label_tracker->label_count] );
    }
    libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length );
    io_loop_label_tracker->label_count++;
  }  
}
void libxsmm_instruction_prefetch( libxsmm_generated_code* io_generated_code,
                                   const unsigned int      i_prefetch_instr, 
                                   const unsigned int      i_gp_reg_base,
                                   const unsigned int      i_gp_reg_idx,
                                   const unsigned int      i_scale,
                                   const int               i_displacement ) {
#ifndef NDEBUG
  if ( i_gp_reg_idx != LIBXSMM_X86_GP_REG_UNDEF ) {
    libxsmm_handle_error( io_generated_code, LIBXSMM_ERR_NO_INDEX_SCALE_ADDR );
    return;
  }
#endif
  /* @TODO add checks in debug mode */
  if ( io_generated_code->code_type > 1 ) {
    /* @TODO-GREG call encoding here */
  } else {
    char l_new_code[512];
    int l_max_code_length = 511;
    int l_code_length = 0;
    char l_gp_reg_base_name[4];
    libxsmm_get_x86_gp_reg_name( i_gp_reg_base, l_gp_reg_base_name, 3 );
    char l_instr_name[16];
    libxsmm_get_x86_instr_name( i_prefetch_instr, l_instr_name, 15 );

    if ( io_generated_code->code_type == 0 ) {
      l_code_length = libxsmm_snprintf(l_new_code, l_max_code_length, "                       \"%s %i(%%%%%s)\\n\\t\"\n", l_instr_name, i_displacement, l_gp_reg_base_name );
    } else {
      l_code_length = libxsmm_snprintf(l_new_code, l_max_code_length, "                       %s %i(%%%s)\n", l_instr_name, i_displacement, l_gp_reg_base_name );
    }
    libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length );
  }
}
Beispiel #4
0
void libxsmm_generator_sparse_kernel( libxsmm_generated_code*         io_generated_code,
                                      const libxsmm_xgemm_descriptor* i_xgemm_desc,
                                      const char*                     i_arch,
                                      const unsigned int*             i_row_idx,
                                      const unsigned int*             i_column_idx,
                                      const double*                   i_values ) {
  /* A matrix is sparse */
  if ( (i_xgemm_desc->lda == 0) && (i_xgemm_desc->ldb > 0) && (i_xgemm_desc->ldc > 0) ) {
    /* check LDB */
    if ( i_xgemm_desc->ldb < i_xgemm_desc->k ) {
      libxsmm_handle_error( io_generated_code, LIBXSMM_ERR_LDB );
      return;
    }
    /* check LDC */
    if ( i_xgemm_desc->ldc < i_xgemm_desc->m ) {
      libxsmm_handle_error( io_generated_code, LIBXSMM_ERR_LDC );
      return;
    }
    libxsmm_generator_sparse_asparse( io_generated_code, i_xgemm_desc, i_arch, i_row_idx, i_column_idx, i_values );
  /* B matrix is sparse */
  } else if ( (i_xgemm_desc->lda > 0) && (i_xgemm_desc->ldb == 0) && (i_xgemm_desc->ldc > 0) ) {
    /* check LDA */
    if ( i_xgemm_desc->lda < i_xgemm_desc->m ) {
      libxsmm_handle_error( io_generated_code, LIBXSMM_ERR_LDA );
      return;
    }
    /* check LDC */
    if ( i_xgemm_desc->ldc < i_xgemm_desc->m ) {
      libxsmm_handle_error( io_generated_code, LIBXSMM_ERR_LDC );
      return;
    }
    libxsmm_generator_sparse_bsparse( io_generated_code, i_xgemm_desc, i_arch, i_row_idx, i_column_idx, i_values );
  } else {
    /* something bad happened... */
    libxsmm_handle_error( io_generated_code, LIBXSMM_ERR_SPARSE_GEN );
    return;
  }
}
void libxsmm_instruction_jump_back_to_label( libxsmm_generated_code*     io_generated_code,
                                             const unsigned int          i_jmp_instr,
                                             libxsmm_loop_label_tracker* io_loop_label_tracker ) {
  /* check that we just handle jl */
  if ( i_jmp_instr != LIBXSMM_X86_INSTR_JL) {
    libxsmm_handle_error( io_generated_code, LIBXSMM_ERR_UNSUPPORTED_JUMP );
    return;
  }

  /* check if we still have lable we can jump to */
  if ( io_loop_label_tracker->label_count == 0 ) {
    libxsmm_handle_error( io_generated_code, LIBXSMM_ERR_NO_JMPLBL_AVAIL );
    return;
  }

  /* @TODO add checks in debug mode */
  if ( io_generated_code->code_type > 1 ) {
    /* @TODO-GREG call encoding here */
  } else {
    char l_new_code[512];
    int l_max_code_length = 511;
    int l_code_length = 0;
    char l_instr_name[16];
    libxsmm_get_x86_instr_name( i_jmp_instr, l_instr_name, 15 );
    
    io_loop_label_tracker->label_count--;
    
    if ( io_generated_code->code_type == 0 ) {
      l_code_length = libxsmm_snprintf(l_new_code, l_max_code_length, "                       \"%s %ib\\n\\t\"\n", l_instr_name, io_loop_label_tracker->label_address[io_loop_label_tracker->label_count] );
    } else {
      l_code_length = libxsmm_snprintf(l_new_code, l_max_code_length, "                       %s %ib\n", l_instr_name, io_loop_label_tracker->label_address[io_loop_label_tracker->label_count] );
    }
    libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length );
    
    io_loop_label_tracker->label_address[io_loop_label_tracker->label_count] = 0;
  }
}
void libxsmm_generator_dense_sse3_microkernel( libxsmm_generated_code*             io_generated_code,
                                               const libxsmm_gp_reg_mapping*       i_gp_reg_mapping,
                                               const libxsmm_micro_kernel_config*  i_micro_kernel_config,
                                               const libxsmm_xgemm_descriptor*     i_xgemm_desc,
                                               const unsigned int                  i_m_blocking,
                                               const unsigned int                  i_n_blocking,
                                               const int                           i_offset ) {
#ifndef NDEBUG
  if ( (i_n_blocking > 3) || (i_n_blocking < 1) ) {
    libxsmm_handle_error( io_generated_code, LIBXSMM_ERR_N_BLOCK );
    return;
  }
  if ( i_m_blocking % i_micro_kernel_config->vector_length != 0 ) {
    libxsmm_handle_error( io_generated_code, LIBXSMM_ERR_M_BLOCK );
    return;
  }
#endif
  /* deriving register blocking from kernel config */ 
  unsigned int l_m_blocking = i_m_blocking/i_micro_kernel_config->vector_length;
  /* register blocking counter in n */
  unsigned int l_n = 0;
  /* register blocking counter in m */
  unsigned int l_m = 0;
  /* start register of accumulator */
  unsigned int l_vec_reg_acc_start = 16 - (i_n_blocking * l_m_blocking);

  if (l_m_blocking == 1) {
    /* load column vectors of A */
    libxsmm_instruction_vec_move( io_generated_code, 
                                  i_micro_kernel_config->instruction_set,
                                  i_micro_kernel_config->a_vmove_instruction, 
                                  i_gp_reg_mapping->gp_reg_a, 
                                  LIBXSMM_X86_GP_REG_UNDEF, 0, 
                                  0, 
                                  i_micro_kernel_config->vector_name, 
                                  i_n_blocking, i_micro_kernel_config->use_masking_a_c, 0 );
    /* loop over columns of B */
    for ( l_n = 0; l_n < i_n_blocking; l_n++ ) {
      /* post increment of a pointer early */
      if ( (l_n == 0) ) {
        libxsmm_instruction_alu_imm( io_generated_code,
                                     i_micro_kernel_config->alu_add_instruction,
                                     i_gp_reg_mapping->gp_reg_a, 
                                     (i_xgemm_desc->lda)*(i_micro_kernel_config->datatype_size) );
      }
      /* different ways of using B */
      if ( i_offset != (-1) ) {
        libxsmm_instruction_vec_move( io_generated_code, 
                                      i_micro_kernel_config->instruction_set,
                                      i_micro_kernel_config->b_vmove_instruction, 
                                      i_gp_reg_mapping->gp_reg_b, 
                                      LIBXSMM_X86_GP_REG_UNDEF, 0, 
                                      ((i_micro_kernel_config->datatype_size) * i_offset) + (i_xgemm_desc->ldb * l_n * (i_micro_kernel_config->datatype_size)), 
                                      i_micro_kernel_config->vector_name, 
                                      l_n, i_micro_kernel_config->use_masking_a_c, 0 );
        /* generate shuffle as SSE3 has no broadcast load for single precision */
        if ( (i_xgemm_desc->single_precision != 0 ) && ( i_micro_kernel_config->b_shuff_instruction != LIBXSMM_X86_INSTR_UNDEF ) ) {
          libxsmm_instruction_vec_shuffle_reg( io_generated_code, 
                                               i_micro_kernel_config->instruction_set,
                                               i_micro_kernel_config->b_shuff_instruction, 
                                               i_micro_kernel_config->vector_name, 
                                               l_n,
                                               l_n, 
                                               LIBXSMM_X86_VEC_REG_UNDEF,
                                               0 );
        }
      } else {
        libxsmm_instruction_vec_move( io_generated_code, 
                                      i_micro_kernel_config->instruction_set,
                                      i_micro_kernel_config->b_vmove_instruction, 
                                      i_gp_reg_mapping->gp_reg_b, 
                                      LIBXSMM_X86_GP_REG_UNDEF, 0, 
                                      i_xgemm_desc->ldb * l_n *  i_micro_kernel_config->datatype_size, 
                                      i_micro_kernel_config->vector_name, 
                                      l_n, i_micro_kernel_config->use_masking_a_c, 0 );
        /* generate shuffle as SSE3 has no broadcast load for single precision */
        if ( (i_xgemm_desc->single_precision != 0 ) && ( i_micro_kernel_config->b_shuff_instruction != LIBXSMM_X86_INSTR_UNDEF ) ) {
          libxsmm_instruction_vec_shuffle_reg( io_generated_code, 
                                               i_micro_kernel_config->instruction_set,
                                               i_micro_kernel_config->b_shuff_instruction, 
                                               i_micro_kernel_config->vector_name, 
                                               l_n,
                                               l_n, 
                                               LIBXSMM_X86_VEC_REG_UNDEF,
                                               0 );
        }
        if ( l_n == (i_n_blocking -1) ) {
          libxsmm_instruction_alu_imm( io_generated_code,
                                       i_micro_kernel_config->alu_add_instruction, 
                                       i_gp_reg_mapping->gp_reg_b,
                                       i_micro_kernel_config->datatype_size );
        }
      }
      /* issue mul-add */
      libxsmm_instruction_vec_compute_reg( io_generated_code, 
                                           i_micro_kernel_config->instruction_set,
                                           i_micro_kernel_config->vmul_instruction, 
                                           i_micro_kernel_config->vector_name, 
                                           i_n_blocking,
                                           l_n, 
                                           LIBXSMM_X86_VEC_REG_UNDEF );
      libxsmm_instruction_vec_compute_reg( io_generated_code, 
                                           i_micro_kernel_config->instruction_set,
                                           i_micro_kernel_config->vadd_instruction, 
                                           i_micro_kernel_config->vector_name, 
                                           l_n, 
                                           l_vec_reg_acc_start + l_n, 
                                           LIBXSMM_X86_VEC_REG_UNDEF );
    }
  } else {
    /* broadcast from B -> into vec registers 0 to i_n_blocking */
    if ( i_offset != (-1) ) { 
      for ( l_n = 0; l_n < i_n_blocking; l_n++ ) {
        libxsmm_instruction_vec_move( io_generated_code, 
                                      i_micro_kernel_config->instruction_set,
                                      i_micro_kernel_config->b_vmove_instruction, 
                                      i_gp_reg_mapping->gp_reg_b, 
                                      LIBXSMM_X86_GP_REG_UNDEF, 0, 
                                      ((i_micro_kernel_config->datatype_size) * i_offset) + (i_xgemm_desc->ldb * l_n * (i_micro_kernel_config->datatype_size)), 
                                      i_micro_kernel_config->vector_name, 
                                      l_n, i_micro_kernel_config->use_masking_a_c, 0 );
        /* generate shuffle as SSE3 has no broadcast load for single precision */
        if ( (i_xgemm_desc->single_precision != 0 ) && ( i_micro_kernel_config->b_shuff_instruction != LIBXSMM_X86_INSTR_UNDEF ) ) {
          libxsmm_instruction_vec_shuffle_reg( io_generated_code, 
                                               i_micro_kernel_config->instruction_set,
                                               i_micro_kernel_config->b_shuff_instruction, 
                                               i_micro_kernel_config->vector_name, 
                                               l_n,
                                               l_n, 
                                               LIBXSMM_X86_VEC_REG_UNDEF,
                                               0 );       
        }
      }
    } else {
      for ( l_n = 0; l_n < i_n_blocking; l_n++ ) {
        libxsmm_instruction_vec_move( io_generated_code, 
                                      i_micro_kernel_config->instruction_set,
                                      i_micro_kernel_config->b_vmove_instruction, 
                                      i_gp_reg_mapping->gp_reg_b, 
                                      LIBXSMM_X86_GP_REG_UNDEF, 0, 
                                      i_xgemm_desc->ldb * l_n *  i_micro_kernel_config->datatype_size, 
                                      i_micro_kernel_config->vector_name, 
                                      l_n, i_micro_kernel_config->use_masking_a_c, 0 );
        /* generate shuffle as SSE3 has no broadcast load for single precision */
        if ( i_xgemm_desc->single_precision != 0 ) {
          libxsmm_instruction_vec_shuffle_reg( io_generated_code, 
                                               i_micro_kernel_config->instruction_set,
                                               i_micro_kernel_config->b_shuff_instruction, 
                                               i_micro_kernel_config->vector_name, 
                                               l_n,
                                               l_n, 
                                               LIBXSMM_X86_VEC_REG_UNDEF,
                                               0 );     
        }
     }
     libxsmm_instruction_alu_imm( io_generated_code,
                                  i_micro_kernel_config->alu_add_instruction, 
                                  i_gp_reg_mapping->gp_reg_b,
                                  i_micro_kernel_config->datatype_size );
    }

    if (l_m_blocking == 3) {
      /* load column vectors of A and multiply with all broadcasted row entries of B */
      for ( l_m = 0; l_m < l_m_blocking ; l_m++ ) {
        libxsmm_instruction_vec_move( io_generated_code, 
                                      i_micro_kernel_config->instruction_set,
                                      i_micro_kernel_config->a_vmove_instruction, 
                                      i_gp_reg_mapping->gp_reg_a, 
                                      LIBXSMM_X86_GP_REG_UNDEF, 0, 
                                      (i_micro_kernel_config->datatype_size) * (i_micro_kernel_config->vector_length) * l_m, 
                                      i_micro_kernel_config->vector_name, 
                                      i_n_blocking, i_micro_kernel_config->use_masking_a_c, 0 );

        for ( l_n = 0; l_n < i_n_blocking; l_n++ ) {
          /* post increment early */
          if ( (l_m == (l_m_blocking-1)) && (l_n == 0) ) {
            libxsmm_instruction_alu_imm( io_generated_code,
                                         i_micro_kernel_config->alu_add_instruction,
                                         i_gp_reg_mapping->gp_reg_a, 
                                         (i_xgemm_desc->lda)*(i_micro_kernel_config->datatype_size) );
          }
          if (l_n < i_n_blocking - 1) {
            /* issed vmove to save loads from A */
            libxsmm_instruction_vec_compute_reg( io_generated_code,
                                                 i_micro_kernel_config->instruction_set, 
                                                 i_micro_kernel_config->a_vmove_instruction, 
                                                 i_micro_kernel_config->vector_name, 
                                                 i_n_blocking + l_n, 
                                                 i_n_blocking + l_n + 1, 
                                                 LIBXSMM_X86_VEC_REG_UNDEF );
          }
          /* issue mul+add */
          libxsmm_instruction_vec_compute_reg( io_generated_code, 
                                               i_micro_kernel_config->instruction_set,
                                               i_micro_kernel_config->vmul_instruction, 
                                               i_micro_kernel_config->vector_name, 
                                               l_n, 
                                               i_n_blocking + l_n, 
                                               LIBXSMM_X86_VEC_REG_UNDEF );
          libxsmm_instruction_vec_compute_reg( io_generated_code, 
                                               i_micro_kernel_config->instruction_set,
                                               i_micro_kernel_config->vadd_instruction, 
                                               i_micro_kernel_config->vector_name, 
                                               i_n_blocking + l_n, 
                                               l_vec_reg_acc_start + l_m + (l_m_blocking * l_n), 
                                               LIBXSMM_X86_VEC_REG_UNDEF );
        }
      }
    } else {
      /* load column vectors of A and multiply with all broadcasted row entries of B */
      for ( l_m = 0; l_m < l_m_blocking ; l_m++ ) {
        libxsmm_instruction_vec_move( io_generated_code, 
                                      i_micro_kernel_config->instruction_set,
                                      i_micro_kernel_config->a_vmove_instruction, 
                                      i_gp_reg_mapping->gp_reg_a, 
                                      LIBXSMM_X86_GP_REG_UNDEF, 0, 
                                      (i_micro_kernel_config->datatype_size) * (i_micro_kernel_config->vector_length) * l_m, 
                                      i_micro_kernel_config->vector_name, 
                                      i_n_blocking + l_m, i_micro_kernel_config->use_masking_a_c, 0 );
      }
      for ( l_m = 0; l_m < l_m_blocking ; l_m++ ) {
        for ( l_n = 0; l_n < i_n_blocking; l_n++ ) {
          /* post increment early */
          if ( (l_m == (l_m_blocking-1)) && (l_n == 0) ) {
            libxsmm_instruction_alu_imm( io_generated_code,
                                         i_micro_kernel_config->alu_add_instruction,
                                         i_gp_reg_mapping->gp_reg_a, 
                                         (i_xgemm_desc->lda)*(i_micro_kernel_config->datatype_size) );
          }
          if (l_n < i_n_blocking - 1) {
            /* issed vmove to save loads from A */
            if (l_n == 0 ) {
              libxsmm_instruction_vec_compute_reg( io_generated_code, 
                                                   i_micro_kernel_config->instruction_set,
                                                   i_micro_kernel_config->a_vmove_instruction, 
                                                   i_micro_kernel_config->vector_name, 
                                                   i_n_blocking + l_m + l_n, 
                                                   i_n_blocking + l_m_blocking + l_n, 
                                                   LIBXSMM_X86_VEC_REG_UNDEF );
            } else {
              libxsmm_instruction_vec_compute_reg( io_generated_code, 
                                                   i_micro_kernel_config->instruction_set,
                                                   i_micro_kernel_config->a_vmove_instruction, 
                                                   i_micro_kernel_config->vector_name, 
                                                   i_n_blocking + l_m_blocking + l_n - 1, 
                                                   i_n_blocking + l_m_blocking + l_n, 
                                                   LIBXSMM_X86_VEC_REG_UNDEF );
            }
          }
          /* issue mul/add */
          if (l_n == 0 ) {
            libxsmm_instruction_vec_compute_reg( io_generated_code, 
                                                 i_micro_kernel_config->instruction_set,
                                                 i_micro_kernel_config->vmul_instruction, 
                                                 i_micro_kernel_config->vector_name, 
                                                 l_n, 
                                                 i_n_blocking + l_m + l_n,                                                
                                                 LIBXSMM_X86_VEC_REG_UNDEF );
            libxsmm_instruction_vec_compute_reg( io_generated_code, 
                                                 i_micro_kernel_config->instruction_set,
                                                 i_micro_kernel_config->vadd_instruction, 
                                                 i_micro_kernel_config->vector_name, 
                                                 i_n_blocking + l_m + l_n,
                                                 l_vec_reg_acc_start + l_m + (l_m_blocking * l_n) , 
                                                 LIBXSMM_X86_VEC_REG_UNDEF );
          } else {
            libxsmm_instruction_vec_compute_reg( io_generated_code, 
                                                 i_micro_kernel_config->instruction_set,
                                                 i_micro_kernel_config->vmul_instruction, 
                                                 i_micro_kernel_config->vector_name, 
                                                 l_n, 
                                                 i_n_blocking + l_m_blocking + l_n - 1,                                                
                                                 LIBXSMM_X86_VEC_REG_UNDEF );
            libxsmm_instruction_vec_compute_reg( io_generated_code, 
                                                 i_micro_kernel_config->instruction_set,
                                                 i_micro_kernel_config->vadd_instruction, 
                                                 i_micro_kernel_config->vector_name, 
                                                 i_n_blocking + l_m_blocking + l_n - 1,
                                                 l_vec_reg_acc_start + l_m + (l_m_blocking * l_n) , 
                                                 LIBXSMM_X86_VEC_REG_UNDEF );
          }
        }
      }
    }
  }
}
void libxsmm_sparse_csc_reader( libxsmm_generated_code* io_generated_code,
                                const char*             i_csc_file_in,
                                unsigned int**          o_row_idx,
                                unsigned int**          o_column_idx,
                                double**                o_values,
                                unsigned int*           o_row_count,
                                unsigned int*           o_column_count,
                                unsigned int*           o_element_count ) {
  FILE *l_csc_file_handle;
  const unsigned int l_line_length = 512;
  char l_line[512/*l_line_length*/+1];
  unsigned int l_header_read = 0;
  unsigned int* l_column_idx_id = NULL;
  unsigned int l_i = 0;

  l_csc_file_handle = fopen( i_csc_file_in, "r" );
  if ( l_csc_file_handle == NULL ) {
    libxsmm_handle_error( io_generated_code, LIBXSMM_ERR_CSC_INPUT );
    return;
  }

  while (fgets(l_line, l_line_length, l_csc_file_handle) != NULL) {
    if ( strlen(l_line) == l_line_length ) {
      libxsmm_handle_error( io_generated_code, LIBXSMM_ERR_CSC_READ_LEN );
      return;
    }
    /* check if we are still reading comments header */
    if ( l_line[0] == '%' ) {
      continue;
    } else {
      /* if we are the first line after comment header, we allocate our data structures */
      if ( l_header_read == 0 ) {
        if ( sscanf(l_line, "%u %u %u", o_row_count, o_column_count, o_element_count) == 3 ) {
          /* allocate CSC datastructue matching mtx file */
          *o_row_idx = (unsigned int*) malloc(sizeof(unsigned int) * (*o_element_count));
          *o_column_idx = (unsigned int*) malloc(sizeof(unsigned int) * (*o_column_count + 1));
          *o_values = (double*) malloc(sizeof(double) * (*o_element_count));
          l_column_idx_id = (unsigned int*) malloc(sizeof(unsigned int) * (*o_column_count));

          /* check if mallocs were successful */
          if ( ( *o_row_idx == NULL )      ||
               ( *o_column_idx == NULL )   ||
               ( *o_values == NULL )       ||
               ( l_column_idx_id == NULL )    ) {
            libxsmm_handle_error( io_generated_code, LIBXSMM_ERR_CSC_ALLOC_DATA );
            return;
          }

          /* set everything to zero for init */
          memset(*o_row_idx, 0, sizeof(unsigned int)*(*o_element_count));
          memset(*o_column_idx, 0, sizeof(unsigned int)*(*o_column_count + 1));
          memset(*o_values, 0, sizeof(double)*(*o_element_count));
          memset(l_column_idx_id, 0, sizeof(unsigned int)*(*o_column_count));

          /* init column idx */
          for ( l_i = 0; l_i < (*o_column_count + 1); l_i++)
            (*o_column_idx)[l_i] = (*o_element_count);

          /* init */
          (*o_column_idx)[0] = 0;
          l_i = 0;
          l_header_read = 1;
        } else {
          libxsmm_handle_error( io_generated_code, LIBXSMM_ERR_CSC_READ_DESC );
          return;
        }
      /* now we read the actual content */
      } else {
        unsigned int l_row, l_column;
        double l_value;
        /* read a line of content */
        if ( sscanf(l_line, "%u %u %lf", &l_row, &l_column, &l_value) != 3 ) {
          libxsmm_handle_error( io_generated_code, LIBXSMM_ERR_CSC_READ_ELEMS );
          return;
        }
        /* adjust numbers to zero termination */
        l_row--;
        l_column--;
        /* add these values to row and value strucuture */
        (*o_row_idx)[l_i] = l_row;
        (*o_values)[l_i] = l_value;
        l_i++;
        /* handle columns, set id to onw for this column, yeah we need to hanle empty columns */
        l_column_idx_id[l_column] = 1;
        (*o_column_idx)[l_column+1] = l_i;
      }
    }
  }

  /* close mtx file */
  fclose( l_csc_file_handle );

  /* check if we read a file which was consitent */
  if ( l_i != (*o_element_count) ) {
    libxsmm_handle_error( io_generated_code, LIBXSMM_ERR_CSC_LEN );
    return;
  }

  /* let's handle empty colums */
  for ( l_i = 0; l_i < (*o_column_count); l_i++) {
    if ( l_column_idx_id[l_i] == 0 ) {
      (*o_column_idx)[l_i+1] = (*o_column_idx)[l_i];
    }
  }

  /* free helper data structure */
  if ( l_column_idx_id != NULL ) {
    free( l_column_idx_id );
  }
}
LIBXSMM_INTERNAL_API_DEFINITION
void libxsmm_generator_gemm_store_C( libxsmm_generated_code*             io_generated_code,
                                      const libxsmm_gp_reg_mapping*      i_gp_reg_mapping,
                                      const libxsmm_micro_kernel_config* i_micro_kernel_config,
                                      const libxsmm_gemm_descriptor*     i_xgemm_desc,
                                      const unsigned int                 i_m_blocking,
                                      const unsigned int                 i_n_blocking )
{
  /* deriving register blocking from kernel config */
  unsigned int l_m_blocking = i_m_blocking/i_micro_kernel_config->vector_length;
  /* register blocking counter in n */
  unsigned int l_n = 0;
  /* register blocking counter in m */
  unsigned int l_m = 0;
  /* start register of accumulator */
  unsigned int l_vec_reg_acc_start = i_micro_kernel_config->vector_reg_count - (i_n_blocking * l_m_blocking);

  /* @TODO fix this test */
#if !defined(NDEBUG)
  if (i_micro_kernel_config->instruction_set == LIBXSMM_X86_SSE3 ||
      i_micro_kernel_config->instruction_set == LIBXSMM_X86_AVX  ||
      i_micro_kernel_config->instruction_set == LIBXSMM_X86_AVX2    ) {
    if ( (i_n_blocking > 3) || (i_n_blocking < 1) || (i_m_blocking < 1) ) {
      libxsmm_handle_error( io_generated_code, LIBXSMM_ERR_REG_BLOCK );
      return;
    }
  } else if (i_micro_kernel_config->instruction_set == LIBXSMM_X86_IMCI        ||
             i_micro_kernel_config->instruction_set == LIBXSMM_X86_AVX512_MIC  ||
             ( (i_micro_kernel_config->instruction_set == LIBXSMM_X86_AVX512_CORE) && (i_m_blocking == i_micro_kernel_config->vector_length) )  ) {
    if ( (i_n_blocking > 30) || (i_n_blocking < 1) || (i_m_blocking != i_micro_kernel_config->vector_length) ) {
      libxsmm_handle_error( io_generated_code, LIBXSMM_ERR_REG_BLOCK );
      return;
    }
  } else if ( i_micro_kernel_config->instruction_set == LIBXSMM_X86_AVX512_CORE ) {
    if ( (i_n_blocking > 6) || (i_n_blocking < 1) || (i_m_blocking < 1) ) {
      libxsmm_handle_error( io_generated_code, LIBXSMM_ERR_REG_BLOCK );
      return;
    }
  } else {}
  if ( i_m_blocking % i_micro_kernel_config->vector_length != 0 ) {
    libxsmm_handle_error( io_generated_code, LIBXSMM_ERR_M_BLOCK );
    return;
  }
#endif

  /* storing C accumulator */
  /* adding to C, so let's load C */
  for ( l_n = 0; l_n < i_n_blocking; l_n++ ) {
    for ( l_m = 0; l_m < l_m_blocking; l_m++ ) {
      libxsmm_x86_instruction_vec_move( io_generated_code,
                                    i_micro_kernel_config->instruction_set,
                                    i_micro_kernel_config->c_vmove_instruction,
                                    i_gp_reg_mapping->gp_reg_c,
                                    LIBXSMM_X86_GP_REG_UNDEF, 0,
                                    ((l_n * i_xgemm_desc->ldc) + (l_m * (i_micro_kernel_config->vector_length))) * (i_micro_kernel_config->datatype_size),
                                    i_micro_kernel_config->vector_name,
                                    l_vec_reg_acc_start + l_m + (l_m_blocking * l_n), i_micro_kernel_config->use_masking_a_c, 1 );
    }

    if ( i_xgemm_desc->prefetch == LIBXSMM_PREFETCH_BL2_VIA_C ||
         i_xgemm_desc->prefetch == LIBXSMM_PREFETCH_AL2BL2_VIA_C ||
         i_xgemm_desc->prefetch == LIBXSMM_PREFETCH_AL2BL2_VIA_C_AHEAD ||
         i_xgemm_desc->prefetch == LIBXSMM_PREFETCH_AL2BL2_VIA_C_JPST)  {
      /* determining how many prefetches we need in M direction as we just need one prefetch per cache line */
      unsigned int l_m_advance = 64 / ((i_micro_kernel_config->vector_length) * (i_micro_kernel_config->datatype_size)); /* 64: hardcoded cache line length */

      for (l_m = 0; l_m < l_m_blocking; l_m += l_m_advance ) {
        libxsmm_x86_instruction_prefetch( io_generated_code,
                                      i_micro_kernel_config->prefetch_instruction,
                                      i_gp_reg_mapping->gp_reg_b_prefetch,
                                      LIBXSMM_X86_GP_REG_UNDEF, 0,
                                      ((l_n * i_xgemm_desc->ldc) + (l_m * (i_micro_kernel_config->vector_length))) * (i_micro_kernel_config->datatype_size));
      }
    }
  }
}
LIBXSMM_INTERNAL_API_DEFINITION
void libxsmm_generator_gemm_load_C( libxsmm_generated_code*             io_generated_code,
                                     const libxsmm_gp_reg_mapping*      i_gp_reg_mapping,
                                     const libxsmm_micro_kernel_config* i_micro_kernel_config,
                                     const libxsmm_gemm_descriptor*     i_xgemm_desc,
                                     const unsigned int                 i_m_blocking,
                                     const unsigned int                 i_n_blocking ) {
  /* deriving register blocking from kernel config */
  const unsigned int l_m_blocking = i_m_blocking/i_micro_kernel_config->vector_length;
  /* start register of accumulator */
  const unsigned int l_vec_reg_acc_start = i_micro_kernel_config->vector_reg_count - (i_n_blocking * l_m_blocking);
  /* register blocking counter in n */
  unsigned int l_n = 0;
  /* register blocking counter in m */
  unsigned int l_m = 0;

#if !defined(NDEBUG)
  /* Do some test if it's possible to generated the requested code.
     This is not done in release mode and therefore bad
     things might happen.... HUAAH */
  if (i_micro_kernel_config->instruction_set == LIBXSMM_X86_SSE3 ||
      i_micro_kernel_config->instruction_set == LIBXSMM_X86_AVX  ||
      i_micro_kernel_config->instruction_set == LIBXSMM_X86_AVX2    ) {
    if ( (i_n_blocking > 3) || (i_n_blocking < 1) || (i_m_blocking < 1) ) {
      libxsmm_handle_error( io_generated_code, LIBXSMM_ERR_REG_BLOCK );
      return;
    }
  } else if (i_micro_kernel_config->instruction_set == LIBXSMM_X86_IMCI        ||
             i_micro_kernel_config->instruction_set == LIBXSMM_X86_AVX512_MIC  ||
             ( (i_micro_kernel_config->instruction_set == LIBXSMM_X86_AVX512_CORE) && (i_m_blocking == i_micro_kernel_config->vector_length) )  ) {
    if ( (i_n_blocking > 30) || (i_n_blocking < 1) || (i_m_blocking != i_micro_kernel_config->vector_length) ) {
      libxsmm_handle_error( io_generated_code, LIBXSMM_ERR_REG_BLOCK );
      return;
    }
  } else if ( i_micro_kernel_config->instruction_set == LIBXSMM_X86_AVX512_CORE ) {
    if ( (i_n_blocking > 6) || (i_n_blocking < 1) || (i_m_blocking < 1) ) {
      libxsmm_handle_error( io_generated_code, LIBXSMM_ERR_REG_BLOCK );
      return;
    }
  } else {}
  if ( i_m_blocking % i_micro_kernel_config->vector_length != 0 ) {
    libxsmm_handle_error( io_generated_code, LIBXSMM_ERR_M_BLOCK );
    return;
  }
#endif /*NDEBUG*/

  /* load C accumulator */
  if (i_xgemm_desc->beta == 1) {
    /* adding to C, so let's load C */
    for ( l_n = 0; l_n < i_n_blocking; l_n++ ) {
      for ( l_m = 0; l_m < l_m_blocking; l_m++ ) {
        libxsmm_x86_instruction_vec_move( io_generated_code,
                                      i_micro_kernel_config->instruction_set,
                                      i_micro_kernel_config->c_vmove_instruction,
                                      i_gp_reg_mapping->gp_reg_c,
                                      LIBXSMM_X86_GP_REG_UNDEF, 0,
                                      ((l_n * i_xgemm_desc->ldc) + (l_m * (i_micro_kernel_config->vector_length))) * (i_micro_kernel_config->datatype_size),
                                      i_micro_kernel_config->vector_name,
                                      l_vec_reg_acc_start + l_m + (l_m_blocking * l_n), i_micro_kernel_config->use_masking_a_c, 0 );
      }
#if 0
      if ( i_xgemm_desc->prefetch == LIBXSMM_PREFETCH_CL2 ||
           i_xgemm_desc->prefetch == LIBXSMM_PREFETCH_AL2CL2BL2_VIA_C )  {
        for (l_m = 0; l_m < l_m_blocking; l_m += l_m++ ) {
          libxsmm_x86_instruction_prefetch( io_generated_code,
                                            i_micro_kernel_config->prefetch_instruction,
                                            i_gp_reg_mapping->gp_reg_c_prefetch,
                                            LIBXSMM_X86_GP_REG_UNDEF, 0,
                                            ((l_n * i_xgemm_desc->ldc) + (l_m * (i_micro_kernel_config->vector_length))) * (i_micro_kernel_config->datatype_size));
        }
      }
#endif
    }
  } else {
    /* overwriting C, so let's xout accumulator */
    for ( l_n = 0; l_n < i_n_blocking; l_n++ ) {
      for ( l_m = 0; l_m < l_m_blocking; l_m++ ) {
        libxsmm_x86_instruction_vec_compute_reg( io_generated_code,
                                             i_micro_kernel_config->instruction_set,
                                             i_micro_kernel_config->vxor_instruction,
                                             i_micro_kernel_config->vector_name,
                                             l_vec_reg_acc_start + l_m + (l_m_blocking * l_n),
                                             l_vec_reg_acc_start + l_m + (l_m_blocking * l_n),
                                             l_vec_reg_acc_start + l_m + (l_m_blocking * l_n) );
      }
#if 0
      if ( i_xgemm_desc->prefetch == LIBXSMM_PREFETCH_CL2 ||
           i_xgemm_desc->prefetch == LIBXSMM_PREFETCH_AL2CL2BL2_VIA_C )  {
        for (l_m = 0; l_m < l_m_blocking; l_m += l_m++ ) {
          libxsmm_x86_instruction_prefetch( io_generated_code,
                                            i_micro_kernel_config->prefetch_instruction,
                                            i_gp_reg_mapping->gp_reg_c_prefetch,
                                            LIBXSMM_X86_GP_REG_UNDEF, 0,
                                            ((l_n * i_xgemm_desc->ldc) + (l_m * (i_micro_kernel_config->vector_length))) * (i_micro_kernel_config->datatype_size));
        }
      }
#endif
    }
  }
}
void libxsmm_generator_sparse_bsparse( libxsmm_generated_code*         io_generated_code,
                                       const libxsmm_xgemm_descriptor* i_xgemm_desc,
                                       const char*                     i_arch,
                                       const unsigned int*             i_row_idx,
                                       const unsigned int*             i_column_idx,
                                       const double*                   i_values ) {
  unsigned int l_n;
  unsigned int l_z;
  unsigned int l_column_elements;
  unsigned int l_flop_count = 0;

  char l_new_code[512];
  int l_max_code_length = 511;
  int l_code_length = 0;

  LIBXSMM_UNUSED(i_values);

  l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, "  unsigned int l_m = 0;\n");
  libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length );

  /* reset C if beta is zero */
  if ( i_xgemm_desc->beta == 0 ) {
    l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, "  unsigned int l_n = 0;\n");
    libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length );
    l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, "  for ( l_n = 0; l_n < %u; l_n++) {\n", i_xgemm_desc->n);
    libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length );
    if ( i_xgemm_desc->m > 1 ) {
      l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, "    #pragma simd\n");
      libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length );
      l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, "    #pragma vector aligned\n");
      libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length );
    }
    if ( (LIBXSMM_XGEMM_FLAG_F32PREC & i_xgemm_desc->flags) == 0 ) {
      l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, "    for ( l_m = 0; l_m < %u; l_m++) { C[(l_n*%u)+l_m] = 0.0; }\n", i_xgemm_desc->m, i_xgemm_desc->ldc);
    } else {
      l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, "    for ( l_m = 0; l_m < %u; l_m++) { C[(l_n*%u)+l_m] = 0.0f; }\n", i_xgemm_desc->m, i_xgemm_desc->ldc);
    }
    libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length );
    l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, "  }\n");
    libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length );
  }
  l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, "\n");
  libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length );

  /* determine the correct simd pragma for each architecture */
  if ( ( strcmp( i_arch, "noarch" ) == 0 ) ||
       ( strcmp( i_arch, "wsm" ) == 0 )    ||
       ( strcmp( i_arch, "snb" ) == 0 )    ||
       ( strcmp( i_arch, "hsw" ) == 0 )       ) {
    if ( i_xgemm_desc->m > 7 ) {
      l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, "  #pragma simd vectorlength(8)\n");
      libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length );
    } else if ( i_xgemm_desc->m > 3 ) {
      l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, "  #pragma simd vectorlength(4)\n");
      libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length );
    } else if ( i_xgemm_desc->m > 1 ) {
      l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, "  #pragma simd vectorlength(2)\n");
      libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length );
    } else {}

    if ( (i_xgemm_desc->m > 1)          &&
         ((LIBXSMM_XGEMM_FLAG_ALIGN_A & i_xgemm_desc->flags) != 0) &&
         ((LIBXSMM_XGEMM_FLAG_ALIGN_C & i_xgemm_desc->flags) != 0)    ) {
      l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, "  #pragma vector aligned\n");
      libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length );
    }
  } else if ( ( strcmp( i_arch, "knc" ) == 0 ) ||
              ( strcmp( i_arch, "knl" ) == 0 ) ||
              ( strcmp( i_arch, "skx" ) == 0 )    ) {
    if ( (i_xgemm_desc->m > 1)          &&
         ((LIBXSMM_XGEMM_FLAG_ALIGN_A & i_xgemm_desc->flags) != 0) &&
         ((LIBXSMM_XGEMM_FLAG_ALIGN_C & i_xgemm_desc->flags) != 0)    ) {
      l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, "  #pragma simd vectorlength(32)\n  #pragma vector aligned\n");
      libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length );
    }
  } else {
    libxsmm_handle_error( io_generated_code, LIBXSMM_ERR_ARCH );
    return;
  }

  /* generate the actuel kernel */
  l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, "  for ( l_m = 0; l_m < %u; l_m++) {\n", i_xgemm_desc->m);
  libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length );

  for ( l_n = 0; l_n < i_xgemm_desc->n; l_n++ ) {
    l_column_elements = i_column_idx[l_n+1] - i_column_idx[l_n];
    for ( l_z = 0; l_z < l_column_elements; l_z++ ) {
      /* check k such that we just use rows which actually need to be multiplied */
      if ( i_row_idx[i_column_idx[l_n] + l_z] < i_xgemm_desc->k ) {
        l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, "    C[%u+l_m] += A[%u+l_m] * B[%u];\n", l_n * i_xgemm_desc->ldc, i_row_idx[i_column_idx[l_n] + l_z]*i_xgemm_desc->lda, i_column_idx[l_n] + l_z);
        libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length );
        l_flop_count += 2;
      }
    }
  }

  l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, "  }\n");
  libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length );

  /* add flop counter */
  l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, "\n#ifndef NDEBUG\n#ifdef _OPENMP\n#pragma omp atomic\n#endif\nlibxsmm_num_total_flops += %u;\n#endif\n", l_flop_count * i_xgemm_desc->m);
  libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length );
}
LIBXSMM_INTERNAL_API_DEFINITION
void libxsmm_generator_gemm_avx512_microkernel_nofsdbcst( libxsmm_generated_code*             io_generated_code,
                                                          const libxsmm_gp_reg_mapping*      i_gp_reg_mapping,
                                                          const libxsmm_micro_kernel_config* i_micro_kernel_config,
                                                          const libxsmm_gemm_descriptor*     i_xgemm_desc,
                                                          const unsigned int                 i_m_blocking,
                                                          const unsigned int                 i_n_blocking,
                                                          const int                          i_offset )
{
  /* deriving register blocking from kernel config */
  unsigned int l_m_blocking = i_m_blocking/i_micro_kernel_config->vector_length;
  /* register blocking counter in n */
  unsigned int l_n = 0;
  /* register blocking counter in m */
  unsigned int l_m = 0;
  /* start register of accumulator */
  unsigned int l_vec_reg_acc_start = i_micro_kernel_config->vector_reg_count - (i_n_blocking * l_m_blocking);

#if !defined(NDEBUG)
  if ( (i_n_blocking > 6) || (i_n_blocking < 1) ) {
    libxsmm_handle_error( io_generated_code, LIBXSMM_ERR_N_BLOCK );
    return;
  }
  if ( i_m_blocking % i_micro_kernel_config->vector_length != 0 ) {
    libxsmm_handle_error( io_generated_code, LIBXSMM_ERR_M_BLOCK );
    return;
  }
#endif

  if (l_m_blocking == 1) {
    /* load column vectors of A */
    libxsmm_x86_instruction_vec_move( io_generated_code,
                                  i_micro_kernel_config->instruction_set,
                                  i_micro_kernel_config->a_vmove_instruction,
                                  i_gp_reg_mapping->gp_reg_a,
                                  LIBXSMM_X86_GP_REG_UNDEF, 0,
                                  0,
                                  i_micro_kernel_config->vector_name,
                                  i_n_blocking, i_micro_kernel_config->use_masking_a_c, 0 );
    /* loop over columns of B */
    for ( l_n = 0; l_n < i_n_blocking; l_n++ ) {
      /* post increment of a pointer early */
      if ( l_n == 0 ) {
        libxsmm_x86_instruction_alu_imm( io_generated_code,
                                     i_micro_kernel_config->alu_add_instruction,
                                     i_gp_reg_mapping->gp_reg_a,
                                     (i_xgemm_desc->lda)*(i_micro_kernel_config->datatype_size) );
      }
      /* different ways of using B */
      if ( i_offset != (-1) ) {
        libxsmm_x86_instruction_vec_move( io_generated_code,
                                      i_micro_kernel_config->instruction_set,
                                      i_micro_kernel_config->b_vmove_instruction,
                                      i_gp_reg_mapping->gp_reg_b,
                                      LIBXSMM_X86_GP_REG_UNDEF, 0,
                                      ((i_micro_kernel_config->datatype_size) * i_offset) + (i_xgemm_desc->ldb * l_n * (i_micro_kernel_config->datatype_size)),
                                      i_micro_kernel_config->vector_name,
                                      l_n, i_micro_kernel_config->use_masking_a_c, 0 );
      } else {
        libxsmm_x86_instruction_vec_move( io_generated_code,
                                      i_micro_kernel_config->instruction_set,
                                      i_micro_kernel_config->b_vmove_instruction,
                                      i_gp_reg_mapping->gp_reg_b,
                                      LIBXSMM_X86_GP_REG_UNDEF, 0,
                                      i_xgemm_desc->ldb * l_n *  i_micro_kernel_config->datatype_size,
                                      i_micro_kernel_config->vector_name,
                                      l_n, i_micro_kernel_config->use_masking_a_c, 0 );
        if ( l_n == (i_n_blocking -1) ) {
          libxsmm_x86_instruction_alu_imm( io_generated_code,
                                       i_micro_kernel_config->alu_add_instruction,
                                       i_gp_reg_mapping->gp_reg_b,
                                       i_micro_kernel_config->datatype_size );
        }
      }
      /* issue fma */
      libxsmm_x86_instruction_vec_compute_reg( io_generated_code,
                                           i_micro_kernel_config->instruction_set,
                                           i_micro_kernel_config->vmul_instruction,
                                           i_micro_kernel_config->vector_name,
                                           i_n_blocking,
                                           l_n,
                                           l_vec_reg_acc_start + l_n );
    }
  } else {
    /* broadcast from B -> into vec registers 0 to i_n_blocking */
    if ( i_offset != (-1) ) {
      for ( l_n = 0; l_n < i_n_blocking; l_n++ ) {
        libxsmm_x86_instruction_vec_move( io_generated_code,
                                      i_micro_kernel_config->instruction_set,
                                      i_micro_kernel_config->b_vmove_instruction,
                                      i_gp_reg_mapping->gp_reg_b,
                                      LIBXSMM_X86_GP_REG_UNDEF, 0,
                                      ((i_micro_kernel_config->datatype_size) * i_offset) + (i_xgemm_desc->ldb * l_n * (i_micro_kernel_config->datatype_size)),
                                      i_micro_kernel_config->vector_name,
                                      l_n, i_micro_kernel_config->use_masking_a_c, 0 );
      }
    } else {
      for ( l_n = 0; l_n < i_n_blocking; l_n++ ) {
        libxsmm_x86_instruction_vec_move( io_generated_code,
                                      i_micro_kernel_config->instruction_set,
                                      i_micro_kernel_config->b_vmove_instruction,
                                      i_gp_reg_mapping->gp_reg_b,
                                      LIBXSMM_X86_GP_REG_UNDEF, 0,
                                      i_xgemm_desc->ldb * l_n *  i_micro_kernel_config->datatype_size,
                                      i_micro_kernel_config->vector_name,
                                      l_n, i_micro_kernel_config->use_masking_a_c, 0 );
     }
     libxsmm_x86_instruction_alu_imm( io_generated_code,
                                  i_micro_kernel_config->alu_add_instruction,
                                  i_gp_reg_mapping->gp_reg_b,
                                  i_micro_kernel_config->datatype_size );
    }

    if (l_m_blocking == 4) {
      /* load column vectors of A and multiply with all broadcasted row entries of B */
      for ( l_m = 0; l_m < l_m_blocking ; l_m++ ) {
        libxsmm_x86_instruction_vec_move( io_generated_code,
                                      i_micro_kernel_config->instruction_set,
                                      i_micro_kernel_config->a_vmove_instruction,
                                      i_gp_reg_mapping->gp_reg_a,
                                      LIBXSMM_X86_GP_REG_UNDEF, 0,
                                      (i_micro_kernel_config->datatype_size) * (i_micro_kernel_config->vector_length) * l_m,
                                      i_micro_kernel_config->vector_name,
                                      i_n_blocking, i_micro_kernel_config->use_masking_a_c, 0 );

        for ( l_n = 0; l_n < i_n_blocking; l_n++ ) {
          /* post increment early */
          if ( (l_m == (l_m_blocking-1)) && (l_n == 0) ) {
            libxsmm_x86_instruction_alu_imm( io_generated_code,
                                         i_micro_kernel_config->alu_add_instruction,
                                         i_gp_reg_mapping->gp_reg_a,
                                         (i_xgemm_desc->lda)*(i_micro_kernel_config->datatype_size) );
          }
          /* issue fma */
          libxsmm_x86_instruction_vec_compute_reg( io_generated_code,
                                               i_micro_kernel_config->instruction_set,
                                               i_micro_kernel_config->vmul_instruction,
                                               i_micro_kernel_config->vector_name,
                                               i_n_blocking,
                                               l_n,
                                               l_vec_reg_acc_start + l_m + (l_m_blocking * l_n) );
        }
      }
    } else {
      /* load column vectors of A and multiply with all broadcasted row entries of B */
      for ( l_m = 0; l_m < l_m_blocking ; l_m++ ) {
        libxsmm_x86_instruction_vec_move( io_generated_code,
                                      i_micro_kernel_config->instruction_set,
                                      i_micro_kernel_config->a_vmove_instruction,
                                      i_gp_reg_mapping->gp_reg_a,
                                      LIBXSMM_X86_GP_REG_UNDEF, 0,
                                      (i_micro_kernel_config->datatype_size) * (i_micro_kernel_config->vector_length) * l_m,
                                      i_micro_kernel_config->vector_name,
                                      i_n_blocking+l_m, i_micro_kernel_config->use_masking_a_c, 0 );
      }
      for ( l_m = 0; l_m < l_m_blocking ; l_m++ ) {
        for ( l_n = 0; l_n < i_n_blocking; l_n++ ) {
          /* post increment early */
          if ( (l_m == (l_m_blocking-1)) && (l_n == 0) ) {
            libxsmm_x86_instruction_alu_imm( io_generated_code,
                                         i_micro_kernel_config->alu_add_instruction,
                                         i_gp_reg_mapping->gp_reg_a,
                                         (i_xgemm_desc->lda)*(i_micro_kernel_config->datatype_size) );
          }
          /* issue fma */
          libxsmm_x86_instruction_vec_compute_reg( io_generated_code,
                                               i_micro_kernel_config->instruction_set,
                                               i_micro_kernel_config->vmul_instruction,
                                               i_micro_kernel_config->vector_name,
                                               i_n_blocking+l_m,
                                               l_n,
                                               l_vec_reg_acc_start + l_m + (l_m_blocking * l_n) );
        }
      }
    }
  }
}
void libxsmm_generator_dense_x86_close_instruction_stream( libxsmm_generated_code*       io_generated_code,
                                                           const libxsmm_gp_reg_mapping* i_gp_reg_mapping,
                                                           const char*                   i_arch, 
                                                           const char*                   i_prefetch) {
  /* @TODO add checks in debug mode */
  if ( io_generated_code->code_type > 1 ) {
    /* @TODO-GREG call encoding here */
    /* @TODO-GREG: how do we interface here? */
    /* this is start of the xGEMM kernel, the registers are in the variables */
  } else if ( io_generated_code->code_type == 1 ) {
    /* @TODO this is currently System V AMD64 RTL(C) ABI only */
    char l_new_code[512];
    int l_max_code_length = 511;
    int l_code_length = 0;
    char l_gp_reg_name[4];

    if ( (strcmp(i_arch, "wsm") == 0) ||
         (strcmp(i_arch, "snb") == 0) ||
         (strcmp(i_arch, "hsw") == 0)    ) {
      if ( libxsmm_check_x86_gp_reg_name_callee_save( i_gp_reg_mapping->gp_reg_kloop ) ) {
        libxsmm_get_x86_gp_reg_name( i_gp_reg_mapping->gp_reg_kloop, l_gp_reg_name, 3 );
        l_code_length = libxsmm_snprintf( l_new_code, l_max_code_length, "                       popq %%%s\n", l_gp_reg_name );
        libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length );
      }
      if ( libxsmm_check_x86_gp_reg_name_callee_save( i_gp_reg_mapping->gp_reg_nloop ) ) {
        libxsmm_get_x86_gp_reg_name( i_gp_reg_mapping->gp_reg_nloop, l_gp_reg_name, 3 );
        l_code_length = libxsmm_snprintf( l_new_code, l_max_code_length, "                       popq %%%s\n", l_gp_reg_name );
        libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length );
      }
      if ( libxsmm_check_x86_gp_reg_name_callee_save( i_gp_reg_mapping->gp_reg_mloop ) ) {
        libxsmm_get_x86_gp_reg_name( i_gp_reg_mapping->gp_reg_mloop, l_gp_reg_name, 3 );
        l_code_length = libxsmm_snprintf( l_new_code, l_max_code_length, "                       popq %%%s\n", l_gp_reg_name );
        libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length );
      }
    } else {
      l_code_length = libxsmm_snprintf( l_new_code, l_max_code_length, "                       popq %r15\n" );
      libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length );
      l_code_length = libxsmm_snprintf( l_new_code, l_max_code_length, "                       popq %r14\n" );
      libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length );
      l_code_length = libxsmm_snprintf( l_new_code, l_max_code_length, "                       popq %r13\n" );
      libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length );
      l_code_length = libxsmm_snprintf( l_new_code, l_max_code_length, "                       popq %r12\n" );
      libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length );
      l_code_length = libxsmm_snprintf( l_new_code, l_max_code_length, "                       popq %rbx\n" );
      libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length );
    }

    if ( libxsmm_check_x86_gp_reg_name_callee_save( i_gp_reg_mapping->gp_reg_b_prefetch ) ) {
      libxsmm_handle_error( io_generated_code, LIBXSMM_ERR_CALLEE_SAVE_B_PREF );
      return;
    }
    if ( libxsmm_check_x86_gp_reg_name_callee_save( i_gp_reg_mapping->gp_reg_a_prefetch ) ) {
      libxsmm_handle_error( io_generated_code, LIBXSMM_ERR_CALLEE_SAVE_A_PREF );
      return;
    }
    if ( libxsmm_check_x86_gp_reg_name_callee_save( i_gp_reg_mapping->gp_reg_c ) ) {
      libxsmm_handle_error( io_generated_code, LIBXSMM_ERR_CALLEE_SAVE_C );
      return;
    }
    if ( libxsmm_check_x86_gp_reg_name_callee_save( i_gp_reg_mapping->gp_reg_b ) ) {
      libxsmm_handle_error( io_generated_code, LIBXSMM_ERR_CALLEE_SAVE_B );
      return;
    }
    if ( libxsmm_check_x86_gp_reg_name_callee_save( i_gp_reg_mapping->gp_reg_a ) ) {
      libxsmm_handle_error( io_generated_code, LIBXSMM_ERR_CALLEE_SAVE_A );
      return;
    }
    /* @TODO: I don't know if this is the correct placement in the generation process */
    l_code_length = libxsmm_snprintf( l_new_code, l_max_code_length, "                       retq\n" );
    libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length );
  } else {
    char l_new_code[1024];
    int l_max_code_length = 1023;
    int l_code_length = 0;
    char l_gp_reg_a[4];
    char l_gp_reg_b[4];
    char l_gp_reg_c[4];
    char l_gp_reg_pre_a[4];
    char l_gp_reg_pre_b[4];
    char l_gp_reg_mloop[4];
    char l_gp_reg_nloop[4];
    char l_gp_reg_kloop[4];

    libxsmm_get_x86_gp_reg_name( i_gp_reg_mapping->gp_reg_a, l_gp_reg_a, 3 );
    libxsmm_get_x86_gp_reg_name( i_gp_reg_mapping->gp_reg_b, l_gp_reg_b, 3 );
    libxsmm_get_x86_gp_reg_name( i_gp_reg_mapping->gp_reg_c, l_gp_reg_c, 3 );
    libxsmm_get_x86_gp_reg_name( i_gp_reg_mapping->gp_reg_a_prefetch, l_gp_reg_pre_a, 3 );
    libxsmm_get_x86_gp_reg_name( i_gp_reg_mapping->gp_reg_b_prefetch, l_gp_reg_pre_b, 3 );
    libxsmm_get_x86_gp_reg_name( i_gp_reg_mapping->gp_reg_mloop, l_gp_reg_mloop, 3 );
    libxsmm_get_x86_gp_reg_name( i_gp_reg_mapping->gp_reg_nloop, l_gp_reg_nloop, 3 );
    libxsmm_get_x86_gp_reg_name( i_gp_reg_mapping->gp_reg_kloop, l_gp_reg_kloop, 3 );

    if ( ( strcmp(i_prefetch, "BL2viaC") == 0 ) || 
         ( strcmp(i_prefetch, "curAL2_BL2viaC") == 0 )    ) {
      if ( (strcmp(i_arch, "wsm") == 0) ||
           (strcmp(i_arch, "snb") == 0) ||
           (strcmp(i_arch, "hsw") == 0)    ) {
        l_code_length = libxsmm_snprintf( l_new_code, l_max_code_length, "                       : : \"m\"(B), \"m\"(A), \"m\"(C), \"m\"(B_prefetch) : \"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"xmm0\",\"xmm1\",\"xmm2\",\"xmm3\",\"xmm4\",\"xmm5\",\"xmm6\",\"xmm7\",\"xmm8\",\"xmm9\",\"xmm10\",\"xmm11\",\"xmm12\",\"xmm13\",\"xmm14\",\"xmm15\");\n", l_gp_reg_a, l_gp_reg_b, l_gp_reg_c, l_gp_reg_pre_b, l_gp_reg_mloop, l_gp_reg_nloop, l_gp_reg_kloop);
      } else {
        l_code_length = libxsmm_snprintf( l_new_code, l_max_code_length, "                       : : \"m\"(B), \"m\"(A), \"m\"(C), \"m\"(B_prefetch) : \"k1\",\"rax\",\"rbx\",\"rcx\",\"rdx\",\"rdi\",\"rsi\",\"r8\",\"r9\",\"r10\",\"r12\",\"r13\",\"r14\",\"r15\",\"zmm0\",\"zmm1\",\"zmm2\",\"zmm3\",\"zmm4\",\"zmm5\",\"zmm6\",\"zmm7\",\"zmm8\",\"zmm9\",\"zmm10\",\"zmm11\",\"zmm12\",\"zmm13\",\"zmm14\",\"zmm15\",\"zmm16\",\"zmm17\",\"zmm18\",\"zmm19\",\"zmm20\",\"zmm21\",\"zmm22\",\"zmm23\",\"zmm24\",\"zmm25\",\"zmm26\",\"zmm27\",\"zmm28\",\"zmm29\",\"zmm30\",\"zmm31\");\n");
      }
    } else if ( ( strcmp(i_prefetch, "AL2jpst") == 0 ) ||
                ( strcmp(i_prefetch, "AL2") == 0 )        ) {
      if ( (strcmp(i_arch, "wsm") == 0) ||
           (strcmp(i_arch, "snb") == 0) ||
           (strcmp(i_arch, "hsw") == 0)    ) {
        l_code_length = libxsmm_snprintf( l_new_code, l_max_code_length, "                       : : \"m\"(B), \"m\"(A), \"m\"(C), \"m\"(A_prefetch) : \"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"xmm0\",\"xmm1\",\"xmm2\",\"xmm3\",\"xmm4\",\"xmm5\",\"xmm6\",\"xmm7\",\"xmm8\",\"xmm9\",\"xmm10\",\"xmm11\",\"xmm12\",\"xmm13\",\"xmm14\",\"xmm15\");\n", l_gp_reg_a, l_gp_reg_b, l_gp_reg_c, l_gp_reg_pre_a, l_gp_reg_mloop, l_gp_reg_nloop, l_gp_reg_kloop);
      } else {
        l_code_length = libxsmm_snprintf( l_new_code, l_max_code_length, "                       : : \"m\"(B), \"m\"(A), \"m\"(C), \"m\"(A_prefetch) : \"k1\",\"rax\",\"rbx\",\"rcx\",\"rdx\",\"rdi\",\"rsi\",\"r8\",\"r9\",\"r10\",\"r12\",\"r13\",\"r14\",\"r15\",\"zmm0\",\"zmm1\",\"zmm2\",\"zmm3\",\"zmm4\",\"zmm5\",\"zmm6\",\"zmm7\",\"zmm8\",\"zmm9\",\"zmm10\",\"zmm11\",\"zmm12\",\"zmm13\",\"zmm14\",\"zmm15\",\"zmm16\",\"zmm17\",\"zmm18\",\"zmm19\",\"zmm20\",\"zmm21\",\"zmm22\",\"zmm23\",\"zmm24\",\"zmm25\",\"zmm26\",\"zmm27\",\"zmm28\",\"zmm29\",\"zmm30\",\"zmm31\");\n");
      }
    } else if ( ( strcmp(i_prefetch, "AL2jpst_BL2viaC") == 0 ) ||
                ( strcmp(i_prefetch, "AL2_BL2viaC") == 0 )        ) {
      if ( (strcmp(i_arch, "wsm") == 0) ||
           (strcmp(i_arch, "snb") == 0) ||
           (strcmp(i_arch, "hsw") == 0)    ) {
        l_code_length = libxsmm_snprintf( l_new_code, l_max_code_length, "                       : : \"m\"(B), \"m\"(A), \"m\"(C), \"m\"(A_prefetch), \"m\"(B_prefetch) : \"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"xmm0\",\"xmm1\",\"xmm2\",\"xmm3\",\"xmm4\",\"xmm5\",\"xmm6\",\"xmm7\",\"xmm8\",\"xmm9\",\"xmm10\",\"xmm11\",\"xmm12\",\"xmm13\",\"xmm14\",\"xmm15\");\n", l_gp_reg_a, l_gp_reg_b, l_gp_reg_c, l_gp_reg_pre_a, l_gp_reg_pre_b, l_gp_reg_mloop, l_gp_reg_nloop, l_gp_reg_kloop);
      } else {
        l_code_length = libxsmm_snprintf( l_new_code, l_max_code_length, "                       : : \"m\"(B), \"m\"(A), \"m\"(C), \"m\"(A_prefetch), \"m\"(B_prefetch) : \"k1\",\"rax\",\"rbx\",\"rcx\",\"rdx\",\"rdi\",\"rsi\",\"r8\",\"r9\",\"r10\",\"r12\",\"r13\",\"r14\",\"r15\",\"zmm0\",\"zmm1\",\"zmm2\",\"zmm3\",\"zmm4\",\"zmm5\",\"zmm6\",\"zmm7\",\"zmm8\",\"zmm9\",\"zmm10\",\"zmm11\",\"zmm12\",\"zmm13\",\"zmm14\",\"zmm15\",\"zmm16\",\"zmm17\",\"zmm18\",\"zmm19\",\"zmm20\",\"zmm21\",\"zmm22\",\"zmm23\",\"zmm24\",\"zmm25\",\"zmm26\",\"zmm27\",\"zmm28\",\"zmm29\",\"zmm30\",\"zmm31\");\n");
      }
    } else {
      if ( (strcmp(i_arch, "wsm") == 0) ||
           (strcmp(i_arch, "snb") == 0) ||
           (strcmp(i_arch, "hsw") == 0)    ) {
        l_code_length = libxsmm_snprintf( l_new_code, l_max_code_length, "                       : : \"m\"(B), \"m\"(A), \"m\"(C) : \"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"xmm0\",\"xmm1\",\"xmm2\",\"xmm3\",\"xmm4\",\"xmm5\",\"xmm6\",\"xmm7\",\"xmm8\",\"xmm9\",\"xmm10\",\"xmm11\",\"xmm12\",\"xmm13\",\"xmm14\",\"xmm15\");\n", l_gp_reg_a, l_gp_reg_b, l_gp_reg_c, l_gp_reg_mloop, l_gp_reg_nloop, l_gp_reg_kloop);
      } else {
        l_code_length = libxsmm_snprintf( l_new_code, l_max_code_length, "                       : : \"m\"(B), \"m\"(A), \"m\"(C) : \"k1\",\"rax\",\"rbx\",\"rcx\",\"rdx\",\"rdi\",\"rsi\",\"r8\",\"r9\",\"r10\",\"r12\",\"r13\",\"r14\",\"r15\",\"zmm0\",\"zmm1\",\"zmm2\",\"zmm3\",\"zmm4\",\"zmm5\",\"zmm6\",\"zmm7\",\"zmm8\",\"zmm9\",\"zmm10\",\"zmm11\",\"zmm12\",\"zmm13\",\"zmm14\",\"zmm15\",\"zmm16\",\"zmm17\",\"zmm18\",\"zmm19\",\"zmm20\",\"zmm21\",\"zmm22\",\"zmm23\",\"zmm24\",\"zmm25\",\"zmm26\",\"zmm27\",\"zmm28\",\"zmm29\",\"zmm30\",\"zmm31\");\n");
      }
    }
    libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length );
  }
}
void libxsmm_generator_dense_x86_open_instruction_stream( libxsmm_generated_code*       io_generated_code,
                                                           const libxsmm_gp_reg_mapping* i_gp_reg_mapping,
                                                           const char*                   i_arch, 
                                                           const char*                   i_prefetch) { 
  /* @TODO add checks in debug mode */
  if ( io_generated_code->code_type > 1 ) {
    /* @TODO-GREG call encoding here */
    /* @TODO-GREG: how do we interface here? */
    /* this is start of the xGEMM kernel, the registers are in the variables */
  } else if ( io_generated_code->code_type == 1 ) {
    /* @TODO this is currently System V AMD64 RTL(C) ABI only */
    char l_new_code[512];
    int l_max_code_length = 511;
    int l_code_length = 0;
    char l_gp_reg_name[4];

    if ( libxsmm_check_x86_gp_reg_name_callee_save( i_gp_reg_mapping->gp_reg_a ) ) {
      libxsmm_handle_error( io_generated_code, LIBXSMM_ERR_CALLEE_SAVE_A );
      return;
    }
    if ( libxsmm_check_x86_gp_reg_name_callee_save( i_gp_reg_mapping->gp_reg_b ) ) {
      libxsmm_handle_error( io_generated_code, LIBXSMM_ERR_CALLEE_SAVE_B );
      return;
    }
    if ( libxsmm_check_x86_gp_reg_name_callee_save( i_gp_reg_mapping->gp_reg_c ) ) {
      libxsmm_handle_error( io_generated_code, LIBXSMM_ERR_CALLEE_SAVE_C );
      return;
    }
    if ( libxsmm_check_x86_gp_reg_name_callee_save( i_gp_reg_mapping->gp_reg_a_prefetch ) ) {
      libxsmm_handle_error( io_generated_code, LIBXSMM_ERR_CALLEE_SAVE_A_PREF );
      return;
    }
    if ( libxsmm_check_x86_gp_reg_name_callee_save( i_gp_reg_mapping->gp_reg_b_prefetch ) ) {
      libxsmm_handle_error( io_generated_code, LIBXSMM_ERR_CALLEE_SAVE_B_PREF );
      return;
    }
    if ( (strcmp(i_arch, "wsm") == 0) ||
         (strcmp(i_arch, "snb") == 0) ||
         (strcmp(i_arch, "hsw") == 0)    ) {
      if ( libxsmm_check_x86_gp_reg_name_callee_save( i_gp_reg_mapping->gp_reg_mloop ) ) {
        libxsmm_get_x86_gp_reg_name( i_gp_reg_mapping->gp_reg_mloop, l_gp_reg_name, 3 );
        l_code_length = libxsmm_snprintf( l_new_code, l_max_code_length, "                       pushq %%%s\n", l_gp_reg_name );
        libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length );
      }
      if ( libxsmm_check_x86_gp_reg_name_callee_save( i_gp_reg_mapping->gp_reg_nloop ) ) {
        libxsmm_get_x86_gp_reg_name( i_gp_reg_mapping->gp_reg_nloop, l_gp_reg_name, 3 );
        l_code_length = libxsmm_snprintf( l_new_code, l_max_code_length, "                       pushq %%%s\n", l_gp_reg_name );
        libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length );
      }
      if ( libxsmm_check_x86_gp_reg_name_callee_save( i_gp_reg_mapping->gp_reg_kloop ) ) {
        libxsmm_get_x86_gp_reg_name( i_gp_reg_mapping->gp_reg_kloop, l_gp_reg_name, 3 );
        l_code_length = libxsmm_snprintf( l_new_code, l_max_code_length, "                       pushq %%%s\n", l_gp_reg_name );
        libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length );
      }
    } else {
      l_code_length = libxsmm_snprintf( l_new_code, l_max_code_length, "                       pushq %rbx\n" );
      libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length );
      l_code_length = libxsmm_snprintf( l_new_code, l_max_code_length, "                       pushq %r12\n" );
      libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length );
      l_code_length = libxsmm_snprintf( l_new_code, l_max_code_length, "                       pushq %r13\n" );
      libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length );
      l_code_length = libxsmm_snprintf( l_new_code, l_max_code_length, "                       pushq %r14\n" );
      libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length );
      l_code_length = libxsmm_snprintf( l_new_code, l_max_code_length, "                       pushq %r15\n" );
      libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length );
    }
  } else {
    char l_new_code[512];
    int l_max_code_length = 511;
    int l_code_length = 0;
    char l_gp_reg_name[4];
    
    /* loading b pointer in assembley */
    libxsmm_get_x86_gp_reg_name( i_gp_reg_mapping->gp_reg_b, l_gp_reg_name, 3 );
    l_code_length = libxsmm_snprintf( l_new_code, l_max_code_length, "  __asm__ __volatile__(\"movq %%0, %%%%%s\\n\\t\"\n", l_gp_reg_name );
    libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length );

    /* loading a pointer in assembley */
    libxsmm_get_x86_gp_reg_name( i_gp_reg_mapping->gp_reg_a, l_gp_reg_name, 3 );
    l_code_length = libxsmm_snprintf( l_new_code, l_max_code_length, "                       \"movq %%1, %%%%%s\\n\\t\"\n", l_gp_reg_name );
    libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length );

    /* loading c pointer in assembley */
    libxsmm_get_x86_gp_reg_name( i_gp_reg_mapping->gp_reg_c, l_gp_reg_name, 3 );
    l_code_length = libxsmm_snprintf( l_new_code, l_max_code_length, "                       \"movq %%2, %%%%%s\\n\\t\"\n", l_gp_reg_name );
    libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length );

    /* loading b prefetch pointer in assembly */
    if ( ( strcmp(i_prefetch, "BL2viaC") == 0 ) || 
         ( strcmp(i_prefetch, "curAL2_BL2viaC") == 0 )    ) {
      libxsmm_get_x86_gp_reg_name( i_gp_reg_mapping->gp_reg_b_prefetch, l_gp_reg_name, 3 );
      l_code_length = libxsmm_snprintf( l_new_code, l_max_code_length, "                       \"movq %%3, %%%%%s\\n\\t\"\n", l_gp_reg_name );
      libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length );
    /* loading a prefetch pointer in assembly */
    } else if ( ( strcmp(i_prefetch, "AL2jpst") == 0 ) ||
                ( strcmp(i_prefetch, "AL2") == 0 )        ) {
      libxsmm_get_x86_gp_reg_name( i_gp_reg_mapping->gp_reg_a_prefetch, l_gp_reg_name, 3 );
      l_code_length = libxsmm_snprintf( l_new_code, l_max_code_length, "                       \"movq %%3, %%%%%s\\n\\t\"\n", l_gp_reg_name );
      libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length );
    /* loading a and b prefetch pointer in assembly */
    } else if ( ( strcmp(i_prefetch, "AL2jpst_BL2viaC") == 0 ) ||
                ( strcmp(i_prefetch, "AL2_BL2viaC") == 0 )        ) {
      libxsmm_get_x86_gp_reg_name( i_gp_reg_mapping->gp_reg_a_prefetch, l_gp_reg_name, 3 );
      l_code_length = libxsmm_snprintf( l_new_code, l_max_code_length, "                       \"movq %%3, %%%%%s\\n\\t\"\n", l_gp_reg_name );
      libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length );
      libxsmm_get_x86_gp_reg_name( i_gp_reg_mapping->gp_reg_b_prefetch, l_gp_reg_name, 3 );
      l_code_length = libxsmm_snprintf( l_new_code, l_max_code_length, "                       \"movq %%4, %%%%%s\\n\\t\"\n", l_gp_reg_name );
      libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length );
    } else {}
  }

  /* reset loop counters */
  libxsmm_instruction_alu_imm( io_generated_code, LIBXSMM_X86_INSTR_MOVQ, i_gp_reg_mapping->gp_reg_mloop, 0 );
  libxsmm_instruction_alu_imm( io_generated_code, LIBXSMM_X86_INSTR_MOVQ, i_gp_reg_mapping->gp_reg_nloop, 0 );
  libxsmm_instruction_alu_imm( io_generated_code, LIBXSMM_X86_INSTR_MOVQ, i_gp_reg_mapping->gp_reg_kloop, 0 );
}
void libxsmm_instruction_vec_move( libxsmm_generated_code* io_generated_code, 
                                   const unsigned int      i_instruction_set,
                                   const unsigned int      i_vmove_instr, 
                                   const unsigned int      i_gp_reg_base,
                                   const unsigned int      i_gp_reg_idx,
                                   const unsigned int      i_scale,
                                   const int               i_displacement,
                                   const char              i_vector_name,
                                   const unsigned int      i_vec_reg_number_0,
                                   const unsigned int      i_use_masking,
                                   const unsigned int      i_is_store ) {
#ifndef NDEBUG
  if ( i_gp_reg_idx != LIBXSMM_X86_GP_REG_UNDEF ) {
    libxsmm_handle_error( io_generated_code, LIBXSMM_ERR_NO_INDEX_SCALE_ADDR );
    return;
  }
#endif
  /* @TODO add checks in debug mode */
  if ( io_generated_code->code_type > 1 ) {
    /* @TODO-GREG call encoding here */
  } else {
    char l_new_code[512];
    int l_max_code_length = 511;
    int l_code_length = 0;
    char l_gp_reg_base_name[4];
    libxsmm_get_x86_gp_reg_name( i_gp_reg_base, l_gp_reg_base_name, 3 );
    char l_instr_name[16];
    libxsmm_get_x86_instr_name( i_vmove_instr, l_instr_name, 15 );

    if ( (i_instruction_set == LIBXSMM_X86_AVX512) && (i_use_masking != 0) ) {
      /* build vmovpd/ps/sd/ss instruction, load use */
      if ( i_is_store == 0 ) {
        if ( io_generated_code->code_type == 0 ) {
          l_code_length = libxsmm_snprintf(l_new_code, l_max_code_length, "                       \"%s %i(%%%%%s), %%%%%cmm%i%%{%%%%k%i%%}%%{z%%}\\n\\t\"\n", l_instr_name, i_displacement, l_gp_reg_base_name, i_vector_name, i_vec_reg_number_0, LIBXSMM_X86_IMCI_AVX512_MASK );
        } else {
          l_code_length = libxsmm_snprintf(l_new_code, l_max_code_length, "                       %s %i(%%%s), %%%cmm%i{%%k%i}{z}\n", l_instr_name, i_displacement, l_gp_reg_base_name, i_vector_name, i_vec_reg_number_0, LIBXSMM_X86_IMCI_AVX512_MASK );
        }
      } else {
        if ( io_generated_code->code_type == 0 ) {
          l_code_length = libxsmm_snprintf(l_new_code, l_max_code_length, "                       \"%s %%%%%cmm%i, %i(%%%%%s)%%{%%%%k%i%%}\\n\\t\"\n", l_instr_name, i_vector_name, i_vec_reg_number_0, i_displacement, l_gp_reg_base_name, LIBXSMM_X86_IMCI_AVX512_MASK );
        } else {
          l_code_length = libxsmm_snprintf(l_new_code, l_max_code_length, "                       %s %%%cmm%i, %i(%%%s){%%k%i}\n", l_instr_name, i_vector_name, i_vec_reg_number_0, i_displacement, l_gp_reg_base_name, LIBXSMM_X86_IMCI_AVX512_MASK );
        }
      }
    } else if ( (i_instruction_set == LIBXSMM_X86_IMCI) && (i_use_masking != 0) ) {
      /* build vmovpd/ps/sd/ss instruction, load use */
      if ( i_is_store == 0 ) {
        if ( io_generated_code->code_type == 0 ) {
          l_code_length = libxsmm_snprintf(l_new_code, l_max_code_length, "                       \"%s %i(%%%%%s), %%%%%cmm%i%%{%%%%k%i%%}\\n\\t\"\n", l_instr_name, i_displacement, l_gp_reg_base_name, i_vector_name, i_vec_reg_number_0, LIBXSMM_X86_IMCI_AVX512_MASK );
        } else {
          l_code_length = libxsmm_snprintf(l_new_code, l_max_code_length, "                       %s %i(%%%s), %%%cmm%i{%%k%i}\n", l_instr_name, i_displacement, l_gp_reg_base_name, i_vector_name, i_vec_reg_number_0, LIBXSMM_X86_IMCI_AVX512_MASK );
        }
      } else {
        if ( io_generated_code->code_type == 0 ) {
          l_code_length = libxsmm_snprintf(l_new_code, l_max_code_length, "                       \"%s %%%%%cmm%i, %i(%%%%%s)%%{%%%%k%i%%}\\n\\t\"\n", l_instr_name, i_vector_name, i_vec_reg_number_0, i_displacement, l_gp_reg_base_name, LIBXSMM_X86_IMCI_AVX512_MASK );
        } else {
          l_code_length = libxsmm_snprintf(l_new_code, l_max_code_length, "                       %s %%%cmm%i, %i(%%%s){%%k%i}\n", l_instr_name, i_vector_name, i_vec_reg_number_0, i_displacement, l_gp_reg_base_name, LIBXSMM_X86_IMCI_AVX512_MASK );
        }
      }
    } else {
      /* build vmovpd/ps/sd/ss instruction, load use */
      if ( i_is_store == 0 ) {
        if ( io_generated_code->code_type == 0 ) {
          l_code_length = libxsmm_snprintf(l_new_code, l_max_code_length, "                       \"%s %i(%%%%%s), %%%%%cmm%i\\n\\t\"\n", l_instr_name, i_displacement, l_gp_reg_base_name, i_vector_name, i_vec_reg_number_0 );
        } else {
          l_code_length = libxsmm_snprintf(l_new_code, l_max_code_length, "                       %s %i(%%%s), %%%cmm%i\n", l_instr_name, i_displacement, l_gp_reg_base_name, i_vector_name, i_vec_reg_number_0 );
        }
      } else {
        if ( io_generated_code->code_type == 0 ) {
          l_code_length = libxsmm_snprintf(l_new_code, l_max_code_length, "                       \"%s %%%%%cmm%i, %i(%%%%%s)\\n\\t\"\n", l_instr_name, i_vector_name, i_vec_reg_number_0, i_displacement, l_gp_reg_base_name );
        } else {
          l_code_length = libxsmm_snprintf(l_new_code, l_max_code_length, "                       %s %%%cmm%i, %i(%%%s)\n", l_instr_name, i_vector_name, i_vec_reg_number_0, i_displacement, l_gp_reg_base_name );
        }
      }
    }
    libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length );
  }
}
void libxsmm_instruction_vec_compute_mem( libxsmm_generated_code* io_generated_code, 
                                          const unsigned int      i_instruction_set,
                                          const unsigned int      i_vec_instr,
                                          const unsigned int      i_use_broadcast, 
                                          const unsigned int      i_gp_reg_base,
                                          const unsigned int      i_gp_reg_idx,
                                          const unsigned int      i_scale,
                                          const int               i_displacement,
                                          const char              i_vector_name,                                
                                          const unsigned int      i_vec_reg_number_0,
                                          const unsigned int      i_vec_reg_number_1 ) {
  /* @TODO add checks in debug mode */
  if ( (i_instruction_set != LIBXSMM_X86_IMCI)   && 
       (i_instruction_set != LIBXSMM_X86_AVX512) &&
       (i_use_broadcast != 0)                       ) {
    libxsmm_handle_error( io_generated_code, LIBXSMM_ERR_NO_IMCI_AVX512_BCAST );
    return;
  }

  if ( io_generated_code->code_type > 1 ) {
    /* @TODO-GREG call encoding here */
  } else {
    char l_new_code[512];
    int l_max_code_length = 511;
    int l_code_length = 0;
    char l_gp_reg_base[4];
    char l_gp_reg_idx[4];
    libxsmm_get_x86_gp_reg_name( i_gp_reg_base, l_gp_reg_base, 3 );
    char l_instr_name[16];
    libxsmm_get_x86_instr_name( i_vec_instr, l_instr_name, 15 );
    char l_broadcast[8];
    unsigned int l_single_precision = libxsmm_is_x86_vec_instr_single_precision( i_vec_instr );
    if (l_single_precision == 0) {
      libxsmm_snprintf( l_broadcast, 7, "1to8" );
    } else {
      libxsmm_snprintf( l_broadcast, 7, "1to16" );
    }

    /* build vXYZpd/ps/sd/ss instruction pure register use*/
    if ( i_gp_reg_idx == LIBXSMM_X86_GP_REG_UNDEF ) {
      if ( io_generated_code->code_type == 0 ) {
        if (i_use_broadcast != 0) {
          l_code_length = libxsmm_snprintf(l_new_code, l_max_code_length, "                       \"%s %i(%%%%%s)%%{%s%%}, %%%%%cmm%i, %%%%%cmm%i\\n\\t\"\n", l_instr_name, i_displacement, l_gp_reg_base, l_broadcast, i_vector_name, i_vec_reg_number_0, i_vector_name, i_vec_reg_number_1 );
        } else {
          l_code_length = libxsmm_snprintf(l_new_code, l_max_code_length, "                       \"%s %i(%%%%%s), %%%%%cmm%i, %%%%%cmm%i\\n\\t\"\n", l_instr_name, i_displacement, l_gp_reg_base, i_vector_name, i_vec_reg_number_0, i_vector_name, i_vec_reg_number_1 );
        }
      } else {
        if (i_use_broadcast != 0) {
          l_code_length = libxsmm_snprintf(l_new_code, l_max_code_length, "                       %s %i(%%%s){%s}, %%%cmm%i, %%%cmm%i\n", l_instr_name, i_displacement, l_gp_reg_base, l_broadcast, i_vector_name, i_vec_reg_number_0, i_vector_name, i_vec_reg_number_1 );
        } else {
          l_code_length = libxsmm_snprintf(l_new_code, l_max_code_length, "                       %s %i(%%%s), %%%cmm%i, %%%cmm%i\n", l_instr_name, i_displacement, l_gp_reg_base, i_vector_name, i_vec_reg_number_0, i_vector_name, i_vec_reg_number_1 );
        }
      }
    } else {
      libxsmm_get_x86_gp_reg_name( i_gp_reg_idx, l_gp_reg_idx, 3 );
      if ( io_generated_code->code_type == 0 ) {
        if (i_use_broadcast != 0) {
          l_code_length = libxsmm_snprintf(l_new_code, l_max_code_length, "                       \"%s %i(%%%%%s,%%%%%s,%i)%%{%s%%}, %%%%%cmm%i, %%%%%cmm%i\\n\\t\"\n", l_instr_name, i_displacement, l_gp_reg_base, l_gp_reg_idx, i_scale, l_broadcast, i_vector_name, i_vec_reg_number_0, i_vector_name, i_vec_reg_number_1 );
        } else {
          l_code_length = libxsmm_snprintf(l_new_code, l_max_code_length, "                       \"%s %i(%%%%%s,%%%%%s,%i), %%%%%cmm%i, %%%%%cmm%i\\n\\t\"\n", l_instr_name, i_displacement, l_gp_reg_base, l_gp_reg_idx, i_scale, i_vector_name, i_vec_reg_number_0, i_vector_name, i_vec_reg_number_1 );
        }
      } else {
        if (i_use_broadcast != 0) {
          l_code_length = libxsmm_snprintf(l_new_code, l_max_code_length, "                       %s %i(%%%s,%%%s,%i){%s}, %%%cmm%i, %%%cmm%i\n", l_instr_name, i_displacement, l_gp_reg_base, l_gp_reg_idx, i_scale, l_broadcast, i_vector_name, i_vec_reg_number_0, i_vector_name, i_vec_reg_number_1 );
        } else {
          l_code_length = libxsmm_snprintf(l_new_code, l_max_code_length, "                       %s %i(%%%s,%%%s,%i), %%%cmm%i, %%%cmm%i\n", l_instr_name, i_displacement, l_gp_reg_base, l_gp_reg_idx, i_scale, i_vector_name, i_vec_reg_number_0, i_vector_name, i_vec_reg_number_1 );
        }
      }
    }
    libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length );
  }
}
void libxsmm_generator_dense_avx512_microkernel( libxsmm_generated_code*             io_generated_code,
                                                 const libxsmm_gp_reg_mapping*       i_gp_reg_mapping,
                                                 const libxsmm_micro_kernel_config*  i_micro_kernel_config,
                                                 const libxsmm_xgemm_descriptor*     i_xgemm_desc,
                                                 const unsigned int                  i_n_blocking,
                                                 const unsigned int                  i_k_blocking,
                                                 const int                           i_offset ) {
#ifndef NDEBUG
  if ( i_n_blocking > 30 ) {
    libxsmm_handle_error( io_generated_code, LIBXSMM_ERR_N_BLOCK );
    return;
  }
  if ( (i_offset >= 0) && (i_k_blocking != 1) ) {
    fprintf(stderr, "LIBXSMM WARNING, libxsmm_generator_dense_avx512_microkernel: i_k_blocking is ignored as offset is >=0\n");
  }
#endif
  unsigned int l_n;
  unsigned int l_k;

  /* if we have an offset greater-equal -> external k-unrolling */
  if (i_offset != (-1)) {
    /* load A */
    libxsmm_instruction_vec_move( io_generated_code,
                                  i_micro_kernel_config->instruction_set,
                                  i_micro_kernel_config->a_vmove_instruction, 
                                  i_gp_reg_mapping->gp_reg_a, 
                                  LIBXSMM_X86_GP_REG_UNDEF, 0, 
                                  i_xgemm_desc->lda * i_offset * i_micro_kernel_config->datatype_size, 
                                  i_micro_kernel_config->vector_name, 
                                  0, 
                                  i_micro_kernel_config->use_masking_a_c, 0 );

    /* current A prefetch, next 8 rows for the current column */
    if ( (strcmp( i_xgemm_desc->prefetch,"curAL2" ) == 0)         ||
         (strcmp( i_xgemm_desc->prefetch,"curAL2_BL2viaC" ) == 0)    ) {
      libxsmm_instruction_prefetch( io_generated_code,
                                    i_micro_kernel_config->prefetch_instruction,
                                    i_gp_reg_mapping->gp_reg_a,
                                    LIBXSMM_X86_GP_REG_UNDEF, 0, 
                                    (i_xgemm_desc->lda * i_offset * i_micro_kernel_config->datatype_size) + 64 );
    }

    /* next A prefetch "same" rows in "same" column, but in a different matrix */ 
    if ( (strcmp( i_xgemm_desc->prefetch,"AL2jpst" ) == 0)         ||
         (strcmp( i_xgemm_desc->prefetch,"AL2jpst_BL2viaC" ) == 0) ||
         (strcmp( i_xgemm_desc->prefetch,"AL2" ) == 0)             || 
         (strcmp( i_xgemm_desc->prefetch,"AL2_BL2viaC" ) == 0)        ) {
      libxsmm_instruction_prefetch( io_generated_code,
                                    i_micro_kernel_config->prefetch_instruction,
                                    i_gp_reg_mapping->gp_reg_a_prefetch,
                                    LIBXSMM_X86_GP_REG_UNDEF, 0, 
                                    (i_xgemm_desc->lda * i_offset * i_micro_kernel_config->datatype_size) );
    }

    /* compute vectorwidth (A) * column broadcast (B) */
    for ( l_n = 0; l_n < i_n_blocking; l_n++) {
      libxsmm_instruction_vec_compute_mem( io_generated_code, 
                                           i_micro_kernel_config->instruction_set,
                                           i_micro_kernel_config->vmul_instruction,
                                           1,
                                           i_gp_reg_mapping->gp_reg_b,
                                           LIBXSMM_X86_GP_REG_UNDEF,
                                           0,
                                           (i_offset * i_micro_kernel_config->datatype_size) + (i_xgemm_desc->ldb * i_micro_kernel_config->datatype_size * l_n),
                                           i_micro_kernel_config->vector_name,
                                           0,
                                           i_micro_kernel_config->vector_reg_count - i_n_blocking + l_n );
    }
  } else {
    /* apply k blocking */
    for ( l_k = 0; l_k < i_k_blocking; l_k++ ) {
      if ( l_k == 0 ) {
        /* load A */
        libxsmm_instruction_vec_move( io_generated_code,
                                      i_micro_kernel_config->instruction_set,
                                      i_micro_kernel_config->a_vmove_instruction, 
                                      i_gp_reg_mapping->gp_reg_a, 
                                      LIBXSMM_X86_GP_REG_UNDEF, 0, 
                                      i_xgemm_desc->lda * l_k * i_micro_kernel_config->datatype_size, 
                                      i_micro_kernel_config->vector_name, 
                                      0, 
                                      i_micro_kernel_config->use_masking_a_c, 0 );
        if ( i_k_blocking > 1 ) {
          /* second A load in first iteration, in case of large blockings -> hiding L1 latencies */
          libxsmm_instruction_vec_move( io_generated_code,
                                        i_micro_kernel_config->instruction_set,
                                        i_micro_kernel_config->a_vmove_instruction, 
                                        i_gp_reg_mapping->gp_reg_a, 
                                        LIBXSMM_X86_GP_REG_UNDEF, 0, 
                                        i_xgemm_desc->lda * (l_k+1) * i_micro_kernel_config->datatype_size, 
                                        i_micro_kernel_config->vector_name, 
                                        1, 
                                        i_micro_kernel_config->use_masking_a_c, 0 );
        }
      } else if ( l_k < (i_k_blocking - 1) ) {
        /* pipelined load of A, one k iteration ahead */
        libxsmm_instruction_vec_move( io_generated_code,
                                      i_micro_kernel_config->instruction_set,
                                      i_micro_kernel_config->a_vmove_instruction, 
                                      i_gp_reg_mapping->gp_reg_a, 
                                      LIBXSMM_X86_GP_REG_UNDEF, 0, 
                                      i_xgemm_desc->lda * (l_k+1) * i_micro_kernel_config->datatype_size, 
                                      i_micro_kernel_config->vector_name, 
                                      (l_k+1)%2, 
                                      i_micro_kernel_config->use_masking_a_c, 0 );
      }

      /* current A prefetch, next 8 rows for the current column */
      if ( (strcmp( i_xgemm_desc->prefetch, "curAL2" ) == 0)         ||
           (strcmp( i_xgemm_desc->prefetch, "curAL2_BL2viaC" ) == 0)    ) {
        libxsmm_instruction_prefetch( io_generated_code,
                                      i_micro_kernel_config->prefetch_instruction,
                                      i_gp_reg_mapping->gp_reg_a,
                                      LIBXSMM_X86_GP_REG_UNDEF, 0, 
                                      (i_xgemm_desc->lda * l_k * i_micro_kernel_config->datatype_size) + 64 );
      }

      /* next A prefetch "same" rows in "same" column, but in a different matrix */
      if ( (strcmp( i_xgemm_desc->prefetch, "AL2jpst" ) == 0)         || 
           (strcmp( i_xgemm_desc->prefetch, "AL2jpst_BL2viaC" ) == 0) ||
           (strcmp( i_xgemm_desc->prefetch, "AL2" ) == 0)             || 
           (strcmp( i_xgemm_desc->prefetch, "AL2_BL2viaC" ) == 0)        ) {
        libxsmm_instruction_prefetch( io_generated_code,
                                      i_micro_kernel_config->prefetch_instruction,
                                      i_gp_reg_mapping->gp_reg_a_prefetch,
                                      LIBXSMM_X86_GP_REG_UNDEF, 0, 
                                      (i_xgemm_desc->lda * l_k * i_micro_kernel_config->datatype_size) );
        if ( l_k == (i_k_blocking - 1) ) {
          libxsmm_instruction_alu_imm( io_generated_code,
                                       i_micro_kernel_config->alu_add_instruction, 
                                       i_gp_reg_mapping->gp_reg_a_prefetch,
                                       i_k_blocking * i_micro_kernel_config->datatype_size * i_xgemm_desc->lda );
        }
      }

      /* in last k-iteration: advance pointers */
      if ( l_k == (i_k_blocking - 1) ) {
        libxsmm_instruction_alu_imm( io_generated_code,
                                     i_micro_kernel_config->alu_add_instruction, 
                                     i_gp_reg_mapping->gp_reg_a,
                                     i_k_blocking * i_micro_kernel_config->datatype_size * i_xgemm_desc->lda );
      }

      /* compute vectorwidth (A) * column broadcast (B) */
      for ( l_n = 0; l_n < i_n_blocking; l_n++) {
        libxsmm_instruction_vec_compute_mem( io_generated_code, 
                                             i_micro_kernel_config->instruction_set,
                                             i_micro_kernel_config->vmul_instruction,
                                             1,
                                             i_gp_reg_mapping->gp_reg_b,
                                             LIBXSMM_X86_GP_REG_UNDEF,
                                             0,
                                             (l_k * i_micro_kernel_config->datatype_size)+(i_xgemm_desc->ldb * i_micro_kernel_config->datatype_size * l_n),
                                             i_micro_kernel_config->vector_name,
                                             l_k%2,
                                             i_micro_kernel_config->vector_reg_count - i_n_blocking + l_n );
      }
    }

    /* advance pointers of B only when we are not fully unrolling K*/
    if ( i_k_blocking < i_xgemm_desc->k ) {
      /* advance pointers of B */
      libxsmm_instruction_alu_imm( io_generated_code,
                                   i_micro_kernel_config->alu_add_instruction, 
                                   i_gp_reg_mapping->gp_reg_b,
                                   i_k_blocking * i_micro_kernel_config->datatype_size );
    }
  }
}