void libxsmm_function_signature( libxsmm_generated_code* io_generated_code, const char* i_routine_name, const libxsmm_xgemm_descriptor* i_xgemm_desc ) { char l_new_code[512]; int l_max_code_length = 511; int l_code_length = 0; if ( io_generated_code->code_type > 1 ) { return; } else if ( io_generated_code->code_type == 1 ) { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, ".global %s\n.type %s, @function\n%s:\n", i_routine_name, i_routine_name, i_routine_name); } else { /* selecting the correct signature */ if (0 != (LIBXSMM_XGEMM_FLAG_F32PREC & i_xgemm_desc->flags)) { if (LIBXSMM_PREFETCH_NONE == i_xgemm_desc->prefetch) { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, "void %s(const float* A, const float* B, float* C) {\n", i_routine_name); } else { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, "void %s(const float* A, const float* B, float* C, const float* A_prefetch, const float* B_prefetch, const float* C_prefetch) {\n", i_routine_name); } } else { if (LIBXSMM_PREFETCH_NONE == i_xgemm_desc->prefetch) { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, "void %s(const double* A, const double* B, double* C) {\n", i_routine_name); } else { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, "void %s(const double* A, const double* B, double* C, const double* A_prefetch, const double* B_prefetch, const double* C_prefetch) {\n", i_routine_name); } } } libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); }
void libxsmm_close_function( libxsmm_generated_code* io_generated_code ) { if ( io_generated_code->code_type == 0 ) { char l_new_code[512]; const int l_max_code_length = 511; const int l_code_length = LIBXSMM_SNPRINTF( l_new_code, l_max_code_length, "}\n\n" ); libxsmm_append_code_as_string(io_generated_code, l_new_code, l_code_length ); } }
LIBXSMM_INLINE LIBXSMM_RETARGETABLE unsigned int internal_print_statistic(FILE* ostream, const char* target_arch, int precision, unsigned int linebreaks, unsigned int indent) { const internal_statistic_type statistic_sml = internal_statistic[precision][0/*SML*/]; const internal_statistic_type statistic_med = internal_statistic[precision][1/*MED*/]; const internal_statistic_type statistic_big = internal_statistic[precision][2/*BIG*/]; int printed = 0; assert(0 != ostream && 0 != target_arch && (0 <= precision && precision < 2)); if (/* omit to print anything if it is superfluous */ 0 != statistic_sml.ntry || 0 != statistic_sml.njit || 0 != statistic_sml.nsta || 0 != statistic_sml.ncol || 0 != statistic_med.ntry || 0 != statistic_med.njit || 0 != statistic_med.nsta || 0 != statistic_med.ncol || 0 != statistic_big.ntry || 0 != statistic_big.njit || 0 != statistic_big.nsta || 0 != statistic_big.ncol) { const unsigned int sml = internal_statistic_sml, med = internal_statistic_med, mnk = internal_statistic_mnk; char title[256], csml[256], cmed[256], cbig[256]; LIBXSMM_SNPRINTF(csml, sizeof(csml), "%u..%u", 0u, sml); LIBXSMM_SNPRINTF(cmed, sizeof(cmed), "%u..%u", sml + 1u, med); LIBXSMM_SNPRINTF(cbig, sizeof(cbig), "%u..%u", med + 1u, mnk); { unsigned int n = 0; for (n = 0; 0 != target_arch[n] && n < sizeof(title); ++n) { /* toupper */ const char c = target_arch[n]; title[n] = (char)(('a' <= c && c <= 'z') ? (c - 32) : c); } LIBXSMM_SNPRINTF(title + n, sizeof(title) - n, "/%s", 0 == precision ? "DP" : "SP"); for (n = 0; n < linebreaks; ++n) fprintf(ostream, "\n"); } fprintf(ostream, "%*s%-10s %6s %6s %6s %6s\n", (int)indent, "", title, "TRY" ,"JIT", "STA", "COL"); fprintf(ostream, "%*s%10s %6u %6u %6u %6u\n", (int)indent, "", csml, statistic_sml.ntry, statistic_sml.njit, statistic_sml.nsta, statistic_sml.ncol); fprintf(ostream, "%*s%10s %6u %6u %6u %6u\n", (int)indent, "", cmed, statistic_med.ntry, statistic_med.njit, statistic_med.nsta, statistic_med.ncol); fprintf(ostream, "%*s%10s %6u %6u %6u %6u\n", (int)indent, "", cbig, statistic_big.ntry, statistic_big.njit, statistic_big.nsta, statistic_big.ncol); printed = 1; } return printed; }
LIBXSMM_INLINE LIBXSMM_RETARGETABLE void internal_get_code_name(const char* target_arch, const char* jit_kind, const libxsmm_gemm_descriptor* desc, unsigned int buffer_size, char* name) { assert((0 != desc && 0 != name) || 0 == buffer_size); LIBXSMM_SNPRINTF(name, buffer_size, "libxsmm_%s_%c%c%c_%ux%ux%u_%u_%u_%u_a%i_b%i_p%i.%s", target_arch /* code path name */, 0 == (LIBXSMM_GEMM_FLAG_F32PREC & desc->flags) ? 'd' : 's', 0 == (LIBXSMM_GEMM_FLAG_TRANS_A & desc->flags) ? 'n' : 't', 0 == (LIBXSMM_GEMM_FLAG_TRANS_B & desc->flags) ? 'n' : 't', (unsigned int)desc->m, (unsigned int)desc->n, (unsigned int)desc->k, (unsigned int)desc->lda, (unsigned int)desc->ldb, (unsigned int)desc->ldc, desc->alpha, desc->beta, internal_get_prefetch(desc), 0 != jit_kind ? jit_kind : "jit"); }
LIBXSMM_INTERNAL_API_DEFINITION void libxsmm_generator_gemm_add_flop_counter( libxsmm_generated_code* io_generated_code, const libxsmm_gemm_descriptor* i_xgemm_desc ) { if ( io_generated_code->code_type == 0 ) { char l_new_code[512]; const unsigned int l_max_code_length = sizeof(l_new_code) - 1; int l_code_length = 0; l_code_length = LIBXSMM_SNPRINTF( l_new_code, l_max_code_length, "#ifndef NDEBUG\n" ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF( l_new_code, l_max_code_length, "#ifdef _OPENMP\n" ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF( l_new_code, l_max_code_length, "#pragma omp atomic\n" ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF( l_new_code, l_max_code_length, "#endif\n" ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF( l_new_code, l_max_code_length, "libxsmm_num_total_flops += %u;\n", 2u * i_xgemm_desc->m * i_xgemm_desc->n * i_xgemm_desc->k); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF( l_new_code, l_max_code_length, "#endif\n" ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); } }
LIBXSMM_API_INTERN void libxsmm_generator_spgemm_csc_bsparse( libxsmm_generated_code* io_generated_code, const libxsmm_gemm_descriptor* i_xgemm_desc, const char* i_arch, const unsigned int* i_row_idx, const unsigned int* i_column_idx, const double* i_values ) { unsigned int l_n; unsigned int l_z; unsigned int l_column_elements; unsigned int l_flop_count = 0; char l_new_code[512]; int l_max_code_length = 511; int l_code_length = 0; LIBXSMM_UNUSED(i_values); l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " unsigned int l_m = 0;\n"); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); /* reset C if beta is zero */ if (0 != (LIBXSMM_GEMM_FLAG_BETA_0 & i_xgemm_desc->flags)) { /* Beta=0 */ l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " unsigned int l_n = 0;\n"); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " for ( l_n = 0; l_n < %u; l_n++) {\n", (unsigned int)i_xgemm_desc->n); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); if ( i_xgemm_desc->m > 1 ) { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " #pragma simd\n"); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " #pragma vector aligned\n"); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); } if ( LIBXSMM_GEMM_PRECISION_F64 == LIBXSMM_GETENUM_INP( i_xgemm_desc->datatype ) ) { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " for ( l_m = 0; l_m < %u; l_m++) { C[(l_n*%u)+l_m] = 0.0; }\n", (unsigned int)i_xgemm_desc->m, (unsigned int)i_xgemm_desc->ldc); } else { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " for ( l_m = 0; l_m < %u; l_m++) { C[(l_n*%u)+l_m] = 0.0f; }\n", (unsigned int)i_xgemm_desc->m, (unsigned int)i_xgemm_desc->ldc); } libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " }\n"); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); } l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, "\n"); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); /* determine the correct simd pragma for each architecture */ if ( ( strcmp( i_arch, "noarch" ) == 0 ) || ( strcmp( i_arch, "wsm" ) == 0 ) || ( strcmp( i_arch, "snb" ) == 0 ) || ( strcmp( i_arch, "hsw" ) == 0 ) ) { if ( i_xgemm_desc->m > 7 ) { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " #pragma simd vectorlength(8)\n"); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); } else if ( i_xgemm_desc->m > 3 ) { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " #pragma simd vectorlength(4)\n"); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); } else if ( i_xgemm_desc->m > 1 ) { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " #pragma simd vectorlength(2)\n"); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); } else {} if ( (i_xgemm_desc->m > 1) && ((LIBXSMM_GEMM_FLAG_ALIGN_A & i_xgemm_desc->flags) != 0) && ((LIBXSMM_GEMM_FLAG_ALIGN_C & i_xgemm_desc->flags) != 0) ) { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " #pragma vector aligned\n"); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); } } else if ( ( strcmp( i_arch, "knl" ) == 0 ) || ( strcmp( i_arch, "skx" ) == 0 ) || ( strcmp( i_arch, "clx" ) == 0 ) || ( strcmp( i_arch, "cpx" ) == 0 ) ) { if ( (i_xgemm_desc->m > 1) && ((LIBXSMM_GEMM_FLAG_ALIGN_A & i_xgemm_desc->flags) != 0) && ((LIBXSMM_GEMM_FLAG_ALIGN_C & i_xgemm_desc->flags) != 0) ) { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " #pragma simd vectorlength(32)\n #pragma vector aligned\n"); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); } } else { LIBXSMM_HANDLE_ERROR( io_generated_code, LIBXSMM_ERR_ARCH ); return; } /* generate the actuel kernel */ l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " for ( l_m = 0; l_m < %u; l_m++) {\n", (unsigned int)i_xgemm_desc->m); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); for ( l_n = 0; l_n < (unsigned int)i_xgemm_desc->n; l_n++ ) { l_column_elements = i_column_idx[l_n+1] - i_column_idx[l_n]; for ( l_z = 0; l_z < l_column_elements; l_z++ ) { /* check k such that we just use rows which actually need to be multiplied */ if ( i_row_idx[i_column_idx[l_n] + l_z] < (unsigned int)i_xgemm_desc->k ) { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " C[%u+l_m] += A[%u+l_m] * B[%u];\n", l_n * i_xgemm_desc->ldc, i_row_idx[i_column_idx[l_n] + l_z]*i_xgemm_desc->lda, i_column_idx[l_n] + l_z); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_flop_count += 2; } } } l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " }\n"); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); /* add flop counter */ l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, "\n#ifndef NDEBUG\n#ifdef _OPENMP\n#pragma omp atomic\n#endif\nlibxsmm_num_total_flops += %u;\n#endif\n", l_flop_count * (unsigned int)i_xgemm_desc->m); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); }
void libxsmm_sparse_asparse_innerloop_scalar( libxsmm_generated_code* io_generated_code, const libxsmm_gemm_descriptor* i_xgemm_desc, const unsigned int i_k, const unsigned int i_z, const unsigned int* i_row_idx, const unsigned int* i_column_idx ) { char l_new_code[512]; int l_max_code_length = 511; int l_code_length = 0; if ( (LIBXSMM_GEMM_FLAG_F32PREC & i_xgemm_desc->flags) == 0 ) { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " __m128d c%u_%u = _mm_load_sd(&C[(l_n*%u)+%u]);\n", i_k, i_z, i_xgemm_desc->ldc, i_row_idx[i_column_idx[i_k] + i_z] ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " __m128d a%u_%u = _mm_load_sd(&A[%u]);\n", i_k, i_z, i_column_idx[i_k] + i_z ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, "#if defined(__SSE3__) && defined(__AVX__)\n"); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " c%u_%u = _mm_add_sd(c%u_%u, _mm_mul_sd(a%u_%u, _mm256_castpd256_pd128(b%u)));\n", i_k, i_z, i_k, i_z, i_k, i_z, i_k ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, "#endif\n"); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, "#if defined(__SSE3__) && !defined(__AVX__)\n"); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " c%u_%u = _mm_add_sd(c%u_%u, _mm_mul_sd(a%u_%u, b%u));\n", i_k, i_z, i_k, i_z, i_k, i_z, i_k ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, "#endif\n"); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " _mm_store_sd(&C[(l_n*%u)+%u], c%u_%u);\n", i_xgemm_desc->ldc, i_row_idx[i_column_idx[i_k] + i_z], i_k, i_z ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); } else { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " __m128 c%u_%u = _mm_load_ss(&C[(l_n*%u)+%u]);\n", i_k, i_z, i_xgemm_desc->ldc, i_row_idx[i_column_idx[i_k] + i_z] ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " __m128 a%u_%u = _mm_load_ss(&A[%u]);\n", i_k, i_z, i_column_idx[i_k] + i_z ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " c%u_%u = _mm_add_ss(c%u_%u, _mm_mul_ss(a%u_%u, b%u));\n", i_k, i_z, i_k, i_z, i_k, i_z, i_k ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " _mm_store_ss(&C[(l_n*%u)+%u], c%u_%u);\n", i_xgemm_desc->ldc, i_row_idx[i_column_idx[i_k] + i_z], i_k, i_z ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); } }
void libxsmm_generator_sparse_asparse( libxsmm_generated_code* io_generated_code, const libxsmm_gemm_descriptor* i_xgemm_desc, const char* i_arch, const unsigned int* i_row_idx, const unsigned int* i_column_idx, const double* i_values ) { char l_new_code[512]; int l_max_code_length = 511; int l_code_length = 0; unsigned int l_k; unsigned int l_flop_count = 0; LIBXSMM_UNUSED(i_arch); LIBXSMM_UNUSED(i_values); /* loop over columns in C in generated code, we fully unroll inside each column */ l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " unsigned int l_n = 0;\n #pragma nounroll_and_jam\n for ( l_n = 0; l_n < %u; l_n++) {\n", i_xgemm_desc->n); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); /* reset the current column in C if needed */ if ( i_xgemm_desc->beta == 0 ) { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " unsigned int l_m = 0;\n"); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); if ( i_xgemm_desc->m > 1 ) { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " #pragma simd\n"); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); } if ( (LIBXSMM_GEMM_FLAG_F32PREC & i_xgemm_desc->flags) == 0 ) { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " for ( l_m = 0; l_m < %u; l_m++) {\n C[(l_n*%u)+l_m] = 0.0;\n }\n", i_xgemm_desc->m, i_xgemm_desc->ldc); } else { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " for ( l_m = 0; l_m < %u; l_m++) {\n C[(l_n*%u)+l_m] = 0.0f;\n }\n", i_xgemm_desc->m, i_xgemm_desc->ldc); } libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); } /* loop over columns in A, rows in B and fully unroll */ for ( l_k = 0; l_k < i_xgemm_desc->k; l_k++ ) { unsigned int l_column_elements = i_column_idx[l_k + 1] - i_column_idx[l_k]; unsigned int l_z = 0; l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, "#if defined(__SSE3__) || defined(__AVX__)\n"); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); if ( l_column_elements > 0 ) { if ( (LIBXSMM_GEMM_FLAG_F32PREC & i_xgemm_desc->flags) == 0 ) { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, "#if defined(__SSE3__) && defined(__AVX__)\n __m256d b%u = _mm256_broadcast_sd(&B[(l_n*%u)+%u]);\n#endif\n", l_k, i_xgemm_desc->ldb, l_k); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, "#if defined(__SSE3__) && !defined(__AVX__)\n __m128d b%u = _mm_loaddup_pd(&B[(l_n*%u)+%u]);\n#endif\n", l_k, i_xgemm_desc->ldb, l_k); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); } else { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, "#if defined(__SSE3__) && defined(__AVX__)\n __m128 b%u = _mm_broadcast_ss(&B[(l_n*%u)+%u]);\n#endif\n", l_k, i_xgemm_desc->ldb, l_k); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, "#if defined(__SSE3__) && !defined(__AVX__)\n __m128 b%u = _mm_load_ss(&B[(l_n*%u)+%u]); b%u = _mm_shuffle_ps(b%u, b%u, 0x00);\n#endif\n", l_k, i_xgemm_desc->ldb, l_k, l_k, l_k, l_k); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); } } /* loop over the columns of A and look for vectorization potential */ for ( l_z = 0; l_z < l_column_elements; l_z++ ) { /* 4 element vector might be possible */ if ( (l_z < (l_column_elements - 3)) && (l_column_elements > 3) ) { /* check for 256bit vector instruction */ if ((i_row_idx[i_column_idx[l_k] + l_z] + 1 == i_row_idx[i_column_idx[l_k] + l_z + 1]) && (i_row_idx[i_column_idx[l_k] + l_z] + 2 == i_row_idx[i_column_idx[l_k] + l_z + 2]) && (i_row_idx[i_column_idx[l_k] + l_z] + 3 == i_row_idx[i_column_idx[l_k] + l_z + 3]) && (i_row_idx[i_column_idx[l_k] + l_z + 3] < i_xgemm_desc->m)) { libxsmm_sparse_asparse_innerloop_four_vector(io_generated_code, i_xgemm_desc, l_k, l_z, i_row_idx, i_column_idx); l_z += 3; /* check for 128bit vector instruction */ } else if ((i_row_idx[i_column_idx[l_k] + l_z] + 1 == i_row_idx[i_column_idx[l_k] + l_z + 1]) && (i_row_idx[i_column_idx[l_k] + l_z + 1] < i_xgemm_desc->m) ) { libxsmm_sparse_asparse_innerloop_two_vector(io_generated_code, i_xgemm_desc, l_k, l_z, i_row_idx, i_column_idx); l_z++; /* scalare instruction */ } else { if ( (i_row_idx[i_column_idx[l_k] + l_z] < i_xgemm_desc->m) ) { libxsmm_sparse_asparse_innerloop_scalar(io_generated_code, i_xgemm_desc, l_k, l_z, i_row_idx, i_column_idx); } } /* 2 element vector might be possible */ } else if ( (l_z < (l_column_elements - 1)) && (l_column_elements > 1)) { /* check for 128bit vector instruction */ if ((i_row_idx[i_column_idx[l_k] + l_z] + 1 == i_row_idx[i_column_idx[l_k] + l_z + 1]) && (i_row_idx[i_column_idx[l_k] + l_z + 1] < i_xgemm_desc->m) ) { libxsmm_sparse_asparse_innerloop_two_vector(io_generated_code, i_xgemm_desc, l_k, l_z, i_row_idx, i_column_idx); l_z++; /* scalare instruction */ } else { if ( (i_row_idx[i_column_idx[l_k] + l_z] < i_xgemm_desc->m) ) { libxsmm_sparse_asparse_innerloop_scalar(io_generated_code, i_xgemm_desc, l_k, l_z, i_row_idx, i_column_idx); } } /* scalar anayways */ } else { if ( (i_row_idx[i_column_idx[l_k] + l_z] < i_xgemm_desc->m) ) { libxsmm_sparse_asparse_innerloop_scalar(io_generated_code, i_xgemm_desc, l_k, l_z, i_row_idx, i_column_idx); } } } /* C fallback code */ l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, "#else\n"); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); /* loop over the columns of A */ for ( l_z = 0; l_z < l_column_elements; l_z++ ) { if ( (i_row_idx[i_column_idx[l_k] + l_z] < i_xgemm_desc->m) ) { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " C[(l_n*%u)+%u] += A[%u] * B[(l_n*%u)+%u];\n", i_xgemm_desc->ldc, i_row_idx[i_column_idx[l_k] + l_z], i_column_idx[l_k] + l_z, i_xgemm_desc->ldb, l_k ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_flop_count += 2; } } l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, "#endif\n\n"); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); } l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " }\n"); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); /* add flop counter */ l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, "\n#ifndef NDEBUG\n#ifdef _OPENMP\n#pragma omp atomic\n#endif\nlibxsmm_num_total_flops += %u;\n#endif\n", l_flop_count * i_xgemm_desc->n); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); }
LIBXSMM_API_INTERN void libxsmm_sparse_csc_asparse_innerloop_four_vector( libxsmm_generated_code* io_generated_code, const libxsmm_gemm_descriptor* i_xgemm_desc, const unsigned int i_k, const unsigned int i_z, const unsigned int* i_row_idx, const unsigned int* i_column_idx ) { char l_new_code[512]; int l_max_code_length = 511; int l_code_length = 0; if ( LIBXSMM_GEMM_PRECISION_F64 == LIBXSMM_GETENUM_INP( i_xgemm_desc->datatype ) ) { unsigned int l_i; unsigned int l_z = i_z; l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, "#if defined(__SSE3__) && defined(__AVX__)\n"); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " __m256d c%u_%u = _mm256_loadu_pd(&C[(l_n*%u)+%u]);\n", i_k, i_z, (unsigned int)i_xgemm_desc->ldc, i_row_idx[i_column_idx[i_k] + i_z] ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " __m256d a%u_%u = _mm256_loadu_pd(&A[%u]);\n", i_k, i_z, i_column_idx[i_k] + i_z ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " c%u_%u = _mm256_add_pd(c%u_%u, _mm256_mul_pd(a%u_%u, b%u));\n", i_k, i_z, i_k, i_z, i_k, i_z, i_k ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " _mm256_storeu_pd(&C[(l_n*%u)+%u], c%u_%u);\n", (unsigned int)i_xgemm_desc->ldc, i_row_idx[i_column_idx[i_k] + i_z], i_k, i_z ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, "#endif\n"); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, "#if defined(__SSE3__) && !defined(__AVX__)\n"); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); for ( l_i = 0; l_i < 2; l_i++ ) { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " __m128d c%u_%u = _mm_loadu_pd(&C[(l_n*%u)+%u]);\n", i_k, l_z, (unsigned int)i_xgemm_desc->ldc, i_row_idx[i_column_idx[i_k] + l_z] ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " __m128d a%u_%u = _mm_loadu_pd(&A[%u]);\n", i_k, l_z, i_column_idx[i_k] + l_z ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " c%u_%u = _mm_add_pd(c%u_%u, _mm_mul_pd(a%u_%u, b%u));\n", i_k, l_z, i_k, l_z, i_k, l_z, i_k ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " _mm_storeu_pd(&C[(l_n*%u)+%u], c%u_%u);\n", (unsigned int)i_xgemm_desc->ldc, i_row_idx[i_column_idx[i_k] + l_z], i_k, l_z ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_z += 2; } l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, "#endif\n"); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); } else { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " __m128 c%u_%u = _mm_loadu_ps(&C[(l_n*%u)+%u]);\n", i_k, i_z, (unsigned int)i_xgemm_desc->ldc, i_row_idx[i_column_idx[i_k] + i_z] ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " __m128 a%u_%u = _mm_loadu_ps(&A[%u]);\n", i_k, i_z, i_column_idx[i_k] + i_z ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " c%u_%u = _mm_add_ps(c%u_%u, _mm_mul_ps(a%u_%u, b%u));\n", i_k, i_z, i_k, i_z, i_k, i_z, i_k ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " _mm_storeu_ps(&C[(l_n*%u)+%u], c%u_%u);\n", (unsigned int)i_xgemm_desc->ldc, i_row_idx[i_column_idx[i_k] + i_z], i_k, i_z ); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); } }
void libxsmm_generator_dense_noarch_kernel( libxsmm_generated_code* io_generated_code, const libxsmm_xgemm_descriptor* i_xgemm_desc, const char* i_arch ) { char l_new_code[512]; int l_max_code_length = 511; int l_code_length = 0; LIBXSMM_UNUSED(i_arch); l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " unsigned int l_m = 0;\n"); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " unsigned int l_n = 0;\n"); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " unsigned int l_k = 0;\n\n"); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " for ( l_n = 0; l_n < %u; l_n++ ) {\n", i_xgemm_desc->n); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); if ( i_xgemm_desc->beta == 0 ) { if ( (LIBXSMM_XGEMM_FLAG_F32PREC & i_xgemm_desc->flags) == 0 ) { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " for ( l_m = 0; l_m < %u; l_m++ ) { C[(l_n*%u)+l_m] = 0.0; }\n\n", i_xgemm_desc->m, i_xgemm_desc->ldc); } else { l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " for ( l_m = 0; l_m < %u; l_m++ ) { C[(l_n*%u)+l_m] = 0.0f; }\n\n", i_xgemm_desc->m, i_xgemm_desc->ldc); } libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); } l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " for ( l_k = 0; l_k < %u; l_k++ ) {\n", i_xgemm_desc->k); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " #pragma simd\n"); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " for ( l_m = 0; l_m < %u; l_m++ ) {\n", i_xgemm_desc->m); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " C[(l_n*%u)+l_m] += A[(l_k*%u)+l_m] * B[(l_n*%u)+l_k];\n", i_xgemm_desc->ldc, i_xgemm_desc->lda, i_xgemm_desc->ldb); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " }\n"); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " }\n"); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); l_code_length = LIBXSMM_SNPRINTF(l_new_code, l_max_code_length, " }\n"); libxsmm_append_code_as_string( io_generated_code, l_new_code, l_code_length ); }
const char* libxsmm_strerror(unsigned int i_error_code) { int l_max_error_length = 511; switch (i_error_code) { case LIBXSMM_ERR_GENERAL: LIBXSMM_SNPRINTF( libxsmm_global_error_message, l_max_error_length, " LIBXSMM ERROR: a general error occured!\n" ); break; case LIBXSMM_ERR_ARCH_PREC: LIBXSMM_SNPRINTF( libxsmm_global_error_message, l_max_error_length, " LIBXSMM ERROR: unknown architecture and precision!\n" ); break; case LIBXSMM_ERR_ARCH: LIBXSMM_SNPRINTF( libxsmm_global_error_message, l_max_error_length, " LIBXSMM ERROR: unknown architecture!\n" ); break; case LIBXSMM_ERR_LDA: LIBXSMM_SNPRINTF( libxsmm_global_error_message, l_max_error_length, " LIBXSMM ERROR: lda need to be bigger than m!\n" ); break; case LIBXSMM_ERR_LDB: LIBXSMM_SNPRINTF( libxsmm_global_error_message, l_max_error_length, " LIBXSMM ERROR: ldb need to be bigger than k!\n" ); break; case LIBXSMM_ERR_LDC: LIBXSMM_SNPRINTF( libxsmm_global_error_message, l_max_error_length, " LIBXSMM ERROR: ldc need to be bigger than m!\n" ); break; case LIBXSMM_ERR_SPARSE_GEN: LIBXSMM_SNPRINTF( libxsmm_global_error_message, l_max_error_length, " LIBXSMM ERROR: could not determine which sparse code generation variant is requested!\n" ); break; case LIBXSMM_ERR_CSC_INPUT: LIBXSMM_SNPRINTF( libxsmm_global_error_message, l_max_error_length, " LIBXSMM ERROR: could not open the specified CSC input file!\n" ); break; case LIBXSMM_ERR_CSC_READ_LEN: LIBXSMM_SNPRINTF( libxsmm_global_error_message, l_max_error_length, " LIBXSMM ERROR: exceeded predefined line-length when reading line of CSC file!\n" ); break; case LIBXSMM_ERR_CSC_READ_DESC: LIBXSMM_SNPRINTF( libxsmm_global_error_message, l_max_error_length, " LIBXSMM ERROR: error when reading descriptor of CSC file!\n" ); break; case LIBXSMM_ERR_CSC_READ_ELEMS: LIBXSMM_SNPRINTF( libxsmm_global_error_message, l_max_error_length, " LIBXSMM ERROR: error when reading line of CSC file!\n" ); break; case LIBXSMM_ERR_CSC_LEN: LIBXSMM_SNPRINTF( libxsmm_global_error_message, l_max_error_length, " LIBXSMM ERROR: number of elements read differs from number of elements specified in CSC file!\n" ); break; case LIBXSMM_ERR_N_BLOCK: LIBXSMM_SNPRINTF( libxsmm_global_error_message, l_max_error_length, " LIBXSMM ERROR: invalid N blocking in microkernel!\n" ); break; case LIBXSMM_ERR_M_BLOCK: LIBXSMM_SNPRINTF( libxsmm_global_error_message, l_max_error_length, " LIBXSMM ERROR: invalid M blocking in microkernel!\n" ); break; case LIBXSMM_ERR_NO_IMCI: LIBXSMM_SNPRINTF( libxsmm_global_error_message, l_max_error_length, " LIBXSMM ERROR: IMCI architecture requested but called for a different on!\n" ); break; case LIBXSMM_ERR_REG_BLOCK: LIBXSMM_SNPRINTF( libxsmm_global_error_message, l_max_error_length, " LIBXSMM ERROR: invalid MxN register blocking was specified!\n" ); break; case LIBXSMM_ERR_VEC_MOVE_IMCI: LIBXSMM_SNPRINTF( libxsmm_global_error_message, l_max_error_length, " LIBXSMM ERROR: invalid vec move instruction for IMCI instruction replacement!\n" ); break; case LIBXSMM_ERR_APPEND_STR: LIBXSMM_SNPRINTF( libxsmm_global_error_message, l_max_error_length, " LIBXSMM ERROR: append code as string was called for generation mode which does not support this!\n" ); break; case LIBXSMM_ERR_ALLOC: LIBXSMM_SNPRINTF( libxsmm_global_error_message, l_max_error_length, " LIBXSMM ERROR: memory allocation failed!\n" ); break; case LIBXSMM_ERR_NO_IMCI_AVX512_BCAST: LIBXSMM_SNPRINTF( libxsmm_global_error_message, l_max_error_length, " LIBXSMM ERROR: fused memory broadcast is not supported on other platforms than AVX512/IMCI!\n" ); break; case LIBXSMM_ERR_CALLEE_SAVE_A: LIBXSMM_SNPRINTF( libxsmm_global_error_message, l_max_error_length, " LIBXSMM ERROR: reg_a cannot be callee save, since input, please use either rdi, rsi, rdx, rcx, r8, r9 for this value!\n" ); break; case LIBXSMM_ERR_CALLEE_SAVE_B: LIBXSMM_SNPRINTF( libxsmm_global_error_message, l_max_error_length, " LIBXSMM ERROR: reg_b cannot be callee save, since input, please use either rdi, rsi, rdx, rcx, r8, r9 for this value!\n" ); break; case LIBXSMM_ERR_CALLEE_SAVE_C: LIBXSMM_SNPRINTF( libxsmm_global_error_message, l_max_error_length, " LIBXSMM ERROR: reg_c cannot be callee save, since input, please use either rdi, rsi, rdx, rcx, r8, r9 for this value!\n" ); break; case LIBXSMM_ERR_CALLEE_SAVE_A_PREF: LIBXSMM_SNPRINTF( libxsmm_global_error_message, l_max_error_length, " LIBXSMM ERROR: reg_a_prefetch cannot be callee save, since input, please use either rdi, rsi, rdx, rcx, r8, r9 for this value!\n" ); break; case LIBXSMM_ERR_CALLEE_SAVE_B_PREF: LIBXSMM_SNPRINTF( libxsmm_global_error_message, l_max_error_length, " LIBXSMM ERROR: reg_b_prefetch cannot be callee save, since input, please use either rdi, rsi, rdx, rcx, r8, r9 for this value!\n" ); break; case LIBXSMM_ERR_NO_INDEX_SCALE_ADDR: LIBXSMM_SNPRINTF( libxsmm_global_error_message, l_max_error_length, " LIBXSMM ERROR: Index + Scale addressing mode is currently not implemented!\n"); break; case LIBXSMM_ERR_UNSUPPORTED_JUMP: LIBXSMM_SNPRINTF( libxsmm_global_error_message, l_max_error_length, " LIBXSMM ERROR: Unsupported jump instruction requested!\n"); break; case LIBXSMM_ERR_NO_JMPLBL_AVAIL: LIBXSMM_SNPRINTF( libxsmm_global_error_message, l_max_error_length, " LIBXSMM ERROR: No destination jump label is available!\n"); break; case LIBXSMM_ERR_EXCEED_JMPLBL: LIBXSMM_SNPRINTF( libxsmm_global_error_message, l_max_error_length, " LIBXSMM ERROR: too many nested loop, exceed loop label tracker!\n"); break; case LIBXSMM_ERR_CSC_ALLOC_DATA: LIBXSMM_SNPRINTF( libxsmm_global_error_message, l_max_error_length, " LIBXSMM ERROR: could not alloc temporay memory for reading CSC file!\n"); break; /* default, we didn't don't know what happend */ default: LIBXSMM_SNPRINTF( libxsmm_global_error_message, l_max_error_length, " LIBXSMM ERROR: an unknown error occured!\n" ); break; } return libxsmm_global_error_message; }