/** * Block panel product * @param The original size M * @param Pointer to A at the current row,column * @param num_rows_AC The number of valid rows in A,C * @param num_acc The number of valid cols in A, and rows in B * @param Pointer to C at the current row */ void gebp_opt1(const int M, const double* A, const int num_rows_AC, const int num_acc, double* C) { // Pack A into memory for(int iter_col = 0; iter_col < num_acc; ++iter_col) { memcpy(A_pack + iter_col * num_rows_AC, A + iter_col * M, num_rows_AC * sizeof(double)); } // For each slice in B,C const int num_slice_blocks = CALC_NUM_BLOCKS(M, 2); for(int iter_slice_block = 0; iter_slice_block < num_slice_blocks; ++iter_slice_block) { const int cur_slice_pos = iter_slice_block * 2; const int num_slice = CALC_CUR_BLOCK_WIDTH(cur_slice_pos, 2, M); // Copy into kernel memory to_kdgemm_B_sized(num_acc, B_pack + cur_slice_pos * num_acc, B_kernel, num_acc, num_slice); // For each row in A,C_aux const int num_a_aux_blocks = CALC_NUM_BLOCKS(num_rows_AC, 2); for(int iter_a_aux_block = 0; iter_a_aux_block < num_a_aux_blocks; ++iter_a_aux_block) { const int cur_a_aux_pos = iter_a_aux_block * 2; const int num_a_aux = CALC_CUR_BLOCK_WIDTH(cur_a_aux_pos, 2, num_rows_AC); // Copy into kernel memory to_kdgemm_A_sized(num_rows_AC, A_pack + cur_a_aux_pos, A_kernel, num_a_aux, num_acc); // Clear the value of C clear_kdgemm_C_sized(C_kernel); // Run kernel kdgemm(A_kernel, B_kernel, C_kernel); // Store results into C_aux from_kdgemm_C_sized(num_rows_AC, C_kernel, C_aux + cur_a_aux_pos, num_a_aux, num_slice); } // Accumulate results from C_aux to C for(int iter_slice_part = 0; iter_slice_part < num_slice; ++iter_slice_part) { for(int iter_row = 0; iter_row < num_rows_AC; ++iter_row) { C[iter_row + (cur_slice_pos + iter_slice_part) * M] += C_aux[iter_row + iter_slice_part * num_rows_AC]; } } } }
/* * Time the matrix multiply */ double time_dgemm(const double *A, const double *B, double *C) { double secs = -1.0; double mflops_sec; int num_iterations = MIN_RUNS; while (secs < MIN_SECS) { matrix_clear(C, DIM_M, DIM_N); double start = omp_get_wtime(); for (int i = 0; i < num_iterations; ++i) { kdgemm(A, B, C); } double finish = omp_get_wtime(); double mflops = 2.0 * num_iterations * DIM_M * DIM_N * DIM_P / 1.0e6; secs = finish-start; mflops_sec = mflops / secs; num_iterations *= 2; } return mflops_sec; }
/* * Run a basic test and timing trial. */ int main(int argc, char** argv) { // Allocate space for ordinary column-major matrices double* A = malloc(DIM_M * DIM_P * sizeof(double)); double* B = malloc(DIM_P * DIM_N * sizeof(double)); double* C = malloc(DIM_M * DIM_N * sizeof(double)); // Allocate aligned scratch space for use by the kernel double* Ak = _mm_malloc(DIM_M * DIM_P * sizeof(double), 16); double* Bk = _mm_malloc(DIM_P * DIM_N * sizeof(double), 16); double* Ck = _mm_malloc(DIM_M * DIM_N * sizeof(double), 16); // Initialize the input matrices and convert to kernel format matrix_init(A, DIM_M, DIM_P); matrix_init(B, DIM_P, DIM_N); to_kdgemm_A(DIM_M, A, Ak); to_kdgemm_B(DIM_P, B, Bk); // Clear the kernel scratch output, run the kernel, convert to col major matrix_clear(Ck, DIM_M, DIM_N); kdgemm(Ak, Bk, Ck); from_kdgemm_C(DIM_M, Ck, C); // Check for agreement double max_diff = check_kdgemm(A, B, C); // Print kernel dimensions, megaflop rate, and error from check printf("%u,%u,%u,%lg,%0.0e\n", DIM_M, DIM_P, DIM_N, time_dgemm(Ak, Bk, Ck), max_diff); // Free kernel matrix space _mm_free(Ck); _mm_free(Bk); _mm_free(Ak); // Free argument matrix space free(C); free(B); free(A); return 0; }