/**
 * Block panel product
 * @param The original size M
 * @param Pointer to A at the current row,column
 * @param num_rows_AC The number of valid rows in A,C
 * @param num_acc The number of valid cols in A, and rows in B
 * @param Pointer to C at the current row
 */
void gebp_opt1(const int M, const double* A, const int num_rows_AC, const int num_acc, double* C)
{
	// Pack A into memory
	for(int iter_col = 0; iter_col < num_acc; ++iter_col)
	{
		memcpy(A_pack + iter_col * num_rows_AC,
				A + iter_col * M,
				num_rows_AC * sizeof(double));
	}

	// For each slice in B,C
	const int num_slice_blocks = CALC_NUM_BLOCKS(M, 2);
	for(int iter_slice_block = 0; iter_slice_block < num_slice_blocks; ++iter_slice_block)
	{
		const int cur_slice_pos = iter_slice_block * 2;
		const int num_slice = CALC_CUR_BLOCK_WIDTH(cur_slice_pos, 2, M);

		// Copy into kernel memory
		to_kdgemm_B_sized(num_acc, B_pack + cur_slice_pos * num_acc, B_kernel, num_acc, num_slice);

		// For each row in A,C_aux
		const int num_a_aux_blocks = CALC_NUM_BLOCKS(num_rows_AC, 2);
		for(int iter_a_aux_block = 0; iter_a_aux_block < num_a_aux_blocks; ++iter_a_aux_block)
		{
			const int cur_a_aux_pos = iter_a_aux_block * 2;
			const int num_a_aux = CALC_CUR_BLOCK_WIDTH(cur_a_aux_pos, 2, num_rows_AC);

			// Copy into kernel memory
			to_kdgemm_A_sized(num_rows_AC, A_pack + cur_a_aux_pos, A_kernel, num_a_aux, num_acc);
			// Clear the value of C
			clear_kdgemm_C_sized(C_kernel);

			// Run kernel
			kdgemm(A_kernel, B_kernel, C_kernel);

			// Store results into C_aux
			from_kdgemm_C_sized(num_rows_AC, C_kernel, C_aux + cur_a_aux_pos, num_a_aux, num_slice);
		}

		// Accumulate results from C_aux to C
		for(int iter_slice_part = 0; iter_slice_part < num_slice; ++iter_slice_part)
		{
			for(int iter_row = 0; iter_row < num_rows_AC; ++iter_row)
			{
				C[iter_row + (cur_slice_pos + iter_slice_part) * M] += C_aux[iter_row + iter_slice_part * num_rows_AC];
			}
		}
	}

}
Ejemplo n.º 2
0
/*
 * Time the matrix multiply
 */
double time_dgemm(const double *A, const double *B, double *C)
{
    double secs = -1.0;
    double mflops_sec;
    int num_iterations = MIN_RUNS;
    while (secs < MIN_SECS) {
        matrix_clear(C, DIM_M, DIM_N);
        double start = omp_get_wtime();
        for (int i = 0; i < num_iterations; ++i) {
            kdgemm(A, B, C);
        }
        double finish = omp_get_wtime();
        double mflops = 2.0 * num_iterations * DIM_M * DIM_N * DIM_P / 1.0e6;
        secs = finish-start;
        mflops_sec = mflops / secs;
        num_iterations *= 2;
    }
    return mflops_sec;
}
Ejemplo n.º 3
0
/*
 * Run a basic test and timing trial.
 */
int main(int argc, char** argv)
{
    // Allocate space for ordinary column-major matrices
    double* A = malloc(DIM_M * DIM_P * sizeof(double));
    double* B = malloc(DIM_P * DIM_N * sizeof(double));
    double* C = malloc(DIM_M * DIM_N * sizeof(double));

    // Allocate aligned scratch space for use by the kernel
    double* Ak = _mm_malloc(DIM_M * DIM_P * sizeof(double), 16);
    double* Bk = _mm_malloc(DIM_P * DIM_N * sizeof(double), 16);
    double* Ck = _mm_malloc(DIM_M * DIM_N * sizeof(double), 16);

    // Initialize the input matrices and convert to kernel format
    matrix_init(A, DIM_M, DIM_P);
    matrix_init(B, DIM_P, DIM_N);
    to_kdgemm_A(DIM_M, A, Ak);
    to_kdgemm_B(DIM_P, B, Bk);

    // Clear the kernel scratch output, run the kernel, convert to col major
    matrix_clear(Ck, DIM_M, DIM_N);
    kdgemm(Ak, Bk, Ck);
    from_kdgemm_C(DIM_M, Ck, C);

    // Check for agreement
    double max_diff = check_kdgemm(A, B, C);

    // Print kernel dimensions, megaflop rate, and error from check
    printf("%u,%u,%u,%lg,%0.0e\n", DIM_M, DIM_P, DIM_N, 
           time_dgemm(Ak, Bk, Ck),
           max_diff);

    // Free kernel matrix space
    _mm_free(Ck);
    _mm_free(Bk);
    _mm_free(Ak);

    // Free argument matrix space
    free(C);
    free(B);
    free(A);

    return 0;
}