/** Purpose ------- ZLARFB applies a complex block reflector H or its transpose H^H to a COMPLEX_16 m by n matrix C, from the left. Arguments --------- @param[in] side magma_side_t - = MagmaLeft: apply H or H^H from the Left - = MagmaRight: apply H or H^H from the Right @param[in] trans magma_trans_t - = MagmaNoTrans: apply H (No transpose) - = Magma_ConjTrans: apply H^H (Conjugate transpose) @param[in] direct magma_direct_t Indicates how H is formed from a product of elementary reflectors - = MagmaForward: H = H(1) H(2) . . . H(k) (Forward) - = MagmaBackward: H = H(k) . . . H(2) H(1) (Backward) @param[in] storev magma_storev_t Indicates how the vectors which define the elementary reflectors are stored: - = MagmaColumnwise: Columnwise - = MagmaRowwise: Rowwise @param[in] m INTEGER The number of rows of the matrix C. @param[in] n INTEGER The number of columns of the matrix C. @param[in] k INTEGER The order of the matrix T (= the number of elementary reflectors whose product defines the block reflector). @param[in] dV COMPLEX_16 array on the GPU, dimension (LDDV,K) if STOREV = MagmaColumnwise (LDDV,M) if STOREV = MagmaRowwise and SIDE = MagmaLeft (LDDV,N) if STOREV = MagmaRowwise and SIDE = MagmaRight The matrix V. See further details. @param[in] lddv INTEGER The leading dimension of the array V. If STOREV = MagmaColumnwise and SIDE = MagmaLeft, LDDV >= max(1,M); if STOREV = MagmaColumnwise and SIDE = MagmaRight, LDDV >= max(1,N); if STOREV = MagmaRowwise, LDDV >= K. @param[in] dT COMPLEX_16 array on the GPU, dimension (LDDT,K) The triangular k by k matrix T in the representation of the block reflector. @param[in] lddt INTEGER The leading dimension of the array T. LDDT >= K. @param[in,out] dC COMPLEX_16 array on the GPU, dimension (LDDC,N) On entry, the m by n matrix C. On exit, C is overwritten by H*C, or H^H*C, or C*H, or C*H^H. @param[in] lddc INTEGER The leading dimension of the array C. LDA >= max(1,M). @param dwork (workspace) COMPLEX_16 array, dimension (LDWORK,K) @param[in] ldwork INTEGER The leading dimension of the array WORK. If SIDE = MagmaLeft, LDWORK >= max(1,N); if SIDE = MagmaRight, LDWORK >= max(1,M); Further Details --------------- The shape of the matrix V and the storage of the vectors which define the H(i) is best illustrated by the following example with n = 5 and k = 3. All elements including 0's and 1's are stored, unlike LAPACK. DIRECT = MagmaForward and DIRECT = MagmaForward and STOREV = MagmaColumnwise: STOREV = MagmaRowwise: V = ( 1 0 0 ) V = ( 1 v1 v1 v1 v1 ) ( v1 1 0 ) ( 0 1 v2 v2 v2 ) ( v1 v2 1 ) ( 0 0 1 v3 v3 ) ( v1 v2 v3 ) ( v1 v2 v3 ) DIRECT = MagmaBackward and DIRECT = MagmaBackward and STOREV = MagmaColumnwise: STOREV = MagmaRowwise: V = ( v1 v2 v3 ) V = ( v1 v1 1 0 0 ) ( v1 v2 v3 ) ( v2 v2 v2 1 0 ) ( 1 v2 v3 ) ( v3 v3 v3 v3 1 ) ( 0 1 v3 ) ( 0 0 1 ) @ingroup magma_zaux3 ********************************************************************/ template<typename Ty> magma_int_t magma_larfb_gpu( magma_side_t side, magma_trans_t trans, magma_direct_t direct, magma_storev_t storev, magma_int_t m, magma_int_t n, magma_int_t k, cl_mem dV , size_t dV_offset, magma_int_t lddv, cl_mem dT , size_t dT_offset, magma_int_t lddt, cl_mem dC , size_t dC_offset, magma_int_t lddc, cl_mem dwork, size_t dwork_offset, magma_int_t ldwork, magma_queue_t queue ) { #define dV(i_,j_) dV, (dV_offset + (i_) + (j_)*lddv) #define dT(i_,j_) dT, (dT_offset + (i_) + (j_)*lddt) #define dC(i_,j_) dC, (dC_offset + (i_) + (j_)*lddc) #define dwork(i_) dwork, (dwork_offset + (i_)) static const Ty c_zero = magma_zero<Ty>(); static const Ty c_one = magma_one<Ty>(); static const Ty c_neg_one = magma_neg_one<Ty>(); static const clblasTranspose transType = magma_is_real<Ty>() ? clblasTrans : clblasConjTrans; /* Check input arguments */ magma_int_t info = 0; if (m < 0) { info = -5; } else if (n < 0) { info = -6; } else if (k < 0) { info = -7; } else if ( ((storev == MagmaColumnwise) && (side == MagmaLeft) && lddv < std::max(1,m)) || ((storev == MagmaColumnwise) && (side == MagmaRight) && lddv < std::max(1,n)) || ((storev == MagmaRowwise) && lddv < k) ) { info = -9; } else if (lddt < k) { info = -11; } else if (lddc < std::max(1,m)) { info = -13; } else if ( ((side == MagmaLeft) && ldwork < std::max(1,n)) || ((side == MagmaRight) && ldwork < std::max(1,m)) ) { info = -15; } if (info != 0) { //magma_xerbla( __func__, -(info) ); return info; } /* Function Body */ if (m <= 0 || n <= 0) { return info; } // opposite of trans clblasTranspose transt; clblasTranspose cltrans; if (trans == MagmaNoTrans) { transt = transType; cltrans = clblasNoTrans; } else { transt = clblasNoTrans; cltrans = transType; } // whether T is upper or lower triangular clblasUplo uplo; if (direct == MagmaForward) uplo = clblasUpper; else uplo = clblasLower; // whether V is stored transposed or not clblasTranspose notransV, transV; if (storev == MagmaColumnwise) { notransV = clblasNoTrans; transV = transType; } else { notransV = transType; transV = clblasNoTrans; } gemm_func<Ty> gpu_gemm; trmm_func<Ty> gpu_trmm; cl_event event = NULL; if ( side == MagmaLeft ) { // Form H C or H^H C // Comments assume H C. When forming H^H C, T gets transposed via transt. // W = C^H V gpu_gemm(clblasColumnMajor, transType, notransV, n, k, m, c_one, dC(0,0), lddc, dV(0,0), lddv, c_zero, dwork(0), ldwork, 1, &queue, 0, nullptr, &event); // W = W T^H = C^H V T^H gpu_trmm(clblasColumnMajor, clblasRight, uplo, transt, clblasNonUnit, n, k, c_one, dT(0,0) , lddt, dwork(0), ldwork, 1, &queue, 0, nullptr, &event); // C = C - V W^H = C - V T V^H C = (I - V T V^H) C = H C gpu_gemm(clblasColumnMajor, notransV, transType, m, n, k, c_neg_one, dV(0,0), lddv, dwork(0), ldwork, c_one, dC(0,0), lddc, 1, &queue, 0, nullptr, &event); } else { // Form C H or C H^H // Comments assume C H. When forming C H^H, T gets transposed via trans. // W = C V gpu_gemm(clblasColumnMajor, clblasNoTrans, notransV, m, k, n, c_one, dC(0,0), lddc, dV(0,0), lddv, c_zero, dwork(0), ldwork, 1, &queue, 0, nullptr, &event); // W = W T = C V T gpu_trmm(clblasColumnMajor, clblasRight, uplo, cltrans, clblasNonUnit, m, k, c_one, dT(0,0), lddt, dwork(0), ldwork, 1, &queue, 0, nullptr, &event); // C = C - W V^H = C - C V T V^H = C (I - V T V^H) = C H gpu_gemm(clblasColumnMajor, clblasNoTrans, transV, m, n, k, c_neg_one, dwork(0), ldwork, dV(0,0), lddv, c_one, dC(0,0), lddc, 1, &queue, 0, nullptr, &event); } return info; } /* magma_zlarfb */
static int benchmark_blas(const int N) { real *h_A; real *h_B; real *h_C; real *h_C_ref; real alpha = 1.0f; real beta = 0.0f; int n2 = N * N; int i; real error_norm; real ref_norm; real diff; /* Allocate host memory for the matrices */ h_A = (real *)malloc(n2 * sizeof(h_A[0])); h_B = (real *)malloc(n2 * sizeof(h_B[0])); h_C = (real *)malloc(n2 * sizeof(h_C[0])); h_C_ref = (real *)malloc(n2 * sizeof(h_C[0])); /* Fill the matrices with test data */ for (i = 0; i < n2; i++) { h_A[i] = rand() / (real)RAND_MAX; h_B[i] = rand() / (real)RAND_MAX; h_C[i] = rand() / (real)RAND_MAX; h_C_ref[i] = h_C[i]; } #ifdef VERIFY /* Performs operation using plain C code*/ clock_t c_start = clock(); simple_sgemm(N, alpha, h_A, h_B, beta, h_C_ref); clock_t c_end = clock(); #endif clock_t g_start = clock(); gpu_gemm(h_A, h_B, h_C, alpha, beta, N); clock_t g_end = clock(); double g_time = (double)(g_end - g_start) / CLOCKS_PER_SEC; std::cout << N << " " << g_time << " "<< 2.0 * pow(N, 3) / g_time / 1000 /1000 / 1000; #ifdef VERIFY std::cout<<" "<< 1000.0 * (c_end - c_start) / CLOCKS_PER_SEC << std::endl; #else std::cout << std::endl; #endif #ifdef VERIFY error_norm = 0; ref_norm = 0; for (i = 0; i < n2; ++i) { diff = h_C_ref[i] - h_C[i]; error_norm += diff * diff; ref_norm += h_C_ref[i] * h_C_ref[i]; } error_norm = (real)sqrt((double)error_norm); ref_norm = (real)sqrt((double)ref_norm); if (fabs(ref_norm) < 1e-7) { fprintf(stderr, "!!!! reference norm is 0\n"); return EXIT_FAILURE; } if (error_norm / ref_norm > 1e-6f) { printf("simpleCUBLAS test failed.\n"); exit(EXIT_FAILURE); } #endif /* Memory clean up */ free(h_A); free(h_B); free(h_C); free(h_C_ref); }