void bli_her( obj_t* alpha, obj_t* x, obj_t* c ) { her_t* her_cntl; num_t dt_targ_x; //num_t dt_targ_c; bool_t x_is_contig; bool_t c_is_contig; obj_t alpha_local; num_t dt_alpha; // Check parameters. if ( bli_error_checking_is_enabled() ) bli_her_check( BLIS_CONJUGATE, alpha, x, c ); // Query the target datatypes of each object. dt_targ_x = bli_obj_target_datatype( *x ); //dt_targ_c = bli_obj_target_datatype( *c ); // Determine whether each operand is stored contiguously. x_is_contig = ( bli_obj_vector_inc( *x ) == 1 ); c_is_contig = ( bli_obj_is_row_stored( *c ) || bli_obj_is_col_stored( *c ) ); // Create object to hold a copy-cast of alpha. dt_alpha = dt_targ_x; bli_obj_init_scalar_copy_of( dt_alpha, BLIS_NO_CONJUGATE, alpha, &alpha_local ); // If all operands are contiguous, we choose a control tree for calling // the unblocked implementation directly without any blocking. if ( x_is_contig && c_is_contig ) { // We use two control trees to handle the four cases corresponding to // combinations of upper/lower triangular storage and row/column-storage. // The row-stored lower triangular and column-stored upper triangular // trees are identical. Same for the remaining two trees. if ( bli_obj_is_lower( *c ) ) { if ( bli_obj_is_row_stored( *c ) ) her_cntl = her_cntl_bs_ke_lrow_ucol; else her_cntl = her_cntl_bs_ke_lcol_urow; } else // if ( bli_obj_is_upper( *c ) ) { if ( bli_obj_is_row_stored( *c ) ) her_cntl = her_cntl_bs_ke_lcol_urow; else her_cntl = her_cntl_bs_ke_lrow_ucol; } } else { // Mark objects with unit stride as already being packed. This prevents // unnecessary packing from happening within the blocked algorithm. if ( x_is_contig ) bli_obj_set_pack_schema( BLIS_PACKED_VECTOR, *x ); if ( c_is_contig ) bli_obj_set_pack_schema( BLIS_PACKED_UNSPEC, *c ); // Here, we make a similar choice as above, except that (1) we look // at storage tilt, and (2) we choose a tree that performs blocking. if ( bli_obj_is_lower( *c ) ) { if ( bli_obj_is_row_stored( *c ) ) her_cntl = her_cntl_ge_lrow_ucol; else her_cntl = her_cntl_ge_lcol_urow; } else // if ( bli_obj_is_upper( *c ) ) { if ( bli_obj_is_row_stored( *c ) ) her_cntl = her_cntl_ge_lcol_urow; else her_cntl = her_cntl_ge_lrow_ucol; } } // Invoke the internal back-end with the copy-cast scalar and the // chosen control tree. Set conjh to BLIS_CONJUGATE to invoke the // Hermitian (and not symmetric) algorithms. bli_her_int( BLIS_CONJUGATE, &alpha_local, x, c, her_cntl ); }
void bli_her_blk_var1( conj_t conjh, obj_t* alpha, obj_t* x, obj_t* c, her_t* cntl ) { obj_t c11, c11_pack; obj_t c10; obj_t x1, x1_pack; obj_t x0; dim_t mn; dim_t ij; dim_t b_alg; // Even though this blocked algorithm is expressed only in terms of the // lower triangular case, the upper triangular case is still supported: // when bli_acquire_mpart_tl2br() is passed a matrix that is stored in // in the upper triangle, and the requested subpartition resides in the // lower triangle (as is the case for this algorithm), the routine fills // the request as if the caller had actually requested the corresponding // "mirror" subpartition in the upper triangle, except that it marks the // subpartition for transposition (and conjugation). // Initialize objects for packing. bli_obj_init_pack( &c11_pack ); bli_obj_init_pack( &x1_pack ); // Query dimension. mn = bli_obj_length( *c ); // Partition diagonally. for ( ij = 0; ij < mn; ij += b_alg ) { // Determine the current algorithmic blocksize. b_alg = bli_determine_blocksize_f( ij, mn, c, cntl_blocksize( cntl ) ); // Acquire partitions for C11, C10, x1, and x0. bli_acquire_mpart_tl2br( BLIS_SUBPART11, ij, b_alg, c, &c11 ); bli_acquire_mpart_tl2br( BLIS_SUBPART10, ij, b_alg, c, &c10 ); bli_acquire_vpart_f2b( BLIS_SUBPART1, ij, b_alg, x, &x1 ); bli_acquire_vpart_f2b( BLIS_SUBPART0, ij, b_alg, x, &x0 ); // Initialize objects for packing C11 and x1 (if needed). bli_packm_init( &c11, &c11_pack, cntl_sub_packm_c11( cntl ) ); bli_packv_init( &x1, &x1_pack, cntl_sub_packv_x1( cntl ) ); // Copy/pack C11, x1 (if needed). bli_packm_int( &c11, &c11_pack, cntl_sub_packm_c11( cntl ), &BLIS_PACKM_SINGLE_THREADED ); bli_packv_int( &x1, &x1_pack, cntl_sub_packv_x1( cntl ) ); // C10 = C10 + alpha * x1 * x0'; bli_ger_int( BLIS_NO_CONJUGATE, conjh, alpha, &x1_pack, &x0, &c10, cntl_sub_ger( cntl ) ); // C11 = C11 + alpha * x1 * x1'; bli_her_int( conjh, alpha, &x1_pack, &c11_pack, cntl_sub_her( cntl ) ); // Copy/unpack C11 (if C11 was packed). bli_unpackm_int( &c11_pack, &c11, cntl_sub_unpackm_c11( cntl ), &BLIS_PACKM_SINGLE_THREADED ); } // If any packing buffers were acquired within packm, release them back // to the memory manager. bli_packm_release( &c11_pack, cntl_sub_packm_c11( cntl ) ); bli_packv_release( &x1_pack, cntl_sub_packv_x1( cntl ) ); }