void blx_l3_cntl_create_if ( opid_t family, obj_t* a, obj_t* b, obj_t* c, cntl_t* cntl_orig, cntl_t** cntl_use ) { // This is part of a hack to support mixed domain in bli_gemm_front(). // Sometimes we need to specify a non-standard schema for A and B, and // we decided to transmit them via the schema field in the obj_t's // rather than pass them in as function parameters. Once the values // have been read, we immediately reset them back to their expected // values for unpacked objects. Notice that we do this even if the // caller passed in a custom control tree; that's because we still need // to reset the pack schema of a and b, which were modified by the // operation's _front() function. However, in order for this to work, // the level-3 thread entry function (or omp parallel region) must // alias thread-local copies of objects a and b. pack_t schema_a = bli_obj_pack_schema( a ); pack_t schema_b = bli_obj_pack_schema( b ); bli_obj_set_pack_schema( BLIS_NOT_PACKED, a ); bli_obj_set_pack_schema( BLIS_NOT_PACKED, b ); // If the control tree pointer is NULL, we construct a default // tree as a function of the operation family. if ( cntl_orig == NULL ) { *cntl_use = blx_gemm_cntl_create( family, schema_a, schema_b ); } else { // If the user provided a control tree, create a copy and use it // instead (so that threads can use its local tree as a place to // cache things like pack mem_t entries). *cntl_use = bli_cntl_copy( cntl_orig ); // Recursively set the family fields of the newly copied control tree // nodes. bli_cntl_mark_family( family, *cntl_use ); } }
void bli_syr2_front ( obj_t* alpha, obj_t* x, obj_t* y, obj_t* c, cntx_t* cntx ) { her2_t* her2_cntl; num_t dt_targ_x; num_t dt_targ_y; //num_t dt_targ_c; bool_t x_has_unit_inc; bool_t y_has_unit_inc; bool_t c_has_unit_inc; obj_t alpha_local; num_t dt_alpha; // Check parameters. if ( bli_error_checking_is_enabled() ) bli_syr2_check( alpha, x, y, c ); // Query the target datatypes of each object. dt_targ_x = bli_obj_target_dt( x ); dt_targ_y = bli_obj_target_dt( y ); //dt_targ_c = bli_obj_target_dt( c ); // Determine whether each operand with unit stride. x_has_unit_inc = ( bli_obj_vector_inc( x ) == 1 ); y_has_unit_inc = ( bli_obj_vector_inc( y ) == 1 ); c_has_unit_inc = ( bli_obj_is_row_stored( c ) || bli_obj_is_col_stored( c ) ); // Create an object to hold a copy-cast of alpha. Notice that we use // the type union of the datatypes of x and y. dt_alpha = bli_dt_union( dt_targ_x, dt_targ_y ); bli_obj_scalar_init_detached_copy_of( dt_alpha, BLIS_NO_CONJUGATE, alpha, &alpha_local ); // If all operands have unit stride, we choose a control tree for calling // the unblocked implementation directly without any blocking. if ( x_has_unit_inc && y_has_unit_inc && c_has_unit_inc ) { // We use two control trees to handle the four cases corresponding to // combinations of upper/lower triangular storage and row/column-storage. // The row-stored lower triangular and column-stored upper triangular // trees are identical. Same for the remaining two trees. if ( bli_obj_is_lower( c ) ) { if ( bli_obj_is_row_stored( c ) ) her2_cntl = her2_cntl_bs_ke_lrow_ucol; else her2_cntl = her2_cntl_bs_ke_lcol_urow; } else // if ( bli_obj_is_upper( c ) ) { if ( bli_obj_is_row_stored( c ) ) her2_cntl = her2_cntl_bs_ke_lcol_urow; else her2_cntl = her2_cntl_bs_ke_lrow_ucol; } } else { // Mark objects with unit stride as already being packed. This prevents // unnecessary packing from happening within the blocked algorithm. if ( x_has_unit_inc ) bli_obj_set_pack_schema( BLIS_PACKED_VECTOR, x ); if ( y_has_unit_inc ) bli_obj_set_pack_schema( BLIS_PACKED_VECTOR, y ); if ( c_has_unit_inc ) bli_obj_set_pack_schema( BLIS_PACKED_UNSPEC, c ); // Here, we make a similar choice as above, except that (1) we look // at storage tilt, and (2) we choose a tree that performs blocking. if ( bli_obj_is_lower( c ) ) { if ( bli_obj_is_row_stored( c ) ) her2_cntl = her2_cntl_ge_lrow_ucol; else her2_cntl = her2_cntl_ge_lcol_urow; } else // if ( bli_obj_is_upper( c ) ) { if ( bli_obj_is_row_stored( c ) ) her2_cntl = her2_cntl_ge_lcol_urow; else her2_cntl = her2_cntl_ge_lrow_ucol; } } // Invoke the internal back-end with the copy-cast scalar and the // chosen control tree. Set conjh to BLIS_NO_CONJUGATE to invoke the // symmetric (and not Hermitian) algorithms. bli_her2_int( BLIS_NO_CONJUGATE, &alpha_local, &alpha_local, x, y, c, cntx, her2_cntl ); }
void bli_trmv( obj_t* alpha, obj_t* a, obj_t* x ) { trmv_t* trmv_cntl; num_t dt_targ_a; num_t dt_targ_x; bool_t a_is_contig; bool_t x_is_contig; obj_t alpha_local; num_t dt_alpha; // Check parameters. if ( bli_error_checking_is_enabled() ) bli_trmv_check( alpha, a, x ); // Query the target datatypes of each object. dt_targ_a = bli_obj_target_datatype( *a ); dt_targ_x = bli_obj_target_datatype( *x ); // Determine whether each operand is stored contiguously. a_is_contig = ( bli_obj_is_row_stored( *a ) || bli_obj_is_col_stored( *a ) ); x_is_contig = ( bli_obj_vector_inc( *x ) == 1 ); // Create an object to hold a copy-cast of alpha. Notice that we use // the type union of the target datatypes of a and x to prevent any // unnecessary loss of information during the computation. dt_alpha = bli_datatype_union( dt_targ_a, dt_targ_x ); bli_obj_init_scalar_copy_of( dt_alpha, BLIS_NO_CONJUGATE, alpha, &alpha_local ); // If all operands are contiguous, we choose a control tree for calling // the unblocked implementation directly without any blocking. if ( a_is_contig && x_is_contig ) { // We use two control trees to handle the four cases corresponding to // combinations of transposition and row/column-storage. // The row-stored without transpose and column-stored with transpose // trees are identical. Same for the remaining two trees. if ( bli_obj_has_notrans( *a ) ) { if ( bli_obj_is_row_stored( *a ) ) trmv_cntl = trmv_cntl_bs_ke_nrow_tcol; else trmv_cntl = trmv_cntl_bs_ke_ncol_trow; } else // if ( bli_obj_has_trans( *a ) ) { if ( bli_obj_is_row_stored( *a ) ) trmv_cntl = trmv_cntl_bs_ke_ncol_trow; else trmv_cntl = trmv_cntl_bs_ke_nrow_tcol; } } else { // Mark objects with unit stride as already being packed. This prevents // unnecessary packing from happening within the blocked algorithm. if ( a_is_contig ) bli_obj_set_pack_schema( BLIS_PACKED_UNSPEC, *a ); if ( x_is_contig ) bli_obj_set_pack_schema( BLIS_PACKED_VECTOR, *x ); // Here, we make a similar choice as above, except that (1) we look // at storage tilt, and (2) we choose a tree that performs blocking. if ( bli_obj_has_notrans( *a ) ) { if ( bli_obj_is_row_tilted( *a ) ) trmv_cntl = trmv_cntl_ge_nrow_tcol; else trmv_cntl = trmv_cntl_ge_ncol_trow; } else // if ( bli_obj_has_trans( *a ) ) { if ( bli_obj_is_row_tilted( *a ) ) trmv_cntl = trmv_cntl_ge_ncol_trow; else trmv_cntl = trmv_cntl_ge_nrow_tcol; } } // Invoke the internal back-end with the copy-cast of alpha and the // chosen control tree. bli_trmv_int( &alpha_local, a, x, trmv_cntl ); }
void bli_hemv( obj_t* alpha, obj_t* a, obj_t* x, obj_t* beta, obj_t* y ) { hemv_t* hemv_cntl; num_t dt_targ_a; num_t dt_targ_x; num_t dt_targ_y; bool_t a_has_unit_inc; bool_t x_has_unit_inc; bool_t y_has_unit_inc; obj_t alpha_local; obj_t beta_local; num_t dt_alpha; num_t dt_beta; // Check parameters. if ( bli_error_checking_is_enabled() ) bli_hemv_check( alpha, a, x, beta, y ); // Query the target datatypes of each object. dt_targ_a = bli_obj_target_datatype( *a ); dt_targ_x = bli_obj_target_datatype( *x ); dt_targ_y = bli_obj_target_datatype( *y ); // Determine whether each operand with unit stride. a_has_unit_inc = ( bli_obj_is_row_stored( *a ) || bli_obj_is_col_stored( *a ) ); x_has_unit_inc = ( bli_obj_vector_inc( *x ) == 1 ); y_has_unit_inc = ( bli_obj_vector_inc( *y ) == 1 ); // Create an object to hold a copy-cast of alpha. Notice that we use // the type union of the target datatypes of a and x to prevent any // unnecessary loss of information during the computation. dt_alpha = bli_datatype_union( dt_targ_a, dt_targ_x ); bli_obj_scalar_init_detached_copy_of( dt_alpha, BLIS_NO_CONJUGATE, alpha, &alpha_local ); // Create an object to hold a copy-cast of beta. Notice that we use // the datatype of y. Here's why: If y is real and beta is complex, // there is no reason to keep beta_local in the complex domain since // the complex part of beta*y will not be stored. If y is complex and // beta is real then beta is harmlessly promoted to complex. dt_beta = dt_targ_y; bli_obj_scalar_init_detached_copy_of( dt_beta, BLIS_NO_CONJUGATE, beta, &beta_local ); // If all operands have unit stride, we choose a control tree for calling // the unblocked implementation directly without any blocking. if ( a_has_unit_inc && x_has_unit_inc && y_has_unit_inc ) { // We use two control trees to handle the four cases corresponding to // combinations of upper/lower triangular storage and row/column-storage. // The row-stored lower triangular and column-stored upper triangular // trees are identical. Same for the remaining two trees. if ( bli_obj_is_lower( *a ) ) { if ( bli_obj_is_row_stored( *a ) ) hemv_cntl = hemv_cntl_bs_ke_lrow_ucol; else hemv_cntl = hemv_cntl_bs_ke_lcol_urow; } else // if ( bli_obj_is_upper( *a ) ) { if ( bli_obj_is_row_stored( *a ) ) hemv_cntl = hemv_cntl_bs_ke_lcol_urow; else hemv_cntl = hemv_cntl_bs_ke_lrow_ucol; } } else { // Mark objects with unit stride as already being packed. This prevents // unnecessary packing from happening within the blocked algorithm. if ( a_has_unit_inc ) bli_obj_set_pack_schema( BLIS_PACKED_UNSPEC, *a ); if ( x_has_unit_inc ) bli_obj_set_pack_schema( BLIS_PACKED_VECTOR, *x ); if ( y_has_unit_inc ) bli_obj_set_pack_schema( BLIS_PACKED_VECTOR, *y ); // Here, we make a similar choice as above, except that (1) we look // at storage tilt, and (2) we choose a tree that performs blocking. if ( bli_obj_is_lower( *a ) ) { if ( bli_obj_is_row_tilted( *a ) ) hemv_cntl = hemv_cntl_ge_lrow_ucol; else hemv_cntl = hemv_cntl_ge_lcol_urow; } else // if ( bli_obj_is_upper( *a ) ) { if ( bli_obj_is_row_tilted( *a ) ) hemv_cntl = hemv_cntl_ge_lcol_urow; else hemv_cntl = hemv_cntl_ge_lrow_ucol; } } // Invoke the internal back-end with the copy-casts of scalars and the // chosen control tree. Set conjh to BLIS_CONJUGATE to invoke the // Hermitian (and not symmetric) algorithms. bli_hemv_int( BLIS_CONJUGATE, &alpha_local, a, x, &beta_local, y, hemv_cntl ); }
siz_t bli_packv_init_pack ( pack_t schema, bszid_t bmult_id, obj_t* a, obj_t* p, cntx_t* cntx ) { num_t dt = bli_obj_dt( a ); dim_t dim_a = bli_obj_vector_dim( a ); dim_t bmult = bli_cntx_get_blksz_def_dt( dt, bmult_id, cntx ); membrk_t* membrk = bli_cntx_membrk( cntx ); #if 0 mem_t* mem_p; #endif dim_t m_p_pad; siz_t size_p; inc_t rs_p, cs_p; void* buf; // We begin by copying the basic fields of c. bli_obj_alias_to( a, p ); // Update the dimensions. bli_obj_set_dims( dim_a, 1, p ); // Reset the view offsets to (0,0). bli_obj_set_offs( 0, 0, p ); // Set the pack schema in the p object to the value in the control tree // node. bli_obj_set_pack_schema( schema, p ); // Compute the dimensions padded by the dimension multiples. m_p_pad = bli_align_dim_to_mult( bli_obj_vector_dim( p ), bmult ); // Compute the size of the packed buffer. size_p = m_p_pad * 1 * bli_obj_elem_size( p ); #if 0 // Extract the address of the mem_t object within p that will track // properties of the packed buffer. mem_p = bli_obj_pack_mem( *p ); if ( bli_mem_is_unalloc( mem_p ) ) { // If the mem_t object of p has not yet been allocated, then acquire // a memory block suitable for a vector. bli_membrk_acquire_v( membrk, size_p, mem_p ); } else { // If the mem_t object has already been allocated, then release and // re-acquire the memory so there is sufficient space. if ( bli_mem_size( mem_p ) < size_p ) { bli_membrk_release( mem_p ); bli_membrk_acquire_v( membrk, size_p, mem_p ); } } // Grab the buffer address from the mem_t object and copy it to the // main object buffer field. (Sometimes this buffer address will be // copied when the value is already up-to-date, because it persists // in the main object buffer field across loop iterations.) buf = bli_mem_buffer( mem_p ); bli_obj_set_buffer( buf, p ); #endif // Save the padded (packed) dimensions into the packed object. bli_obj_set_padded_dims( m_p_pad, 1, p ); // Set the row and column strides of p based on the pack schema. if ( schema == BLIS_PACKED_VECTOR ) { // Set the strides to reflect a column-stored vector. Note that the // column stride may never be used, and is only useful to determine // how much space beyond the vector would need to be zero-padded, if // zero-padding was needed. rs_p = 1; cs_p = bli_obj_padded_length( p ); bli_obj_set_strides( rs_p, cs_p, p ); } return size_p; }
void bli_l3_thread_decorator ( l3int_t func, opid_t family, obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ) { // This is part of a hack to support mixed domain in bli_gemm_front(). // Sometimes we need to specify a non-standard schema for A and B, and // we decided to transmit them via the schema field in the obj_t's // rather than pass them in as function parameters. Once the values // have been read, we immediately reset them back to their expected // values for unpacked objects. pack_t schema_a = bli_obj_pack_schema( a ); pack_t schema_b = bli_obj_pack_schema( b ); bli_obj_set_pack_schema( BLIS_NOT_PACKED, a ); bli_obj_set_pack_schema( BLIS_NOT_PACKED, b ); // For sequential execution, we use only one thread. const dim_t n_threads = 1; // NOTE: The sba was initialized in bli_init(). // Check out an array_t from the small block allocator. This is done // with an internal lock to ensure only one application thread accesses // the sba at a time. bli_sba_checkout_array() will also automatically // resize the array_t, if necessary. array_t* restrict array = bli_sba_checkout_array( n_threads ); // Access the pool_t* for thread 0 and embed it into the rntm. We do // this up-front only so that we can create the global comm below. bli_sba_rntm_set_pool( 0, array, rntm ); // Set the packing block allocator field of the rntm. bli_membrk_rntm_set_membrk( rntm ); // Allcoate a global communicator for the root thrinfo_t structures. thrcomm_t* restrict gl_comm = bli_thrcomm_create( rntm, n_threads ); { // NOTE: We don't need to create another copy of the rntm_t since // it was already copied in one of the high-level oapi functions. rntm_t* restrict rntm_p = rntm; cntl_t* cntl_use; thrinfo_t* thread; const dim_t tid = 0; // Use the thread id to access the appropriate pool_t* within the // array_t, and use it to set the sba_pool field within the rntm_t. // If the pool_t* element within the array_t is NULL, it will first // be allocated/initialized. // NOTE: This is commented out because, in the single-threaded case, // this is redundant since it's already been done above. //bli_sba_rntm_set_pool( tid, array, rntm_p ); // NOTE: Unlike with the _openmp.c and _pthreads.c variants, we don't // need to alias objects for A, B, and C since they were already aliased // in bli_*_front(). However, we may add aliasing here in the future so // that, with all three (_single.c, _openmp.c, _pthreads.c) implementations // consistently providing local aliases, we can then eliminate aliasing // elsewhere. // Create a default control tree for the operation, if needed. bli_l3_cntl_create_if( family, schema_a, schema_b, a, b, c, rntm_p, cntl, &cntl_use ); // Create the root node of the thread's thrinfo_t structure. bli_l3_thrinfo_create_root( tid, gl_comm, rntm_p, cntl_use, &thread ); func ( alpha, a, b, beta, c, cntx, rntm_p, cntl_use, thread ); // Free the thread's local control tree. bli_l3_cntl_free( rntm_p, cntl_use, thread ); // Free the current thread's thrinfo_t structure. bli_l3_thrinfo_free( rntm_p, thread ); } // We shouldn't free the global communicator since it was already freed // by the global communicator's chief thread in bli_l3_thrinfo_free() // (called above). // Check the array_t back into the small block allocator. Similar to the // check-out, this is done using a lock embedded within the sba to ensure // mutual exclusion. bli_sba_checkin_array( array ); }