void bli_hemv_cntl_init() { // Create blocksize objects. hemv_mc = bli_blksz_obj_create( BLIS_HEMV_MC_S, 0, BLIS_HEMV_MC_D, 0, BLIS_HEMV_MC_C, 0, BLIS_HEMV_MC_Z, 0 ); // Create control trees for the lowest-level kernels. These trees induce // operations on (presumably) relatively small block-subvector problems. hemv_cntl_bs_ke_lrow_ucol = bli_hemv_cntl_obj_create( BLIS_UNB_FUSED, BLIS_VARIANT1, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL ); hemv_cntl_bs_ke_lcol_urow = bli_hemv_cntl_obj_create( BLIS_UNB_FUSED, BLIS_VARIANT3, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL ); // Create control trees for generally large problems. Here, we choose a // variant that prioritizes keeping a subvector of y in cache. hemv_cntl_ge_lrow_ucol = bli_hemv_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT2, hemv_mc, scalv_cntl, // scale y up-front packm_cntl_noscale, // pack A11 (if needed) packv_cntl, // pack x1 (if needed) packv_cntl, // pack y1 (if needed) gemv_cntl_rp_bs_dot, // gemv_n_rp needed by var2 NULL, // gemv_n_cp not used by var2 NULL, // gemv_t_rp not used by var2 gemv_cntl_rp_bs_axpy, // gemv_t_cp needed by var2 hemv_cntl_bs_ke_lrow_ucol, unpackv_cntl ); // unpack y1 (if packed) hemv_cntl_ge_lcol_urow = bli_hemv_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT2, hemv_mc, scalv_cntl, // scale y up-front packm_cntl_noscale, // pack A11 (if needed) packv_cntl, // pack x1 (if needed) packv_cntl, // pack y1 (if needed) gemv_cntl_rp_bs_axpy, // gemv_n_rp needed by var2 NULL, // gemv_n_cp not used by var2 NULL, // gemv_t_rp not used by var2 gemv_cntl_rp_bs_dot, // gemv_t_cp needed by var2 hemv_cntl_bs_ke_lcol_urow, unpackv_cntl ); // unpack y1 (if packed) }
void bli_packv_cntl_init() { packv_mult_dim = bli_blksz_obj_create( BLIS_DEFAULT_VR_S, 0, BLIS_DEFAULT_VR_D, 0, BLIS_DEFAULT_VR_C, 0, BLIS_DEFAULT_VR_Z, 0 ); packv_cntl = bli_packv_cntl_obj_create( BLIS_UNBLOCKED, BLIS_VARIANT1, packv_mult_dim, BLIS_PACKED_VECTOR ); }
void bli_trsv_cntl_init() { // Create blocksize objects. trsv_mc = bli_blksz_obj_create( BLIS_TRSV_MC_S, 0, BLIS_TRSV_MC_D, 0, BLIS_TRSV_MC_C, 0, BLIS_TRSV_MC_Z, 0 ); // Create control trees for the lowest-level kernels. These trees induce // operations on (presumably) relatively small block-subvector problems. trsv_cntl_bs_ke_nrow_tcol = bli_trsv_cntl_obj_create( BLIS_UNB_FUSED, BLIS_VARIANT1, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL ); trsv_cntl_bs_ke_ncol_trow = bli_trsv_cntl_obj_create( BLIS_UNB_FUSED, BLIS_VARIANT2, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL ); // Create control trees for generally large problems. Here we choose a // variant that prioritizes keeping a subvector of x in cache. trsv_cntl_ge_nrow_tcol = bli_trsv_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT1, // use var1 to maximize x1 usage trsv_mc, scalv_cntl, // scale x up-front packm_cntl_noscale, // pack A11 (if needed) packv_cntl, // pack x1 (if needed) gemv_cntl_rp_bs_dot, // gemv_rp needed by var1 NULL, // gemv_cp not needed by var1 trsv_cntl_bs_ke_nrow_tcol, unpackv_cntl ); // unpack x1 (if needed) trsv_cntl_ge_ncol_trow = bli_trsv_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT1, // use var1 to maximize x1 usage trsv_mc, scalv_cntl, // scale x up-front packm_cntl_noscale, // pack A11 (if needed) packv_cntl, // pack x1 (if needed) gemv_cntl_rp_bs_axpy, // gemv_rp needed by var1 NULL, // gemv_cp not needed by var1 trsv_cntl_bs_ke_ncol_trow, unpackv_cntl ); // unpack x1 (if needed) }
void bli_her_cntl_init() { // Create blocksize objects. her_mc = bli_blksz_obj_create( BLIS_HER_MC_S, 0, BLIS_HER_MC_D, 0, BLIS_HER_MC_C, 0, BLIS_HER_MC_Z, 0 ); // Create control trees for the lowest-level kernels. These trees induce // operations on (persumably) relatively small block-subvector problems. her_cntl_bs_ke_lrow_ucol = bli_her_cntl_obj_create( BLIS_UNBLOCKED, BLIS_VARIANT1, NULL, NULL, NULL, NULL, NULL, NULL ); her_cntl_bs_ke_lcol_urow = bli_her_cntl_obj_create( BLIS_UNBLOCKED, BLIS_VARIANT2, NULL, NULL, NULL, NULL, NULL, NULL ); // Create control trees for generally large problems. Here, we choose // variants that partition for ger subproblems in the same direction // as the assumed storage. her_cntl_ge_lrow_ucol = bli_her_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT1, her_mc, packv_cntl, // pack x1 (if needed) NULL, // do NOT pack C11 ger_cntl_rp_bs_row, her_cntl_bs_ke_lrow_ucol, NULL ); // no unpacking needed her_cntl_ge_lcol_urow = bli_her_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT2, her_mc, packv_cntl, // pack x1 (if needed) NULL, // do NOT pack C11 ger_cntl_cp_bs_col, her_cntl_bs_ke_lcol_urow, NULL ); // no unpacking needed }
void bli_gemm4mb_cntl_init() { // Create blocksize objects for each dimension. gemm4mb_mc = bli_blksz_obj_create( 0, 0, 0, 0, BLIS_DEFAULT_MC_S/2, BLIS_MAXIMUM_MC_S/2, BLIS_DEFAULT_MC_D/2, BLIS_MAXIMUM_MC_D/2 ); gemm4mb_nc = bli_blksz_obj_create( 0, 0, 0, 0, BLIS_DEFAULT_NC_S/2, BLIS_MAXIMUM_NC_S/2, BLIS_DEFAULT_NC_D/2, BLIS_MAXIMUM_NC_D/2 ); gemm4mb_kc = bli_blksz_obj_create( 0, 0, 0, 0, BLIS_DEFAULT_KC_S, BLIS_MAXIMUM_KC_S, BLIS_DEFAULT_KC_D, BLIS_MAXIMUM_KC_D ); gemm4mb_mr = bli_blksz_obj_create( 0, 0, 0, 0, BLIS_DEFAULT_MR_S, BLIS_PACKDIM_MR_S, BLIS_DEFAULT_MR_D, BLIS_PACKDIM_MR_D ); gemm4mb_nr = bli_blksz_obj_create( 0, 0, 0, 0, BLIS_DEFAULT_NR_S, BLIS_PACKDIM_NR_S, BLIS_DEFAULT_NR_D, BLIS_PACKDIM_NR_D ); gemm4mb_kr = bli_blksz_obj_create( 0, 0, 0, 0, BLIS_DEFAULT_KR_S, BLIS_PACKDIM_KR_S, BLIS_DEFAULT_KR_D, BLIS_PACKDIM_KR_D ); // Attach the register blksz_t objects as blocksize multiples to the cache // blksz_t objects. bli_blksz_obj_attach_mult_to( gemm4mb_mr, gemm4mb_mc ); bli_blksz_obj_attach_mult_to( gemm4mb_nr, gemm4mb_nc ); bli_blksz_obj_attach_mult_to( gemm4mb_kr, gemm4mb_kc ); // The cache blocksizes that were scaled above need to be rounded down // to their respective nearest register blocksize multiples. Note that // this can only happen after the appropriate register blocksize is // actually attached as a multiple. bli_blksz_reduce_to_mult( gemm4mb_mc ); bli_blksz_reduce_to_mult( gemm4mb_nc ); // Attach the mr and nr blksz_t objects to each cache blksz_t object. // The primary example of why this is needed relates to nudging kc. // In hemm, symm, trmm, or trmm3, we need to know both mr and nr, // since the multiple we target in nudging depends on whether the // structured matrix is on the left or the right. bli_blksz_obj_attach_mr_nr_to( gemm4mb_mr, gemm4mb_nr, gemm4mb_mc ); bli_blksz_obj_attach_mr_nr_to( gemm4mb_mr, gemm4mb_nr, gemm4mb_nc ); bli_blksz_obj_attach_mr_nr_to( gemm4mb_mr, gemm4mb_nr, gemm4mb_kc ); // Create function pointer object for each datatype-specific gemm // micro-kernel. gemm4mb_ukrs = bli_func_obj_create( NULL, FALSE, NULL, FALSE, BLIS_CGEMM4MB_UKERNEL, BLIS_CGEMM4MB_UKERNEL_PREFERS_CONTIG_ROWS, BLIS_ZGEMM4MB_UKERNEL, BLIS_ZGEMM4MB_UKERNEL_PREFERS_CONTIG_ROWS ); // Create control tree objects for packm operations. gemm4mb_packa_cntl = bli_packm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT2, gemm4mb_mr, gemm4mb_kr, FALSE, // do NOT invert diagonal FALSE, // reverse iteration if upper? FALSE, // reverse iteration if lower? BLIS_PACKED_ROW_PANELS_4MI, BLIS_BUFFER_FOR_A_BLOCK ); gemm4mb_packb_cntl = bli_packm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT2, gemm4mb_kr, gemm4mb_nr, FALSE, // do NOT invert diagonal FALSE, // reverse iteration if upper? FALSE, // reverse iteration if lower? BLIS_PACKED_COL_PANELS_4MI, BLIS_BUFFER_FOR_B_PANEL ); // // Create a control tree for packing A and B, and streaming C. // // Create control tree object for lowest-level block-panel kernel. gemm4mb_cntl_bp_ke = bli_gemm_cntl_obj_create( BLIS_UNB_OPT, BLIS_VARIANT3, NULL, gemm4mb_ukrs, NULL, NULL, NULL, NULL, NULL, NULL ); // Create control tree object for outer panel (to block-panel) // problem. gemm4mb_cntl_op_bp = bli_gemm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT1, gemm4mb_mc, NULL, NULL, gemm4mb_packa_cntl, gemm4mb_packb_cntl, NULL, gemm4mb_cntl_bp_ke, NULL ); // Create control tree object for general problem via multiple // rank-k (outer panel) updates. gemm4mb_cntl_mm_op = bli_gemm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT3, gemm4mb_kc, NULL, NULL, NULL, NULL, NULL, gemm4mb_cntl_op_bp, NULL ); // Create control tree object for very large problem via multiple // general problems. gemm4mb_cntl_vl_mm = bli_gemm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT2, gemm4mb_nc, NULL, NULL, NULL, NULL, NULL, gemm4mb_cntl_mm_op, NULL ); // Alias the "master" gemm control tree to a shorter name. gemm4mb_cntl = gemm4mb_cntl_vl_mm; }
void bli_gemm_cntl_init() { // Create blocksize objects for each dimension. gemm_mc = bli_blksz_obj_create( BLIS_DEFAULT_MC_S, BLIS_MAXIMUM_MC_S, BLIS_DEFAULT_MC_D, BLIS_MAXIMUM_MC_D, BLIS_DEFAULT_MC_C, BLIS_MAXIMUM_MC_C, BLIS_DEFAULT_MC_Z, BLIS_MAXIMUM_MC_Z ); gemm_nc = bli_blksz_obj_create( BLIS_DEFAULT_NC_S, BLIS_MAXIMUM_NC_S, BLIS_DEFAULT_NC_D, BLIS_MAXIMUM_NC_D, BLIS_DEFAULT_NC_C, BLIS_MAXIMUM_NC_C, BLIS_DEFAULT_NC_Z, BLIS_MAXIMUM_NC_Z ); gemm_kc = bli_blksz_obj_create( BLIS_DEFAULT_KC_S, BLIS_MAXIMUM_KC_S, BLIS_DEFAULT_KC_D, BLIS_MAXIMUM_KC_D, BLIS_DEFAULT_KC_C, BLIS_MAXIMUM_KC_C, BLIS_DEFAULT_KC_Z, BLIS_MAXIMUM_KC_Z ); gemm_mr = bli_blksz_obj_create( BLIS_DEFAULT_MR_S, BLIS_PACKDIM_MR_S, BLIS_DEFAULT_MR_D, BLIS_PACKDIM_MR_D, BLIS_DEFAULT_MR_C, BLIS_PACKDIM_MR_C, BLIS_DEFAULT_MR_Z, BLIS_PACKDIM_MR_Z ); gemm_nr = bli_blksz_obj_create( BLIS_DEFAULT_NR_S, BLIS_PACKDIM_NR_S, BLIS_DEFAULT_NR_D, BLIS_PACKDIM_NR_D, BLIS_DEFAULT_NR_C, BLIS_PACKDIM_NR_C, BLIS_DEFAULT_NR_Z, BLIS_PACKDIM_NR_Z ); gemm_kr = bli_blksz_obj_create( BLIS_DEFAULT_KR_S, BLIS_PACKDIM_KR_S, BLIS_DEFAULT_KR_D, BLIS_PACKDIM_KR_D, BLIS_DEFAULT_KR_C, BLIS_PACKDIM_KR_C, BLIS_DEFAULT_KR_Z, BLIS_PACKDIM_KR_Z ); // Create objects for micro-panel alignment (in bytes). gemm_upanel_a_align = bli_blksz_obj_create( BLIS_UPANEL_A_ALIGN_SIZE_S, 0, BLIS_UPANEL_A_ALIGN_SIZE_D, 0, BLIS_UPANEL_A_ALIGN_SIZE_C, 0, BLIS_UPANEL_A_ALIGN_SIZE_Z, 0 ); gemm_upanel_b_align = bli_blksz_obj_create( BLIS_UPANEL_B_ALIGN_SIZE_S, 0, BLIS_UPANEL_B_ALIGN_SIZE_D, 0, BLIS_UPANEL_B_ALIGN_SIZE_C, 0, BLIS_UPANEL_B_ALIGN_SIZE_Z, 0 ); // Attach the register blksz_t objects as sub-blocksizes to the cache // blksz_t objects. bli_blksz_obj_attach_to( gemm_mr, gemm_mc ); bli_blksz_obj_attach_to( gemm_nr, gemm_nc ); bli_blksz_obj_attach_to( gemm_kr, gemm_kc ); // Create function pointer object for each datatype-specific gemm // micro-kernel. gemm_ukrs = bli_func_obj_create( BLIS_SGEMM_UKERNEL, BLIS_SGEMM_UKERNEL_PREFERS_CONTIG_ROWS, BLIS_DGEMM_UKERNEL, BLIS_DGEMM_UKERNEL_PREFERS_CONTIG_ROWS, BLIS_CGEMM_UKERNEL, BLIS_CGEMM_UKERNEL_PREFERS_CONTIG_ROWS, BLIS_ZGEMM_UKERNEL, BLIS_ZGEMM_UKERNEL_PREFERS_CONTIG_ROWS ); // Create control tree objects for packm operations. gemm_packa_cntl = bli_packm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT1, gemm_mr, gemm_kr, FALSE, // do NOT invert diagonal FALSE, // reverse iteration if upper? FALSE, // reverse iteration if lower? BLIS_PACKED_ROW_PANELS, BLIS_BUFFER_FOR_A_BLOCK ); gemm_packb_cntl = bli_packm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT1, gemm_kr, gemm_nr, FALSE, // do NOT invert diagonal FALSE, // reverse iteration if upper? FALSE, // reverse iteration if lower? BLIS_PACKED_COL_PANELS, BLIS_BUFFER_FOR_B_PANEL ); // // Create a control tree for packing A and B, and streaming C. // // Create control tree object for lowest-level block-panel kernel. gemm_cntl_bp_ke = bli_gemm_cntl_obj_create( BLIS_UNB_OPT, BLIS_VARIANT2, NULL, gemm_ukrs, NULL, NULL, NULL, NULL, NULL, NULL ); // Create control tree object for outer panel (to block-panel) // problem. gemm_cntl_op_bp = bli_gemm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT1, gemm_mc, gemm_ukrs, NULL, gemm_packa_cntl, gemm_packb_cntl, NULL, gemm_cntl_bp_ke, NULL ); // Create control tree object for general problem via multiple // rank-k (outer panel) updates. gemm_cntl_mm_op = bli_gemm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT3, gemm_kc, gemm_ukrs, NULL, NULL, NULL, NULL, gemm_cntl_op_bp, NULL ); // Create control tree object for very large problem via multiple // general problems. gemm_cntl_vl_mm = bli_gemm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT2, gemm_nc, gemm_ukrs, NULL, NULL, NULL, NULL, gemm_cntl_mm_op, NULL ); // Alias the "master" gemm control tree to a shorter name. gemm_cntl = gemm_cntl_vl_mm; }
void bli_gemm_cntl_init() { // Create blocksize objects for each dimension. gemm_mc = bli_blksz_obj_create( BLIS_DEFAULT_MC_S, BLIS_EXTEND_MC_S, BLIS_DEFAULT_MC_D, BLIS_EXTEND_MC_D, BLIS_DEFAULT_MC_C, BLIS_EXTEND_MC_C, BLIS_DEFAULT_MC_Z, BLIS_EXTEND_MC_Z ); gemm_nc = bli_blksz_obj_create( BLIS_DEFAULT_NC_S, BLIS_EXTEND_NC_S, BLIS_DEFAULT_NC_D, BLIS_EXTEND_NC_D, BLIS_DEFAULT_NC_C, BLIS_EXTEND_NC_C, BLIS_DEFAULT_NC_Z, BLIS_EXTEND_NC_Z ); gemm_kc = bli_blksz_obj_create( BLIS_DEFAULT_KC_S, BLIS_EXTEND_KC_S, BLIS_DEFAULT_KC_D, BLIS_EXTEND_KC_D, BLIS_DEFAULT_KC_C, BLIS_EXTEND_KC_C, BLIS_DEFAULT_KC_Z, BLIS_EXTEND_KC_Z ); gemm_mr = bli_blksz_obj_create( BLIS_DEFAULT_MR_S, BLIS_EXTEND_MR_S, BLIS_DEFAULT_MR_D, BLIS_EXTEND_MR_D, BLIS_DEFAULT_MR_C, BLIS_EXTEND_MR_C, BLIS_DEFAULT_MR_Z, BLIS_EXTEND_MR_Z ); gemm_nr = bli_blksz_obj_create( BLIS_DEFAULT_NR_S, BLIS_EXTEND_NR_S, BLIS_DEFAULT_NR_D, BLIS_EXTEND_NR_D, BLIS_DEFAULT_NR_C, BLIS_EXTEND_NR_C, BLIS_DEFAULT_NR_Z, BLIS_EXTEND_NR_Z ); gemm_kr = bli_blksz_obj_create( BLIS_DEFAULT_KR_S, BLIS_EXTEND_KR_S, BLIS_DEFAULT_KR_D, BLIS_EXTEND_KR_D, BLIS_DEFAULT_KR_C, BLIS_EXTEND_KR_C, BLIS_DEFAULT_KR_Z, BLIS_EXTEND_KR_Z ); // Create control tree objects for packm operations. gemm_packa_cntl = bli_packm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT2, gemm_mr, gemm_kr, FALSE, // already dense; densify not necessary FALSE, // do NOT invert diagonal FALSE, // reverse iteration if upper? FALSE, // reverse iteration if lower? BLIS_PACKED_ROW_PANELS, BLIS_BUFFER_FOR_A_BLOCK ); gemm_packb_cntl = bli_packm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT2, gemm_kr, gemm_nr, FALSE, // already dense; densify not necessary FALSE, // do NOT invert diagonal FALSE, // reverse iteration if upper? FALSE, // reverse iteration if lower? BLIS_PACKED_COL_PANELS, BLIS_BUFFER_FOR_B_PANEL ); // Create control tree objects for packm/unpackm operations on C. gemm_packc_cntl = bli_packm_cntl_obj_create( BLIS_UNBLOCKED, BLIS_VARIANT1, gemm_mr, gemm_nr, FALSE, // already dense; densify not necessary FALSE, // do NOT invert diagonal FALSE, // reverse iteration if upper? FALSE, // reverse iteration if lower? BLIS_PACKED_COLUMNS, BLIS_BUFFER_FOR_C_PANEL ); gemm_unpackc_cntl = bli_unpackm_cntl_obj_create( BLIS_UNBLOCKED, BLIS_VARIANT1, NULL ); // no blocksize needed // // Create a control tree for packing A and B, and streaming C. // // Create control tree object for lowest-level block-panel kernel. gemm_cntl_bp_ke = bli_gemm_cntl_obj_create( BLIS_UNB_OPT, BLIS_VARIANT2, NULL, NULL, NULL, NULL, NULL, NULL, NULL ); // Create control tree object for outer panel (to block-panel) // problem. gemm_cntl_op_bp = bli_gemm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT1, gemm_mc, NULL, gemm_packa_cntl, gemm_packb_cntl, NULL, gemm_cntl_bp_ke, NULL ); // Create control tree object for general problem via multiple // rank-k (outer panel) updates. gemm_cntl_mm_op = bli_gemm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT3, gemm_kc, NULL, NULL, NULL, NULL, gemm_cntl_op_bp, NULL ); // Create control tree object for very large problem via multiple // general problems. gemm_cntl_vl_mm = bli_gemm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT2, gemm_nc, NULL, NULL, NULL, NULL, gemm_cntl_mm_op, NULL ); // Alias the "master" gemm control tree to a shorter name. gemm_cntl = gemm_cntl_vl_mm; #if 0 // // Create a control tree for packing A, and streaming B and C. // gemm_cntl_bp_ke5 = bli_gemm_cntl_obj_create( BLIS_UNB_OPT, BLIS_VARIANT5, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL ); gemm_cntl_pm_bp = bli_gemm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT3, gemm_kc, NULL, gemm_packa_cntl, NULL, //gemm_packc_cntl, NULL, gemm_cntl_bp_ke5, //gemm_unpackc_cntl ); NULL ); gemm_cntl_mm_pm = bli_gemm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT1, gemm_mc, NULL, NULL, NULL, NULL, gemm_cntl_pm_bp, NULL ); gemm_cntl_vl_mm5 = bli_gemm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT2, gemm_nc, NULL, NULL, NULL, NULL, gemm_cntl_mm_pm, NULL ); gemm_cntl_packa = gemm_cntl_vl_mm5; #endif }
void bli_gemm3m3_cntl_init() { // Create blocksize objects for each dimension. // NOTE: the complex blocksizes for 3m3 are generally equal to their // corresponding real domain counterparts. However, we want to promote // similar cache footprints for the micro-panels of A and B (when // compared to executing in the real domain), and since the complex // micro-panels are three times as "fat" (due to storing real, imaginary // and real+imaginary parts), we reduce KC by a factor of 2 to // compensate. Ideally, we would reduce by a factor of 3, but that // could get messy vis-a-vis keeping KC a multiple of the register // blocksizes. gemm3m3_mc = bli_blksz_obj_create( 0, 0, 0, 0, BLIS_DEFAULT_MC_S, BLIS_MAXIMUM_MC_S, BLIS_DEFAULT_MC_D, BLIS_MAXIMUM_MC_D ); gemm3m3_nc = bli_blksz_obj_create( 0, 0, 0, 0, BLIS_DEFAULT_NC_S/3, BLIS_MAXIMUM_NC_S/3, BLIS_DEFAULT_NC_D/3, BLIS_MAXIMUM_NC_D/3 ); gemm3m3_kc = bli_blksz_obj_create( 0, 0, 0, 0, BLIS_DEFAULT_KC_S, BLIS_MAXIMUM_KC_S, BLIS_DEFAULT_KC_D, BLIS_MAXIMUM_KC_D ); gemm3m3_mr = bli_blksz_obj_create( 0, 0, 0, 0, BLIS_DEFAULT_MR_S, BLIS_PACKDIM_MR_S, BLIS_DEFAULT_MR_D, BLIS_PACKDIM_MR_D ); gemm3m3_nr = bli_blksz_obj_create( 0, 0, 0, 0, BLIS_DEFAULT_NR_S, BLIS_PACKDIM_NR_S, BLIS_DEFAULT_NR_D, BLIS_PACKDIM_NR_D ); gemm3m3_kr = bli_blksz_obj_create( 0, 0, 0, 0, BLIS_DEFAULT_KR_S, BLIS_PACKDIM_KR_S, BLIS_DEFAULT_KR_D, BLIS_PACKDIM_KR_D ); // Attach the register blksz_t objects as blocksize multiples to the cache // blksz_t objects. bli_blksz_obj_attach_mult_to( gemm3m3_mr, gemm3m3_mc ); bli_blksz_obj_attach_mult_to( gemm3m3_nr, gemm3m3_nc ); bli_blksz_obj_attach_mult_to( gemm3m3_kr, gemm3m3_kc ); // The cache blocksizes that were scaled above need to be rounded down // to their respective nearest register blocksize multiples. Note that // this can only happen after the appropriate register blocksize is // actually attached as a multiple. bli_blksz_reduce_to_mult( gemm3m3_nc ); // Attach the mr and nr blksz_t objects to each cache blksz_t object. // The primary example of why this is needed relates to nudging kc. // In hemm, symm, trmm, or trmm3, we need to know both mr and nr, // since the multiple we target in nudging depends on whether the // structured matrix is on the left or the right. bli_blksz_obj_attach_mr_nr_to( gemm3m3_mr, gemm3m3_nr, gemm3m3_mc ); bli_blksz_obj_attach_mr_nr_to( gemm3m3_mr, gemm3m3_nr, gemm3m3_nc ); bli_blksz_obj_attach_mr_nr_to( gemm3m3_mr, gemm3m3_nr, gemm3m3_kc ); // Create function pointer object for each datatype-specific gemm // micro-kernel. gemm3m3_ukrs = bli_func_obj_create( NULL, FALSE, NULL, FALSE, BLIS_CGEMM3M3_UKERNEL, BLIS_CGEMM3M3_UKERNEL_PREFERS_CONTIG_ROWS, BLIS_ZGEMM3M3_UKERNEL, BLIS_ZGEMM3M3_UKERNEL_PREFERS_CONTIG_ROWS ); // Create control tree objects for packm operations. gemm3m3_packb_cntl = bli_packm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT2, gemm3m3_kr, gemm3m3_nr, FALSE, // do NOT invert diagonal FALSE, // reverse iteration if upper? FALSE, // reverse iteration if lower? BLIS_PACKED_COL_PANELS_3MS, BLIS_BUFFER_FOR_B_PANEL ); // // Create a control tree for packing A and B, and streaming C. // // Create control tree object for lowest-level block-panel kernel. gemm3m3_cntl_bp_ke = bli_gemm_cntl_obj_create( BLIS_UNB_OPT, BLIS_VARIANT2, NULL, gemm3m3_ukrs, NULL, NULL, NULL, NULL, NULL, NULL ); // Create control tree object for outer panel (to block-panel) // problem. gemm3m3_cntl_op_bp = bli_gemm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT4, gemm3m3_mc, NULL, NULL, NULL, // packm cntl nodes accessed directly from blk_var4 gemm3m3_packb_cntl, NULL, gemm3m3_cntl_bp_ke, NULL ); // Create control tree object for general problem via multiple // rank-k (outer panel) updates. gemm3m3_cntl_mm_op = bli_gemm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT3, gemm3m3_kc, NULL, NULL, NULL, NULL, NULL, gemm3m3_cntl_op_bp, NULL ); // Create control tree object for very large problem via multiple // general problems. gemm3m3_cntl_vl_mm = bli_gemm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT2, gemm3m3_nc, NULL, NULL, NULL, NULL, NULL, gemm3m3_cntl_mm_op, NULL ); // Alias the "master" gemm control tree to a shorter name. gemm3m3_cntl = gemm3m3_cntl_vl_mm; }
void bli_gemv_cntl_init() { // Create blocksize objects for each dimension. gemv_mc = bli_blksz_obj_create( BLIS_DEFAULT_L2_MC_S, 0, BLIS_DEFAULT_L2_MC_D, 0, BLIS_DEFAULT_L2_MC_C, 0, BLIS_DEFAULT_L2_MC_Z, 0 ); gemv_nc = bli_blksz_obj_create( BLIS_DEFAULT_L2_NC_S, 0, BLIS_DEFAULT_L2_NC_D, 0, BLIS_DEFAULT_L2_NC_C, 0, BLIS_DEFAULT_L2_NC_Z, 0 ); // Create control trees for the lowest-level kernels. These trees induce // operations on (persumably) relatively small block-subvector problems. gemv_cntl_bs_ke_dot = bli_gemv_cntl_obj_create( BLIS_UNB_FUSED, BLIS_VARIANT1, NULL, NULL, NULL, NULL, NULL, NULL, NULL ); gemv_cntl_bs_ke_axpy = bli_gemv_cntl_obj_create( BLIS_UNB_FUSED, BLIS_VARIANT2, NULL, NULL, NULL, NULL, NULL, NULL, NULL ); // Create control trees for problems with relatively small m dimension // (ie: where trans(A) is a row panel problem). gemv_cntl_rp_bs_dot = bli_gemv_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT2, gemv_nc, scalv_cntl, // scale y up-front packm_cntl, // pack A1 (if needed) packv_cntl, // pack x1 (if needed) NULL, // y is not partitioned in var2 gemv_cntl_bs_ke_dot, NULL ); // y is not partitioned in var2 gemv_cntl_rp_bs_axpy = bli_gemv_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT2, gemv_nc, scalv_cntl, // scale y up-front packm_cntl, // pack A1 (if needed) packv_cntl, // pack x1 (if needed) NULL, // y is not partitioned in var2 gemv_cntl_bs_ke_axpy, NULL ); // y is not partitioned in var2 // Create control trees for problems with relatively small n dimension // (ie: where trans(A) is a column panel problem). gemv_cntl_cp_bs_dot = bli_gemv_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT1, gemv_mc, NULL, // no scaling in blk_var1 packm_cntl, // pack A1 (if needed) NULL, // x is not partitioned in var1 packv_cntl, // pack y1 (if needed) gemv_cntl_bs_ke_dot, unpackv_cntl ); // unpack y1 (if packed) gemv_cntl_cp_bs_axpy = bli_gemv_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT1, gemv_mc, NULL, // no scaling in blk_var1 packm_cntl, // pack A1 (if needed) NULL, // x is not partitioned in var1 packv_cntl, // pack y1 (if needed) gemv_cntl_bs_ke_axpy, unpackv_cntl ); // unpack y1 (if packed) // Create control trees for generally large problems. Here, we choose a // variant that partitions subproblems into row panels. gemv_cntl_ge_dot = bli_gemv_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT1, gemv_mc, NULL, // no scaling in blk_var1 NULL, // do not pack A1 NULL, // x is not partitioned in var1 packv_cntl, // pack y1 (if needed) gemv_cntl_rp_bs_dot, unpackv_cntl ); // unpack y1 (if packed) gemv_cntl_ge_axpy = bli_gemv_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT1, gemv_mc, NULL, // no scaling in blk_var1 NULL, // do not pack A1 NULL, // x is not partitioned in var1 packv_cntl, // pack y1 (if needed) gemv_cntl_rp_bs_axpy, unpackv_cntl ); // unpack y1 (if packed) }
void bli_packm_cntl_init() { // Create blocksize objects for m and n register blocking. We will attach // these to the packm control node so they can be used to (a) allocate a // block whose m and n dimension are multiples of mr and nr, and (b) know // how much zero-padding is necessary for edge cases. // NOTE: these alignments end up getting applied to matrices packed for // level-2 operations, even though they are not needed, and/or smaller // alignments may be sufficient. For simplicity, we choose to tweak the // dimensions of all pack matrix buffers the same amount. packm_mult_ldim = bli_blksz_obj_create( BLIS_DEFAULT_MR_S, 0, BLIS_DEFAULT_MR_D, 0, BLIS_DEFAULT_MR_C, 0, BLIS_DEFAULT_MR_Z, 0 ); packm_mult_nvec = bli_blksz_obj_create( BLIS_DEFAULT_NR_S, 0, BLIS_DEFAULT_NR_D, 0, BLIS_DEFAULT_NR_C, 0, BLIS_DEFAULT_NR_Z, 0 ); // Generally speaking, the BLIS_PACKED_ROWS and BLIS_PACKED_COLUMNS // are used by the level-2 operations, and thus densification is not // necessary. These schemas amount to simple copies to row or column // storage. These simple schemas may be used by level-3 operations, // but they should never be used for matrices with structure (since // they do not densify). // The BLIS_PACKED_ROW_PANELS and BLIS_PACKED_COL_PANELS schemas are // used only in level-3 operations. They pack to (typically) skinny // row and column panels, where the width of the panel is determined // by register blocksizes. They are configured to densify matrices // with structure, though they can also be used on matrices that // are already dense and/or have no structure. // Create control trees to pack by rows. packm_cntl_row = bli_packm_cntl_obj_create( BLIS_UNBLOCKED, BLIS_VARIANT1, // When packing to rows: packm_mult_nvec, // - nvec multiple is used for m dimension packm_mult_ldim, // - ldim multiple is used for n dimension FALSE, // do NOT densify structure FALSE, // do NOT invert diagonal FALSE, // do NOT iterate backwards if upper FALSE, // do NOT iterate backwards if lower BLIS_PACKED_ROWS, BLIS_BUFFER_FOR_GEN_USE ); // Create control trees to pack by columns. packm_cntl_col = bli_packm_cntl_obj_create( BLIS_UNBLOCKED, BLIS_VARIANT1, // When packing to columns: packm_mult_ldim, // - ldim multiple is used for m dimension packm_mult_nvec, // - nvec multiple is used for n dimension FALSE, // do NOT densify structure FALSE, // do NOT invert diagonal FALSE, // do NOT iterate backwards if upper FALSE, // do NOT iterate backwards if lower BLIS_PACKED_COLUMNS, BLIS_BUFFER_FOR_GEN_USE ); // Set defaults when we don't care whether the packing is by rows or // by columns. packm_cntl = packm_cntl_col; }
void bli_herk_cntl_init() { // Create blocksize objects for each dimension. herk_mc = bli_blksz_obj_create( BLIS_DEFAULT_MC_S, BLIS_EXTEND_MC_S, BLIS_DEFAULT_MC_D, BLIS_EXTEND_MC_D, BLIS_DEFAULT_MC_C, BLIS_EXTEND_MC_C, BLIS_DEFAULT_MC_Z, BLIS_EXTEND_MC_Z ); herk_nc = bli_blksz_obj_create( BLIS_DEFAULT_NC_S, BLIS_EXTEND_NC_S, BLIS_DEFAULT_NC_D, BLIS_EXTEND_NC_D, BLIS_DEFAULT_NC_C, BLIS_EXTEND_NC_C, BLIS_DEFAULT_NC_Z, BLIS_EXTEND_NC_Z ); herk_kc = bli_blksz_obj_create( BLIS_DEFAULT_KC_S, BLIS_EXTEND_KC_S, BLIS_DEFAULT_KC_D, BLIS_EXTEND_KC_D, BLIS_DEFAULT_KC_C, BLIS_EXTEND_KC_C, BLIS_DEFAULT_KC_Z, BLIS_EXTEND_KC_Z ); herk_mr = bli_blksz_obj_create( BLIS_DEFAULT_MR_S, BLIS_EXTEND_MR_S, BLIS_DEFAULT_MR_D, BLIS_EXTEND_MR_D, BLIS_DEFAULT_MR_C, BLIS_EXTEND_MR_C, BLIS_DEFAULT_MR_Z, BLIS_EXTEND_MR_Z ); herk_nr = bli_blksz_obj_create( BLIS_DEFAULT_NR_S, BLIS_EXTEND_NR_S, BLIS_DEFAULT_NR_D, BLIS_EXTEND_NR_D, BLIS_DEFAULT_NR_C, BLIS_EXTEND_NR_C, BLIS_DEFAULT_NR_Z, BLIS_EXTEND_NR_Z ); herk_kr = bli_blksz_obj_create( BLIS_DEFAULT_KR_S, BLIS_EXTEND_KR_S, BLIS_DEFAULT_KR_D, BLIS_EXTEND_KR_D, BLIS_DEFAULT_KR_C, BLIS_EXTEND_KR_C, BLIS_DEFAULT_KR_Z, BLIS_EXTEND_KR_Z ); herk_ni = bli_blksz_obj_create( BLIS_DEFAULT_NI_S, 0, BLIS_DEFAULT_NI_D, 0, BLIS_DEFAULT_NI_C, 0, BLIS_DEFAULT_NI_Z, 0 ); // Create control tree objects for packm operations. herk_packa_cntl = bli_packm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT2, herk_mr, herk_kr, FALSE, // already dense; densify not necessary FALSE, // do NOT invert diagonal FALSE, // reverse iteration if upper? FALSE, // reverse iteration if lower? BLIS_PACKED_ROW_PANELS, BLIS_BUFFER_FOR_A_BLOCK ); herk_packb_cntl = bli_packm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT2, herk_kr, herk_nr, FALSE, // already dense; densify not necessary FALSE, // do NOT invert diagonal FALSE, // reverse iteration if upper? FALSE, // reverse iteration if lower? BLIS_PACKED_COL_PANELS, BLIS_BUFFER_FOR_B_PANEL ); // Create control tree objects for packm/unpackm operations on C. herk_packc_cntl = bli_packm_cntl_obj_create( BLIS_UNBLOCKED, BLIS_VARIANT1, herk_mr, herk_nr, FALSE, // already dense; densify not necessary FALSE, // do NOT invert diagonal FALSE, // reverse iteration if upper? FALSE, // reverse iteration if lower? BLIS_PACKED_COLUMNS, BLIS_BUFFER_FOR_GEN_USE ); herk_unpackc_cntl = bli_unpackm_cntl_obj_create( BLIS_UNBLOCKED, BLIS_VARIANT1, NULL ); // no blocksize needed // Create control tree object for lowest-level block-panel kernel. herk_cntl_bp_ke = bli_herk_cntl_obj_create( BLIS_UNB_OPT, BLIS_VARIANT2, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL ); // Create control tree object for outer panel (to block-panel) // problem. herk_cntl_op_bp = bli_herk_cntl_obj_create( BLIS_BLOCKED, //BLIS_VARIANT4, // var1 with incremental pack in iter 0 BLIS_VARIANT1, herk_mc, herk_ni, NULL, herk_packa_cntl, herk_packb_cntl, NULL, herk_cntl_bp_ke, NULL ); // Create control tree object for general problem via multiple // rank-k (outer panel) updates. herk_cntl_mm_op = bli_herk_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT3, herk_kc, NULL, NULL, NULL, NULL, NULL, herk_cntl_op_bp, NULL ); // Create control tree object for very large problem via multiple // general problems. herk_cntl_vl_mm = bli_herk_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT2, herk_nc, NULL, NULL, NULL, NULL, NULL, herk_cntl_mm_op, NULL ); // Alias the "master" herk control tree to a shorter name. herk_cntl = herk_cntl_vl_mm; }
void bli_gemm4m1_cntl_init() { // Create blocksize objects for each dimension. // NOTE: the complex blocksizes for 4m1 are generally equal to their // corresponding real domain counterparts. However, we want to promote // similar cache footprints for the micro-panels of A and B (when // compared to executing in the real domain), and since the complex // micro-panels are twice as "fat" (due to storing real and imaginary // parts), we reduce KC by a factor of 2 to compensate. gemm4m1_mc = bli_blksz_obj_create( 0, 0, 0, 0, BLIS_DEFAULT_MC_S, BLIS_MAXIMUM_MC_S, BLIS_DEFAULT_MC_D, BLIS_MAXIMUM_MC_D ); gemm4m1_nc = bli_blksz_obj_create( 0, 0, 0, 0, BLIS_DEFAULT_NC_S, BLIS_MAXIMUM_NC_S, BLIS_DEFAULT_NC_D, BLIS_MAXIMUM_NC_D ); gemm4m1_kc = bli_blksz_obj_create( 0, 0, 0, 0, BLIS_DEFAULT_KC_S/2, BLIS_MAXIMUM_KC_S/2, BLIS_DEFAULT_KC_D/2, BLIS_MAXIMUM_KC_D/2 ); gemm4m1_mr = bli_blksz_obj_create( 0, 0, 0, 0, BLIS_DEFAULT_MR_S, BLIS_PACKDIM_MR_S, BLIS_DEFAULT_MR_D, BLIS_PACKDIM_MR_D ); gemm4m1_nr = bli_blksz_obj_create( 0, 0, 0, 0, BLIS_DEFAULT_NR_S, BLIS_PACKDIM_NR_S, BLIS_DEFAULT_NR_D, BLIS_PACKDIM_NR_D ); gemm4m1_kr = bli_blksz_obj_create( 0, 0, 0, 0, BLIS_DEFAULT_KR_S, BLIS_PACKDIM_KR_S, BLIS_DEFAULT_KR_D, BLIS_PACKDIM_KR_D ); // Attach the register blksz_t objects as blocksize multiples to the cache // blksz_t objects. bli_blksz_obj_attach_mult_to( gemm4m1_mr, gemm4m1_mc ); bli_blksz_obj_attach_mult_to( gemm4m1_nr, gemm4m1_nc ); bli_blksz_obj_attach_mult_to( gemm4m1_kr, gemm4m1_kc ); // Attach the mr and nr blksz_t objects to each cache blksz_t object. // The primary example of why this is needed relates to nudging kc. // In hemm, symm, trmm, or trmm3, we need to know both mr and nr, // since the multiple we target in nudging depends on whether the // structured matrix is on the left or the right. bli_blksz_obj_attach_mr_nr_to( gemm4m1_mr, gemm4m1_nr, gemm4m1_mc ); bli_blksz_obj_attach_mr_nr_to( gemm4m1_mr, gemm4m1_nr, gemm4m1_nc ); bli_blksz_obj_attach_mr_nr_to( gemm4m1_mr, gemm4m1_nr, gemm4m1_kc ); // Create function pointer object for each datatype-specific gemm // micro-kernel. gemm4m1_ukrs = bli_func_obj_create( NULL, FALSE, NULL, FALSE, BLIS_CGEMM4M1_UKERNEL, BLIS_CGEMM4M1_UKERNEL_PREFERS_CONTIG_ROWS, BLIS_ZGEMM4M1_UKERNEL, BLIS_ZGEMM4M1_UKERNEL_PREFERS_CONTIG_ROWS ); // Create control tree objects for packm operations. gemm4m1_packa_cntl = bli_packm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT1, gemm4m1_mr, gemm4m1_kr, FALSE, // do NOT invert diagonal FALSE, // reverse iteration if upper? FALSE, // reverse iteration if lower? BLIS_PACKED_ROW_PANELS_4MI, BLIS_BUFFER_FOR_A_BLOCK ); gemm4m1_packb_cntl = bli_packm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT1, gemm4m1_kr, gemm4m1_nr, FALSE, // do NOT invert diagonal FALSE, // reverse iteration if upper? FALSE, // reverse iteration if lower? BLIS_PACKED_COL_PANELS_4MI, BLIS_BUFFER_FOR_B_PANEL ); // // Create a control tree for packing A and B, and streaming C. // // Create control tree object for lowest-level block-panel kernel. gemm4m1_cntl_bp_ke = bli_gemm_cntl_obj_create( BLIS_UNB_OPT, BLIS_VARIANT2, NULL, gemm4m1_ukrs, NULL, NULL, NULL, NULL, NULL, NULL ); // Create control tree object for outer panel (to block-panel) // problem. gemm4m1_cntl_op_bp = bli_gemm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT1, gemm4m1_mc, NULL, NULL, gemm4m1_packa_cntl, gemm4m1_packb_cntl, NULL, gemm4m1_cntl_bp_ke, NULL ); // Create control tree object for general problem via multiple // rank-k (outer panel) updates. gemm4m1_cntl_mm_op = bli_gemm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT3, gemm4m1_kc, NULL, NULL, NULL, NULL, NULL, gemm4m1_cntl_op_bp, NULL ); // Create control tree object for very large problem via multiple // general problems. gemm4m1_cntl_vl_mm = bli_gemm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT2, gemm4m1_nc, NULL, NULL, NULL, NULL, NULL, gemm4m1_cntl_mm_op, NULL ); // Alias the "master" gemm control tree to a shorter name. gemm4m1_cntl = gemm4m1_cntl_vl_mm; }
void bli_gemm4mh_cntl_init() { // Create blocksize objects for each dimension. // NOTE: the complex blocksizes for 4mh are equal to their // corresponding real domain counterparts. gemm4mh_mc = bli_blksz_obj_create( 0, 0, 0, 0, BLIS_DEFAULT_MC_S, BLIS_MAXIMUM_MC_S, BLIS_DEFAULT_MC_D, BLIS_MAXIMUM_MC_D ); gemm4mh_nc = bli_blksz_obj_create( 0, 0, 0, 0, BLIS_DEFAULT_NC_S, BLIS_MAXIMUM_NC_S, BLIS_DEFAULT_NC_D, BLIS_MAXIMUM_NC_D ); gemm4mh_kc = bli_blksz_obj_create( 0, 0, 0, 0, BLIS_DEFAULT_KC_S, BLIS_MAXIMUM_KC_S, BLIS_DEFAULT_KC_D, BLIS_MAXIMUM_KC_D ); gemm4mh_mr = bli_blksz_obj_create( 0, 0, 0, 0, BLIS_DEFAULT_MR_S, BLIS_PACKDIM_MR_S, BLIS_DEFAULT_MR_D, BLIS_PACKDIM_MR_D ); gemm4mh_nr = bli_blksz_obj_create( 0, 0, 0, 0, BLIS_DEFAULT_NR_S, BLIS_PACKDIM_NR_S, BLIS_DEFAULT_NR_D, BLIS_PACKDIM_NR_D ); gemm4mh_kr = bli_blksz_obj_create( 0, 0, 0, 0, BLIS_DEFAULT_KR_S, BLIS_PACKDIM_KR_S, BLIS_DEFAULT_KR_D, BLIS_PACKDIM_KR_D ); // Attach the register blksz_t objects as blocksize multiples to the cache // blksz_t objects. bli_blksz_obj_attach_mult_to( gemm4mh_mr, gemm4mh_mc ); bli_blksz_obj_attach_mult_to( gemm4mh_nr, gemm4mh_nc ); bli_blksz_obj_attach_mult_to( gemm4mh_kr, gemm4mh_kc ); // Attach the mr and nr blksz_t objects to each cache blksz_t object. // The primary example of why this is needed relates to nudging kc. // In hemm, symm, trmm, or trmm3, we need to know both mr and nr, // since the multiple we target in nudging depends on whether the // structured matrix is on the left or the right. bli_blksz_obj_attach_mr_nr_to( gemm4mh_mr, gemm4mh_nr, gemm4mh_mc ); bli_blksz_obj_attach_mr_nr_to( gemm4mh_mr, gemm4mh_nr, gemm4mh_nc ); bli_blksz_obj_attach_mr_nr_to( gemm4mh_mr, gemm4mh_nr, gemm4mh_kc ); // Create function pointer object for each datatype-specific gemm // micro-kernel. gemm4mh_ukrs = bli_func_obj_create( NULL, FALSE, NULL, FALSE, BLIS_CGEMM4MH_UKERNEL, BLIS_CGEMM4MH_UKERNEL_PREFERS_CONTIG_ROWS, BLIS_ZGEMM4MH_UKERNEL, BLIS_ZGEMM4MH_UKERNEL_PREFERS_CONTIG_ROWS ); // Create control tree objects for packm operations (real only). gemm4mh_packa_cntl_ro = bli_packm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT2, gemm4mh_mr, gemm4mh_kr, FALSE, // do NOT invert diagonal FALSE, // reverse iteration if upper? FALSE, // reverse iteration if lower? BLIS_PACKED_ROW_PANELS_RO, BLIS_BUFFER_FOR_A_BLOCK ); gemm4mh_packb_cntl_ro = bli_packm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT2, gemm4mh_kr, gemm4mh_nr, FALSE, // do NOT invert diagonal FALSE, // reverse iteration if upper? FALSE, // reverse iteration if lower? BLIS_PACKED_COL_PANELS_RO, BLIS_BUFFER_FOR_B_PANEL ); // Create control tree objects for packm operations (imag only). gemm4mh_packa_cntl_io = bli_packm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT2, gemm4mh_mr, gemm4mh_kr, FALSE, // do NOT invert diagonal FALSE, // reverse iteration if upper? FALSE, // reverse iteration if lower? BLIS_PACKED_ROW_PANELS_IO, BLIS_BUFFER_FOR_A_BLOCK ); gemm4mh_packb_cntl_io = bli_packm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT2, gemm4mh_kr, gemm4mh_nr, FALSE, // do NOT invert diagonal FALSE, // reverse iteration if upper? FALSE, // reverse iteration if lower? BLIS_PACKED_COL_PANELS_IO, BLIS_BUFFER_FOR_B_PANEL ); // Create control tree object for lowest-level block-panel kernel. gemm4mh_cntl_bp_ke = bli_gemm_cntl_obj_create( BLIS_UNB_OPT, BLIS_VARIANT2, NULL, gemm4mh_ukrs, NULL, NULL, NULL, NULL, NULL, NULL ); // // Create control tree for A.real * B.real. // // Create control tree object for outer panel (to block-panel) // problem. (real x real) gemm4mh_cntl_op_bp_rr = bli_gemm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT1, gemm4mh_mc, NULL, NULL, gemm4mh_packa_cntl_ro, gemm4mh_packb_cntl_ro, NULL, gemm4mh_cntl_bp_ke, NULL ); // Create control tree object for general problem via multiple // rank-k (outer panel) updates. (real x real) gemm4mh_cntl_mm_op_rr = bli_gemm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT3, gemm4mh_kc, NULL, NULL, NULL, NULL, NULL, gemm4mh_cntl_op_bp_rr, NULL ); // Create control tree object for very large problem via multiple // general problems. (real x real) gemm4mh_cntl_vl_mm_rr = bli_gemm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT2, gemm4mh_nc, NULL, NULL, NULL, NULL, NULL, gemm4mh_cntl_mm_op_rr, NULL ); // // Create control tree for A.real * B.imag. // // Create control tree object for outer panel (to block-panel) // problem. (real x imag) gemm4mh_cntl_op_bp_ri = bli_gemm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT1, gemm4mh_mc, NULL, NULL, gemm4mh_packa_cntl_ro, gemm4mh_packb_cntl_io, NULL, gemm4mh_cntl_bp_ke, NULL ); // Create control tree object for general problem via multiple // rank-k (outer panel) updates. (real x imag) gemm4mh_cntl_mm_op_ri = bli_gemm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT3, gemm4mh_kc, NULL, NULL, NULL, NULL, NULL, gemm4mh_cntl_op_bp_ri, NULL ); // Create control tree object for very large problem via multiple // general problems. (real x imag) gemm4mh_cntl_vl_mm_ri = bli_gemm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT2, gemm4mh_nc, NULL, NULL, NULL, NULL, NULL, gemm4mh_cntl_mm_op_ri, NULL ); // // Create control tree for A.imag * B.real. // // Create control tree object for outer panel (to block-panel) // problem. (imag x real) gemm4mh_cntl_op_bp_ir = bli_gemm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT1, gemm4mh_mc, NULL, NULL, gemm4mh_packa_cntl_io, gemm4mh_packb_cntl_ro, NULL, gemm4mh_cntl_bp_ke, NULL ); // Create control tree object for general problem via multiple // rank-k (outer panel) updates. (imag x real) gemm4mh_cntl_mm_op_ir = bli_gemm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT3, gemm4mh_kc, NULL, NULL, NULL, NULL, NULL, gemm4mh_cntl_op_bp_ir, NULL ); // Create control tree object for very large problem via multiple // general problems. (imag x real) gemm4mh_cntl_vl_mm_ir = bli_gemm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT2, gemm4mh_nc, NULL, NULL, NULL, NULL, NULL, gemm4mh_cntl_mm_op_ir, NULL ); // // Create control tree for A.imag * B.imag. // // Create control tree object for outer panel (to block-panel) // problem. (imag x imag) gemm4mh_cntl_op_bp_ii = bli_gemm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT1, gemm4mh_mc, NULL, NULL, gemm4mh_packa_cntl_io, gemm4mh_packb_cntl_io, NULL, gemm4mh_cntl_bp_ke, NULL ); // Create control tree object for general problem via multiple // rank-k (outer panel) updates. (imag x imag) gemm4mh_cntl_mm_op_ii = bli_gemm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT3, gemm4mh_kc, NULL, NULL, NULL, NULL, NULL, gemm4mh_cntl_op_bp_ii, NULL ); // Create control tree object for very large problem via multiple // general problems. (imag x imag) gemm4mh_cntl_vl_mm_ii = bli_gemm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT2, gemm4mh_nc, NULL, NULL, NULL, NULL, NULL, gemm4mh_cntl_mm_op_ii, NULL ); // Alias the "master" gemm control tree to a shorter name. gemm4mh_cntl_rr = gemm4mh_cntl_vl_mm_rr; gemm4mh_cntl_ri = gemm4mh_cntl_vl_mm_ri; gemm4mh_cntl_ir = gemm4mh_cntl_vl_mm_ir; gemm4mh_cntl_ii = gemm4mh_cntl_vl_mm_ii; }
void bli_gemm_cntl_init() { // Create blocksize objects for each dimension. gemm_mc = bli_blksz_obj_create( BLIS_DEFAULT_MC_S, BLIS_EXTEND_MC_S, BLIS_DEFAULT_MC_D, BLIS_EXTEND_MC_D, BLIS_DEFAULT_MC_C, BLIS_EXTEND_MC_C, BLIS_DEFAULT_MC_Z, BLIS_EXTEND_MC_Z ); gemm_nc = bli_blksz_obj_create( BLIS_DEFAULT_NC_S, BLIS_EXTEND_NC_S, BLIS_DEFAULT_NC_D, BLIS_EXTEND_NC_D, BLIS_DEFAULT_NC_C, BLIS_EXTEND_NC_C, BLIS_DEFAULT_NC_Z, BLIS_EXTEND_NC_Z ); gemm_kc = bli_blksz_obj_create( BLIS_DEFAULT_KC_S, BLIS_EXTEND_KC_S, BLIS_DEFAULT_KC_D, BLIS_EXTEND_KC_D, BLIS_DEFAULT_KC_C, BLIS_EXTEND_KC_C, BLIS_DEFAULT_KC_Z, BLIS_EXTEND_KC_Z ); gemm_mr = bli_blksz_obj_create( BLIS_DEFAULT_MR_S, BLIS_EXTEND_MR_S, BLIS_DEFAULT_MR_D, BLIS_EXTEND_MR_D, BLIS_DEFAULT_MR_C, BLIS_EXTEND_MR_C, BLIS_DEFAULT_MR_Z, BLIS_EXTEND_MR_Z ); gemm_nr = bli_blksz_obj_create( BLIS_DEFAULT_NR_S, BLIS_EXTEND_NR_S, BLIS_DEFAULT_NR_D, BLIS_EXTEND_NR_D, BLIS_DEFAULT_NR_C, BLIS_EXTEND_NR_C, BLIS_DEFAULT_NR_Z, BLIS_EXTEND_NR_Z ); gemm_kr = bli_blksz_obj_create( BLIS_DEFAULT_KR_S, 0, BLIS_DEFAULT_KR_D, 0, BLIS_DEFAULT_KR_C, 0, BLIS_DEFAULT_KR_Z, 0 ); // Create function pointer object for each datatype-specific gemm // micro-kernel. gemm_ukrs = bli_func_obj_create( BLIS_SGEMM_UKERNEL, BLIS_DGEMM_UKERNEL, BLIS_CGEMM_UKERNEL, BLIS_ZGEMM_UKERNEL ); // Create control tree objects for packm operations. gemm_packa_cntl = bli_packm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT1, gemm_mr, gemm_kr, TRUE, // densify; used by hemm/symm FALSE, // do NOT invert diagonal FALSE, // reverse iteration if upper? FALSE, // reverse iteration if lower? BLIS_PACKED_ROW_PANELS, BLIS_BUFFER_FOR_A_BLOCK ); gemm_packb_cntl = bli_packm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT1, gemm_kr, gemm_nr, TRUE, // densify; used by hemm/symm FALSE, // do NOT invert diagonal FALSE, // reverse iteration if upper? FALSE, // reverse iteration if lower? BLIS_PACKED_COL_PANELS, BLIS_BUFFER_FOR_B_PANEL ); // // Create a control tree for packing A and B, and streaming C. // // Create control tree object for lowest-level block-panel kernel. gemm_cntl_bp_ke = bli_gemm_cntl_obj_create( BLIS_UNB_OPT, BLIS_VARIANT2, NULL, gemm_ukrs, NULL, NULL, NULL, NULL, NULL, NULL ); // Create control tree object for outer panel (to block-panel) // problem. gemm_cntl_op_bp = bli_gemm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT1, gemm_mc, NULL, NULL, gemm_packa_cntl, gemm_packb_cntl, NULL, gemm_cntl_bp_ke, NULL ); // Create control tree object for general problem via multiple // rank-k (outer panel) updates. gemm_cntl_mm_op = bli_gemm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT3, gemm_kc, NULL, NULL, NULL, NULL, NULL, gemm_cntl_op_bp, NULL ); // Create control tree object for very large problem via multiple // general problems. gemm_cntl_vl_mm = bli_gemm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT2, gemm_nc, NULL, NULL, NULL, NULL, NULL, gemm_cntl_mm_op, NULL ); // Alias the "master" gemm control tree to a shorter name. gemm_cntl = gemm_cntl_vl_mm; //bli_gemm_cntl_init_exp(); }
void bli_gemm_cntl_init() { // Create blocksize objects for each dimension. gemm_mc = bli_blksz_obj_create( BLIS_DEFAULT_MC_S, BLIS_MAXIMUM_MC_S, BLIS_DEFAULT_MC_D, BLIS_MAXIMUM_MC_D, BLIS_DEFAULT_MC_C, BLIS_MAXIMUM_MC_C, BLIS_DEFAULT_MC_Z, BLIS_MAXIMUM_MC_Z ); gemm_nc = bli_blksz_obj_create( BLIS_DEFAULT_NC_S, BLIS_MAXIMUM_NC_S, BLIS_DEFAULT_NC_D, BLIS_MAXIMUM_NC_D, BLIS_DEFAULT_NC_C, BLIS_MAXIMUM_NC_C, BLIS_DEFAULT_NC_Z, BLIS_MAXIMUM_NC_Z ); gemm_kc = bli_blksz_obj_create( BLIS_DEFAULT_KC_S, BLIS_MAXIMUM_KC_S, BLIS_DEFAULT_KC_D, BLIS_MAXIMUM_KC_D, BLIS_DEFAULT_KC_C, BLIS_MAXIMUM_KC_C, BLIS_DEFAULT_KC_Z, BLIS_MAXIMUM_KC_Z ); gemm_mr = bli_blksz_obj_create( BLIS_DEFAULT_MR_S, BLIS_PACKDIM_MR_S, BLIS_DEFAULT_MR_D, BLIS_PACKDIM_MR_D, BLIS_DEFAULT_MR_C, BLIS_PACKDIM_MR_C, BLIS_DEFAULT_MR_Z, BLIS_PACKDIM_MR_Z ); gemm_nr = bli_blksz_obj_create( BLIS_DEFAULT_NR_S, BLIS_PACKDIM_NR_S, BLIS_DEFAULT_NR_D, BLIS_PACKDIM_NR_D, BLIS_DEFAULT_NR_C, BLIS_PACKDIM_NR_C, BLIS_DEFAULT_NR_Z, BLIS_PACKDIM_NR_Z ); gemm_kr = bli_blksz_obj_create( BLIS_DEFAULT_KR_S, BLIS_PACKDIM_KR_S, BLIS_DEFAULT_KR_D, BLIS_PACKDIM_KR_D, BLIS_DEFAULT_KR_C, BLIS_PACKDIM_KR_C, BLIS_DEFAULT_KR_Z, BLIS_PACKDIM_KR_Z ); // Create objects for micro-panel alignment (in bytes). gemm_upanel_a_align = bli_blksz_obj_create( BLIS_UPANEL_A_ALIGN_SIZE_S, 0, BLIS_UPANEL_A_ALIGN_SIZE_D, 0, BLIS_UPANEL_A_ALIGN_SIZE_C, 0, BLIS_UPANEL_A_ALIGN_SIZE_Z, 0 ); gemm_upanel_b_align = bli_blksz_obj_create( BLIS_UPANEL_B_ALIGN_SIZE_S, 0, BLIS_UPANEL_B_ALIGN_SIZE_D, 0, BLIS_UPANEL_B_ALIGN_SIZE_C, 0, BLIS_UPANEL_B_ALIGN_SIZE_Z, 0 ); // Attach the register blksz_t objects as blocksize multiples to the cache // blksz_t objects. bli_blksz_obj_attach_mult_to( gemm_mr, gemm_mc ); bli_blksz_obj_attach_mult_to( gemm_nr, gemm_nc ); bli_blksz_obj_attach_mult_to( gemm_kr, gemm_kc ); // Attach the mr and nr blksz_t objects to each cache blksz_t object. // The primary example of why this is needed relates to nudging kc. // In hemm, symm, trmm, or trmm3, we need to know both mr and nr, // since the multiple we target in nudging depends on whether the // structured matrix is on the left or the right. bli_blksz_obj_attach_mr_nr_to( gemm_mr, gemm_nr, gemm_mc ); bli_blksz_obj_attach_mr_nr_to( gemm_mr, gemm_nr, gemm_nc ); bli_blksz_obj_attach_mr_nr_to( gemm_mr, gemm_nr, gemm_kc ); // Create function pointer object for each datatype-specific gemm // micro-kernel. gemm_ukrs = bli_func_obj_create( BLIS_SGEMM_UKERNEL, BLIS_SGEMM_UKERNEL_PREFERS_CONTIG_ROWS, BLIS_DGEMM_UKERNEL, BLIS_DGEMM_UKERNEL_PREFERS_CONTIG_ROWS, BLIS_CGEMM_UKERNEL, BLIS_CGEMM_UKERNEL_PREFERS_CONTIG_ROWS, BLIS_ZGEMM_UKERNEL, BLIS_ZGEMM_UKERNEL_PREFERS_CONTIG_ROWS ); // Create function pointer object for reference micro-kernels. gemm_ref_ukrs = bli_func_obj_create( BLIS_SGEMM_UKERNEL_REF, FALSE, BLIS_DGEMM_UKERNEL_REF, FALSE, BLIS_CGEMM_UKERNEL_REF, FALSE, BLIS_ZGEMM_UKERNEL_REF, FALSE ); // Create control tree objects for packm operations. gemm_packa_cntl = bli_packm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT1, gemm_mr, gemm_kr, FALSE, // do NOT invert diagonal FALSE, // reverse iteration if upper? FALSE, // reverse iteration if lower? BLIS_PACKED_ROW_PANELS, BLIS_BUFFER_FOR_A_BLOCK ); gemm_packb_cntl = bli_packm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT1, gemm_kr, gemm_nr, FALSE, // do NOT invert diagonal FALSE, // reverse iteration if upper? FALSE, // reverse iteration if lower? BLIS_PACKED_COL_PANELS, BLIS_BUFFER_FOR_B_PANEL ); // // Create a control tree for packing A and B, and streaming C. // // Create control tree object for lowest-level block-panel kernel. gemm_cntl_bp_ke = bli_gemm_cntl_obj_create( BLIS_UNB_OPT, BLIS_VARIANT2, NULL, gemm_ukrs, NULL, NULL, NULL, NULL, NULL, NULL ); // Create control tree object for outer panel (to block-panel) // problem. gemm_cntl_op_bp = bli_gemm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT1, gemm_mc, NULL, NULL, gemm_packa_cntl, gemm_packb_cntl, NULL, gemm_cntl_bp_ke, NULL ); // Create control tree object for general problem via multiple // rank-k (outer panel) updates. gemm_cntl_mm_op = bli_gemm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT3, gemm_kc, NULL, NULL, NULL, NULL, NULL, gemm_cntl_op_bp, NULL ); // Create control tree object for very large problem via multiple // general problems. gemm_cntl_vl_mm = bli_gemm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT2, gemm_nc, NULL, NULL, NULL, NULL, NULL, gemm_cntl_mm_op, NULL ); // Alias the "master" gemm control tree to a shorter name. gemm_cntl = gemm_cntl_vl_mm; }