void bli_gemm_cntl_init_exp() { // // Create a control tree for packing A, and streaming B and C. // gemm_cntl_bp_ke5 = bli_gemm_cntl_obj_create( BLIS_UNB_OPT, BLIS_VARIANT5, NULL, gemm_ukrs, NULL, NULL, NULL, NULL, NULL, NULL ); gemm_cntl_pm_bp = bli_gemm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT3, gemm_kc, NULL, NULL, gemm_packa_cntl, NULL, NULL, gemm_cntl_bp_ke5, NULL ); gemm_cntl_mm_pm = bli_gemm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT1, gemm_mc, NULL, NULL, NULL, NULL, NULL, gemm_cntl_pm_bp, NULL ); gemm_cntl_vl_mm5 = bli_gemm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT2, gemm_nc, NULL, NULL, NULL, NULL, NULL, gemm_cntl_mm_pm, NULL ); gemm_cntl5 = gemm_cntl_vl_mm5; }
void bli_gemm4mb_cntl_init() { // Create blocksize objects for each dimension. gemm4mb_mc = bli_blksz_obj_create( 0, 0, 0, 0, BLIS_DEFAULT_MC_S/2, BLIS_MAXIMUM_MC_S/2, BLIS_DEFAULT_MC_D/2, BLIS_MAXIMUM_MC_D/2 ); gemm4mb_nc = bli_blksz_obj_create( 0, 0, 0, 0, BLIS_DEFAULT_NC_S/2, BLIS_MAXIMUM_NC_S/2, BLIS_DEFAULT_NC_D/2, BLIS_MAXIMUM_NC_D/2 ); gemm4mb_kc = bli_blksz_obj_create( 0, 0, 0, 0, BLIS_DEFAULT_KC_S, BLIS_MAXIMUM_KC_S, BLIS_DEFAULT_KC_D, BLIS_MAXIMUM_KC_D ); gemm4mb_mr = bli_blksz_obj_create( 0, 0, 0, 0, BLIS_DEFAULT_MR_S, BLIS_PACKDIM_MR_S, BLIS_DEFAULT_MR_D, BLIS_PACKDIM_MR_D ); gemm4mb_nr = bli_blksz_obj_create( 0, 0, 0, 0, BLIS_DEFAULT_NR_S, BLIS_PACKDIM_NR_S, BLIS_DEFAULT_NR_D, BLIS_PACKDIM_NR_D ); gemm4mb_kr = bli_blksz_obj_create( 0, 0, 0, 0, BLIS_DEFAULT_KR_S, BLIS_PACKDIM_KR_S, BLIS_DEFAULT_KR_D, BLIS_PACKDIM_KR_D ); // Attach the register blksz_t objects as blocksize multiples to the cache // blksz_t objects. bli_blksz_obj_attach_mult_to( gemm4mb_mr, gemm4mb_mc ); bli_blksz_obj_attach_mult_to( gemm4mb_nr, gemm4mb_nc ); bli_blksz_obj_attach_mult_to( gemm4mb_kr, gemm4mb_kc ); // The cache blocksizes that were scaled above need to be rounded down // to their respective nearest register blocksize multiples. Note that // this can only happen after the appropriate register blocksize is // actually attached as a multiple. bli_blksz_reduce_to_mult( gemm4mb_mc ); bli_blksz_reduce_to_mult( gemm4mb_nc ); // Attach the mr and nr blksz_t objects to each cache blksz_t object. // The primary example of why this is needed relates to nudging kc. // In hemm, symm, trmm, or trmm3, we need to know both mr and nr, // since the multiple we target in nudging depends on whether the // structured matrix is on the left or the right. bli_blksz_obj_attach_mr_nr_to( gemm4mb_mr, gemm4mb_nr, gemm4mb_mc ); bli_blksz_obj_attach_mr_nr_to( gemm4mb_mr, gemm4mb_nr, gemm4mb_nc ); bli_blksz_obj_attach_mr_nr_to( gemm4mb_mr, gemm4mb_nr, gemm4mb_kc ); // Create function pointer object for each datatype-specific gemm // micro-kernel. gemm4mb_ukrs = bli_func_obj_create( NULL, FALSE, NULL, FALSE, BLIS_CGEMM4MB_UKERNEL, BLIS_CGEMM4MB_UKERNEL_PREFERS_CONTIG_ROWS, BLIS_ZGEMM4MB_UKERNEL, BLIS_ZGEMM4MB_UKERNEL_PREFERS_CONTIG_ROWS ); // Create control tree objects for packm operations. gemm4mb_packa_cntl = bli_packm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT2, gemm4mb_mr, gemm4mb_kr, FALSE, // do NOT invert diagonal FALSE, // reverse iteration if upper? FALSE, // reverse iteration if lower? BLIS_PACKED_ROW_PANELS_4MI, BLIS_BUFFER_FOR_A_BLOCK ); gemm4mb_packb_cntl = bli_packm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT2, gemm4mb_kr, gemm4mb_nr, FALSE, // do NOT invert diagonal FALSE, // reverse iteration if upper? FALSE, // reverse iteration if lower? BLIS_PACKED_COL_PANELS_4MI, BLIS_BUFFER_FOR_B_PANEL ); // // Create a control tree for packing A and B, and streaming C. // // Create control tree object for lowest-level block-panel kernel. gemm4mb_cntl_bp_ke = bli_gemm_cntl_obj_create( BLIS_UNB_OPT, BLIS_VARIANT3, NULL, gemm4mb_ukrs, NULL, NULL, NULL, NULL, NULL, NULL ); // Create control tree object for outer panel (to block-panel) // problem. gemm4mb_cntl_op_bp = bli_gemm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT1, gemm4mb_mc, NULL, NULL, gemm4mb_packa_cntl, gemm4mb_packb_cntl, NULL, gemm4mb_cntl_bp_ke, NULL ); // Create control tree object for general problem via multiple // rank-k (outer panel) updates. gemm4mb_cntl_mm_op = bli_gemm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT3, gemm4mb_kc, NULL, NULL, NULL, NULL, NULL, gemm4mb_cntl_op_bp, NULL ); // Create control tree object for very large problem via multiple // general problems. gemm4mb_cntl_vl_mm = bli_gemm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT2, gemm4mb_nc, NULL, NULL, NULL, NULL, NULL, gemm4mb_cntl_mm_op, NULL ); // Alias the "master" gemm control tree to a shorter name. gemm4mb_cntl = gemm4mb_cntl_vl_mm; }
void bli_gemm_cntl_init() { // Create blocksize objects for each dimension. gemm_mc = bli_blksz_obj_create( BLIS_DEFAULT_MC_S, BLIS_MAXIMUM_MC_S, BLIS_DEFAULT_MC_D, BLIS_MAXIMUM_MC_D, BLIS_DEFAULT_MC_C, BLIS_MAXIMUM_MC_C, BLIS_DEFAULT_MC_Z, BLIS_MAXIMUM_MC_Z ); gemm_nc = bli_blksz_obj_create( BLIS_DEFAULT_NC_S, BLIS_MAXIMUM_NC_S, BLIS_DEFAULT_NC_D, BLIS_MAXIMUM_NC_D, BLIS_DEFAULT_NC_C, BLIS_MAXIMUM_NC_C, BLIS_DEFAULT_NC_Z, BLIS_MAXIMUM_NC_Z ); gemm_kc = bli_blksz_obj_create( BLIS_DEFAULT_KC_S, BLIS_MAXIMUM_KC_S, BLIS_DEFAULT_KC_D, BLIS_MAXIMUM_KC_D, BLIS_DEFAULT_KC_C, BLIS_MAXIMUM_KC_C, BLIS_DEFAULT_KC_Z, BLIS_MAXIMUM_KC_Z ); gemm_mr = bli_blksz_obj_create( BLIS_DEFAULT_MR_S, BLIS_PACKDIM_MR_S, BLIS_DEFAULT_MR_D, BLIS_PACKDIM_MR_D, BLIS_DEFAULT_MR_C, BLIS_PACKDIM_MR_C, BLIS_DEFAULT_MR_Z, BLIS_PACKDIM_MR_Z ); gemm_nr = bli_blksz_obj_create( BLIS_DEFAULT_NR_S, BLIS_PACKDIM_NR_S, BLIS_DEFAULT_NR_D, BLIS_PACKDIM_NR_D, BLIS_DEFAULT_NR_C, BLIS_PACKDIM_NR_C, BLIS_DEFAULT_NR_Z, BLIS_PACKDIM_NR_Z ); gemm_kr = bli_blksz_obj_create( BLIS_DEFAULT_KR_S, BLIS_PACKDIM_KR_S, BLIS_DEFAULT_KR_D, BLIS_PACKDIM_KR_D, BLIS_DEFAULT_KR_C, BLIS_PACKDIM_KR_C, BLIS_DEFAULT_KR_Z, BLIS_PACKDIM_KR_Z ); // Create objects for micro-panel alignment (in bytes). gemm_upanel_a_align = bli_blksz_obj_create( BLIS_UPANEL_A_ALIGN_SIZE_S, 0, BLIS_UPANEL_A_ALIGN_SIZE_D, 0, BLIS_UPANEL_A_ALIGN_SIZE_C, 0, BLIS_UPANEL_A_ALIGN_SIZE_Z, 0 ); gemm_upanel_b_align = bli_blksz_obj_create( BLIS_UPANEL_B_ALIGN_SIZE_S, 0, BLIS_UPANEL_B_ALIGN_SIZE_D, 0, BLIS_UPANEL_B_ALIGN_SIZE_C, 0, BLIS_UPANEL_B_ALIGN_SIZE_Z, 0 ); // Attach the register blksz_t objects as sub-blocksizes to the cache // blksz_t objects. bli_blksz_obj_attach_to( gemm_mr, gemm_mc ); bli_blksz_obj_attach_to( gemm_nr, gemm_nc ); bli_blksz_obj_attach_to( gemm_kr, gemm_kc ); // Create function pointer object for each datatype-specific gemm // micro-kernel. gemm_ukrs = bli_func_obj_create( BLIS_SGEMM_UKERNEL, BLIS_SGEMM_UKERNEL_PREFERS_CONTIG_ROWS, BLIS_DGEMM_UKERNEL, BLIS_DGEMM_UKERNEL_PREFERS_CONTIG_ROWS, BLIS_CGEMM_UKERNEL, BLIS_CGEMM_UKERNEL_PREFERS_CONTIG_ROWS, BLIS_ZGEMM_UKERNEL, BLIS_ZGEMM_UKERNEL_PREFERS_CONTIG_ROWS ); // Create control tree objects for packm operations. gemm_packa_cntl = bli_packm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT1, gemm_mr, gemm_kr, FALSE, // do NOT invert diagonal FALSE, // reverse iteration if upper? FALSE, // reverse iteration if lower? BLIS_PACKED_ROW_PANELS, BLIS_BUFFER_FOR_A_BLOCK ); gemm_packb_cntl = bli_packm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT1, gemm_kr, gemm_nr, FALSE, // do NOT invert diagonal FALSE, // reverse iteration if upper? FALSE, // reverse iteration if lower? BLIS_PACKED_COL_PANELS, BLIS_BUFFER_FOR_B_PANEL ); // // Create a control tree for packing A and B, and streaming C. // // Create control tree object for lowest-level block-panel kernel. gemm_cntl_bp_ke = bli_gemm_cntl_obj_create( BLIS_UNB_OPT, BLIS_VARIANT2, NULL, gemm_ukrs, NULL, NULL, NULL, NULL, NULL, NULL ); // Create control tree object for outer panel (to block-panel) // problem. gemm_cntl_op_bp = bli_gemm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT1, gemm_mc, gemm_ukrs, NULL, gemm_packa_cntl, gemm_packb_cntl, NULL, gemm_cntl_bp_ke, NULL ); // Create control tree object for general problem via multiple // rank-k (outer panel) updates. gemm_cntl_mm_op = bli_gemm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT3, gemm_kc, gemm_ukrs, NULL, NULL, NULL, NULL, gemm_cntl_op_bp, NULL ); // Create control tree object for very large problem via multiple // general problems. gemm_cntl_vl_mm = bli_gemm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT2, gemm_nc, gemm_ukrs, NULL, NULL, NULL, NULL, gemm_cntl_mm_op, NULL ); // Alias the "master" gemm control tree to a shorter name. gemm_cntl = gemm_cntl_vl_mm; }
void bli_gemm_cntl_init() { // Create blocksize objects for each dimension. gemm_mc = bli_blksz_obj_create( BLIS_DEFAULT_MC_S, BLIS_EXTEND_MC_S, BLIS_DEFAULT_MC_D, BLIS_EXTEND_MC_D, BLIS_DEFAULT_MC_C, BLIS_EXTEND_MC_C, BLIS_DEFAULT_MC_Z, BLIS_EXTEND_MC_Z ); gemm_nc = bli_blksz_obj_create( BLIS_DEFAULT_NC_S, BLIS_EXTEND_NC_S, BLIS_DEFAULT_NC_D, BLIS_EXTEND_NC_D, BLIS_DEFAULT_NC_C, BLIS_EXTEND_NC_C, BLIS_DEFAULT_NC_Z, BLIS_EXTEND_NC_Z ); gemm_kc = bli_blksz_obj_create( BLIS_DEFAULT_KC_S, BLIS_EXTEND_KC_S, BLIS_DEFAULT_KC_D, BLIS_EXTEND_KC_D, BLIS_DEFAULT_KC_C, BLIS_EXTEND_KC_C, BLIS_DEFAULT_KC_Z, BLIS_EXTEND_KC_Z ); gemm_mr = bli_blksz_obj_create( BLIS_DEFAULT_MR_S, BLIS_EXTEND_MR_S, BLIS_DEFAULT_MR_D, BLIS_EXTEND_MR_D, BLIS_DEFAULT_MR_C, BLIS_EXTEND_MR_C, BLIS_DEFAULT_MR_Z, BLIS_EXTEND_MR_Z ); gemm_nr = bli_blksz_obj_create( BLIS_DEFAULT_NR_S, BLIS_EXTEND_NR_S, BLIS_DEFAULT_NR_D, BLIS_EXTEND_NR_D, BLIS_DEFAULT_NR_C, BLIS_EXTEND_NR_C, BLIS_DEFAULT_NR_Z, BLIS_EXTEND_NR_Z ); gemm_kr = bli_blksz_obj_create( BLIS_DEFAULT_KR_S, BLIS_EXTEND_KR_S, BLIS_DEFAULT_KR_D, BLIS_EXTEND_KR_D, BLIS_DEFAULT_KR_C, BLIS_EXTEND_KR_C, BLIS_DEFAULT_KR_Z, BLIS_EXTEND_KR_Z ); // Create control tree objects for packm operations. gemm_packa_cntl = bli_packm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT2, gemm_mr, gemm_kr, FALSE, // already dense; densify not necessary FALSE, // do NOT invert diagonal FALSE, // reverse iteration if upper? FALSE, // reverse iteration if lower? BLIS_PACKED_ROW_PANELS, BLIS_BUFFER_FOR_A_BLOCK ); gemm_packb_cntl = bli_packm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT2, gemm_kr, gemm_nr, FALSE, // already dense; densify not necessary FALSE, // do NOT invert diagonal FALSE, // reverse iteration if upper? FALSE, // reverse iteration if lower? BLIS_PACKED_COL_PANELS, BLIS_BUFFER_FOR_B_PANEL ); // Create control tree objects for packm/unpackm operations on C. gemm_packc_cntl = bli_packm_cntl_obj_create( BLIS_UNBLOCKED, BLIS_VARIANT1, gemm_mr, gemm_nr, FALSE, // already dense; densify not necessary FALSE, // do NOT invert diagonal FALSE, // reverse iteration if upper? FALSE, // reverse iteration if lower? BLIS_PACKED_COLUMNS, BLIS_BUFFER_FOR_C_PANEL ); gemm_unpackc_cntl = bli_unpackm_cntl_obj_create( BLIS_UNBLOCKED, BLIS_VARIANT1, NULL ); // no blocksize needed // // Create a control tree for packing A and B, and streaming C. // // Create control tree object for lowest-level block-panel kernel. gemm_cntl_bp_ke = bli_gemm_cntl_obj_create( BLIS_UNB_OPT, BLIS_VARIANT2, NULL, NULL, NULL, NULL, NULL, NULL, NULL ); // Create control tree object for outer panel (to block-panel) // problem. gemm_cntl_op_bp = bli_gemm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT1, gemm_mc, NULL, gemm_packa_cntl, gemm_packb_cntl, NULL, gemm_cntl_bp_ke, NULL ); // Create control tree object for general problem via multiple // rank-k (outer panel) updates. gemm_cntl_mm_op = bli_gemm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT3, gemm_kc, NULL, NULL, NULL, NULL, gemm_cntl_op_bp, NULL ); // Create control tree object for very large problem via multiple // general problems. gemm_cntl_vl_mm = bli_gemm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT2, gemm_nc, NULL, NULL, NULL, NULL, gemm_cntl_mm_op, NULL ); // Alias the "master" gemm control tree to a shorter name. gemm_cntl = gemm_cntl_vl_mm; #if 0 // // Create a control tree for packing A, and streaming B and C. // gemm_cntl_bp_ke5 = bli_gemm_cntl_obj_create( BLIS_UNB_OPT, BLIS_VARIANT5, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL ); gemm_cntl_pm_bp = bli_gemm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT3, gemm_kc, NULL, gemm_packa_cntl, NULL, //gemm_packc_cntl, NULL, gemm_cntl_bp_ke5, //gemm_unpackc_cntl ); NULL ); gemm_cntl_mm_pm = bli_gemm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT1, gemm_mc, NULL, NULL, NULL, NULL, gemm_cntl_pm_bp, NULL ); gemm_cntl_vl_mm5 = bli_gemm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT2, gemm_nc, NULL, NULL, NULL, NULL, gemm_cntl_mm_pm, NULL ); gemm_cntl_packa = gemm_cntl_vl_mm5; #endif }
void bli_gemm3m3_cntl_init() { // Create blocksize objects for each dimension. // NOTE: the complex blocksizes for 3m3 are generally equal to their // corresponding real domain counterparts. However, we want to promote // similar cache footprints for the micro-panels of A and B (when // compared to executing in the real domain), and since the complex // micro-panels are three times as "fat" (due to storing real, imaginary // and real+imaginary parts), we reduce KC by a factor of 2 to // compensate. Ideally, we would reduce by a factor of 3, but that // could get messy vis-a-vis keeping KC a multiple of the register // blocksizes. gemm3m3_mc = bli_blksz_obj_create( 0, 0, 0, 0, BLIS_DEFAULT_MC_S, BLIS_MAXIMUM_MC_S, BLIS_DEFAULT_MC_D, BLIS_MAXIMUM_MC_D ); gemm3m3_nc = bli_blksz_obj_create( 0, 0, 0, 0, BLIS_DEFAULT_NC_S/3, BLIS_MAXIMUM_NC_S/3, BLIS_DEFAULT_NC_D/3, BLIS_MAXIMUM_NC_D/3 ); gemm3m3_kc = bli_blksz_obj_create( 0, 0, 0, 0, BLIS_DEFAULT_KC_S, BLIS_MAXIMUM_KC_S, BLIS_DEFAULT_KC_D, BLIS_MAXIMUM_KC_D ); gemm3m3_mr = bli_blksz_obj_create( 0, 0, 0, 0, BLIS_DEFAULT_MR_S, BLIS_PACKDIM_MR_S, BLIS_DEFAULT_MR_D, BLIS_PACKDIM_MR_D ); gemm3m3_nr = bli_blksz_obj_create( 0, 0, 0, 0, BLIS_DEFAULT_NR_S, BLIS_PACKDIM_NR_S, BLIS_DEFAULT_NR_D, BLIS_PACKDIM_NR_D ); gemm3m3_kr = bli_blksz_obj_create( 0, 0, 0, 0, BLIS_DEFAULT_KR_S, BLIS_PACKDIM_KR_S, BLIS_DEFAULT_KR_D, BLIS_PACKDIM_KR_D ); // Attach the register blksz_t objects as blocksize multiples to the cache // blksz_t objects. bli_blksz_obj_attach_mult_to( gemm3m3_mr, gemm3m3_mc ); bli_blksz_obj_attach_mult_to( gemm3m3_nr, gemm3m3_nc ); bli_blksz_obj_attach_mult_to( gemm3m3_kr, gemm3m3_kc ); // The cache blocksizes that were scaled above need to be rounded down // to their respective nearest register blocksize multiples. Note that // this can only happen after the appropriate register blocksize is // actually attached as a multiple. bli_blksz_reduce_to_mult( gemm3m3_nc ); // Attach the mr and nr blksz_t objects to each cache blksz_t object. // The primary example of why this is needed relates to nudging kc. // In hemm, symm, trmm, or trmm3, we need to know both mr and nr, // since the multiple we target in nudging depends on whether the // structured matrix is on the left or the right. bli_blksz_obj_attach_mr_nr_to( gemm3m3_mr, gemm3m3_nr, gemm3m3_mc ); bli_blksz_obj_attach_mr_nr_to( gemm3m3_mr, gemm3m3_nr, gemm3m3_nc ); bli_blksz_obj_attach_mr_nr_to( gemm3m3_mr, gemm3m3_nr, gemm3m3_kc ); // Create function pointer object for each datatype-specific gemm // micro-kernel. gemm3m3_ukrs = bli_func_obj_create( NULL, FALSE, NULL, FALSE, BLIS_CGEMM3M3_UKERNEL, BLIS_CGEMM3M3_UKERNEL_PREFERS_CONTIG_ROWS, BLIS_ZGEMM3M3_UKERNEL, BLIS_ZGEMM3M3_UKERNEL_PREFERS_CONTIG_ROWS ); // Create control tree objects for packm operations. gemm3m3_packb_cntl = bli_packm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT2, gemm3m3_kr, gemm3m3_nr, FALSE, // do NOT invert diagonal FALSE, // reverse iteration if upper? FALSE, // reverse iteration if lower? BLIS_PACKED_COL_PANELS_3MS, BLIS_BUFFER_FOR_B_PANEL ); // // Create a control tree for packing A and B, and streaming C. // // Create control tree object for lowest-level block-panel kernel. gemm3m3_cntl_bp_ke = bli_gemm_cntl_obj_create( BLIS_UNB_OPT, BLIS_VARIANT2, NULL, gemm3m3_ukrs, NULL, NULL, NULL, NULL, NULL, NULL ); // Create control tree object for outer panel (to block-panel) // problem. gemm3m3_cntl_op_bp = bli_gemm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT4, gemm3m3_mc, NULL, NULL, NULL, // packm cntl nodes accessed directly from blk_var4 gemm3m3_packb_cntl, NULL, gemm3m3_cntl_bp_ke, NULL ); // Create control tree object for general problem via multiple // rank-k (outer panel) updates. gemm3m3_cntl_mm_op = bli_gemm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT3, gemm3m3_kc, NULL, NULL, NULL, NULL, NULL, gemm3m3_cntl_op_bp, NULL ); // Create control tree object for very large problem via multiple // general problems. gemm3m3_cntl_vl_mm = bli_gemm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT2, gemm3m3_nc, NULL, NULL, NULL, NULL, NULL, gemm3m3_cntl_mm_op, NULL ); // Alias the "master" gemm control tree to a shorter name. gemm3m3_cntl = gemm3m3_cntl_vl_mm; }
void bli_gemm4m1_cntl_init() { // Create blocksize objects for each dimension. // NOTE: the complex blocksizes for 4m1 are generally equal to their // corresponding real domain counterparts. However, we want to promote // similar cache footprints for the micro-panels of A and B (when // compared to executing in the real domain), and since the complex // micro-panels are twice as "fat" (due to storing real and imaginary // parts), we reduce KC by a factor of 2 to compensate. gemm4m1_mc = bli_blksz_obj_create( 0, 0, 0, 0, BLIS_DEFAULT_MC_S, BLIS_MAXIMUM_MC_S, BLIS_DEFAULT_MC_D, BLIS_MAXIMUM_MC_D ); gemm4m1_nc = bli_blksz_obj_create( 0, 0, 0, 0, BLIS_DEFAULT_NC_S, BLIS_MAXIMUM_NC_S, BLIS_DEFAULT_NC_D, BLIS_MAXIMUM_NC_D ); gemm4m1_kc = bli_blksz_obj_create( 0, 0, 0, 0, BLIS_DEFAULT_KC_S/2, BLIS_MAXIMUM_KC_S/2, BLIS_DEFAULT_KC_D/2, BLIS_MAXIMUM_KC_D/2 ); gemm4m1_mr = bli_blksz_obj_create( 0, 0, 0, 0, BLIS_DEFAULT_MR_S, BLIS_PACKDIM_MR_S, BLIS_DEFAULT_MR_D, BLIS_PACKDIM_MR_D ); gemm4m1_nr = bli_blksz_obj_create( 0, 0, 0, 0, BLIS_DEFAULT_NR_S, BLIS_PACKDIM_NR_S, BLIS_DEFAULT_NR_D, BLIS_PACKDIM_NR_D ); gemm4m1_kr = bli_blksz_obj_create( 0, 0, 0, 0, BLIS_DEFAULT_KR_S, BLIS_PACKDIM_KR_S, BLIS_DEFAULT_KR_D, BLIS_PACKDIM_KR_D ); // Attach the register blksz_t objects as blocksize multiples to the cache // blksz_t objects. bli_blksz_obj_attach_mult_to( gemm4m1_mr, gemm4m1_mc ); bli_blksz_obj_attach_mult_to( gemm4m1_nr, gemm4m1_nc ); bli_blksz_obj_attach_mult_to( gemm4m1_kr, gemm4m1_kc ); // Attach the mr and nr blksz_t objects to each cache blksz_t object. // The primary example of why this is needed relates to nudging kc. // In hemm, symm, trmm, or trmm3, we need to know both mr and nr, // since the multiple we target in nudging depends on whether the // structured matrix is on the left or the right. bli_blksz_obj_attach_mr_nr_to( gemm4m1_mr, gemm4m1_nr, gemm4m1_mc ); bli_blksz_obj_attach_mr_nr_to( gemm4m1_mr, gemm4m1_nr, gemm4m1_nc ); bli_blksz_obj_attach_mr_nr_to( gemm4m1_mr, gemm4m1_nr, gemm4m1_kc ); // Create function pointer object for each datatype-specific gemm // micro-kernel. gemm4m1_ukrs = bli_func_obj_create( NULL, FALSE, NULL, FALSE, BLIS_CGEMM4M1_UKERNEL, BLIS_CGEMM4M1_UKERNEL_PREFERS_CONTIG_ROWS, BLIS_ZGEMM4M1_UKERNEL, BLIS_ZGEMM4M1_UKERNEL_PREFERS_CONTIG_ROWS ); // Create control tree objects for packm operations. gemm4m1_packa_cntl = bli_packm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT1, gemm4m1_mr, gemm4m1_kr, FALSE, // do NOT invert diagonal FALSE, // reverse iteration if upper? FALSE, // reverse iteration if lower? BLIS_PACKED_ROW_PANELS_4MI, BLIS_BUFFER_FOR_A_BLOCK ); gemm4m1_packb_cntl = bli_packm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT1, gemm4m1_kr, gemm4m1_nr, FALSE, // do NOT invert diagonal FALSE, // reverse iteration if upper? FALSE, // reverse iteration if lower? BLIS_PACKED_COL_PANELS_4MI, BLIS_BUFFER_FOR_B_PANEL ); // // Create a control tree for packing A and B, and streaming C. // // Create control tree object for lowest-level block-panel kernel. gemm4m1_cntl_bp_ke = bli_gemm_cntl_obj_create( BLIS_UNB_OPT, BLIS_VARIANT2, NULL, gemm4m1_ukrs, NULL, NULL, NULL, NULL, NULL, NULL ); // Create control tree object for outer panel (to block-panel) // problem. gemm4m1_cntl_op_bp = bli_gemm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT1, gemm4m1_mc, NULL, NULL, gemm4m1_packa_cntl, gemm4m1_packb_cntl, NULL, gemm4m1_cntl_bp_ke, NULL ); // Create control tree object for general problem via multiple // rank-k (outer panel) updates. gemm4m1_cntl_mm_op = bli_gemm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT3, gemm4m1_kc, NULL, NULL, NULL, NULL, NULL, gemm4m1_cntl_op_bp, NULL ); // Create control tree object for very large problem via multiple // general problems. gemm4m1_cntl_vl_mm = bli_gemm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT2, gemm4m1_nc, NULL, NULL, NULL, NULL, NULL, gemm4m1_cntl_mm_op, NULL ); // Alias the "master" gemm control tree to a shorter name. gemm4m1_cntl = gemm4m1_cntl_vl_mm; }
void bli_gemm4mh_cntl_init() { // Create blocksize objects for each dimension. // NOTE: the complex blocksizes for 4mh are equal to their // corresponding real domain counterparts. gemm4mh_mc = bli_blksz_obj_create( 0, 0, 0, 0, BLIS_DEFAULT_MC_S, BLIS_MAXIMUM_MC_S, BLIS_DEFAULT_MC_D, BLIS_MAXIMUM_MC_D ); gemm4mh_nc = bli_blksz_obj_create( 0, 0, 0, 0, BLIS_DEFAULT_NC_S, BLIS_MAXIMUM_NC_S, BLIS_DEFAULT_NC_D, BLIS_MAXIMUM_NC_D ); gemm4mh_kc = bli_blksz_obj_create( 0, 0, 0, 0, BLIS_DEFAULT_KC_S, BLIS_MAXIMUM_KC_S, BLIS_DEFAULT_KC_D, BLIS_MAXIMUM_KC_D ); gemm4mh_mr = bli_blksz_obj_create( 0, 0, 0, 0, BLIS_DEFAULT_MR_S, BLIS_PACKDIM_MR_S, BLIS_DEFAULT_MR_D, BLIS_PACKDIM_MR_D ); gemm4mh_nr = bli_blksz_obj_create( 0, 0, 0, 0, BLIS_DEFAULT_NR_S, BLIS_PACKDIM_NR_S, BLIS_DEFAULT_NR_D, BLIS_PACKDIM_NR_D ); gemm4mh_kr = bli_blksz_obj_create( 0, 0, 0, 0, BLIS_DEFAULT_KR_S, BLIS_PACKDIM_KR_S, BLIS_DEFAULT_KR_D, BLIS_PACKDIM_KR_D ); // Attach the register blksz_t objects as blocksize multiples to the cache // blksz_t objects. bli_blksz_obj_attach_mult_to( gemm4mh_mr, gemm4mh_mc ); bli_blksz_obj_attach_mult_to( gemm4mh_nr, gemm4mh_nc ); bli_blksz_obj_attach_mult_to( gemm4mh_kr, gemm4mh_kc ); // Attach the mr and nr blksz_t objects to each cache blksz_t object. // The primary example of why this is needed relates to nudging kc. // In hemm, symm, trmm, or trmm3, we need to know both mr and nr, // since the multiple we target in nudging depends on whether the // structured matrix is on the left or the right. bli_blksz_obj_attach_mr_nr_to( gemm4mh_mr, gemm4mh_nr, gemm4mh_mc ); bli_blksz_obj_attach_mr_nr_to( gemm4mh_mr, gemm4mh_nr, gemm4mh_nc ); bli_blksz_obj_attach_mr_nr_to( gemm4mh_mr, gemm4mh_nr, gemm4mh_kc ); // Create function pointer object for each datatype-specific gemm // micro-kernel. gemm4mh_ukrs = bli_func_obj_create( NULL, FALSE, NULL, FALSE, BLIS_CGEMM4MH_UKERNEL, BLIS_CGEMM4MH_UKERNEL_PREFERS_CONTIG_ROWS, BLIS_ZGEMM4MH_UKERNEL, BLIS_ZGEMM4MH_UKERNEL_PREFERS_CONTIG_ROWS ); // Create control tree objects for packm operations (real only). gemm4mh_packa_cntl_ro = bli_packm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT2, gemm4mh_mr, gemm4mh_kr, FALSE, // do NOT invert diagonal FALSE, // reverse iteration if upper? FALSE, // reverse iteration if lower? BLIS_PACKED_ROW_PANELS_RO, BLIS_BUFFER_FOR_A_BLOCK ); gemm4mh_packb_cntl_ro = bli_packm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT2, gemm4mh_kr, gemm4mh_nr, FALSE, // do NOT invert diagonal FALSE, // reverse iteration if upper? FALSE, // reverse iteration if lower? BLIS_PACKED_COL_PANELS_RO, BLIS_BUFFER_FOR_B_PANEL ); // Create control tree objects for packm operations (imag only). gemm4mh_packa_cntl_io = bli_packm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT2, gemm4mh_mr, gemm4mh_kr, FALSE, // do NOT invert diagonal FALSE, // reverse iteration if upper? FALSE, // reverse iteration if lower? BLIS_PACKED_ROW_PANELS_IO, BLIS_BUFFER_FOR_A_BLOCK ); gemm4mh_packb_cntl_io = bli_packm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT2, gemm4mh_kr, gemm4mh_nr, FALSE, // do NOT invert diagonal FALSE, // reverse iteration if upper? FALSE, // reverse iteration if lower? BLIS_PACKED_COL_PANELS_IO, BLIS_BUFFER_FOR_B_PANEL ); // Create control tree object for lowest-level block-panel kernel. gemm4mh_cntl_bp_ke = bli_gemm_cntl_obj_create( BLIS_UNB_OPT, BLIS_VARIANT2, NULL, gemm4mh_ukrs, NULL, NULL, NULL, NULL, NULL, NULL ); // // Create control tree for A.real * B.real. // // Create control tree object for outer panel (to block-panel) // problem. (real x real) gemm4mh_cntl_op_bp_rr = bli_gemm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT1, gemm4mh_mc, NULL, NULL, gemm4mh_packa_cntl_ro, gemm4mh_packb_cntl_ro, NULL, gemm4mh_cntl_bp_ke, NULL ); // Create control tree object for general problem via multiple // rank-k (outer panel) updates. (real x real) gemm4mh_cntl_mm_op_rr = bli_gemm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT3, gemm4mh_kc, NULL, NULL, NULL, NULL, NULL, gemm4mh_cntl_op_bp_rr, NULL ); // Create control tree object for very large problem via multiple // general problems. (real x real) gemm4mh_cntl_vl_mm_rr = bli_gemm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT2, gemm4mh_nc, NULL, NULL, NULL, NULL, NULL, gemm4mh_cntl_mm_op_rr, NULL ); // // Create control tree for A.real * B.imag. // // Create control tree object for outer panel (to block-panel) // problem. (real x imag) gemm4mh_cntl_op_bp_ri = bli_gemm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT1, gemm4mh_mc, NULL, NULL, gemm4mh_packa_cntl_ro, gemm4mh_packb_cntl_io, NULL, gemm4mh_cntl_bp_ke, NULL ); // Create control tree object for general problem via multiple // rank-k (outer panel) updates. (real x imag) gemm4mh_cntl_mm_op_ri = bli_gemm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT3, gemm4mh_kc, NULL, NULL, NULL, NULL, NULL, gemm4mh_cntl_op_bp_ri, NULL ); // Create control tree object for very large problem via multiple // general problems. (real x imag) gemm4mh_cntl_vl_mm_ri = bli_gemm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT2, gemm4mh_nc, NULL, NULL, NULL, NULL, NULL, gemm4mh_cntl_mm_op_ri, NULL ); // // Create control tree for A.imag * B.real. // // Create control tree object for outer panel (to block-panel) // problem. (imag x real) gemm4mh_cntl_op_bp_ir = bli_gemm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT1, gemm4mh_mc, NULL, NULL, gemm4mh_packa_cntl_io, gemm4mh_packb_cntl_ro, NULL, gemm4mh_cntl_bp_ke, NULL ); // Create control tree object for general problem via multiple // rank-k (outer panel) updates. (imag x real) gemm4mh_cntl_mm_op_ir = bli_gemm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT3, gemm4mh_kc, NULL, NULL, NULL, NULL, NULL, gemm4mh_cntl_op_bp_ir, NULL ); // Create control tree object for very large problem via multiple // general problems. (imag x real) gemm4mh_cntl_vl_mm_ir = bli_gemm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT2, gemm4mh_nc, NULL, NULL, NULL, NULL, NULL, gemm4mh_cntl_mm_op_ir, NULL ); // // Create control tree for A.imag * B.imag. // // Create control tree object for outer panel (to block-panel) // problem. (imag x imag) gemm4mh_cntl_op_bp_ii = bli_gemm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT1, gemm4mh_mc, NULL, NULL, gemm4mh_packa_cntl_io, gemm4mh_packb_cntl_io, NULL, gemm4mh_cntl_bp_ke, NULL ); // Create control tree object for general problem via multiple // rank-k (outer panel) updates. (imag x imag) gemm4mh_cntl_mm_op_ii = bli_gemm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT3, gemm4mh_kc, NULL, NULL, NULL, NULL, NULL, gemm4mh_cntl_op_bp_ii, NULL ); // Create control tree object for very large problem via multiple // general problems. (imag x imag) gemm4mh_cntl_vl_mm_ii = bli_gemm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT2, gemm4mh_nc, NULL, NULL, NULL, NULL, NULL, gemm4mh_cntl_mm_op_ii, NULL ); // Alias the "master" gemm control tree to a shorter name. gemm4mh_cntl_rr = gemm4mh_cntl_vl_mm_rr; gemm4mh_cntl_ri = gemm4mh_cntl_vl_mm_ri; gemm4mh_cntl_ir = gemm4mh_cntl_vl_mm_ir; gemm4mh_cntl_ii = gemm4mh_cntl_vl_mm_ii; }
cntl_t* bli_gemm_cntl_create ( opid_t family ) { void* macro_kernel_p = bli_gemm_ker_var2; // Change the macro-kernel if the operation family is herk or trmm. if ( family == BLIS_HERK ) macro_kernel_p = bli_herk_x_ker_var2; else if ( family == BLIS_TRMM ) macro_kernel_p = bli_trmm_xx_ker_var2; // Create two nodes for the macro-kernel. cntl_t* gemm_cntl_bu_ke = bli_gemm_cntl_obj_create ( BLIS_MR, // needed for bli_thrinfo_rgrow() NULL, // variant function pointer not used NULL // no sub-node; this is the leaf of the tree. ); cntl_t* gemm_cntl_bp_bu = bli_gemm_cntl_obj_create ( BLIS_NR, // not used by macro-kernel, but needed for bli_thrinfo_rgrow() macro_kernel_p, gemm_cntl_bu_ke ); // Create a node for packing matrix A. cntl_t* gemm_cntl_packa = bli_packm_cntl_obj_create ( bli_gemm_packa, bli_packm_blk_var1, BLIS_MR, BLIS_KR, FALSE, // do NOT invert diagonal FALSE, // reverse iteration if upper? FALSE, // reverse iteration if lower? BLIS_PACKED_ROW_PANELS, BLIS_BUFFER_FOR_A_BLOCK, gemm_cntl_bp_bu ); // Create a node for partitioning the m dimension by MC. cntl_t* gemm_cntl_op_bp = bli_gemm_cntl_obj_create ( BLIS_MC, bli_gemm_blk_var1, gemm_cntl_packa ); // Create a node for packing matrix B. cntl_t* gemm_cntl_packb = bli_packm_cntl_obj_create ( bli_gemm_packb, bli_packm_blk_var1, BLIS_KR, BLIS_NR, FALSE, // do NOT invert diagonal FALSE, // reverse iteration if upper? FALSE, // reverse iteration if lower? BLIS_PACKED_COL_PANELS, BLIS_BUFFER_FOR_B_PANEL, gemm_cntl_op_bp ); // Create a node for partitioning the k dimension by KC. cntl_t* gemm_cntl_mm_op = bli_gemm_cntl_obj_create ( BLIS_KC, bli_gemm_blk_var3, gemm_cntl_packb ); // Create a node for partitioning the n dimension by NC. cntl_t* gemm_cntl_vl_mm = bli_gemm_cntl_obj_create ( BLIS_NC, bli_gemm_blk_var2, gemm_cntl_mm_op ); return gemm_cntl_vl_mm; }
void bli_gemm_cntl_init() { // Create blocksize objects for each dimension. gemm_mc = bli_blksz_obj_create( BLIS_DEFAULT_MC_S, BLIS_EXTEND_MC_S, BLIS_DEFAULT_MC_D, BLIS_EXTEND_MC_D, BLIS_DEFAULT_MC_C, BLIS_EXTEND_MC_C, BLIS_DEFAULT_MC_Z, BLIS_EXTEND_MC_Z ); gemm_nc = bli_blksz_obj_create( BLIS_DEFAULT_NC_S, BLIS_EXTEND_NC_S, BLIS_DEFAULT_NC_D, BLIS_EXTEND_NC_D, BLIS_DEFAULT_NC_C, BLIS_EXTEND_NC_C, BLIS_DEFAULT_NC_Z, BLIS_EXTEND_NC_Z ); gemm_kc = bli_blksz_obj_create( BLIS_DEFAULT_KC_S, BLIS_EXTEND_KC_S, BLIS_DEFAULT_KC_D, BLIS_EXTEND_KC_D, BLIS_DEFAULT_KC_C, BLIS_EXTEND_KC_C, BLIS_DEFAULT_KC_Z, BLIS_EXTEND_KC_Z ); gemm_mr = bli_blksz_obj_create( BLIS_DEFAULT_MR_S, BLIS_EXTEND_MR_S, BLIS_DEFAULT_MR_D, BLIS_EXTEND_MR_D, BLIS_DEFAULT_MR_C, BLIS_EXTEND_MR_C, BLIS_DEFAULT_MR_Z, BLIS_EXTEND_MR_Z ); gemm_nr = bli_blksz_obj_create( BLIS_DEFAULT_NR_S, BLIS_EXTEND_NR_S, BLIS_DEFAULT_NR_D, BLIS_EXTEND_NR_D, BLIS_DEFAULT_NR_C, BLIS_EXTEND_NR_C, BLIS_DEFAULT_NR_Z, BLIS_EXTEND_NR_Z ); gemm_kr = bli_blksz_obj_create( BLIS_DEFAULT_KR_S, 0, BLIS_DEFAULT_KR_D, 0, BLIS_DEFAULT_KR_C, 0, BLIS_DEFAULT_KR_Z, 0 ); // Create function pointer object for each datatype-specific gemm // micro-kernel. gemm_ukrs = bli_func_obj_create( BLIS_SGEMM_UKERNEL, BLIS_DGEMM_UKERNEL, BLIS_CGEMM_UKERNEL, BLIS_ZGEMM_UKERNEL ); // Create control tree objects for packm operations. gemm_packa_cntl = bli_packm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT1, gemm_mr, gemm_kr, TRUE, // densify; used by hemm/symm FALSE, // do NOT invert diagonal FALSE, // reverse iteration if upper? FALSE, // reverse iteration if lower? BLIS_PACKED_ROW_PANELS, BLIS_BUFFER_FOR_A_BLOCK ); gemm_packb_cntl = bli_packm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT1, gemm_kr, gemm_nr, TRUE, // densify; used by hemm/symm FALSE, // do NOT invert diagonal FALSE, // reverse iteration if upper? FALSE, // reverse iteration if lower? BLIS_PACKED_COL_PANELS, BLIS_BUFFER_FOR_B_PANEL ); // // Create a control tree for packing A and B, and streaming C. // // Create control tree object for lowest-level block-panel kernel. gemm_cntl_bp_ke = bli_gemm_cntl_obj_create( BLIS_UNB_OPT, BLIS_VARIANT2, NULL, gemm_ukrs, NULL, NULL, NULL, NULL, NULL, NULL ); // Create control tree object for outer panel (to block-panel) // problem. gemm_cntl_op_bp = bli_gemm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT1, gemm_mc, NULL, NULL, gemm_packa_cntl, gemm_packb_cntl, NULL, gemm_cntl_bp_ke, NULL ); // Create control tree object for general problem via multiple // rank-k (outer panel) updates. gemm_cntl_mm_op = bli_gemm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT3, gemm_kc, NULL, NULL, NULL, NULL, NULL, gemm_cntl_op_bp, NULL ); // Create control tree object for very large problem via multiple // general problems. gemm_cntl_vl_mm = bli_gemm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT2, gemm_nc, NULL, NULL, NULL, NULL, NULL, gemm_cntl_mm_op, NULL ); // Alias the "master" gemm control tree to a shorter name. gemm_cntl = gemm_cntl_vl_mm; //bli_gemm_cntl_init_exp(); }
void bli_gemm_cntl_init() { // Create blocksize objects for each dimension. gemm_mc = bli_blksz_obj_create( BLIS_DEFAULT_MC_S, BLIS_MAXIMUM_MC_S, BLIS_DEFAULT_MC_D, BLIS_MAXIMUM_MC_D, BLIS_DEFAULT_MC_C, BLIS_MAXIMUM_MC_C, BLIS_DEFAULT_MC_Z, BLIS_MAXIMUM_MC_Z ); gemm_nc = bli_blksz_obj_create( BLIS_DEFAULT_NC_S, BLIS_MAXIMUM_NC_S, BLIS_DEFAULT_NC_D, BLIS_MAXIMUM_NC_D, BLIS_DEFAULT_NC_C, BLIS_MAXIMUM_NC_C, BLIS_DEFAULT_NC_Z, BLIS_MAXIMUM_NC_Z ); gemm_kc = bli_blksz_obj_create( BLIS_DEFAULT_KC_S, BLIS_MAXIMUM_KC_S, BLIS_DEFAULT_KC_D, BLIS_MAXIMUM_KC_D, BLIS_DEFAULT_KC_C, BLIS_MAXIMUM_KC_C, BLIS_DEFAULT_KC_Z, BLIS_MAXIMUM_KC_Z ); gemm_mr = bli_blksz_obj_create( BLIS_DEFAULT_MR_S, BLIS_PACKDIM_MR_S, BLIS_DEFAULT_MR_D, BLIS_PACKDIM_MR_D, BLIS_DEFAULT_MR_C, BLIS_PACKDIM_MR_C, BLIS_DEFAULT_MR_Z, BLIS_PACKDIM_MR_Z ); gemm_nr = bli_blksz_obj_create( BLIS_DEFAULT_NR_S, BLIS_PACKDIM_NR_S, BLIS_DEFAULT_NR_D, BLIS_PACKDIM_NR_D, BLIS_DEFAULT_NR_C, BLIS_PACKDIM_NR_C, BLIS_DEFAULT_NR_Z, BLIS_PACKDIM_NR_Z ); gemm_kr = bli_blksz_obj_create( BLIS_DEFAULT_KR_S, BLIS_PACKDIM_KR_S, BLIS_DEFAULT_KR_D, BLIS_PACKDIM_KR_D, BLIS_DEFAULT_KR_C, BLIS_PACKDIM_KR_C, BLIS_DEFAULT_KR_Z, BLIS_PACKDIM_KR_Z ); // Create objects for micro-panel alignment (in bytes). gemm_upanel_a_align = bli_blksz_obj_create( BLIS_UPANEL_A_ALIGN_SIZE_S, 0, BLIS_UPANEL_A_ALIGN_SIZE_D, 0, BLIS_UPANEL_A_ALIGN_SIZE_C, 0, BLIS_UPANEL_A_ALIGN_SIZE_Z, 0 ); gemm_upanel_b_align = bli_blksz_obj_create( BLIS_UPANEL_B_ALIGN_SIZE_S, 0, BLIS_UPANEL_B_ALIGN_SIZE_D, 0, BLIS_UPANEL_B_ALIGN_SIZE_C, 0, BLIS_UPANEL_B_ALIGN_SIZE_Z, 0 ); // Attach the register blksz_t objects as blocksize multiples to the cache // blksz_t objects. bli_blksz_obj_attach_mult_to( gemm_mr, gemm_mc ); bli_blksz_obj_attach_mult_to( gemm_nr, gemm_nc ); bli_blksz_obj_attach_mult_to( gemm_kr, gemm_kc ); // Attach the mr and nr blksz_t objects to each cache blksz_t object. // The primary example of why this is needed relates to nudging kc. // In hemm, symm, trmm, or trmm3, we need to know both mr and nr, // since the multiple we target in nudging depends on whether the // structured matrix is on the left or the right. bli_blksz_obj_attach_mr_nr_to( gemm_mr, gemm_nr, gemm_mc ); bli_blksz_obj_attach_mr_nr_to( gemm_mr, gemm_nr, gemm_nc ); bli_blksz_obj_attach_mr_nr_to( gemm_mr, gemm_nr, gemm_kc ); // Create function pointer object for each datatype-specific gemm // micro-kernel. gemm_ukrs = bli_func_obj_create( BLIS_SGEMM_UKERNEL, BLIS_SGEMM_UKERNEL_PREFERS_CONTIG_ROWS, BLIS_DGEMM_UKERNEL, BLIS_DGEMM_UKERNEL_PREFERS_CONTIG_ROWS, BLIS_CGEMM_UKERNEL, BLIS_CGEMM_UKERNEL_PREFERS_CONTIG_ROWS, BLIS_ZGEMM_UKERNEL, BLIS_ZGEMM_UKERNEL_PREFERS_CONTIG_ROWS ); // Create function pointer object for reference micro-kernels. gemm_ref_ukrs = bli_func_obj_create( BLIS_SGEMM_UKERNEL_REF, FALSE, BLIS_DGEMM_UKERNEL_REF, FALSE, BLIS_CGEMM_UKERNEL_REF, FALSE, BLIS_ZGEMM_UKERNEL_REF, FALSE ); // Create control tree objects for packm operations. gemm_packa_cntl = bli_packm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT1, gemm_mr, gemm_kr, FALSE, // do NOT invert diagonal FALSE, // reverse iteration if upper? FALSE, // reverse iteration if lower? BLIS_PACKED_ROW_PANELS, BLIS_BUFFER_FOR_A_BLOCK ); gemm_packb_cntl = bli_packm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT1, gemm_kr, gemm_nr, FALSE, // do NOT invert diagonal FALSE, // reverse iteration if upper? FALSE, // reverse iteration if lower? BLIS_PACKED_COL_PANELS, BLIS_BUFFER_FOR_B_PANEL ); // // Create a control tree for packing A and B, and streaming C. // // Create control tree object for lowest-level block-panel kernel. gemm_cntl_bp_ke = bli_gemm_cntl_obj_create( BLIS_UNB_OPT, BLIS_VARIANT2, NULL, gemm_ukrs, NULL, NULL, NULL, NULL, NULL, NULL ); // Create control tree object for outer panel (to block-panel) // problem. gemm_cntl_op_bp = bli_gemm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT1, gemm_mc, NULL, NULL, gemm_packa_cntl, gemm_packb_cntl, NULL, gemm_cntl_bp_ke, NULL ); // Create control tree object for general problem via multiple // rank-k (outer panel) updates. gemm_cntl_mm_op = bli_gemm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT3, gemm_kc, NULL, NULL, NULL, NULL, NULL, gemm_cntl_op_bp, NULL ); // Create control tree object for very large problem via multiple // general problems. gemm_cntl_vl_mm = bli_gemm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT2, gemm_nc, NULL, NULL, NULL, NULL, NULL, gemm_cntl_mm_op, NULL ); // Alias the "master" gemm control tree to a shorter name. gemm_cntl = gemm_cntl_vl_mm; }