void bli_gemm4mb_cntl_init() { // Create blocksize objects for each dimension. gemm4mb_mc = bli_blksz_obj_create( 0, 0, 0, 0, BLIS_DEFAULT_MC_S/2, BLIS_MAXIMUM_MC_S/2, BLIS_DEFAULT_MC_D/2, BLIS_MAXIMUM_MC_D/2 ); gemm4mb_nc = bli_blksz_obj_create( 0, 0, 0, 0, BLIS_DEFAULT_NC_S/2, BLIS_MAXIMUM_NC_S/2, BLIS_DEFAULT_NC_D/2, BLIS_MAXIMUM_NC_D/2 ); gemm4mb_kc = bli_blksz_obj_create( 0, 0, 0, 0, BLIS_DEFAULT_KC_S, BLIS_MAXIMUM_KC_S, BLIS_DEFAULT_KC_D, BLIS_MAXIMUM_KC_D ); gemm4mb_mr = bli_blksz_obj_create( 0, 0, 0, 0, BLIS_DEFAULT_MR_S, BLIS_PACKDIM_MR_S, BLIS_DEFAULT_MR_D, BLIS_PACKDIM_MR_D ); gemm4mb_nr = bli_blksz_obj_create( 0, 0, 0, 0, BLIS_DEFAULT_NR_S, BLIS_PACKDIM_NR_S, BLIS_DEFAULT_NR_D, BLIS_PACKDIM_NR_D ); gemm4mb_kr = bli_blksz_obj_create( 0, 0, 0, 0, BLIS_DEFAULT_KR_S, BLIS_PACKDIM_KR_S, BLIS_DEFAULT_KR_D, BLIS_PACKDIM_KR_D ); // Attach the register blksz_t objects as blocksize multiples to the cache // blksz_t objects. bli_blksz_obj_attach_mult_to( gemm4mb_mr, gemm4mb_mc ); bli_blksz_obj_attach_mult_to( gemm4mb_nr, gemm4mb_nc ); bli_blksz_obj_attach_mult_to( gemm4mb_kr, gemm4mb_kc ); // The cache blocksizes that were scaled above need to be rounded down // to their respective nearest register blocksize multiples. Note that // this can only happen after the appropriate register blocksize is // actually attached as a multiple. bli_blksz_reduce_to_mult( gemm4mb_mc ); bli_blksz_reduce_to_mult( gemm4mb_nc ); // Attach the mr and nr blksz_t objects to each cache blksz_t object. // The primary example of why this is needed relates to nudging kc. // In hemm, symm, trmm, or trmm3, we need to know both mr and nr, // since the multiple we target in nudging depends on whether the // structured matrix is on the left or the right. bli_blksz_obj_attach_mr_nr_to( gemm4mb_mr, gemm4mb_nr, gemm4mb_mc ); bli_blksz_obj_attach_mr_nr_to( gemm4mb_mr, gemm4mb_nr, gemm4mb_nc ); bli_blksz_obj_attach_mr_nr_to( gemm4mb_mr, gemm4mb_nr, gemm4mb_kc ); // Create function pointer object for each datatype-specific gemm // micro-kernel. gemm4mb_ukrs = bli_func_obj_create( NULL, FALSE, NULL, FALSE, BLIS_CGEMM4MB_UKERNEL, BLIS_CGEMM4MB_UKERNEL_PREFERS_CONTIG_ROWS, BLIS_ZGEMM4MB_UKERNEL, BLIS_ZGEMM4MB_UKERNEL_PREFERS_CONTIG_ROWS ); // Create control tree objects for packm operations. gemm4mb_packa_cntl = bli_packm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT2, gemm4mb_mr, gemm4mb_kr, FALSE, // do NOT invert diagonal FALSE, // reverse iteration if upper? FALSE, // reverse iteration if lower? BLIS_PACKED_ROW_PANELS_4MI, BLIS_BUFFER_FOR_A_BLOCK ); gemm4mb_packb_cntl = bli_packm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT2, gemm4mb_kr, gemm4mb_nr, FALSE, // do NOT invert diagonal FALSE, // reverse iteration if upper? FALSE, // reverse iteration if lower? BLIS_PACKED_COL_PANELS_4MI, BLIS_BUFFER_FOR_B_PANEL ); // // Create a control tree for packing A and B, and streaming C. // // Create control tree object for lowest-level block-panel kernel. gemm4mb_cntl_bp_ke = bli_gemm_cntl_obj_create( BLIS_UNB_OPT, BLIS_VARIANT3, NULL, gemm4mb_ukrs, NULL, NULL, NULL, NULL, NULL, NULL ); // Create control tree object for outer panel (to block-panel) // problem. gemm4mb_cntl_op_bp = bli_gemm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT1, gemm4mb_mc, NULL, NULL, gemm4mb_packa_cntl, gemm4mb_packb_cntl, NULL, gemm4mb_cntl_bp_ke, NULL ); // Create control tree object for general problem via multiple // rank-k (outer panel) updates. gemm4mb_cntl_mm_op = bli_gemm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT3, gemm4mb_kc, NULL, NULL, NULL, NULL, NULL, gemm4mb_cntl_op_bp, NULL ); // Create control tree object for very large problem via multiple // general problems. gemm4mb_cntl_vl_mm = bli_gemm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT2, gemm4mb_nc, NULL, NULL, NULL, NULL, NULL, gemm4mb_cntl_mm_op, NULL ); // Alias the "master" gemm control tree to a shorter name. gemm4mb_cntl = gemm4mb_cntl_vl_mm; }
void bli_gemm3m3_cntl_init() { // Create blocksize objects for each dimension. // NOTE: the complex blocksizes for 3m3 are generally equal to their // corresponding real domain counterparts. However, we want to promote // similar cache footprints for the micro-panels of A and B (when // compared to executing in the real domain), and since the complex // micro-panels are three times as "fat" (due to storing real, imaginary // and real+imaginary parts), we reduce KC by a factor of 2 to // compensate. Ideally, we would reduce by a factor of 3, but that // could get messy vis-a-vis keeping KC a multiple of the register // blocksizes. gemm3m3_mc = bli_blksz_obj_create( 0, 0, 0, 0, BLIS_DEFAULT_MC_S, BLIS_MAXIMUM_MC_S, BLIS_DEFAULT_MC_D, BLIS_MAXIMUM_MC_D ); gemm3m3_nc = bli_blksz_obj_create( 0, 0, 0, 0, BLIS_DEFAULT_NC_S/3, BLIS_MAXIMUM_NC_S/3, BLIS_DEFAULT_NC_D/3, BLIS_MAXIMUM_NC_D/3 ); gemm3m3_kc = bli_blksz_obj_create( 0, 0, 0, 0, BLIS_DEFAULT_KC_S, BLIS_MAXIMUM_KC_S, BLIS_DEFAULT_KC_D, BLIS_MAXIMUM_KC_D ); gemm3m3_mr = bli_blksz_obj_create( 0, 0, 0, 0, BLIS_DEFAULT_MR_S, BLIS_PACKDIM_MR_S, BLIS_DEFAULT_MR_D, BLIS_PACKDIM_MR_D ); gemm3m3_nr = bli_blksz_obj_create( 0, 0, 0, 0, BLIS_DEFAULT_NR_S, BLIS_PACKDIM_NR_S, BLIS_DEFAULT_NR_D, BLIS_PACKDIM_NR_D ); gemm3m3_kr = bli_blksz_obj_create( 0, 0, 0, 0, BLIS_DEFAULT_KR_S, BLIS_PACKDIM_KR_S, BLIS_DEFAULT_KR_D, BLIS_PACKDIM_KR_D ); // Attach the register blksz_t objects as blocksize multiples to the cache // blksz_t objects. bli_blksz_obj_attach_mult_to( gemm3m3_mr, gemm3m3_mc ); bli_blksz_obj_attach_mult_to( gemm3m3_nr, gemm3m3_nc ); bli_blksz_obj_attach_mult_to( gemm3m3_kr, gemm3m3_kc ); // The cache blocksizes that were scaled above need to be rounded down // to their respective nearest register blocksize multiples. Note that // this can only happen after the appropriate register blocksize is // actually attached as a multiple. bli_blksz_reduce_to_mult( gemm3m3_nc ); // Attach the mr and nr blksz_t objects to each cache blksz_t object. // The primary example of why this is needed relates to nudging kc. // In hemm, symm, trmm, or trmm3, we need to know both mr and nr, // since the multiple we target in nudging depends on whether the // structured matrix is on the left or the right. bli_blksz_obj_attach_mr_nr_to( gemm3m3_mr, gemm3m3_nr, gemm3m3_mc ); bli_blksz_obj_attach_mr_nr_to( gemm3m3_mr, gemm3m3_nr, gemm3m3_nc ); bli_blksz_obj_attach_mr_nr_to( gemm3m3_mr, gemm3m3_nr, gemm3m3_kc ); // Create function pointer object for each datatype-specific gemm // micro-kernel. gemm3m3_ukrs = bli_func_obj_create( NULL, FALSE, NULL, FALSE, BLIS_CGEMM3M3_UKERNEL, BLIS_CGEMM3M3_UKERNEL_PREFERS_CONTIG_ROWS, BLIS_ZGEMM3M3_UKERNEL, BLIS_ZGEMM3M3_UKERNEL_PREFERS_CONTIG_ROWS ); // Create control tree objects for packm operations. gemm3m3_packb_cntl = bli_packm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT2, gemm3m3_kr, gemm3m3_nr, FALSE, // do NOT invert diagonal FALSE, // reverse iteration if upper? FALSE, // reverse iteration if lower? BLIS_PACKED_COL_PANELS_3MS, BLIS_BUFFER_FOR_B_PANEL ); // // Create a control tree for packing A and B, and streaming C. // // Create control tree object for lowest-level block-panel kernel. gemm3m3_cntl_bp_ke = bli_gemm_cntl_obj_create( BLIS_UNB_OPT, BLIS_VARIANT2, NULL, gemm3m3_ukrs, NULL, NULL, NULL, NULL, NULL, NULL ); // Create control tree object for outer panel (to block-panel) // problem. gemm3m3_cntl_op_bp = bli_gemm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT4, gemm3m3_mc, NULL, NULL, NULL, // packm cntl nodes accessed directly from blk_var4 gemm3m3_packb_cntl, NULL, gemm3m3_cntl_bp_ke, NULL ); // Create control tree object for general problem via multiple // rank-k (outer panel) updates. gemm3m3_cntl_mm_op = bli_gemm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT3, gemm3m3_kc, NULL, NULL, NULL, NULL, NULL, gemm3m3_cntl_op_bp, NULL ); // Create control tree object for very large problem via multiple // general problems. gemm3m3_cntl_vl_mm = bli_gemm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT2, gemm3m3_nc, NULL, NULL, NULL, NULL, NULL, gemm3m3_cntl_mm_op, NULL ); // Alias the "master" gemm control tree to a shorter name. gemm3m3_cntl = gemm3m3_cntl_vl_mm; }
void bli_gemm4mh_cntl_init() { // Create blocksize objects for each dimension. // NOTE: the complex blocksizes for 4mh are equal to their // corresponding real domain counterparts. gemm4mh_mc = bli_blksz_obj_create( 0, 0, 0, 0, BLIS_DEFAULT_MC_S, BLIS_MAXIMUM_MC_S, BLIS_DEFAULT_MC_D, BLIS_MAXIMUM_MC_D ); gemm4mh_nc = bli_blksz_obj_create( 0, 0, 0, 0, BLIS_DEFAULT_NC_S, BLIS_MAXIMUM_NC_S, BLIS_DEFAULT_NC_D, BLIS_MAXIMUM_NC_D ); gemm4mh_kc = bli_blksz_obj_create( 0, 0, 0, 0, BLIS_DEFAULT_KC_S, BLIS_MAXIMUM_KC_S, BLIS_DEFAULT_KC_D, BLIS_MAXIMUM_KC_D ); gemm4mh_mr = bli_blksz_obj_create( 0, 0, 0, 0, BLIS_DEFAULT_MR_S, BLIS_PACKDIM_MR_S, BLIS_DEFAULT_MR_D, BLIS_PACKDIM_MR_D ); gemm4mh_nr = bli_blksz_obj_create( 0, 0, 0, 0, BLIS_DEFAULT_NR_S, BLIS_PACKDIM_NR_S, BLIS_DEFAULT_NR_D, BLIS_PACKDIM_NR_D ); gemm4mh_kr = bli_blksz_obj_create( 0, 0, 0, 0, BLIS_DEFAULT_KR_S, BLIS_PACKDIM_KR_S, BLIS_DEFAULT_KR_D, BLIS_PACKDIM_KR_D ); // Attach the register blksz_t objects as blocksize multiples to the cache // blksz_t objects. bli_blksz_obj_attach_mult_to( gemm4mh_mr, gemm4mh_mc ); bli_blksz_obj_attach_mult_to( gemm4mh_nr, gemm4mh_nc ); bli_blksz_obj_attach_mult_to( gemm4mh_kr, gemm4mh_kc ); // Attach the mr and nr blksz_t objects to each cache blksz_t object. // The primary example of why this is needed relates to nudging kc. // In hemm, symm, trmm, or trmm3, we need to know both mr and nr, // since the multiple we target in nudging depends on whether the // structured matrix is on the left or the right. bli_blksz_obj_attach_mr_nr_to( gemm4mh_mr, gemm4mh_nr, gemm4mh_mc ); bli_blksz_obj_attach_mr_nr_to( gemm4mh_mr, gemm4mh_nr, gemm4mh_nc ); bli_blksz_obj_attach_mr_nr_to( gemm4mh_mr, gemm4mh_nr, gemm4mh_kc ); // Create function pointer object for each datatype-specific gemm // micro-kernel. gemm4mh_ukrs = bli_func_obj_create( NULL, FALSE, NULL, FALSE, BLIS_CGEMM4MH_UKERNEL, BLIS_CGEMM4MH_UKERNEL_PREFERS_CONTIG_ROWS, BLIS_ZGEMM4MH_UKERNEL, BLIS_ZGEMM4MH_UKERNEL_PREFERS_CONTIG_ROWS ); // Create control tree objects for packm operations (real only). gemm4mh_packa_cntl_ro = bli_packm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT2, gemm4mh_mr, gemm4mh_kr, FALSE, // do NOT invert diagonal FALSE, // reverse iteration if upper? FALSE, // reverse iteration if lower? BLIS_PACKED_ROW_PANELS_RO, BLIS_BUFFER_FOR_A_BLOCK ); gemm4mh_packb_cntl_ro = bli_packm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT2, gemm4mh_kr, gemm4mh_nr, FALSE, // do NOT invert diagonal FALSE, // reverse iteration if upper? FALSE, // reverse iteration if lower? BLIS_PACKED_COL_PANELS_RO, BLIS_BUFFER_FOR_B_PANEL ); // Create control tree objects for packm operations (imag only). gemm4mh_packa_cntl_io = bli_packm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT2, gemm4mh_mr, gemm4mh_kr, FALSE, // do NOT invert diagonal FALSE, // reverse iteration if upper? FALSE, // reverse iteration if lower? BLIS_PACKED_ROW_PANELS_IO, BLIS_BUFFER_FOR_A_BLOCK ); gemm4mh_packb_cntl_io = bli_packm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT2, gemm4mh_kr, gemm4mh_nr, FALSE, // do NOT invert diagonal FALSE, // reverse iteration if upper? FALSE, // reverse iteration if lower? BLIS_PACKED_COL_PANELS_IO, BLIS_BUFFER_FOR_B_PANEL ); // Create control tree object for lowest-level block-panel kernel. gemm4mh_cntl_bp_ke = bli_gemm_cntl_obj_create( BLIS_UNB_OPT, BLIS_VARIANT2, NULL, gemm4mh_ukrs, NULL, NULL, NULL, NULL, NULL, NULL ); // // Create control tree for A.real * B.real. // // Create control tree object for outer panel (to block-panel) // problem. (real x real) gemm4mh_cntl_op_bp_rr = bli_gemm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT1, gemm4mh_mc, NULL, NULL, gemm4mh_packa_cntl_ro, gemm4mh_packb_cntl_ro, NULL, gemm4mh_cntl_bp_ke, NULL ); // Create control tree object for general problem via multiple // rank-k (outer panel) updates. (real x real) gemm4mh_cntl_mm_op_rr = bli_gemm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT3, gemm4mh_kc, NULL, NULL, NULL, NULL, NULL, gemm4mh_cntl_op_bp_rr, NULL ); // Create control tree object for very large problem via multiple // general problems. (real x real) gemm4mh_cntl_vl_mm_rr = bli_gemm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT2, gemm4mh_nc, NULL, NULL, NULL, NULL, NULL, gemm4mh_cntl_mm_op_rr, NULL ); // // Create control tree for A.real * B.imag. // // Create control tree object for outer panel (to block-panel) // problem. (real x imag) gemm4mh_cntl_op_bp_ri = bli_gemm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT1, gemm4mh_mc, NULL, NULL, gemm4mh_packa_cntl_ro, gemm4mh_packb_cntl_io, NULL, gemm4mh_cntl_bp_ke, NULL ); // Create control tree object for general problem via multiple // rank-k (outer panel) updates. (real x imag) gemm4mh_cntl_mm_op_ri = bli_gemm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT3, gemm4mh_kc, NULL, NULL, NULL, NULL, NULL, gemm4mh_cntl_op_bp_ri, NULL ); // Create control tree object for very large problem via multiple // general problems. (real x imag) gemm4mh_cntl_vl_mm_ri = bli_gemm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT2, gemm4mh_nc, NULL, NULL, NULL, NULL, NULL, gemm4mh_cntl_mm_op_ri, NULL ); // // Create control tree for A.imag * B.real. // // Create control tree object for outer panel (to block-panel) // problem. (imag x real) gemm4mh_cntl_op_bp_ir = bli_gemm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT1, gemm4mh_mc, NULL, NULL, gemm4mh_packa_cntl_io, gemm4mh_packb_cntl_ro, NULL, gemm4mh_cntl_bp_ke, NULL ); // Create control tree object for general problem via multiple // rank-k (outer panel) updates. (imag x real) gemm4mh_cntl_mm_op_ir = bli_gemm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT3, gemm4mh_kc, NULL, NULL, NULL, NULL, NULL, gemm4mh_cntl_op_bp_ir, NULL ); // Create control tree object for very large problem via multiple // general problems. (imag x real) gemm4mh_cntl_vl_mm_ir = bli_gemm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT2, gemm4mh_nc, NULL, NULL, NULL, NULL, NULL, gemm4mh_cntl_mm_op_ir, NULL ); // // Create control tree for A.imag * B.imag. // // Create control tree object for outer panel (to block-panel) // problem. (imag x imag) gemm4mh_cntl_op_bp_ii = bli_gemm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT1, gemm4mh_mc, NULL, NULL, gemm4mh_packa_cntl_io, gemm4mh_packb_cntl_io, NULL, gemm4mh_cntl_bp_ke, NULL ); // Create control tree object for general problem via multiple // rank-k (outer panel) updates. (imag x imag) gemm4mh_cntl_mm_op_ii = bli_gemm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT3, gemm4mh_kc, NULL, NULL, NULL, NULL, NULL, gemm4mh_cntl_op_bp_ii, NULL ); // Create control tree object for very large problem via multiple // general problems. (imag x imag) gemm4mh_cntl_vl_mm_ii = bli_gemm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT2, gemm4mh_nc, NULL, NULL, NULL, NULL, NULL, gemm4mh_cntl_mm_op_ii, NULL ); // Alias the "master" gemm control tree to a shorter name. gemm4mh_cntl_rr = gemm4mh_cntl_vl_mm_rr; gemm4mh_cntl_ri = gemm4mh_cntl_vl_mm_ri; gemm4mh_cntl_ir = gemm4mh_cntl_vl_mm_ir; gemm4mh_cntl_ii = gemm4mh_cntl_vl_mm_ii; }
void bli_gemm4m1_cntl_init() { // Create blocksize objects for each dimension. // NOTE: the complex blocksizes for 4m1 are generally equal to their // corresponding real domain counterparts. However, we want to promote // similar cache footprints for the micro-panels of A and B (when // compared to executing in the real domain), and since the complex // micro-panels are twice as "fat" (due to storing real and imaginary // parts), we reduce KC by a factor of 2 to compensate. gemm4m1_mc = bli_blksz_obj_create( 0, 0, 0, 0, BLIS_DEFAULT_MC_S, BLIS_MAXIMUM_MC_S, BLIS_DEFAULT_MC_D, BLIS_MAXIMUM_MC_D ); gemm4m1_nc = bli_blksz_obj_create( 0, 0, 0, 0, BLIS_DEFAULT_NC_S, BLIS_MAXIMUM_NC_S, BLIS_DEFAULT_NC_D, BLIS_MAXIMUM_NC_D ); gemm4m1_kc = bli_blksz_obj_create( 0, 0, 0, 0, BLIS_DEFAULT_KC_S/2, BLIS_MAXIMUM_KC_S/2, BLIS_DEFAULT_KC_D/2, BLIS_MAXIMUM_KC_D/2 ); gemm4m1_mr = bli_blksz_obj_create( 0, 0, 0, 0, BLIS_DEFAULT_MR_S, BLIS_PACKDIM_MR_S, BLIS_DEFAULT_MR_D, BLIS_PACKDIM_MR_D ); gemm4m1_nr = bli_blksz_obj_create( 0, 0, 0, 0, BLIS_DEFAULT_NR_S, BLIS_PACKDIM_NR_S, BLIS_DEFAULT_NR_D, BLIS_PACKDIM_NR_D ); gemm4m1_kr = bli_blksz_obj_create( 0, 0, 0, 0, BLIS_DEFAULT_KR_S, BLIS_PACKDIM_KR_S, BLIS_DEFAULT_KR_D, BLIS_PACKDIM_KR_D ); // Attach the register blksz_t objects as blocksize multiples to the cache // blksz_t objects. bli_blksz_obj_attach_mult_to( gemm4m1_mr, gemm4m1_mc ); bli_blksz_obj_attach_mult_to( gemm4m1_nr, gemm4m1_nc ); bli_blksz_obj_attach_mult_to( gemm4m1_kr, gemm4m1_kc ); // Attach the mr and nr blksz_t objects to each cache blksz_t object. // The primary example of why this is needed relates to nudging kc. // In hemm, symm, trmm, or trmm3, we need to know both mr and nr, // since the multiple we target in nudging depends on whether the // structured matrix is on the left or the right. bli_blksz_obj_attach_mr_nr_to( gemm4m1_mr, gemm4m1_nr, gemm4m1_mc ); bli_blksz_obj_attach_mr_nr_to( gemm4m1_mr, gemm4m1_nr, gemm4m1_nc ); bli_blksz_obj_attach_mr_nr_to( gemm4m1_mr, gemm4m1_nr, gemm4m1_kc ); // Create function pointer object for each datatype-specific gemm // micro-kernel. gemm4m1_ukrs = bli_func_obj_create( NULL, FALSE, NULL, FALSE, BLIS_CGEMM4M1_UKERNEL, BLIS_CGEMM4M1_UKERNEL_PREFERS_CONTIG_ROWS, BLIS_ZGEMM4M1_UKERNEL, BLIS_ZGEMM4M1_UKERNEL_PREFERS_CONTIG_ROWS ); // Create control tree objects for packm operations. gemm4m1_packa_cntl = bli_packm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT1, gemm4m1_mr, gemm4m1_kr, FALSE, // do NOT invert diagonal FALSE, // reverse iteration if upper? FALSE, // reverse iteration if lower? BLIS_PACKED_ROW_PANELS_4MI, BLIS_BUFFER_FOR_A_BLOCK ); gemm4m1_packb_cntl = bli_packm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT1, gemm4m1_kr, gemm4m1_nr, FALSE, // do NOT invert diagonal FALSE, // reverse iteration if upper? FALSE, // reverse iteration if lower? BLIS_PACKED_COL_PANELS_4MI, BLIS_BUFFER_FOR_B_PANEL ); // // Create a control tree for packing A and B, and streaming C. // // Create control tree object for lowest-level block-panel kernel. gemm4m1_cntl_bp_ke = bli_gemm_cntl_obj_create( BLIS_UNB_OPT, BLIS_VARIANT2, NULL, gemm4m1_ukrs, NULL, NULL, NULL, NULL, NULL, NULL ); // Create control tree object for outer panel (to block-panel) // problem. gemm4m1_cntl_op_bp = bli_gemm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT1, gemm4m1_mc, NULL, NULL, gemm4m1_packa_cntl, gemm4m1_packb_cntl, NULL, gemm4m1_cntl_bp_ke, NULL ); // Create control tree object for general problem via multiple // rank-k (outer panel) updates. gemm4m1_cntl_mm_op = bli_gemm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT3, gemm4m1_kc, NULL, NULL, NULL, NULL, NULL, gemm4m1_cntl_op_bp, NULL ); // Create control tree object for very large problem via multiple // general problems. gemm4m1_cntl_vl_mm = bli_gemm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT2, gemm4m1_nc, NULL, NULL, NULL, NULL, NULL, gemm4m1_cntl_mm_op, NULL ); // Alias the "master" gemm control tree to a shorter name. gemm4m1_cntl = gemm4m1_cntl_vl_mm; }
void bli_gemm_cntl_init() { // Create blocksize objects for each dimension. gemm_mc = bli_blksz_obj_create( BLIS_DEFAULT_MC_S, BLIS_MAXIMUM_MC_S, BLIS_DEFAULT_MC_D, BLIS_MAXIMUM_MC_D, BLIS_DEFAULT_MC_C, BLIS_MAXIMUM_MC_C, BLIS_DEFAULT_MC_Z, BLIS_MAXIMUM_MC_Z ); gemm_nc = bli_blksz_obj_create( BLIS_DEFAULT_NC_S, BLIS_MAXIMUM_NC_S, BLIS_DEFAULT_NC_D, BLIS_MAXIMUM_NC_D, BLIS_DEFAULT_NC_C, BLIS_MAXIMUM_NC_C, BLIS_DEFAULT_NC_Z, BLIS_MAXIMUM_NC_Z ); gemm_kc = bli_blksz_obj_create( BLIS_DEFAULT_KC_S, BLIS_MAXIMUM_KC_S, BLIS_DEFAULT_KC_D, BLIS_MAXIMUM_KC_D, BLIS_DEFAULT_KC_C, BLIS_MAXIMUM_KC_C, BLIS_DEFAULT_KC_Z, BLIS_MAXIMUM_KC_Z ); gemm_mr = bli_blksz_obj_create( BLIS_DEFAULT_MR_S, BLIS_PACKDIM_MR_S, BLIS_DEFAULT_MR_D, BLIS_PACKDIM_MR_D, BLIS_DEFAULT_MR_C, BLIS_PACKDIM_MR_C, BLIS_DEFAULT_MR_Z, BLIS_PACKDIM_MR_Z ); gemm_nr = bli_blksz_obj_create( BLIS_DEFAULT_NR_S, BLIS_PACKDIM_NR_S, BLIS_DEFAULT_NR_D, BLIS_PACKDIM_NR_D, BLIS_DEFAULT_NR_C, BLIS_PACKDIM_NR_C, BLIS_DEFAULT_NR_Z, BLIS_PACKDIM_NR_Z ); gemm_kr = bli_blksz_obj_create( BLIS_DEFAULT_KR_S, BLIS_PACKDIM_KR_S, BLIS_DEFAULT_KR_D, BLIS_PACKDIM_KR_D, BLIS_DEFAULT_KR_C, BLIS_PACKDIM_KR_C, BLIS_DEFAULT_KR_Z, BLIS_PACKDIM_KR_Z ); // Create objects for micro-panel alignment (in bytes). gemm_upanel_a_align = bli_blksz_obj_create( BLIS_UPANEL_A_ALIGN_SIZE_S, 0, BLIS_UPANEL_A_ALIGN_SIZE_D, 0, BLIS_UPANEL_A_ALIGN_SIZE_C, 0, BLIS_UPANEL_A_ALIGN_SIZE_Z, 0 ); gemm_upanel_b_align = bli_blksz_obj_create( BLIS_UPANEL_B_ALIGN_SIZE_S, 0, BLIS_UPANEL_B_ALIGN_SIZE_D, 0, BLIS_UPANEL_B_ALIGN_SIZE_C, 0, BLIS_UPANEL_B_ALIGN_SIZE_Z, 0 ); // Attach the register blksz_t objects as blocksize multiples to the cache // blksz_t objects. bli_blksz_obj_attach_mult_to( gemm_mr, gemm_mc ); bli_blksz_obj_attach_mult_to( gemm_nr, gemm_nc ); bli_blksz_obj_attach_mult_to( gemm_kr, gemm_kc ); // Attach the mr and nr blksz_t objects to each cache blksz_t object. // The primary example of why this is needed relates to nudging kc. // In hemm, symm, trmm, or trmm3, we need to know both mr and nr, // since the multiple we target in nudging depends on whether the // structured matrix is on the left or the right. bli_blksz_obj_attach_mr_nr_to( gemm_mr, gemm_nr, gemm_mc ); bli_blksz_obj_attach_mr_nr_to( gemm_mr, gemm_nr, gemm_nc ); bli_blksz_obj_attach_mr_nr_to( gemm_mr, gemm_nr, gemm_kc ); // Create function pointer object for each datatype-specific gemm // micro-kernel. gemm_ukrs = bli_func_obj_create( BLIS_SGEMM_UKERNEL, BLIS_SGEMM_UKERNEL_PREFERS_CONTIG_ROWS, BLIS_DGEMM_UKERNEL, BLIS_DGEMM_UKERNEL_PREFERS_CONTIG_ROWS, BLIS_CGEMM_UKERNEL, BLIS_CGEMM_UKERNEL_PREFERS_CONTIG_ROWS, BLIS_ZGEMM_UKERNEL, BLIS_ZGEMM_UKERNEL_PREFERS_CONTIG_ROWS ); // Create function pointer object for reference micro-kernels. gemm_ref_ukrs = bli_func_obj_create( BLIS_SGEMM_UKERNEL_REF, FALSE, BLIS_DGEMM_UKERNEL_REF, FALSE, BLIS_CGEMM_UKERNEL_REF, FALSE, BLIS_ZGEMM_UKERNEL_REF, FALSE ); // Create control tree objects for packm operations. gemm_packa_cntl = bli_packm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT1, gemm_mr, gemm_kr, FALSE, // do NOT invert diagonal FALSE, // reverse iteration if upper? FALSE, // reverse iteration if lower? BLIS_PACKED_ROW_PANELS, BLIS_BUFFER_FOR_A_BLOCK ); gemm_packb_cntl = bli_packm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT1, gemm_kr, gemm_nr, FALSE, // do NOT invert diagonal FALSE, // reverse iteration if upper? FALSE, // reverse iteration if lower? BLIS_PACKED_COL_PANELS, BLIS_BUFFER_FOR_B_PANEL ); // // Create a control tree for packing A and B, and streaming C. // // Create control tree object for lowest-level block-panel kernel. gemm_cntl_bp_ke = bli_gemm_cntl_obj_create( BLIS_UNB_OPT, BLIS_VARIANT2, NULL, gemm_ukrs, NULL, NULL, NULL, NULL, NULL, NULL ); // Create control tree object for outer panel (to block-panel) // problem. gemm_cntl_op_bp = bli_gemm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT1, gemm_mc, NULL, NULL, gemm_packa_cntl, gemm_packb_cntl, NULL, gemm_cntl_bp_ke, NULL ); // Create control tree object for general problem via multiple // rank-k (outer panel) updates. gemm_cntl_mm_op = bli_gemm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT3, gemm_kc, NULL, NULL, NULL, NULL, NULL, gemm_cntl_op_bp, NULL ); // Create control tree object for very large problem via multiple // general problems. gemm_cntl_vl_mm = bli_gemm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT2, gemm_nc, NULL, NULL, NULL, NULL, NULL, gemm_cntl_mm_op, NULL ); // Alias the "master" gemm control tree to a shorter name. gemm_cntl = gemm_cntl_vl_mm; }